From 04a42e72d77a93a166b79c34b7bc862f55a53967 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Dec 2022 22:17:57 -0800 Subject: mm: move folio_set_compound_order() to mm/internal.h folio_set_compound_order() is moved to an mm-internal location so external folio users cannot misuse this function. Change the name of the function to folio_set_order() and use WARN_ON_ONCE() rather than BUG_ON. Also, handle the case if a non-large folio is passed and add clarifying comments to the function. Link: https://lore.kernel.org/lkml/20221207223731.32784-1-sidhartha.kumar@oracle.com/T/ Link: https://lkml.kernel.org/r/20221215061757.223440-1-sidhartha.kumar@oracle.com Fixes: 9fd330582b2f ("mm: add folio dtor and order setter functions") Signed-off-by: Sidhartha Kumar Suggested-by: Mike Kravetz Suggested-by: Muchun Song Suggested-by: Matthew Wilcox Suggested-by: John Hubbard Reviewed-by: John Hubbard Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 ---------------- mm/hugetlb.c | 6 +++--- mm/internal.h | 19 +++++++++++++++++++ 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8f857163ac89..253b2d7489e6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1019,22 +1019,6 @@ static inline void set_compound_order(struct page *page, unsigned int order) #endif } -/* - * folio_set_compound_order is generally passed a non-zero order to - * initialize a large folio. However, hugetlb code abuses this by - * passing in zero when 'dissolving' a large folio. - */ -static inline void folio_set_compound_order(struct folio *folio, - unsigned int order) -{ - VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); - - folio->_folio_order = order; -#ifdef CONFIG_64BIT - folio->_folio_nr_pages = order ? 1U << order : 0; -#endif -} - /* Returns the number of pages in this potentially compound page. */ static inline unsigned long compound_nr(struct page *page) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bdbfeb6fb393..cfd47a66ded0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1492,7 +1492,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio, set_page_refcounted(p); } - folio_set_compound_order(folio, 0); + folio_set_order(folio, 0); __folio_clear_head(folio); } @@ -1956,7 +1956,7 @@ static bool __prep_compound_gigantic_folio(struct folio *folio, __folio_clear_reserved(folio); __folio_set_head(folio); /* we rely on prep_new_hugetlb_folio to set the destructor */ - folio_set_compound_order(folio, order); + folio_set_order(folio, order); for (i = 0; i < nr_pages; i++) { p = folio_page(folio, i); @@ -2020,7 +2020,7 @@ out_error: p = folio_page(folio, j); __ClearPageReserved(p); } - folio_set_compound_order(folio, 0); + folio_set_order(folio, 0); __folio_clear_head(folio); return false; } diff --git a/mm/internal.h b/mm/internal.h index bcf75a8b032d..1d6f4e168510 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -378,6 +378,25 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, int split_free_page(struct page *free_page, unsigned int order, unsigned long split_pfn_offset); +/* + * This will have no effect, other than possibly generating a warning, if the + * caller passes in a non-large folio. + */ +static inline void folio_set_order(struct folio *folio, unsigned int order) +{ + if (WARN_ON_ONCE(!folio_test_large(folio))) + return; + + folio->_folio_order = order; +#ifdef CONFIG_64BIT + /* + * When hugetlb dissolves a folio, we need to clear the tail + * page, rather than setting nr_pages to 1. + */ + folio->_folio_nr_pages = order ? 1U << order : 0; +#endif +} + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* -- cgit v1.2.3 From f1eb1bacfba9019823b2fce42383f010cd561fa6 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 14 Dec 2022 15:15:33 -0500 Subject: mm/uffd: always wr-protect pte in pte|pmd_mkuffd_wp() This patch is a cleanup to always wr-protect pte/pmd in mkuffd_wp paths. The reasons I still think this patch is worthwhile, are: (1) It is a cleanup already; diffstat tells. (2) It just feels natural after I thought about this, if the pte is uffd protected, let's remove the write bit no matter what it was. (2) Since x86 is the only arch that supports uffd-wp, it also redefines pte|pmd_mkuffd_wp() in that it should always contain removals of write bits. It means any future arch that want to implement uffd-wp should naturally follow this rule too. It's good to make it a default, even if with vm_page_prot changes on VM_UFFD_WP. (3) It covers more than vm_page_prot. So no chance of any potential future "accident" (like pte_mkdirty() sparc64 or loongarch, even though it just got its pte_mkdirty fixed <1 month ago). It'll be fairly clear when reading the code too that we don't worry anything before a pte_mkuffd_wp() on uncertainty of the write bit. We may call pte_wrprotect() one more time in some paths (e.g. thp split), but that should be fully local bitop instruction so the overhead should be negligible. Although this patch should logically also fix all the known issues on uffd-wp too recently on page migration (not for numa hint recovery - that may need another explcit pte_wrprotect), but this is not the plan for that fix. So no fixes, and stable doesn't need this. Link: https://lkml.kernel.org/r/20221214201533.1774616-1-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Ives van Hoorne Cc: Mike Kravetz Cc: Nadav Amit Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 24 ++++++++++++------------ include/asm-generic/hugetlb.h | 16 ++++++++-------- mm/huge_memory.c | 8 +++----- mm/hugetlb.c | 4 ++-- mm/memory.c | 8 +++----- mm/mprotect.c | 6 ++---- mm/userfaultfd.c | 18 ++---------------- 7 files changed, 32 insertions(+), 52 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 0564edd24ffb..1c843395a8b3 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -289,6 +289,11 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) return native_make_pte(v & ~clear); } +static inline pte_t pte_wrprotect(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_RW); +} + #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline int pte_uffd_wp(pte_t pte) { @@ -313,7 +318,7 @@ static inline int pte_uffd_wp(pte_t pte) static inline pte_t pte_mkuffd_wp(pte_t pte) { - return pte_set_flags(pte, _PAGE_UFFD_WP); + return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP)); } static inline pte_t pte_clear_uffd_wp(pte_t pte) @@ -332,11 +337,6 @@ static inline pte_t pte_mkold(pte_t pte) return pte_clear_flags(pte, _PAGE_ACCESSED); } -static inline pte_t pte_wrprotect(pte_t pte) -{ - return pte_clear_flags(pte, _PAGE_RW); -} - static inline pte_t pte_mkexec(pte_t pte) { return pte_clear_flags(pte, _PAGE_NX); @@ -401,6 +401,11 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) return native_make_pmd(v & ~clear); } +static inline pmd_t pmd_wrprotect(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_RW); +} + #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline int pmd_uffd_wp(pmd_t pmd) { @@ -409,7 +414,7 @@ static inline int pmd_uffd_wp(pmd_t pmd) static inline pmd_t pmd_mkuffd_wp(pmd_t pmd) { - return pmd_set_flags(pmd, _PAGE_UFFD_WP); + return pmd_wrprotect(pmd_set_flags(pmd, _PAGE_UFFD_WP)); } static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd) @@ -428,11 +433,6 @@ static inline pmd_t pmd_mkclean(pmd_t pmd) return pmd_clear_flags(pmd, _PAGE_DIRTY); } -static inline pmd_t pmd_wrprotect(pmd_t pmd) -{ - return pmd_clear_flags(pmd, _PAGE_RW); -} - static inline pmd_t pmd_mkdirty(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index a57d667addd2..d7f6335d3999 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -25,6 +25,13 @@ static inline pte_t huge_pte_mkwrite(pte_t pte) return pte_mkwrite(pte); } +#ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT +static inline pte_t huge_pte_wrprotect(pte_t pte) +{ + return pte_wrprotect(pte); +} +#endif + static inline pte_t huge_pte_mkdirty(pte_t pte) { return pte_mkdirty(pte); @@ -37,7 +44,7 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) static inline pte_t huge_pte_mkuffd_wp(pte_t pte) { - return pte_mkuffd_wp(pte); + return huge_pte_wrprotect(pte_mkuffd_wp(pte)); } static inline pte_t huge_pte_clear_uffd_wp(pte_t pte) @@ -104,13 +111,6 @@ static inline int huge_pte_none_mostly(pte_t pte) return huge_pte_none(pte) || is_pte_marker(pte); } -#ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} -#endif - #ifndef __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index abe6cfd92ffa..867f02e6061d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1920,17 +1920,15 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, oldpmd = pmdp_invalidate_ad(vma, addr, pmd); entry = pmd_modify(oldpmd, newprot); - if (uffd_wp) { - entry = pmd_wrprotect(entry); + if (uffd_wp) entry = pmd_mkuffd_wp(entry); - } else if (uffd_wp_resolve) { + else if (uffd_wp_resolve) /* * Leave the write bit to be handled by PF interrupt * handler, then things like COW could be properly * handled. */ entry = pmd_clear_uffd_wp(entry); - } /* See change_pte_range(). */ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) && @@ -3275,7 +3273,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) if (is_writable_migration_entry(entry)) pmde = maybe_pmd_mkwrite(pmde, vma); if (pmd_swp_uffd_wp(*pvmw->pmd)) - pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde)); + pmde = pmd_mkuffd_wp(pmde); if (!is_migration_entry_young(entry)) pmde = pmd_mkold(pmde); /* NOTE: this may contain setting soft-dirty on some archs */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cfd47a66ded0..92b3fd01a652 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5919,7 +5919,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * if populated. */ if (unlikely(pte_marker_uffd_wp(old_pte))) - new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte)); + new_pte = huge_pte_mkuffd_wp(new_pte); set_huge_pte_at(mm, haddr, ptep, new_pte); hugetlb_count_add(pages_per_huge_page(h), mm); @@ -6728,7 +6728,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, pte = huge_pte_modify(old_pte, newprot); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); if (uffd_wp) - pte = huge_pte_mkuffd_wp(huge_pte_wrprotect(pte)); + pte = huge_pte_mkuffd_wp(pte); else if (uffd_wp_resolve) pte = huge_pte_clear_uffd_wp(pte); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); diff --git a/mm/memory.c b/mm/memory.c index 3e836fecd035..ca490596b36f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -878,7 +878,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); if (userfaultfd_pte_wp(dst_vma, *src_pte)) /* Uffd-wp needs to be delivered to dest pte as well */ - pte = pte_wrprotect(pte_mkuffd_wp(pte)); + pte = pte_mkuffd_wp(pte); set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); return 0; } @@ -3950,10 +3950,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) flush_icache_page(vma, page); if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); - if (pte_swp_uffd_wp(vmf->orig_pte)) { + if (pte_swp_uffd_wp(vmf->orig_pte)) pte = pte_mkuffd_wp(pte); - pte = pte_wrprotect(pte); - } vmf->orig_pte = pte; /* ksm created a completely new copy */ @@ -4296,7 +4294,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (unlikely(uffd_wp)) - entry = pte_mkuffd_wp(pte_wrprotect(entry)); + entry = pte_mkuffd_wp(entry); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter(vma->vm_mm, MM_ANONPAGES); diff --git a/mm/mprotect.c b/mm/mprotect.c index 61cf60015a8b..bf8fa0af5a15 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -177,12 +177,10 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, oldpte = ptep_modify_prot_start(vma, addr, pte); ptent = pte_modify(oldpte, newprot); - if (uffd_wp) { - ptent = pte_wrprotect(ptent); + if (uffd_wp) ptent = pte_mkuffd_wp(ptent); - } else if (uffd_wp_resolve) { + else if (uffd_wp_resolve) ptent = pte_clear_uffd_wp(ptent); - } /* * In some writable, shared mappings, we might want diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0499907b6f1a..f8d31b82aceb 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -74,24 +74,10 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, _dst_pte = pte_mkdirty(_dst_pte); if (page_in_cache && !vm_shared) writable = false; - - /* - * Always mark a PTE as write-protected when needed, regardless of - * VM_WRITE, which the user might change. - */ - if (wp_copy) { - _dst_pte = pte_mkuffd_wp(_dst_pte); - writable = false; - } - if (writable) _dst_pte = pte_mkwrite(_dst_pte); - else - /* - * We need this to make sure write bit removed; as mk_pte() - * could return a pte with write bit set. - */ - _dst_pte = pte_wrprotect(_dst_pte); + if (wp_copy) + _dst_pte = pte_mkuffd_wp(_dst_pte); dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); -- cgit v1.2.3 From 6fd7353829cafc4067aad9eea0dc95da67e7df16 Mon Sep 17 00:00:00 2001 From: Daniel Verkamp Date: Thu, 15 Dec 2022 00:12:01 +0000 Subject: mm/memfd: add F_SEAL_EXEC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm/memfd: introduce MFD_NOEXEC_SEAL and MFD_EXEC", v8. Since Linux introduced the memfd feature, memfd have always had their execute bit set, and the memfd_create() syscall doesn't allow setting it differently. However, in a secure by default system, such as ChromeOS, (where all executables should come from the rootfs, which is protected by Verified boot), this executable nature of memfd opens a door for NoExec bypass and enables “confused deputy attack”. E.g, in VRP bug [1]: cros_vm process created a memfd to share the content with an external process, however the memfd is overwritten and used for executing arbitrary code and root escalation. [2] lists more VRP in this kind. On the other hand, executable memfd has its legit use, runc uses memfd’s seal and executable feature to copy the contents of the binary then execute them, for such system, we need a solution to differentiate runc's use of executable memfds and an attacker's [3]. To address those above, this set of patches add following: 1> Let memfd_create() set X bit at creation time. 2> Let memfd to be sealed for modifying X bit. 3> A new pid namespace sysctl: vm.memfd_noexec to control the behavior of X bit.For example, if a container has vm.memfd_noexec=2, then memfd_create() without MFD_NOEXEC_SEAL will be rejected. 4> A new security hook in memfd_create(). This make it possible to a new LSM, which rejects or allows executable memfd based on its security policy. This patch (of 5): The new F_SEAL_EXEC flag will prevent modification of the exec bits: written as traditional octal mask, 0111, or as named flags, S_IXUSR | S_IXGRP | S_IXOTH. Any chmod(2) or similar call that attempts to modify any of these bits after the seal is applied will fail with errno EPERM. This will preserve the execute bits as they are at the time of sealing, so the memfd will become either permanently executable or permanently un-executable. Link: https://lkml.kernel.org/r/20221215001205.51969-1-jeffxu@google.com Link: https://lkml.kernel.org/r/20221215001205.51969-2-jeffxu@google.com Signed-off-by: Daniel Verkamp Co-developed-by: Jeff Xu Signed-off-by: Jeff Xu Reviewed-by: Kees Cook Cc: Dmitry Torokhov Cc: Hugh Dickins Cc: Jann Horn Cc: Jorge Lucangeli Obes Cc: Shuah Khan Cc: David Herrmann Cc: kernel test robot Signed-off-by: Andrew Morton --- include/uapi/linux/fcntl.h | 1 + mm/memfd.c | 2 ++ mm/shmem.c | 6 ++++++ 3 files changed, 9 insertions(+) diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 2f86b2ad6d7e..e8c07da58c9f 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -43,6 +43,7 @@ #define F_SEAL_GROW 0x0004 /* prevent file from growing */ #define F_SEAL_WRITE 0x0008 /* prevent writes */ #define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ +#define F_SEAL_EXEC 0x0020 /* prevent chmod modifying exec bits */ /* (1U << 31) is reserved for signed error codes */ /* diff --git a/mm/memfd.c b/mm/memfd.c index 08f5f8304746..4ebeab94aa74 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -147,6 +147,7 @@ static unsigned int *memfd_file_seals_ptr(struct file *file) } #define F_ALL_SEALS (F_SEAL_SEAL | \ + F_SEAL_EXEC | \ F_SEAL_SHRINK | \ F_SEAL_GROW | \ F_SEAL_WRITE | \ @@ -175,6 +176,7 @@ static int memfd_add_seals(struct file *file, unsigned int seals) * SEAL_SHRINK: Prevent the file from shrinking * SEAL_GROW: Prevent the file from growing * SEAL_WRITE: Prevent write access to the file + * SEAL_EXEC: Prevent modification of the exec bits in the file mode * * As we don't require any trust relationship between two parties, we * must prevent seals from being removed. Therefore, sealing a file diff --git a/mm/shmem.c b/mm/shmem.c index 0005ab2c29af..d3f0c94f6836 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1093,6 +1093,12 @@ static int shmem_setattr(struct user_namespace *mnt_userns, if (error) return error; + if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) { + if ((inode->i_mode ^ attr->ia_mode) & 0111) { + return -EPERM; + } + } + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; -- cgit v1.2.3 From 32d118ad50a5afecb74358bcefc5cb6ea6ccfc2b Mon Sep 17 00:00:00 2001 From: Daniel Verkamp Date: Thu, 15 Dec 2022 00:12:02 +0000 Subject: selftests/memfd: add tests for F_SEAL_EXEC Basic tests to ensure that user/group/other execute bits cannot be changed after applying F_SEAL_EXEC to a memfd. Link: https://lkml.kernel.org/r/20221215001205.51969-3-jeffxu@google.com Signed-off-by: Daniel Verkamp Co-developed-by: Jeff Xu Signed-off-by: Jeff Xu Reviewed-by: Kees Cook Cc: David Herrmann Cc: Dmitry Torokhov Cc: Hugh Dickins Cc: Jann Horn Cc: Jorge Lucangeli Obes Cc: kernel test robot Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/memfd/memfd_test.c | 123 ++++++++++++++++++++++++++++- 1 file changed, 122 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 94df2692e6e4..f18a15a1f275 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -28,12 +28,38 @@ #define MFD_DEF_SIZE 8192 #define STACK_SIZE 65536 +#define F_SEAL_EXEC 0x0020 + /* * Default is not to test hugetlbfs */ static size_t mfd_def_size = MFD_DEF_SIZE; static const char *memfd_str = MEMFD_STR; +static ssize_t fd2name(int fd, char *buf, size_t bufsize) +{ + char buf1[PATH_MAX]; + int size; + ssize_t nbytes; + + size = snprintf(buf1, PATH_MAX, "/proc/self/fd/%d", fd); + if (size < 0) { + printf("snprintf(%d) failed on %m\n", fd); + abort(); + } + + /* + * reserver one byte for string termination. + */ + nbytes = readlink(buf1, buf, bufsize-1); + if (nbytes == -1) { + printf("readlink(%s) failed %m\n", buf1); + abort(); + } + buf[nbytes] = '\0'; + return nbytes; +} + static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) { int r, fd; @@ -98,11 +124,14 @@ static unsigned int mfd_assert_get_seals(int fd) static void mfd_assert_has_seals(int fd, unsigned int seals) { + char buf[PATH_MAX]; + int nbytes; unsigned int s; + fd2name(fd, buf, PATH_MAX); s = mfd_assert_get_seals(fd); if (s != seals) { - printf("%u != %u = GET_SEALS(%d)\n", seals, s, fd); + printf("%u != %u = GET_SEALS(%s)\n", seals, s, buf); abort(); } } @@ -594,6 +623,64 @@ static void mfd_fail_grow_write(int fd) } } +static void mfd_assert_mode(int fd, int mode) +{ + struct stat st; + char buf[PATH_MAX]; + int nbytes; + + fd2name(fd, buf, PATH_MAX); + + if (fstat(fd, &st) < 0) { + printf("fstat(%s) failed: %m\n", buf); + abort(); + } + + if ((st.st_mode & 07777) != mode) { + printf("fstat(%s) wrong file mode 0%04o, but expected 0%04o\n", + buf, (int)st.st_mode & 07777, mode); + abort(); + } +} + +static void mfd_assert_chmod(int fd, int mode) +{ + char buf[PATH_MAX]; + int nbytes; + + fd2name(fd, buf, PATH_MAX); + + if (fchmod(fd, mode) < 0) { + printf("fchmod(%s, 0%04o) failed: %m\n", buf, mode); + abort(); + } + + mfd_assert_mode(fd, mode); +} + +static void mfd_fail_chmod(int fd, int mode) +{ + struct stat st; + char buf[PATH_MAX]; + int nbytes; + + fd2name(fd, buf, PATH_MAX); + + if (fstat(fd, &st) < 0) { + printf("fstat(%s) failed: %m\n", buf); + abort(); + } + + if (fchmod(fd, mode) == 0) { + printf("fchmod(%s, 0%04o) didn't fail as expected\n", + buf, mode); + abort(); + } + + /* verify that file mode bits did not change */ + mfd_assert_mode(fd, st.st_mode & 07777); +} + static int idle_thread_fn(void *arg) { sigset_t set; @@ -880,6 +967,39 @@ static void test_seal_resize(void) close(fd); } +/* + * Test SEAL_EXEC + * Test that chmod() cannot change x bits after sealing + */ +static void test_seal_exec(void) +{ + int fd; + + printf("%s SEAL-EXEC\n", memfd_str); + + fd = mfd_assert_new("kern_memfd_seal_exec", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + mfd_assert_mode(fd, 0777); + + mfd_assert_chmod(fd, 0644); + + mfd_assert_has_seals(fd, 0); + mfd_assert_add_seals(fd, F_SEAL_EXEC); + mfd_assert_has_seals(fd, F_SEAL_EXEC); + + mfd_assert_chmod(fd, 0600); + mfd_fail_chmod(fd, 0777); + mfd_fail_chmod(fd, 0670); + mfd_fail_chmod(fd, 0605); + mfd_fail_chmod(fd, 0700); + mfd_fail_chmod(fd, 0100); + mfd_assert_chmod(fd, 0666); + + close(fd); +} + /* * Test sharing via dup() * Test that seals are shared between dupped FDs and they're all equal. @@ -1059,6 +1179,7 @@ int main(int argc, char **argv) test_seal_shrink(); test_seal_grow(); test_seal_resize(); + test_seal_exec(); test_share_dup("SHARE-DUP", ""); test_share_mmap("SHARE-MMAP", ""); -- cgit v1.2.3 From 105ff5339f498af74e60d7662c8f1c4d21f1342d Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Thu, 15 Dec 2022 00:12:03 +0000 Subject: mm/memfd: add MFD_NOEXEC_SEAL and MFD_EXEC The new MFD_NOEXEC_SEAL and MFD_EXEC flags allows application to set executable bit at creation time (memfd_create). When MFD_NOEXEC_SEAL is set, memfd is created without executable bit (mode:0666), and sealed with F_SEAL_EXEC, so it can't be chmod to be executable (mode: 0777) after creation. when MFD_EXEC flag is set, memfd is created with executable bit (mode:0777), this is the same as the old behavior of memfd_create. The new pid namespaced sysctl vm.memfd_noexec has 3 values: 0: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL acts like MFD_EXEC was set. 1: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL acts like MFD_NOEXEC_SEAL was set. 2: memfd_create() without MFD_NOEXEC_SEAL will be rejected. The sysctl allows finer control of memfd_create for old-software that doesn't set the executable bit, for example, a container with vm.memfd_noexec=1 means the old-software will create non-executable memfd by default. Also, the value of memfd_noexec is passed to child namespace at creation time. For example, if the init namespace has vm.memfd_noexec=2, all its children namespaces will be created with 2. [akpm@linux-foundation.org: add stub functions to fix build] [akpm@linux-foundation.org: remove unneeded register_pid_ns_ctl_table_vm() stub, per Jeff] [akpm@linux-foundation.org: s/pr_warn_ratelimited/pr_warn_once/, per review] [akpm@linux-foundation.org: fix CONFIG_SYSCTL=n warning] Link: https://lkml.kernel.org/r/20221215001205.51969-4-jeffxu@google.com Signed-off-by: Jeff Xu Co-developed-by: Daniel Verkamp Signed-off-by: Daniel Verkamp Reported-by: kernel test robot Reviewed-by: Kees Cook Cc: David Herrmann Cc: Dmitry Torokhov Cc: Hugh Dickins Cc: Jann Horn Cc: Jorge Lucangeli Obes Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/pid_namespace.h | 19 ++++++++++++++ include/uapi/linux/memfd.h | 4 +++ kernel/pid_namespace.c | 5 ++++ kernel/pid_sysctl.h | 60 +++++++++++++++++++++++++++++++++++++++++++ mm/memfd.c | 48 ++++++++++++++++++++++++++++++++-- 5 files changed, 134 insertions(+), 2 deletions(-) create mode 100644 kernel/pid_sysctl.h diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 07481bb87d4e..c758809d5bcf 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -16,6 +16,21 @@ struct fs_pin; +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) +/* + * sysctl for vm.memfd_noexec + * 0: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL + * acts like MFD_EXEC was set. + * 1: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL + * acts like MFD_NOEXEC_SEAL was set. + * 2: memfd_create() without MFD_NOEXEC_SEAL will be + * rejected. + */ +#define MEMFD_NOEXEC_SCOPE_EXEC 0 +#define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1 +#define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2 +#endif + struct pid_namespace { struct idr idr; struct rcu_head rcu; @@ -31,6 +46,10 @@ struct pid_namespace { struct ucounts *ucounts; int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) + /* sysctl for vm.memfd_noexec */ + int memfd_noexec_scope; +#endif } __randomize_layout; extern struct pid_namespace init_pid_ns; diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h index 7a8a26751c23..273a4e15dfcf 100644 --- a/include/uapi/linux/memfd.h +++ b/include/uapi/linux/memfd.h @@ -8,6 +8,10 @@ #define MFD_CLOEXEC 0x0001U #define MFD_ALLOW_SEALING 0x0002U #define MFD_HUGETLB 0x0004U +/* not executable and sealed to prevent changing to executable. */ +#define MFD_NOEXEC_SEAL 0x0008U +/* executable */ +#define MFD_EXEC 0x0010U /* * Huge page size encoding when MFD_HUGETLB is specified, and a huge page diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index f4f8cb0435b4..8a98b1af9376 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -23,6 +23,7 @@ #include #include #include +#include "pid_sysctl.h" static DEFINE_MUTEX(pid_caches_mutex); static struct kmem_cache *pid_ns_cachep; @@ -110,6 +111,8 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->ucounts = ucounts; ns->pid_allocated = PIDNS_ADDING; + initialize_memfd_noexec_scope(ns); + return ns; out_free_idr: @@ -455,6 +458,8 @@ static __init int pid_namespaces_init(void) #ifdef CONFIG_CHECKPOINT_RESTORE register_sysctl_paths(kern_path, pid_ns_ctl_table); #endif + + register_pid_ns_sysctl_table_vm(); return 0; } diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h new file mode 100644 index 000000000000..e22d072e1e24 --- /dev/null +++ b/kernel/pid_sysctl.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_PID_SYSCTL_H +#define LINUX_PID_SYSCTL_H + +#include + +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) +static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) +{ + ns->memfd_noexec_scope = + task_active_pid_ns(current)->memfd_noexec_scope; +} + +static int pid_mfd_noexec_dointvec_minmax(struct ctl_table *table, + int write, void *buf, size_t *lenp, loff_t *ppos) +{ + struct pid_namespace *ns = task_active_pid_ns(current); + struct ctl_table table_copy; + + if (write && !ns_capable(ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + table_copy = *table; + if (ns != &init_pid_ns) + table_copy.data = &ns->memfd_noexec_scope; + + /* + * set minimum to current value, the effect is only bigger + * value is accepted. + */ + if (*(int *)table_copy.data > *(int *)table_copy.extra1) + table_copy.extra1 = table_copy.data; + + return proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos); +} + +static struct ctl_table pid_ns_ctl_table_vm[] = { + { + .procname = "memfd_noexec", + .data = &init_pid_ns.memfd_noexec_scope, + .maxlen = sizeof(init_pid_ns.memfd_noexec_scope), + .mode = 0644, + .proc_handler = pid_mfd_noexec_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { } +}; +static struct ctl_path vm_path[] = { { .procname = "vm", }, { } }; +static inline void register_pid_ns_sysctl_table_vm(void) +{ + register_sysctl_paths(vm_path, pid_ns_ctl_table_vm); +} +#else +static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) {} +static inline void set_memfd_noexec_scope(struct pid_namespace *ns) {} +static inline void register_pid_ns_sysctl_table_vm(void) {} +#endif + +#endif /* LINUX_PID_SYSCTL_H */ diff --git a/mm/memfd.c b/mm/memfd.c index 4ebeab94aa74..bc214390e28d 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -18,6 +18,7 @@ #include #include #include +#include #include /* @@ -263,12 +264,13 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC) SYSCALL_DEFINE2(memfd_create, const char __user *, uname, unsigned int, flags) { + char comm[TASK_COMM_LEN]; unsigned int *file_seals; struct file *file; int fd, error; @@ -285,6 +287,40 @@ SYSCALL_DEFINE2(memfd_create, return -EINVAL; } + /* Invalid if both EXEC and NOEXEC_SEAL are set.*/ + if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL)) + return -EINVAL; + + if (!(flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { +#ifdef CONFIG_SYSCTL + int sysctl = MEMFD_NOEXEC_SCOPE_EXEC; + struct pid_namespace *ns; + + ns = task_active_pid_ns(current); + if (ns) + sysctl = ns->memfd_noexec_scope; + + switch (sysctl) { + case MEMFD_NOEXEC_SCOPE_EXEC: + flags |= MFD_EXEC; + break; + case MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL: + flags |= MFD_NOEXEC_SEAL; + break; + default: + pr_warn_once( + "memfd_create(): MFD_NOEXEC_SEAL is enforced, pid=%d '%s'\n", + task_pid_nr(current), get_task_comm(comm, current)); + return -EINVAL; + } +#else + flags |= MFD_EXEC; +#endif + pr_warn_once( + "memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL, pid=%d '%s'\n", + task_pid_nr(current), get_task_comm(comm, current)); + } + /* length includes terminating zero */ len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); if (len <= 0) @@ -328,7 +364,15 @@ SYSCALL_DEFINE2(memfd_create, file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; file->f_flags |= O_LARGEFILE; - if (flags & MFD_ALLOW_SEALING) { + if (flags & MFD_NOEXEC_SEAL) { + struct inode *inode = file_inode(file); + + inode->i_mode &= ~0111; + file_seals = memfd_file_seals_ptr(file); + *file_seals &= ~F_SEAL_SEAL; + *file_seals |= F_SEAL_EXEC; + } else if (flags & MFD_ALLOW_SEALING) { + /* MFD_EXEC and MFD_ALLOW_SEALING are set */ file_seals = memfd_file_seals_ptr(file); *file_seals &= ~F_SEAL_SEAL; } -- cgit v1.2.3 From c4f75bc8bd6b3d62665e1f5400c419540edb5601 Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Thu, 15 Dec 2022 00:12:04 +0000 Subject: mm/memfd: add write seals when apply SEAL_EXEC to executable memfd In order to avoid WX mappings, add F_SEAL_WRITE when apply F_SEAL_EXEC to an executable memfd, so W^X from start. This implys application need to fill the content of the memfd first, after F_SEAL_EXEC is applied, application can no longer modify the content of the memfd. Typically, application seals the memfd right after writing to it. For example: 1. memfd_create(MFD_EXEC). 2. write() code to the memfd. 3. fcntl(F_ADD_SEALS, F_SEAL_EXEC) to convert the memfd to W^X. 4. call exec() on the memfd. Link: https://lkml.kernel.org/r/20221215001205.51969-5-jeffxu@google.com Signed-off-by: Jeff Xu Reviewed-by: Kees Cook Cc: Daniel Verkamp Cc: David Herrmann Cc: Dmitry Torokhov Cc: Hugh Dickins Cc: Jann Horn Cc: Jorge Lucangeli Obes Cc: kernel test robot Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/memfd.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/memfd.c b/mm/memfd.c index bc214390e28d..a0a7a37e8177 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -222,6 +222,12 @@ static int memfd_add_seals(struct file *file, unsigned int seals) } } + /* + * SEAL_EXEC implys SEAL_WRITE, making W^X from the start. + */ + if (seals & F_SEAL_EXEC && inode->i_mode & 0111) + seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE; + *file_seals |= seals; error = 0; -- cgit v1.2.3 From 11f75a01448f1b7a739e75dbd8f17b844fcfc510 Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Thu, 15 Dec 2022 00:12:05 +0000 Subject: selftests/memfd: add tests for MFD_NOEXEC_SEAL MFD_EXEC Tests to verify MFD_NOEXEC, MFD_EXEC and vm.memfd_noexec sysctl. Link: https://lkml.kernel.org/r/20221215001205.51969-6-jeffxu@google.com Signed-off-by: Jeff Xu Co-developed-by: Daniel Verkamp Signed-off-by: Daniel Verkamp Reviewed-by: Kees Cook Cc: David Herrmann Cc: Dmitry Torokhov Cc: Hugh Dickins Cc: Jann Horn Cc: Jorge Lucangeli Obes Cc: kernel test robot Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/memfd/fuse_test.c | 1 + tools/testing/selftests/memfd/memfd_test.c | 228 ++++++++++++++++++++++++++++- 2 files changed, 224 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/memfd/fuse_test.c b/tools/testing/selftests/memfd/fuse_test.c index be675002f918..93798c8c5d54 100644 --- a/tools/testing/selftests/memfd/fuse_test.c +++ b/tools/testing/selftests/memfd/fuse_test.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index f18a15a1f275..ae71f15f790d 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -30,6 +30,14 @@ #define F_SEAL_EXEC 0x0020 +#define F_WX_SEALS (F_SEAL_SHRINK | \ + F_SEAL_GROW | \ + F_SEAL_WRITE | \ + F_SEAL_FUTURE_WRITE | \ + F_SEAL_EXEC) + +#define MFD_NOEXEC_SEAL 0x0008U + /* * Default is not to test hugetlbfs */ @@ -80,6 +88,37 @@ static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) return fd; } +static void sysctl_assert_write(const char *val) +{ + int fd = open("/proc/sys/vm/memfd_noexec", O_WRONLY | O_CLOEXEC); + + if (fd < 0) { + printf("open sysctl failed\n"); + abort(); + } + + if (write(fd, val, strlen(val)) < 0) { + printf("write sysctl failed\n"); + abort(); + } +} + +static void sysctl_fail_write(const char *val) +{ + int fd = open("/proc/sys/vm/memfd_noexec", O_WRONLY | O_CLOEXEC); + + if (fd < 0) { + printf("open sysctl failed\n"); + abort(); + } + + if (write(fd, val, strlen(val)) >= 0) { + printf("write sysctl %s succeeded, but failure expected\n", + val); + abort(); + } +} + static int mfd_assert_reopen_fd(int fd_in) { int fd; @@ -758,6 +797,9 @@ static void test_create(void) mfd_fail_new("", ~0); mfd_fail_new("", 0x80000000U); + /* verify EXEC and NOEXEC_SEAL can't both be set */ + mfd_fail_new("", MFD_EXEC | MFD_NOEXEC_SEAL); + /* verify MFD_CLOEXEC is allowed */ fd = mfd_assert_new("", 0, MFD_CLOEXEC); close(fd); @@ -969,20 +1011,21 @@ static void test_seal_resize(void) /* * Test SEAL_EXEC - * Test that chmod() cannot change x bits after sealing + * Test fd is created with exec and allow sealing. + * chmod() cannot change x bits after sealing. */ -static void test_seal_exec(void) +static void test_exec_seal(void) { int fd; printf("%s SEAL-EXEC\n", memfd_str); + printf("%s Apply SEAL_EXEC\n", memfd_str); fd = mfd_assert_new("kern_memfd_seal_exec", mfd_def_size, - MFD_CLOEXEC | MFD_ALLOW_SEALING); + MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_EXEC); mfd_assert_mode(fd, 0777); - mfd_assert_chmod(fd, 0644); mfd_assert_has_seals(fd, 0); @@ -996,10 +1039,181 @@ static void test_seal_exec(void) mfd_fail_chmod(fd, 0700); mfd_fail_chmod(fd, 0100); mfd_assert_chmod(fd, 0666); + mfd_assert_write(fd); + close(fd); + + printf("%s Apply ALL_SEALS\n", memfd_str); + fd = mfd_assert_new("kern_memfd_seal_exec", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_EXEC); + + mfd_assert_mode(fd, 0777); + mfd_assert_chmod(fd, 0700); + + mfd_assert_has_seals(fd, 0); + mfd_assert_add_seals(fd, F_SEAL_EXEC); + mfd_assert_has_seals(fd, F_WX_SEALS); + mfd_fail_chmod(fd, 0711); + mfd_fail_chmod(fd, 0600); + mfd_fail_write(fd); + close(fd); +} + +/* + * Test EXEC_NO_SEAL + * Test fd is created with exec and not allow sealing. + */ +static void test_exec_no_seal(void) +{ + int fd; + + printf("%s EXEC_NO_SEAL\n", memfd_str); + + /* Create with EXEC but without ALLOW_SEALING */ + fd = mfd_assert_new("kern_memfd_exec_no_sealing", + mfd_def_size, + MFD_CLOEXEC | MFD_EXEC); + mfd_assert_mode(fd, 0777); + mfd_assert_has_seals(fd, F_SEAL_SEAL); + mfd_assert_chmod(fd, 0666); close(fd); } +/* + * Test memfd_create with MFD_NOEXEC flag + */ +static void test_noexec_seal(void) +{ + int fd; + + printf("%s NOEXEC_SEAL\n", memfd_str); + + /* Create with NOEXEC and ALLOW_SEALING */ + fd = mfd_assert_new("kern_memfd_noexec", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_NOEXEC_SEAL); + mfd_assert_mode(fd, 0666); + mfd_assert_has_seals(fd, F_SEAL_EXEC); + mfd_fail_chmod(fd, 0777); + close(fd); + + /* Create with NOEXEC but without ALLOW_SEALING */ + fd = mfd_assert_new("kern_memfd_noexec", + mfd_def_size, + MFD_CLOEXEC | MFD_NOEXEC_SEAL); + mfd_assert_mode(fd, 0666); + mfd_assert_has_seals(fd, F_SEAL_EXEC); + mfd_fail_chmod(fd, 0777); + close(fd); +} + +static void test_sysctl_child(void) +{ + int fd; + + printf("%s sysctl 0\n", memfd_str); + sysctl_assert_write("0"); + fd = mfd_assert_new("kern_memfd_sysctl_0", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + mfd_assert_mode(fd, 0777); + mfd_assert_has_seals(fd, 0); + mfd_assert_chmod(fd, 0644); + close(fd); + + printf("%s sysctl 1\n", memfd_str); + sysctl_assert_write("1"); + fd = mfd_assert_new("kern_memfd_sysctl_1", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + mfd_assert_mode(fd, 0666); + mfd_assert_has_seals(fd, F_SEAL_EXEC); + mfd_fail_chmod(fd, 0777); + sysctl_fail_write("0"); + close(fd); + + printf("%s sysctl 2\n", memfd_str); + sysctl_assert_write("2"); + mfd_fail_new("kern_memfd_sysctl_2", + MFD_CLOEXEC | MFD_ALLOW_SEALING); + sysctl_fail_write("0"); + sysctl_fail_write("1"); +} + +static int newpid_thread_fn(void *arg) +{ + test_sysctl_child(); + return 0; +} + +static void test_sysctl_child2(void) +{ + int fd; + + sysctl_fail_write("0"); + fd = mfd_assert_new("kern_memfd_sysctl_1", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + mfd_assert_mode(fd, 0666); + mfd_assert_has_seals(fd, F_SEAL_EXEC); + mfd_fail_chmod(fd, 0777); + close(fd); +} + +static int newpid_thread_fn2(void *arg) +{ + test_sysctl_child2(); + return 0; +} +static pid_t spawn_newpid_thread(unsigned int flags, int (*fn)(void *)) +{ + uint8_t *stack; + pid_t pid; + + stack = malloc(STACK_SIZE); + if (!stack) { + printf("malloc(STACK_SIZE) failed: %m\n"); + abort(); + } + + pid = clone(fn, + stack + STACK_SIZE, + SIGCHLD | flags, + NULL); + if (pid < 0) { + printf("clone() failed: %m\n"); + abort(); + } + + return pid; +} + +static void join_newpid_thread(pid_t pid) +{ + waitpid(pid, NULL, 0); +} + +/* + * Test sysctl + * A very basic sealing test to see whether setting/retrieving seals works. + */ +static void test_sysctl(void) +{ + int pid = spawn_newpid_thread(CLONE_NEWPID, newpid_thread_fn); + + join_newpid_thread(pid); + + printf("%s child ns\n", memfd_str); + sysctl_assert_write("1"); + + pid = spawn_newpid_thread(CLONE_NEWPID, newpid_thread_fn2); + join_newpid_thread(pid); +} + /* * Test sharing via dup() * Test that seals are shared between dupped FDs and they're all equal. @@ -1173,13 +1387,15 @@ int main(int argc, char **argv) test_create(); test_basic(); + test_exec_seal(); + test_exec_no_seal(); + test_noexec_seal(); test_seal_write(); test_seal_future_write(); test_seal_shrink(); test_seal_grow(); test_seal_resize(); - test_seal_exec(); test_share_dup("SHARE-DUP", ""); test_share_mmap("SHARE-MMAP", ""); @@ -1195,6 +1411,8 @@ int main(int argc, char **argv) test_share_fork("SHARE-FORK", SHARED_FT_STR); join_idle_thread(pid); + test_sysctl(); + printf("memfd: DONE\n"); return 0; -- cgit v1.2.3 From 379c2e60e82ff71510a949033bf8431f39f66c75 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 12 Dec 2022 15:50:42 -0800 Subject: hugetlb: update vma flag check for hugetlb vma lock The check for whether a hugetlb vma lock exists partially depends on the vma's flags. Currently, it checks for either VM_MAYSHARE or VM_SHARED. The reason both flags are used is because VM_MAYSHARE was previously cleared in hugetlb vmas as they are tore down. This is no longer the case, and only the VM_MAYSHARE check is required. Link: https://lkml.kernel.org/r/20221212235042.178355-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: "Aneesh Kumar K.V" Cc: David Hildenbrand Cc: James Houghton Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/hugetlb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 92b3fd01a652..ed1ac2df582c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -262,8 +262,7 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) */ static bool __vma_shareable_lock(struct vm_area_struct *vma) { - return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) && - vma->vm_private_data; + return vma->vm_flags & VM_MAYSHARE && vma->vm_private_data; } void hugetlb_vma_lock_read(struct vm_area_struct *vma) -- cgit v1.2.3 From 243b1f2d3b09b6c813c2e4a179cdf5bfc878eb54 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:50:52 -0500 Subject: mm/hugetlb: let vma_offset_start() to return start Patch series "mm/hugetlb: Make huge_pte_offset() thread-safe for pmd unshare", v4. Problem ======= huge_pte_offset() is a major helper used by hugetlb code paths to walk a hugetlb pgtable. It's used mostly everywhere since that's needed even before taking the pgtable lock. huge_pte_offset() is always called with mmap lock held with either read or write. It was assumed to be safe but it's actually not. One race condition can easily trigger by: (1) firstly trigger pmd share on a memory range, (2) do huge_pte_offset() on the range, then at the meantime, (3) another thread unshare the pmd range, and the pgtable page is prone to lost if the other shared process wants to free it completely (by either munmap or exit mm). The recent work from Mike on vma lock can resolve most of this already. It's achieved by forbidden pmd unsharing during the lock being taken, so no further risk of the pgtable page being freed. It means if we can take the vma lock around all huge_pte_offset() callers it'll be safe. There're already a bunch of them that we did as per the latest mm-unstable, but also quite a few others that we didn't for various reasons especially on huge_pte_offset() usage. One more thing to mention is that besides the vma lock, i_mmap_rwsem can also be used to protect the pgtable page (along with its pgtable lock) from being freed from under us. IOW, huge_pte_offset() callers need to either hold the vma lock or i_mmap_rwsem to safely walk the pgtables. A reproducer of such problem, based on hugetlb GUP (NOTE: since the race is very hard to trigger, one needs to apply another kernel delay patch too, see below): ======8<======= #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #define MSIZE (1UL << 30) /* 1GB */ #define PSIZE (2UL << 20) /* 2MB */ #define HOLD_SEC (1) int pipefd[2]; void *buf; void *do_map(int fd) { unsigned char *tmpbuf, *p; int ret; ret = posix_memalign((void **)&tmpbuf, MSIZE, MSIZE); if (ret) { perror("posix_memalign() failed"); return NULL; } tmpbuf = mmap(tmpbuf, MSIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0); if (tmpbuf == MAP_FAILED) { perror("mmap() failed"); return NULL; } printf("mmap() -> %p\n", tmpbuf); for (p = tmpbuf; p < tmpbuf + MSIZE; p += PSIZE) { *p = 1; } return tmpbuf; } void do_unmap(void *buf) { munmap(buf, MSIZE); } void proc2(int fd) { unsigned char c; buf = do_map(fd); if (!buf) return; read(pipefd[0], &c, 1); /* * This frees the shared pgtable page, causing use-after-free in * proc1_thread1 when soft walking hugetlb pgtable. */ do_unmap(buf); printf("Proc2 quitting\n"); } void *proc1_thread1(void *data) { /* * Trigger follow-page on 1st 2m page. Kernel hack patch needed to * withhold this procedure for easier reproduce. */ madvise(buf, PSIZE, MADV_POPULATE_WRITE); printf("Proc1-thread1 quitting\n"); return NULL; } void *proc1_thread2(void *data) { unsigned char c; /* Wait a while until proc1_thread1() start to wait */ sleep(0.5); /* Trigger pmd unshare */ madvise(buf, PSIZE, MADV_DONTNEED); /* Kick off proc2 to release the pgtable */ write(pipefd[1], &c, 1); printf("Proc1-thread2 quitting\n"); return NULL; } void proc1(int fd) { pthread_t tid1, tid2; int ret; buf = do_map(fd); if (!buf) return; ret = pthread_create(&tid1, NULL, proc1_thread1, NULL); assert(ret == 0); ret = pthread_create(&tid2, NULL, proc1_thread2, NULL); assert(ret == 0); /* Kick the child to share the PUD entry */ pthread_join(tid1, NULL); pthread_join(tid2, NULL); do_unmap(buf); } int main(void) { int fd, ret; fd = memfd_create("test-huge", MFD_HUGETLB | MFD_HUGE_2MB); if (fd < 0) { perror("open failed"); return -1; } ret = ftruncate(fd, MSIZE); if (ret) { perror("ftruncate() failed"); return -1; } ret = pipe(pipefd); if (ret) { perror("pipe() failed"); return -1; } if (fork()) { proc1(fd); } else { proc2(fd); } close(pipefd[0]); close(pipefd[1]); close(fd); return 0; } ======8<======= The kernel patch needed to present such a race so it'll trigger 100%: ======8<======= : diff --git a/mm/hugetlb.c b/mm/hugetlb.c : index 9d97c9a2a15d..f8d99dad5004 100644 : --- a/mm/hugetlb.c : +++ b/mm/hugetlb.c : @@ -38,6 +38,7 @@ : #include : #include : #include : +#include : : #include : #include : @@ -6290,6 +6291,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, : bool unshare = false; : int absent; : struct page *page; : + unsigned long c = 0; : : /* : * If we have a pending SIGKILL, don't keep faulting pages and : @@ -6309,6 +6311,13 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, : */ : pte = huge_pte_offset(mm, vaddr & huge_page_mask(h), : huge_page_size(h)); : + : + pr_info("%s: withhold 1 sec...\n", __func__); : + for (c = 0; c < 100; c++) { : + udelay(10000); : + } : + pr_info("%s: withhold 1 sec...done\n", __func__); : + : if (pte) : ptl = huge_pte_lock(h, mm, pte); : absent = !pte || huge_pte_none(huge_ptep_get(pte)); : ======8<======= It'll trigger use-after-free of the pgtable spinlock: ======8<======= [ 16.959907] follow_hugetlb_page: withhold 1 sec... [ 17.960315] follow_hugetlb_page: withhold 1 sec...done [ 17.960550] ------------[ cut here ]------------ [ 17.960742] DEBUG_LOCKS_WARN_ON(1) [ 17.960756] WARNING: CPU: 3 PID: 542 at kernel/locking/lockdep.c:231 __lock_acquire+0x955/0x1fa0 [ 17.961264] Modules linked in: [ 17.961394] CPU: 3 PID: 542 Comm: hugetlb-pmd-sha Not tainted 6.1.0-rc4-peterx+ #46 [ 17.961704] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 [ 17.962266] RIP: 0010:__lock_acquire+0x955/0x1fa0 [ 17.962516] Code: c0 0f 84 5f fe ff ff 44 8b 1d 0f 9a 29 02 45 85 db 0f 85 4f fe ff ff 48 c7 c6 75 50 83 82 48 c7 c7 1b 4b 7d 82 e8 d3 22 d8 00 <0f> 0b 31 c0 4c 8b 54 24 08 4c 8b 04 24 e9 [ 17.963494] RSP: 0018:ffffc90000e4fba8 EFLAGS: 00010096 [ 17.963704] RAX: 0000000000000016 RBX: fffffffffd3925a8 RCX: 0000000000000000 [ 17.963989] RDX: 0000000000000002 RSI: ffffffff82863ccf RDI: 00000000ffffffff [ 17.964276] RBP: 0000000000000000 R08: 0000000000000000 R09: ffffc90000e4fa58 [ 17.964557] R10: 0000000000000003 R11: ffffffff83162688 R12: 0000000000000000 [ 17.964839] R13: 0000000000000001 R14: ffff888105eac748 R15: 0000000000000001 [ 17.965123] FS: 00007f17c0a00640(0000) GS:ffff888277cc0000(0000) knlGS:0000000000000000 [ 17.965443] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 17.965672] CR2: 00007f17c09ffef8 CR3: 000000010c87a005 CR4: 0000000000770ee0 [ 17.965956] PKRU: 55555554 [ 17.966068] Call Trace: [ 17.966172] [ 17.966268] ? tick_nohz_tick_stopped+0x12/0x30 [ 17.966455] lock_acquire+0xbf/0x2b0 [ 17.966603] ? follow_hugetlb_page.cold+0x75/0x5c4 [ 17.966799] ? _printk+0x48/0x4e [ 17.966934] _raw_spin_lock+0x2f/0x40 [ 17.967087] ? follow_hugetlb_page.cold+0x75/0x5c4 [ 17.967285] follow_hugetlb_page.cold+0x75/0x5c4 [ 17.967473] __get_user_pages+0xbb/0x620 [ 17.967635] faultin_vma_page_range+0x9a/0x100 [ 17.967817] madvise_vma_behavior+0x3c0/0xbd0 [ 17.967998] ? mas_prev+0x11/0x290 [ 17.968141] ? find_vma_prev+0x5e/0xa0 [ 17.968304] ? madvise_vma_anon_name+0x70/0x70 [ 17.968486] madvise_walk_vmas+0xa9/0x120 [ 17.968650] do_madvise.part.0+0xfa/0x270 [ 17.968813] __x64_sys_madvise+0x5a/0x70 [ 17.968974] do_syscall_64+0x37/0x90 [ 17.969123] entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 17.969329] RIP: 0033:0x7f1840f0efdb [ 17.969477] Code: c3 66 0f 1f 44 00 00 48 8b 15 39 6e 0e 00 f7 d8 64 89 02 b8 ff ff ff ff eb bc 0f 1f 44 00 00 f3 0f 1e fa b8 1c 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 0d 68 [ 17.970205] RSP: 002b:00007f17c09ffe38 EFLAGS: 00000202 ORIG_RAX: 000000000000001c [ 17.970504] RAX: ffffffffffffffda RBX: 00007f17c0a00640 RCX: 00007f1840f0efdb [ 17.970786] RDX: 0000000000000017 RSI: 0000000000200000 RDI: 00007f1800000000 [ 17.971068] RBP: 00007f17c09ffe50 R08: 0000000000000000 R09: 00007ffd3954164f [ 17.971353] R10: 00007f1840e10348 R11: 0000000000000202 R12: ffffffffffffff80 [ 17.971709] R13: 0000000000000000 R14: 00007ffd39541550 R15: 00007f17c0200000 [ 17.972083] [ 17.972199] irq event stamp: 2353 [ 17.972372] hardirqs last enabled at (2353): [] __up_console_sem+0x5e/0x70 [ 17.972869] hardirqs last disabled at (2352): [] __up_console_sem+0x43/0x70 [ 17.973365] softirqs last enabled at (2330): [] __irq_exit_rcu+0xed/0x160 [ 17.973857] softirqs last disabled at (2323): [] __irq_exit_rcu+0xed/0x160 [ 17.974341] ---[ end trace 0000000000000000 ]--- [ 17.974614] BUG: kernel NULL pointer dereference, address: 00000000000000b8 [ 17.975012] #PF: supervisor read access in kernel mode [ 17.975314] #PF: error_code(0x0000) - not-present page [ 17.975615] PGD 103f7b067 P4D 103f7b067 PUD 106cd7067 PMD 0 [ 17.975943] Oops: 0000 [#1] PREEMPT SMP NOPTI [ 17.976197] CPU: 3 PID: 542 Comm: hugetlb-pmd-sha Tainted: G W 6.1.0-rc4-peterx+ #46 [ 17.976712] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 [ 17.977370] RIP: 0010:__lock_acquire+0x190/0x1fa0 [ 17.977655] Code: 98 00 00 00 41 89 46 24 81 e2 ff 1f 00 00 48 0f a3 15 e4 ba dd 02 0f 83 ff 05 00 00 48 8d 04 52 48 c1 e0 06 48 05 c0 d2 f4 83 <44> 0f b6 a0 b8 00 00 00 41 0f b7 46 20 6f [ 17.979170] RSP: 0018:ffffc90000e4fba8 EFLAGS: 00010046 [ 17.979787] RAX: 0000000000000000 RBX: fffffffffd3925a8 RCX: 0000000000000000 [ 17.980838] RDX: 0000000000000002 RSI: ffffffff82863ccf RDI: 00000000ffffffff [ 17.982048] RBP: 0000000000000000 R08: ffff888105eac720 R09: ffffc90000e4fa58 [ 17.982892] R10: ffff888105eab900 R11: ffffffff83162688 R12: 0000000000000000 [ 17.983771] R13: 0000000000000001 R14: ffff888105eac748 R15: 0000000000000001 [ 17.984815] FS: 00007f17c0a00640(0000) GS:ffff888277cc0000(0000) knlGS:0000000000000000 [ 17.985924] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 17.986265] CR2: 00000000000000b8 CR3: 000000010c87a005 CR4: 0000000000770ee0 [ 17.986674] PKRU: 55555554 [ 17.986832] Call Trace: [ 17.987012] [ 17.987266] ? tick_nohz_tick_stopped+0x12/0x30 [ 17.987770] lock_acquire+0xbf/0x2b0 [ 17.988118] ? follow_hugetlb_page.cold+0x75/0x5c4 [ 17.988575] ? _printk+0x48/0x4e [ 17.988889] _raw_spin_lock+0x2f/0x40 [ 17.989243] ? follow_hugetlb_page.cold+0x75/0x5c4 [ 17.989687] follow_hugetlb_page.cold+0x75/0x5c4 [ 17.990119] __get_user_pages+0xbb/0x620 [ 17.990500] faultin_vma_page_range+0x9a/0x100 [ 17.990928] madvise_vma_behavior+0x3c0/0xbd0 [ 17.991354] ? mas_prev+0x11/0x290 [ 17.991678] ? find_vma_prev+0x5e/0xa0 [ 17.992024] ? madvise_vma_anon_name+0x70/0x70 [ 17.992421] madvise_walk_vmas+0xa9/0x120 [ 17.992793] do_madvise.part.0+0xfa/0x270 [ 17.993166] __x64_sys_madvise+0x5a/0x70 [ 17.993539] do_syscall_64+0x37/0x90 [ 17.993879] entry_SYSCALL_64_after_hwframe+0x63/0xcd ======8<======= Resolution ========== This patchset protects all the huge_pte_offset() callers to also take the vma lock properly. Patch Layout ============ Patch 1-2: cleanup, or dependency of the follow up patches Patch 3: before fixing, document huge_pte_offset() on lock required Patch 4-8: each patch resolves one possible race condition Patch 9: introduce hugetlb_walk() to replace huge_pte_offset() Tests ===== The series is verified with the above reproducer so the race cannot trigger anymore. It also passes all hugetlb kselftests. This patch (of 9): Even though vma_offset_start() is named like that, it's not returning "the start address of the range" but rather the offset we should use to offset the vma->vm_start address. Make it return the real value of the start vaddr, and it also helps for all the callers because whenever the retval is used, it'll be ultimately added into the vma->vm_start anyway, so it's better. Link: https://lkml.kernel.org/r/20221216155100.2043537-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20221216155100.2043537-2-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: David Hildenbrand Reviewed-by: John Hubbard Cc: Andrea Arcangeli Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 790d2727141a..fdb16246f46e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -412,10 +412,12 @@ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, */ static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start) { + unsigned long offset = 0; + if (vma->vm_pgoff < start) - return (start - vma->vm_pgoff) << PAGE_SHIFT; - else - return 0; + offset = (start - vma->vm_pgoff) << PAGE_SHIFT; + + return vma->vm_start + offset; } static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end) @@ -457,7 +459,7 @@ retry: v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); - if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page)) + if (!hugetlb_vma_maps_page(vma, v_start, page)) continue; if (!hugetlb_vma_trylock_write(vma)) { @@ -473,8 +475,8 @@ retry: break; } - unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, - NULL, ZAP_FLAG_DROP_MARKER); + unmap_hugepage_range(vma, v_start, v_end, NULL, + ZAP_FLAG_DROP_MARKER); hugetlb_vma_unlock_write(vma); } @@ -507,10 +509,9 @@ retry: */ v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); - if (hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page)) - unmap_hugepage_range(vma, vma->vm_start + v_start, - v_end, NULL, - ZAP_FLAG_DROP_MARKER); + if (hugetlb_vma_maps_page(vma, v_start, page)) + unmap_hugepage_range(vma, v_start, v_end, NULL, + ZAP_FLAG_DROP_MARKER); kref_put(&vma_lock->refs, hugetlb_vma_lock_release); hugetlb_vma_unlock_write(vma); @@ -540,8 +541,7 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); - unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, - NULL, zap_flags); + unmap_hugepage_range(vma, v_start, v_end, NULL, zap_flags); /* * Note that vma lock only exists for shared/non-private -- cgit v1.2.3 From bb373dce2c7b473023f9e69f041a22d81171b71a Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:50:53 -0500 Subject: mm/hugetlb: don't wait for migration entry during follow page That's what the code does with !hugetlb pages, so we should logically do the same for hugetlb, so migration entry will also be treated as no page. This is probably also the last piece in follow_page code that may sleep, the last one should be removed in cf994dd8af27 ("mm/gup: remove FOLL_MIGRATION", 2022-11-16). Link: https://lkml.kernel.org/r/20221216155100.2043537-3-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: David Hildenbrand Reviewed-by: John Hubbard Cc: Andrea Arcangeli Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- mm/hugetlb.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ed1ac2df582c..549f79668756 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6401,7 +6401,6 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, if (WARN_ON_ONCE(flags & FOLL_PIN)) return NULL; -retry: pte = huge_pte_offset(mm, haddr, huge_page_size(h)); if (!pte) return NULL; @@ -6424,16 +6423,6 @@ retry: page = NULL; goto out; } - } else { - if (is_hugetlb_entry_migration(entry)) { - spin_unlock(ptl); - __migration_entry_wait_huge(pte, ptl); - goto retry; - } - /* - * hwpoisoned entry is treated as no_page_table in - * follow_page_mask(). - */ } out: spin_unlock(ptl); -- cgit v1.2.3 From fe7d4c6d5a42f5bdc63fdfdca2cad32c8a779e23 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:50:54 -0500 Subject: mm/hugetlb: document huge_pte_offset usage huge_pte_offset() is potentially a pgtable walker, looking up pte_t* for a hugetlb address. Normally, it's always safe to walk a generic pgtable as long as we're with the mmap lock held for either read or write, because that guarantees the pgtable pages will always be valid during the process. But it's not true for hugetlbfs, especially shared: hugetlbfs can have its pgtable freed by pmd unsharing, it means that even with mmap lock held for current mm, the PMD pgtable page can still go away from under us if pmd unsharing is possible during the walk. So we have two ways to make it safe even for a shared mapping: (1) If we're with the hugetlb vma lock held for either read/write, it's okay because pmd unshare cannot happen at all. (2) If we're with the i_mmap_rwsem lock held for either read/write, it's okay because even if pmd unshare can happen, the pgtable page cannot be freed from under us. Document it. Link: https://lkml.kernel.org/r/20221216155100.2043537-4-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Mike Kravetz Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 551834cd5299..d755e2a7c0db 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -192,6 +192,38 @@ extern struct list_head huge_boot_pages; pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz); +/* + * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE. + * Returns the pte_t* if found, or NULL if the address is not mapped. + * + * Since this function will walk all the pgtable pages (including not only + * high-level pgtable page, but also PUD entry that can be unshared + * concurrently for VM_SHARED), the caller of this function should be + * responsible of its thread safety. One can follow this rule: + * + * (1) For private mappings: pmd unsharing is not possible, so holding the + * mmap_lock for either read or write is sufficient. Most callers + * already hold the mmap_lock, so normally, no special action is + * required. + * + * (2) For shared mappings: pmd unsharing is possible (so the PUD-ranged + * pgtable page can go away from under us! It can be done by a pmd + * unshare with a follow up munmap() on the other process), then we + * need either: + * + * (2.1) hugetlb vma lock read or write held, to make sure pmd unshare + * won't happen upon the range (it also makes sure the pte_t we + * read is the right and stable one), or, + * + * (2.2) hugetlb mapping i_mmap_rwsem lock held read or write, to make + * sure even if unshare happened the racy unmap() will wait until + * i_mmap_rwsem is released. + * + * Option (2.1) is the safest, which guarantees pte stability from pmd + * sharing pov, until the vma lock released. Option (2.2) doesn't protect + * a concurrent pmd unshare, but it makes sure the pgtable page is safe to + * access. + */ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); unsigned long hugetlb_mask_last_page(struct hstate *h); -- cgit v1.2.3 From fcd48540d188876c917a377d81cd24c100332a62 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:50:55 -0500 Subject: mm/hugetlb: move swap entry handling into vma lock when faulted In hugetlb_fault(), there used to have a special path to handle swap entry at the entrance using huge_pte_offset(). That's unsafe because huge_pte_offset() for a pmd sharable range can access freed pgtables if without any lock to protect the pgtable from being freed after pmd unshare. Here the simplest solution to make it safe is to move the swap handling to be after the vma lock being held. We may need to take the fault mutex on either migration or hwpoison entries now (also the vma lock, but that's really needed), however neither of them is hot path. Note that the vma lock cannot be released in hugetlb_fault() when the migration entry is detected, because in migration_entry_wait_huge() the pgtable page will be used again (by taking the pgtable lock), so that also need to be protected by the vma lock. Modify migration_entry_wait_huge() so that it must be called with vma read lock held, and properly release the lock in __migration_entry_wait_huge(). Link: https://lkml.kernel.org/r/20221216155100.2043537-5-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: John Hubbard Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/swapops.h | 6 ++++-- mm/hugetlb.c | 37 ++++++++++++++++--------------------- mm/migrate.c | 25 +++++++++++++++++++++---- 3 files changed, 41 insertions(+), 27 deletions(-) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index b982dd614572..3a451b7afcb3 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -337,7 +337,8 @@ extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); #ifdef CONFIG_HUGETLB_PAGE -extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl); +extern void __migration_entry_wait_huge(struct vm_area_struct *vma, + pte_t *ptep, spinlock_t *ptl); extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte); #endif /* CONFIG_HUGETLB_PAGE */ #else /* CONFIG_MIGRATION */ @@ -366,7 +367,8 @@ static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } #ifdef CONFIG_HUGETLB_PAGE -static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { } +static inline void __migration_entry_wait_huge(struct vm_area_struct *vma, + pte_t *ptep, spinlock_t *ptl) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { } #endif /* CONFIG_HUGETLB_PAGE */ static inline int is_writable_migration_entry(swp_entry_t entry) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 549f79668756..7f9db1d9f6a5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5993,22 +5993,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, int need_wait_lock = 0; unsigned long haddr = address & huge_page_mask(h); - ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); - if (ptep) { - /* - * Since we hold no locks, ptep could be stale. That is - * OK as we are only making decisions based on content and - * not actually modifying content here. - */ - entry = huge_ptep_get(ptep); - if (unlikely(is_hugetlb_entry_migration(entry))) { - migration_entry_wait_huge(vma, ptep); - return 0; - } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) - return VM_FAULT_HWPOISON_LARGE | - VM_FAULT_SET_HINDEX(hstate_index(h)); - } - /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate @@ -6023,10 +6007,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * Acquire vma lock before calling huge_pte_alloc and hold * until finished with ptep. This prevents huge_pmd_unshare from * being called elsewhere and making the ptep no longer valid. - * - * ptep could have already be assigned via huge_pte_offset. That - * is OK, as huge_pte_alloc will return the same value unless - * something has changed. */ hugetlb_vma_lock_read(vma); ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); @@ -6055,8 +6035,23 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will * properly handle it. */ - if (!pte_present(entry)) + if (!pte_present(entry)) { + if (unlikely(is_hugetlb_entry_migration(entry))) { + /* + * Release the hugetlb fault lock now, but retain + * the vma lock, because it is needed to guard the + * huge_pte_lockptr() later in + * migration_entry_wait_huge(). The vma lock will + * be released there. + */ + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + migration_entry_wait_huge(vma, ptep); + return 0; + } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) + ret = VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_mutex; + } /* * If we are going to COW/unshare the mapping later, we examine the diff --git a/mm/migrate.c b/mm/migrate.c index a4d3fc65085f..98de7ce2b576 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -329,24 +329,41 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, } #ifdef CONFIG_HUGETLB_PAGE -void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) +/* + * The vma read lock must be held upon entry. Holding that lock prevents either + * the pte or the ptl from being freed. + * + * This function will release the vma lock before returning. + */ +void __migration_entry_wait_huge(struct vm_area_struct *vma, + pte_t *ptep, spinlock_t *ptl) { pte_t pte; + hugetlb_vma_assert_locked(vma); spin_lock(ptl); pte = huge_ptep_get(ptep); - if (unlikely(!is_hugetlb_entry_migration(pte))) + if (unlikely(!is_hugetlb_entry_migration(pte))) { spin_unlock(ptl); - else + hugetlb_vma_unlock_read(vma); + } else { + /* + * If migration entry existed, safe to release vma lock + * here because the pgtable page won't be freed without the + * pgtable lock released. See comment right above pgtable + * lock release in migration_entry_wait_on_locked(). + */ + hugetlb_vma_unlock_read(vma); migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl); + } } void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte); - __migration_entry_wait_huge(pte, ptl); + __migration_entry_wait_huge(vma, pte, ptl); } #endif -- cgit v1.2.3 From b8da2e466000b232a6b67072bbef375061142daa Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:52:17 -0500 Subject: mm/hugetlb: make userfaultfd_huge_must_wait() safe to pmd unshare We can take the hugetlb walker lock, here taking vma lock directly. Link: https://lkml.kernel.org/r/20221216155217.2043700-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: David Hildenbrand Reviewed-by: Mike Kravetz Reviewed-by: John Hubbard Cc: Andrea Arcangeli Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index cc694846617a..3b1797e0448a 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -391,7 +391,8 @@ static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags) */ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) { - struct mm_struct *mm = vmf->vma->vm_mm; + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vma->vm_mm; struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue uwq; vm_fault_t ret = VM_FAULT_SIGBUS; @@ -418,7 +419,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) */ mmap_assert_locked(mm); - ctx = vmf->vma->vm_userfaultfd_ctx.ctx; + ctx = vma->vm_userfaultfd_ctx.ctx; if (!ctx) goto out; @@ -508,6 +509,15 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) blocking_state = userfaultfd_get_blocking_state(vmf->flags); + /* + * Take the vma lock now, in order to safely call + * userfaultfd_huge_must_wait() later. Since acquiring the + * (sleepable) vma lock can modify the current task state, that + * must be before explicitly calling set_current_state(). + */ + if (is_vm_hugetlb_page(vma)) + hugetlb_vma_lock_read(vma); + spin_lock_irq(&ctx->fault_pending_wqh.lock); /* * After the __add_wait_queue the uwq is visible to userland @@ -522,13 +532,15 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) set_current_state(blocking_state); spin_unlock_irq(&ctx->fault_pending_wqh.lock); - if (!is_vm_hugetlb_page(vmf->vma)) + if (!is_vm_hugetlb_page(vma)) must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, reason); else - must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma, + must_wait = userfaultfd_huge_must_wait(ctx, vma, vmf->address, vmf->flags, reason); + if (is_vm_hugetlb_page(vma)) + hugetlb_vma_unlock_read(vma); mmap_read_unlock(mm); if (likely(must_wait && !READ_ONCE(ctx->released))) { -- cgit v1.2.3 From 7d049f3a03ea705522210d70b9d3e223ef86d663 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:52:19 -0500 Subject: mm/hugetlb: make hugetlb_follow_page_mask() safe to pmd unshare Since hugetlb_follow_page_mask() walks the pgtable, it needs the vma lock to make sure the pgtable page will not be freed concurrently. Link: https://lkml.kernel.org/r/20221216155219.2043714-1-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: David Hildenbrand Reviewed-by: Mike Kravetz Reviewed-by: John Hubbard Cc: Andrea Arcangeli Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- mm/hugetlb.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7f9db1d9f6a5..807edc1410e5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6396,9 +6396,10 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, if (WARN_ON_ONCE(flags & FOLL_PIN)) return NULL; + hugetlb_vma_lock_read(vma); pte = huge_pte_offset(mm, haddr, huge_page_size(h)); if (!pte) - return NULL; + goto out_unlock; ptl = huge_pte_lock(h, mm, pte); entry = huge_ptep_get(pte); @@ -6421,6 +6422,8 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, } out: spin_unlock(ptl); +out_unlock: + hugetlb_vma_unlock_read(vma); return page; } -- cgit v1.2.3 From eefc7fa53608920203a1402ecf7255ecfa8bb030 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:52:23 -0500 Subject: mm/hugetlb: make follow_hugetlb_page() safe to pmd unshare Since follow_hugetlb_page() walks the pgtable, it needs the vma lock to make sure the pgtable page will not be freed concurrently. Link: https://lkml.kernel.org/r/20221216155223.2043727-1-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: David Hildenbrand Reviewed-by: Mike Kravetz Reviewed-by: John Hubbard Cc: Andrea Arcangeli Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- mm/hugetlb.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 807edc1410e5..da4c37553c08 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6454,6 +6454,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } + hugetlb_vma_lock_read(vma); /* * Some archs (sparc64, sh*) have multiple pte_ts to * each hugepage. We have to make sure we get the @@ -6478,6 +6479,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, !hugetlbfs_pagecache_present(h, vma, vaddr)) { if (pte) spin_unlock(ptl); + hugetlb_vma_unlock_read(vma); remainder = 0; break; } @@ -6499,6 +6501,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, if (pte) spin_unlock(ptl); + hugetlb_vma_unlock_read(vma); + if (flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; else if (unshare) @@ -6561,6 +6565,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, remainder -= pages_per_huge_page(h); i += pages_per_huge_page(h); spin_unlock(ptl); + hugetlb_vma_unlock_read(vma); continue; } @@ -6590,6 +6595,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs, flags))) { spin_unlock(ptl); + hugetlb_vma_unlock_read(vma); remainder = 0; err = -ENOMEM; break; @@ -6601,6 +6607,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, i += refs; spin_unlock(ptl); + hugetlb_vma_unlock_read(vma); } *nr_pages = remainder; /* -- cgit v1.2.3 From dd361e5033cf36c51acab996ea17748b81cedb38 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:52:26 -0500 Subject: mm/hugetlb: make walk_hugetlb_range() safe to pmd unshare Since walk_hugetlb_range() walks the pgtable, it needs the vma lock to make sure the pgtable page will not be freed concurrently. Link: https://lkml.kernel.org/r/20221216155226.2043738-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: John Hubbard Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/pagewalk.h | 11 ++++++++++- mm/hmm.c | 15 ++++++++++++++- mm/pagewalk.c | 2 ++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 959f52e5867d..27a6df448ee5 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -21,7 +21,16 @@ struct mm_walk; * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD. * Any folded depths (where PTRS_PER_P?D is equal to 1) * are skipped. - * @hugetlb_entry: if set, called for each hugetlb entry + * @hugetlb_entry: if set, called for each hugetlb entry. This hook + * function is called with the vma lock held, in order to + * protect against a concurrent freeing of the pte_t* or + * the ptl. In some cases, the hook function needs to drop + * and retake the vma lock in order to avoid deadlocks + * while calling other functions. In such cases the hook + * function must either refrain from accessing the pte or + * ptl after dropping the vma lock, or else revalidate + * those items after re-acquiring the vma lock and before + * accessing them. * @test_walk: caller specific callback function to determine whether * we walk over the current vma or not. Returning 0 means * "do page table walk over the current vma", returning diff --git a/mm/hmm.c b/mm/hmm.c index 601a99ce3c84..6a151c09de5e 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -492,8 +492,21 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, required_fault = hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); if (required_fault) { + int ret; + spin_unlock(ptl); - return hmm_vma_fault(addr, end, required_fault, walk); + hugetlb_vma_unlock_read(vma); + /* + * Avoid deadlock: drop the vma lock before calling + * hmm_vma_fault(), which will itself potentially take and + * drop the vma lock. This is also correct from a + * protection point of view, because there is no further + * use here of either pte or ptl after dropping the vma + * lock. + */ + ret = hmm_vma_fault(addr, end, required_fault, walk); + hugetlb_vma_lock_read(vma); + return ret; } pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 7f1c9b274906..d98564a7be57 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -302,6 +302,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, const struct mm_walk_ops *ops = walk->ops; int err = 0; + hugetlb_vma_lock_read(vma); do { next = hugetlb_entry_end(h, addr, end); pte = huge_pte_offset(walk->mm, addr & hmask, sz); @@ -314,6 +315,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, if (err) break; } while (addr = next, addr != end); + hugetlb_vma_unlock_read(vma); return err; } -- cgit v1.2.3 From 9c67a20704e763f9cb8cd262c3e45de7bd2816bc Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:52:29 -0500 Subject: mm/hugetlb: introduce hugetlb_walk() huge_pte_offset() is the main walker function for hugetlb pgtables. The name is not really representing what it does, though. Instead of renaming it, introduce a wrapper function called hugetlb_walk() which will use huge_pte_offset() inside. Assert on the locks when walking the pgtable. Note, the vma lock assertion will be a no-op for private mappings. Document the last special case in the page_vma_mapped_walk() path where we don't need any more lock to call hugetlb_walk(). Taking vma lock there is not needed because either: (1) potential callers of hugetlb pvmw holds i_mmap_rwsem already (from one rmap_walk()), or (2) the caller will not walk a hugetlb vma at all so the hugetlb code path not reachable (e.g. in ksm or uprobe paths). It's slightly implicit for future page_vma_mapped_walk() callers on that lock requirement. But anyway, when one day this rule breaks, one will get a straightforward warning in hugetlb_walk() with lockdep, then there'll be a way out. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20221216155229.2043750-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 4 +--- fs/userfaultfd.c | 6 ++---- include/linux/hugetlb.h | 37 +++++++++++++++++++++++++++++++++++++ mm/hugetlb.c | 31 +++++++++++++------------------ mm/page_vma_mapped.c | 9 ++++++--- mm/pagewalk.c | 4 +--- 6 files changed, 60 insertions(+), 31 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index fdb16246f46e..48f1a8ad2243 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -388,9 +388,7 @@ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, { pte_t *ptep, pte; - ptep = huge_pte_offset(vma->vm_mm, addr, - huge_page_size(hstate_vma(vma))); - + ptep = hugetlb_walk(vma, addr, huge_page_size(hstate_vma(vma))); if (!ptep) return false; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 3b1797e0448a..15a5bf765d43 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -252,14 +252,12 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, unsigned long flags, unsigned long reason) { - struct mm_struct *mm = ctx->mm; pte_t *ptep, pte; bool ret = true; - mmap_assert_locked(mm); - - ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); + mmap_assert_locked(ctx->mm); + ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma)); if (!ptep) goto out; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d755e2a7c0db..b6b10101bea7 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -2,6 +2,7 @@ #ifndef _LINUX_HUGETLB_H #define _LINUX_HUGETLB_H +#include #include #include #include @@ -196,6 +197,11 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE. * Returns the pte_t* if found, or NULL if the address is not mapped. * + * IMPORTANT: we should normally not directly call this function, instead + * this is only a common interface to implement arch-specific + * walker. Please use hugetlb_walk() instead, because that will attempt to + * verify the locking for you. + * * Since this function will walk all the pgtable pages (including not only * high-level pgtable page, but also PUD entry that can be unshared * concurrently for VM_SHARED), the caller of this function should be @@ -1229,4 +1235,35 @@ bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); #define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #endif +static inline bool __vma_shareable_lock(struct vm_area_struct *vma) +{ + return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data; +} + +/* + * Safe version of huge_pte_offset() to check the locks. See comments + * above huge_pte_offset(). + */ +static inline pte_t * +hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz) +{ +#if defined(CONFIG_HUGETLB_PAGE) && \ + defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP) + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + /* + * If pmd sharing possible, locking needed to safely walk the + * hugetlb pgtables. More information can be found at the comment + * above huge_pte_offset() in the same file. + * + * NOTE: lockdep_is_held() is only defined with CONFIG_LOCKDEP. + */ + if (__vma_shareable_lock(vma)) + WARN_ON_ONCE(!lockdep_is_held(&vma_lock->rw_sema) && + !lockdep_is_held( + &vma->vm_file->f_mapping->i_mmap_rwsem)); +#endif + return huge_pte_offset(vma->vm_mm, addr, sz); +} + #endif /* _LINUX_HUGETLB_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index da4c37553c08..0e5441d6890a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -260,11 +260,6 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) /* * hugetlb vma_lock helper routines */ -static bool __vma_shareable_lock(struct vm_area_struct *vma) -{ - return vma->vm_flags & VM_MAYSHARE && vma->vm_private_data; -} - void hugetlb_vma_lock_read(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { @@ -4980,7 +4975,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, } else { /* * For shared mappings the vma lock must be held before - * calling huge_pte_offset in the src vma. Otherwise, the + * calling hugetlb_walk() in the src vma. Otherwise, the * returned ptep could go away if part of a shared pmd and * another thread calls huge_pmd_unshare. */ @@ -4990,7 +4985,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, last_addr_mask = hugetlb_mask_last_page(h); for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; - src_pte = huge_pte_offset(src, addr, sz); + src_pte = hugetlb_walk(src_vma, addr, sz); if (!src_pte) { addr |= last_addr_mask; continue; @@ -5197,7 +5192,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, hugetlb_vma_lock_write(vma); i_mmap_lock_write(mapping); for (; old_addr < old_end; old_addr += sz, new_addr += sz) { - src_pte = huge_pte_offset(mm, old_addr, sz); + src_pte = hugetlb_walk(vma, old_addr, sz); if (!src_pte) { old_addr |= last_addr_mask; new_addr |= last_addr_mask; @@ -5260,7 +5255,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct last_addr_mask = hugetlb_mask_last_page(h); address = start; for (; address < end; address += sz) { - ptep = huge_pte_offset(mm, address, sz); + ptep = hugetlb_walk(vma, address, sz); if (!ptep) { address |= last_addr_mask; continue; @@ -5573,7 +5568,7 @@ retry_avoidcopy: mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vma_lock_read(vma); spin_lock(ptl); - ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); + ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) goto retry_avoidcopy; @@ -5611,7 +5606,7 @@ retry_avoidcopy: * before the page tables are altered */ spin_lock(ptl); - ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); + ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { /* Break COW or unshare */ huge_ptep_clear_flush(vma, haddr, ptep); @@ -6397,7 +6392,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, return NULL; hugetlb_vma_lock_read(vma); - pte = huge_pte_offset(mm, haddr, huge_page_size(h)); + pte = hugetlb_walk(vma, haddr, huge_page_size(h)); if (!pte) goto out_unlock; @@ -6462,8 +6457,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * * Note that page table lock is not held when pte is null. */ - pte = huge_pte_offset(mm, vaddr & huge_page_mask(h), - huge_page_size(h)); + pte = hugetlb_walk(vma, vaddr & huge_page_mask(h), + huge_page_size(h)); if (pte) ptl = huge_pte_lock(h, mm, pte); absent = !pte || huge_pte_none(huge_ptep_get(pte)); @@ -6654,7 +6649,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, last_addr_mask = hugetlb_mask_last_page(h); for (; address < end; address += psize) { spinlock_t *ptl; - ptep = huge_pte_offset(mm, address, psize); + ptep = hugetlb_walk(vma, address, psize); if (!ptep) { if (!uffd_wp) { address |= last_addr_mask; @@ -7064,8 +7059,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, saddr = page_table_shareable(svma, vma, addr, idx); if (saddr) { - spte = huge_pte_offset(svma->vm_mm, saddr, - vma_mmu_pagesize(svma)); + spte = hugetlb_walk(svma, saddr, + vma_mmu_pagesize(svma)); if (spte) { get_page(virt_to_page(spte)); break; @@ -7377,7 +7372,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); for (address = start; address < end; address += PUD_SIZE) { - ptep = huge_pte_offset(mm, address, sz); + ptep = hugetlb_walk(vma, address, sz); if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 93e13fc17d3c..4e448cfbc6ef 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -168,9 +168,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) /* The only possible mapping was handled on last iteration */ if (pvmw->pte) return not_found(pvmw); - - /* when pud is not present, pte will be NULL */ - pvmw->pte = huge_pte_offset(mm, pvmw->address, size); + /* + * All callers that get here will already hold the + * i_mmap_rwsem. Therefore, no additional locks need to be + * taken before calling hugetlb_walk(). + */ + pvmw->pte = hugetlb_walk(vma, pvmw->address, size); if (!pvmw->pte) return false; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index d98564a7be57..cb23f8a15c13 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -305,13 +305,11 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, hugetlb_vma_lock_read(vma); do { next = hugetlb_entry_end(h, addr, end); - pte = huge_pte_offset(walk->mm, addr & hmask, sz); - + pte = hugetlb_walk(vma, addr & hmask, sz); if (pte) err = ops->hugetlb_entry(pte, hmask, addr, next, walk); else if (ops->pte_hole) err = ops->pte_hole(addr, next, -1, walk); - if (err) break; } while (addr = next, addr != end); -- cgit v1.2.3 From d685c668b0695dff927c85e27ef27d4f404f16a3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:43:51 +0000 Subject: buffer: add b_folio as an alias of b_page Patch series "Start converting buffer_heads to use folios". I was hoping that filesystems would convert from buffer_heads to iomap, but that's not happening particularly quickly. So the buffer_head infrastructure needs to be converted from being page-based to being folio-based. This patch (of 12): Buffer heads point to the allocation (ie the folio), not the page. This is currently the same thing for all filesystems that use buffer heads, so this is a safe transitional step. Link: https://lkml.kernel.org/r/20221215214402.3522366-1-willy@infradead.org Link: https://lkml.kernel.org/r/20221215214402.3522366-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- include/linux/buffer_head.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 33fa5e94aa80..8f14dca5fed7 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -61,7 +61,10 @@ typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); struct buffer_head { unsigned long b_state; /* buffer state bitmap (see above) */ struct buffer_head *b_this_page;/* circular list of page's buffers */ - struct page *b_page; /* the page this bh is mapped to */ + union { + struct page *b_page; /* the page this bh is mapped to */ + struct folio *b_folio; /* the folio this bh is mapped to */ + }; sector_t b_blocknr; /* start block number */ size_t b_size; /* size of mapping */ -- cgit v1.2.3 From abc8a8a2c7dc7b557619befa8fb29be60ed481bc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:43:52 +0000 Subject: buffer: replace obvious uses of b_page with b_folio These cases just check if it's NULL, or use b_page to get to the page's address space. They are assumptions that b_page never points to a tail page. Link: https://lkml.kernel.org/r/20221215214402.3522366-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- fs/buffer.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index d9c6d1fbb6dd..e1055fe0b366 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -321,7 +321,7 @@ static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate) { /* Decrypt if needed */ if (uptodate && - fscrypt_inode_uses_fs_layer_crypto(bh->b_page->mapping->host)) { + fscrypt_inode_uses_fs_layer_crypto(bh->b_folio->mapping->host)) { struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC); if (ctx) { @@ -570,7 +570,7 @@ void write_boundary_block(struct block_device *bdev, void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) { struct address_space *mapping = inode->i_mapping; - struct address_space *buffer_mapping = bh->b_page->mapping; + struct address_space *buffer_mapping = bh->b_folio->mapping; mark_buffer_dirty(bh); if (!mapping->private_data) { @@ -1073,7 +1073,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, * and then attach the address_space's inode to its superblock's dirty * inode list. * - * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, + * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->private_lock, * i_pages lock and mapping->host->i_lock. */ void mark_buffer_dirty(struct buffer_head *bh) @@ -1117,8 +1117,8 @@ void mark_buffer_write_io_error(struct buffer_head *bh) set_buffer_write_io_error(bh); /* FIXME: do we need to set this in both places? */ - if (bh->b_page && bh->b_page->mapping) - mapping_set_error(bh->b_page->mapping, -EIO); + if (bh->b_folio && bh->b_folio->mapping) + mapping_set_error(bh->b_folio->mapping, -EIO); if (bh->b_assoc_map) mapping_set_error(bh->b_assoc_map, -EIO); rcu_read_lock(); @@ -1154,7 +1154,7 @@ void __bforget(struct buffer_head *bh) { clear_buffer_dirty(bh); if (bh->b_assoc_map) { - struct address_space *buffer_mapping = bh->b_page->mapping; + struct address_space *buffer_mapping = bh->b_folio->mapping; spin_lock(&buffer_mapping->private_lock); list_del_init(&bh->b_assoc_buffers); -- cgit v1.2.3 From 03c5f331234c5798965fa654783dbed1c792c7f4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:43:53 +0000 Subject: buffer: use b_folio in touch_buffer() Removes a call to compound_head() in this path. Link: https://lkml.kernel.org/r/20221215214402.3522366-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- fs/buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/buffer.c b/fs/buffer.c index e1055fe0b366..8a02fdaeec9a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -60,7 +60,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, inline void touch_buffer(struct buffer_head *bh) { trace_block_touch_buffer(bh); - mark_page_accessed(bh->b_page); + folio_mark_accessed(bh->b_folio); } EXPORT_SYMBOL(touch_buffer); -- cgit v1.2.3 From 2e2dba15d107491e972041acb2d0b7bd73b92bc0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:43:54 +0000 Subject: buffer: use b_folio in end_buffer_async_read() Removes a call to compound_head() in SetPageError(), saving 76 bytes of text. Link: https://lkml.kernel.org/r/20221215214402.3522366-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- fs/buffer.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 8a02fdaeec9a..5bdcc040eca3 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -246,18 +246,18 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) unsigned long flags; struct buffer_head *first; struct buffer_head *tmp; - struct page *page; - int page_uptodate = 1; + struct folio *folio; + int folio_uptodate = 1; BUG_ON(!buffer_async_read(bh)); - page = bh->b_page; + folio = bh->b_folio; if (uptodate) { set_buffer_uptodate(bh); } else { clear_buffer_uptodate(bh); buffer_io_error(bh, ", async page read"); - SetPageError(page); + folio_set_error(folio); } /* @@ -265,14 +265,14 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) * two buffer heads end IO at almost the same time and both * decide that the page is now completely done. */ - first = page_buffers(page); + first = folio_buffers(folio); spin_lock_irqsave(&first->b_uptodate_lock, flags); clear_buffer_async_read(bh); unlock_buffer(bh); tmp = bh; do { if (!buffer_uptodate(tmp)) - page_uptodate = 0; + folio_uptodate = 0; if (buffer_async_read(tmp)) { BUG_ON(!buffer_locked(tmp)); goto still_busy; @@ -285,9 +285,9 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) * If all of the buffers are uptodate then we can set the page * uptodate. */ - if (page_uptodate) - SetPageUptodate(page); - unlock_page(page); + if (folio_uptodate) + folio_mark_uptodate(folio); + folio_unlock(folio); return; still_busy: -- cgit v1.2.3 From 743ed81ec11147b42085a73a2e408964674291a9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:43:55 +0000 Subject: buffer: use b_folio in end_buffer_async_write() Save 76 bytes from avoiding the call to compound_head() in SetPageError(). Also avoid the call to compound_head() in end_page_writeback(). Link: https://lkml.kernel.org/r/20221215214402.3522366-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- fs/buffer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 5bdcc040eca3..c44ca40530c3 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -344,21 +344,21 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) unsigned long flags; struct buffer_head *first; struct buffer_head *tmp; - struct page *page; + struct folio *folio; BUG_ON(!buffer_async_write(bh)); - page = bh->b_page; + folio = bh->b_folio; if (uptodate) { set_buffer_uptodate(bh); } else { buffer_io_error(bh, ", lost async page write"); mark_buffer_write_io_error(bh); clear_buffer_uptodate(bh); - SetPageError(page); + folio_set_error(folio); } - first = page_buffers(page); + first = folio_buffers(folio); spin_lock_irqsave(&first->b_uptodate_lock, flags); clear_buffer_async_write(bh); @@ -372,7 +372,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) tmp = tmp->b_this_page; } spin_unlock_irqrestore(&first->b_uptodate_lock, flags); - end_page_writeback(page); + folio_end_writeback(folio); return; still_busy: -- cgit v1.2.3 From c10d91194d5d630a0befc7bc116aba3cfda8a226 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:43:56 +0000 Subject: page_io: remove buffer_head include page_io never uses buffer_heads to do I/O. Link: https://lkml.kernel.org/r/20221215214402.3522366-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- mm/page_io.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/page_io.c b/mm/page_io.c index 3a5f921b932e..905d9fcc0c96 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From cf1d3417e634b2b2dd162a7e193af9a9d700431b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:43:57 +0000 Subject: buffer: use b_folio in mark_buffer_dirty() Removes about four calls to compound_head(). Two of them are inline which removes 132 bytes from the kernel text. Link: https://lkml.kernel.org/r/20221215214402.3522366-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- fs/buffer.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index c44ca40530c3..7e42d67bcaad 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1095,16 +1095,16 @@ void mark_buffer_dirty(struct buffer_head *bh) } if (!test_set_buffer_dirty(bh)) { - struct page *page = bh->b_page; + struct folio *folio = bh->b_folio; struct address_space *mapping = NULL; - lock_page_memcg(page); - if (!TestSetPageDirty(page)) { - mapping = page_mapping(page); + folio_memcg_lock(folio); + if (!folio_test_set_dirty(folio)) { + mapping = folio->mapping; if (mapping) - __set_page_dirty(page, mapping, 0); + __folio_mark_dirty(folio, mapping, 0); } - unlock_page_memcg(page); + folio_memcg_unlock(folio); if (mapping) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } -- cgit v1.2.3 From 11551cf15ecc17c4db4456538fd73d284ffcf20b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:43:58 +0000 Subject: gfs2: replace obvious uses of b_page with b_folio These places just use b_page to get to the buffer's address_space. Link: https://lkml.kernel.org/r/20221215214402.3522366-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- fs/gfs2/glops.c | 2 +- fs/gfs2/log.c | 2 +- fs/gfs2/meta_io.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index d78b61ecc1cd..081422644ec5 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -39,7 +39,7 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page " "state 0x%lx\n", bh, (unsigned long long)bh->b_blocknr, bh->b_state, - bh->b_page->mapping, bh->b_page->flags); + bh->b_folio->mapping, bh->b_folio->flags); fs_err(sdp, "AIL glock %u:%llu mapping %p\n", gl->gl_name.ln_type, gl->gl_name.ln_number, gfs2_glock2aspace(gl)); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 723639376ae2..1fcc829f02ab 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -127,7 +127,7 @@ __acquires(&sdp->sd_ail_lock) continue; gl = bd->bd_gl; list_move(&bd->bd_ail_st_list, &tr->tr_ail1_list); - mapping = bh->b_page->mapping; + mapping = bh->b_folio->mapping; if (!mapping) continue; spin_unlock(&sdp->sd_ail_lock); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 3c41b864ee5b..924361fa510b 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -334,7 +334,7 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) void gfs2_remove_from_journal(struct buffer_head *bh, int meta) { - struct address_space *mapping = bh->b_page->mapping; + struct address_space *mapping = bh->b_folio->mapping; struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); struct gfs2_bufdata *bd = bh->b_private; struct gfs2_trans *tr = current->journal_info; -- cgit v1.2.3 From 0d22fe2f039e971c2d7cc97d19ce48d8bdae253c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:43:59 +0000 Subject: jbd2: replace obvious uses of b_page with b_folio These places just use b_page to get to the buffer's address_space or have already been converted to folio. Link: https://lkml.kernel.org/r/20221215214402.3522366-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- fs/jbd2/commit.c | 8 ++------ fs/jbd2/journal.c | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 4810438b7856..96a1ebc6342d 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -63,16 +63,12 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) static void release_buffer_page(struct buffer_head *bh) { struct folio *folio; - struct page *page; if (buffer_dirty(bh)) goto nope; if (atomic_read(&bh->b_count) != 1) goto nope; - page = bh->b_page; - if (!page) - goto nope; - folio = page_folio(page); + folio = bh->b_folio; if (folio->mapping) goto nope; @@ -1040,7 +1036,7 @@ restart_loop: * already detached from the mapping and buffers cannot * get reused. */ - mapping = READ_ONCE(bh->b_page->mapping); + mapping = READ_ONCE(bh->b_folio->mapping); if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) { clear_buffer_mapped(bh); clear_buffer_new(bh); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 2696f43e7239..4095fe91457f 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2938,7 +2938,7 @@ repeat: } else { J_ASSERT_BH(bh, (atomic_read(&bh->b_count) > 0) || - (bh->b_page && bh->b_page->mapping)); + (bh->b_folio && bh->b_folio->mapping)); if (!new_jh) { jbd_unlock_bh_journal_head(bh); -- cgit v1.2.3 From 6ad4cd7f36000ae145373b68ac61f15a1793b054 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:44:00 +0000 Subject: nilfs2: replace obvious uses of b_page with b_folio These places just use b_page to get to the buffer's address_space or the index of the page the buffer is in. Link: https://lkml.kernel.org/r/20221215214402.3522366-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- fs/nilfs2/btnode.c | 2 +- fs/nilfs2/btree.c | 2 +- fs/nilfs2/gcinode.c | 2 +- fs/nilfs2/mdt.c | 4 ++-- fs/nilfs2/segment.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index e74fda212620..e956f886a1a1 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -188,7 +188,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, struct page *opage = obh->b_page; lock_page(opage); retry: - /* BUG_ON(oldkey != obh->b_page->index); */ + /* BUG_ON(oldkey != obh->b_folio->index); */ if (unlikely(oldkey != opage->index)) NILFS_PAGE_BUG(opage, "invalid oldkey %lld (newkey=%lld)", diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 40ce92a332fe..b5f997e5e670 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -398,7 +398,7 @@ int nilfs_btree_broken_node_block(struct buffer_head *bh) if (buffer_nilfs_checked(bh)) return 0; - inode = bh->b_page->mapping->host; + inode = bh->b_folio->mapping->host; ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data, bh->b_size, inode, bh->b_blocknr); if (likely(!ret)) diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index b0d22ff24b67..48fe71d309cb 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -140,7 +140,7 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh) { wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - struct inode *inode = bh->b_page->mapping->host; + struct inode *inode = bh->b_folio->mapping->host; nilfs_err(inode->i_sb, "I/O error reading %s block for GC (ino=%lu, vblocknr=%llu)", diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index cbf4fa60eea2..19c8158605ed 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -563,7 +563,7 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh) struct page *page; int blkbits = inode->i_blkbits; - page = grab_cache_page(shadow->inode->i_mapping, bh->b_page->index); + page = grab_cache_page(shadow->inode->i_mapping, bh->b_folio->index); if (!page) return -ENOMEM; @@ -595,7 +595,7 @@ nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh) struct page *page; int n; - page = find_lock_page(shadow->inode->i_mapping, bh->b_page->index); + page = find_lock_page(shadow->inode->i_mapping, bh->b_folio->index); if (page) { if (page_has_buffers(page)) { n = bh_offset(bh) >> inode->i_blkbits; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 76c3bd88b858..f7a14ed12a66 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1581,7 +1581,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci, nblocks = le32_to_cpu(finfo->fi_nblocks); ndatablk = le32_to_cpu(finfo->fi_ndatablk); - inode = bh->b_page->mapping->host; + inode = bh->b_folio->mapping->host; if (mode == SC_LSEG_DSYNC) sc_op = &nilfs_sc_dsync_ops; -- cgit v1.2.3 From ac55e78d9e44a5374f5726b94e55f5a7b5d51fb9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:44:01 +0000 Subject: reiserfs: replace obvious uses of b_page with b_folio These places just use b_page to get to the buffer's address_space or call page_folio() on b_page to get a folio. Link: https://lkml.kernel.org/r/20221215214402.3522366-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- fs/reiserfs/journal.c | 4 ++-- fs/reiserfs/tail_conversion.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 9f62da7471c9..9ce4ec296b74 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -601,7 +601,7 @@ static int journal_list_still_alive(struct super_block *s, */ static void release_buffer_page(struct buffer_head *bh) { - struct folio *folio = page_folio(bh->b_page); + struct folio *folio = bh->b_folio; if (!folio->mapping && folio_trylock(folio)) { folio_get(folio); put_bh(bh); @@ -866,7 +866,7 @@ loop_next: * will ever write the buffer. We're safe if we write the * page one last time after freeing the journal header. */ - if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) { + if (buffer_dirty(bh) && unlikely(bh->b_folio->mapping == NULL)) { spin_unlock(lock); write_dirty_buffer(bh, 0); spin_lock(lock); diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c index b0ae088dffc7..2cec61af2a9e 100644 --- a/fs/reiserfs/tail_conversion.c +++ b/fs/reiserfs/tail_conversion.c @@ -177,7 +177,7 @@ void reiserfs_unmap_buffer(struct buffer_head *bh) * BUG() on attempt to write not mapped buffer */ if ((!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) { - struct inode *inode = bh->b_page->mapping->host; + struct inode *inode = bh->b_folio->mapping->host; struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); spin_lock(&j->j_dirty_buffers_lock); list_del_init(&bh->b_assoc_buffers); -- cgit v1.2.3 From a5fd8390d2b2db15fd043b8bd571b536101222c2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:44:02 +0000 Subject: mpage: use b_folio in do_mpage_readpage() Remove this conversion of a folio back to a page. Link: https://lkml.kernel.org/r/20221215214402.3522366-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- fs/mpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/mpage.c b/fs/mpage.c index 0f8ae954a579..db59cbf6affc 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -198,7 +198,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) /* * Then do more get_blocks calls until we are done with this folio. */ - map_bh->b_page = &folio->page; + map_bh->b_folio = folio; while (page_block < blocks_per_page) { map_bh->b_state = 0; map_bh->b_size = 0; -- cgit v1.2.3 From e976936cfc66376fc740a3a476365273384ce1ce Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 16 Dec 2022 14:45:37 -0500 Subject: mm/mempolicy: do not duplicate policy if it is not applicable for set_mempolicy_home_node set_mempolicy_home_node tries to duplicate a memory policy before checking it whether it is applicable for the operation. There is no real reason for doing that and it might actually be a pointless memory allocation and deallocation exercise for MPOL_INTERLEAVE. Not a big problem but we can do better. Simply check the policy before acting on it. Link: https://lkml.kernel.org/r/20221216194537.238047-2-mathieu.desnoyers@efficios.com Signed-off-by: Michal Hocko Reviewed-by: Mathieu Desnoyers Signed-off-by: Mathieu Desnoyers Cc: Aneesh Kumar K.V Cc: Dave Hansen Cc: Feng Tang Cc: Michal Hocko Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: Huang Ying Signed-off-by: Andrew Morton --- mm/mempolicy.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 02c8a712282f..becf41e10076 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1489,7 +1489,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - struct mempolicy *new; + struct mempolicy *new, *old; unsigned long vmstart; unsigned long vmend; unsigned long end; @@ -1521,31 +1521,27 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le return 0; mmap_write_lock(mm); for_each_vma_range(vmi, vma, end) { - vmstart = max(start, vma->vm_start); - vmend = min(end, vma->vm_end); - new = mpol_dup(vma_policy(vma)); - if (IS_ERR(new)) { - err = PTR_ERR(new); - break; - } - /* - * Only update home node if there is an existing vma policy - */ - if (!new) - continue; - /* * If any vma in the range got policy other than MPOL_BIND * or MPOL_PREFERRED_MANY we return error. We don't reset * the home node for vmas we already updated before. */ - if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) { - mpol_put(new); + old = vma_policy(vma); + if (!old) + continue; + if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) { err = -EOPNOTSUPP; break; } + new = mpol_dup(old); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } new->home_node = home_node; + vmstart = max(start, vma->vm_start); + vmend = min(end, vma->vm_end); err = mbind_range(mm, vmstart, vmend, new); mpol_put(new); if (err) -- cgit v1.2.3 From 6b1ead5985bf73b7dd4453f5cd3b8690a9c52cd5 Mon Sep 17 00:00:00 2001 From: Qinglin Pan Date: Mon, 12 Dec 2022 13:56:57 +0800 Subject: lib/test_vmalloc.c: add parameter use_huge for fix_size_alloc_test Add a parameter `use_huge' for fix_size_alloc_test(), which can be used to test allocation vie vmalloc_huge for both functionality and performance. Link: https://lkml.kernel.org/r/20221212055657.698420-1-panqinglin2020@iscas.ac.cn Signed-off-by: Qinglin Pan Cc: "Uladzislau Rezki (Sony)" Signed-off-by: Andrew Morton --- lib/test_vmalloc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index f90d2c27675b..de4ee0d50906 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -38,6 +38,9 @@ __param(int, test_loop_count, 1000000, __param(int, nr_pages, 0, "Set number of pages for fix_size_alloc_test(default: 1)"); +__param(bool, use_huge, false, + "Use vmalloc_huge in fix_size_alloc_test"); + __param(int, run_test_mask, INT_MAX, "Set tests specified in the mask.\n\n" "\t\tid: 1, name: fix_size_alloc_test\n" @@ -264,7 +267,10 @@ static int fix_size_alloc_test(void) int i; for (i = 0; i < test_loop_count; i++) { - ptr = vmalloc((nr_pages > 0 ? nr_pages:1) * PAGE_SIZE); + if (use_huge) + ptr = vmalloc_huge((nr_pages > 0 ? nr_pages:1) * PAGE_SIZE, GFP_KERNEL); + else + ptr = vmalloc((nr_pages > 0 ? nr_pages:1) * PAGE_SIZE); if (!ptr) return -1; -- cgit v1.2.3 From cb6c33d4dc09a8fddda1867708956c27615775f4 Mon Sep 17 00:00:00 2001 From: Wenchao Hao Date: Thu, 8 Dec 2022 22:21:30 +0800 Subject: cma: tracing: print alloc result in trace_cma_alloc_finish The result of the allocation attempt is not printed in trace_cma_alloc_finish, but it's important to do it so we can set filters to catch specific errors on allocation or to trigger some operations on specific errors. We have printed the result in log, but the log is conditional and could not be filtered by tracing events. It introduces little overhead to print this result. The result of allocation is named `errorno' in the trace. Link: https://lkml.kernel.org/r/20221208142130.1501195-1-haowenchao@huawei.com Signed-off-by: Wenchao Hao Cc: Masami Hiramatsu (Google) Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/trace/events/cma.h | 32 +++++++++++++++++++++++++++++--- mm/cma.c | 2 +- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index 3d708dae1542..ef75ea606ab2 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -91,12 +91,38 @@ TRACE_EVENT(cma_alloc_start, __entry->align) ); -DEFINE_EVENT(cma_alloc_class, cma_alloc_finish, +TRACE_EVENT(cma_alloc_finish, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned long count, unsigned int align), + unsigned long count, unsigned int align, int errorno), - TP_ARGS(name, pfn, page, count, align) + TP_ARGS(name, pfn, page, count, align, errorno), + + TP_STRUCT__entry( + __string(name, name) + __field(unsigned long, pfn) + __field(const struct page *, page) + __field(unsigned long, count) + __field(unsigned int, align) + __field(int, errorno) + ), + + TP_fast_assign( + __assign_str(name, name); + __entry->pfn = pfn; + __entry->page = page; + __entry->count = count; + __entry->align = align; + __entry->errorno = errorno; + ), + + TP_printk("name=%s pfn=0x%lx page=%p count=%lu align=%u errorno=%d", + __get_str(name), + __entry->pfn, + __entry->page, + __entry->count, + __entry->align, + __entry->errorno) ); DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry, diff --git a/mm/cma.c b/mm/cma.c index 4a978e09547a..a75b17b03b66 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -491,7 +491,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, start = bitmap_no + mask + 1; } - trace_cma_alloc_finish(cma->name, pfn, page, count, align); + trace_cma_alloc_finish(cma->name, pfn, page, count, align, ret); /* * CMA can allocate multiple page blocks, which results in different -- cgit v1.2.3 From fc986a38b670d9b8f6af0596e973976018314a59 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 7 Dec 2022 10:34:30 +0800 Subject: mm: huge_memory: convert madvise_free_huge_pmd to use a folio Using folios instead of pages removes several calls to compound_head(), Link: https://lkml.kernel.org/r/20221207023431.151008-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Vishal Moola (Oracle) Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/huge_memory.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 867f02e6061d..3de266e0aeb2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1603,7 +1603,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, { spinlock_t *ptl; pmd_t orig_pmd; - struct page *page; + struct folio *folio; struct mm_struct *mm = tlb->mm; bool ret = false; @@ -1623,15 +1623,15 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, goto out; } - page = pmd_page(orig_pmd); + folio = pfn_folio(pmd_pfn(orig_pmd)); /* - * If other processes are mapping this page, we couldn't discard - * the page unless they all do MADV_FREE so let's skip the page. + * If other processes are mapping this folio, we couldn't discard + * the folio unless they all do MADV_FREE so let's skip the folio. */ - if (total_mapcount(page) != 1) + if (folio_mapcount(folio) != 1) goto out; - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto out; /* @@ -1639,17 +1639,17 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, * will deactivate only them. */ if (next - addr != HPAGE_PMD_SIZE) { - get_page(page); + folio_get(folio); spin_unlock(ptl); - split_huge_page(page); - unlock_page(page); - put_page(page); + split_folio(folio); + folio_unlock(folio); + folio_put(folio); goto out_unlocked; } - if (PageDirty(page)) - ClearPageDirty(page); - unlock_page(page); + if (folio_test_dirty(folio)) + folio_clear_dirty(folio); + folio_unlock(folio); if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { pmdp_invalidate(vma, addr, pmd); @@ -1660,7 +1660,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } - mark_page_lazyfree(page); + mark_page_lazyfree(&folio->page); ret = true; out: spin_unlock(ptl); -- cgit v1.2.3 From 6a6fe9ebd571a4092b7d5c1f11e4e1e15d296fa5 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 9 Dec 2022 10:06:18 +0800 Subject: mm: swap: convert mark_page_lazyfree() to folio_mark_lazyfree() mark_page_lazyfree() and the callers are converted to use folio, this rename and make it to take in a folio argument instead of calling page_folio(). Link: https://lkml.kernel.org/r/20221209020618.190306-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Vishal Moola (Oracle) Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 +- mm/huge_memory.c | 2 +- mm/madvise.c | 2 +- mm/swap.c | 12 +++++------- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 2787b84eaf12..93f1cebd8545 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -402,7 +402,7 @@ extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); extern void lru_add_drain_all(void); extern void deactivate_page(struct page *page); -extern void mark_page_lazyfree(struct page *page); +void folio_mark_lazyfree(struct folio *folio); extern void swap_setup(void); extern void lru_cache_add_inactive_or_unevictable(struct page *page, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3de266e0aeb2..266c4b557946 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1660,7 +1660,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } - mark_page_lazyfree(&folio->page); + folio_mark_lazyfree(folio); ret = true; out: spin_unlock(ptl); diff --git a/mm/madvise.c b/mm/madvise.c index b6ea204d4e23..479d9a32e44a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -728,7 +728,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, set_pte_at(mm, addr, pte, ptent); tlb_remove_tlb_entry(tlb, pte, addr); } - mark_page_lazyfree(&folio->page); + folio_mark_lazyfree(folio); } out: if (nr_swap) { diff --git a/mm/swap.c b/mm/swap.c index 70e2063ef43a..5e5eba186930 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -757,16 +757,14 @@ void deactivate_page(struct page *page) } /** - * mark_page_lazyfree - make an anon page lazyfree - * @page: page to deactivate + * folio_mark_lazyfree - make an anon folio lazyfree + * @folio: folio to deactivate * - * mark_page_lazyfree() moves @page to the inactive file list. - * This is done to accelerate the reclaim of @page. + * folio_mark_lazyfree() moves @folio to the inactive file list. + * This is done to accelerate the reclaim of @folio. */ -void mark_page_lazyfree(struct page *page) +void folio_mark_lazyfree(struct folio *folio) { - struct folio *folio = page_folio(page); - if (folio_test_lru(folio) && folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) { -- cgit v1.2.3 From c5094ec79cbe487983e3a96548a7eb1c1c82c727 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 16 Dec 2022 14:45:07 -0800 Subject: hugetlb: initialize variable to avoid compiler warning With the gcc 'maybe-uninitialized' warning enabled, gcc will produce: mm/hugetlb.c:6896:20: warning: `chg' may be used uninitialized This is a false positive, but may be difficult for the compiler to determine. maybe-uninitialized is disabled by default, but this gets flagged as a 0-DAY build regression. Initialize the variable to silence the warning. Link: https://lkml.kernel.org/r/20221216224507.106789-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0e5441d6890a..a82f41024167 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6760,7 +6760,7 @@ bool hugetlb_reserve_pages(struct inode *inode, struct vm_area_struct *vma, vm_flags_t vm_flags) { - long chg, add = -1; + long chg = -1, add = -1; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; -- cgit v1.2.3 From 4e0cf05f60590c4119c2eeacf136d1b832978370 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 6 Dec 2022 18:13:39 +0100 Subject: mm: memcontrol: skip moving non-present pages that are mapped elsewhere Patch series "mm: push down lock_page_memcg()", v2. This patch (of 3): During charge moving, the pte lock and the page lock cover nearly all cases of stabilizing page_mapped(). The only exception is when we're looking at a non-present pte and find a page in the page cache or in the swapcache: if the page is mapped elsewhere, it can become unmapped outside of our control. For this reason, rmap needs lock_page_memcg(). We don't like cgroup-specific locks in generic MM code - especially in performance-critical MM code - and for a legacy feature that's unlikely to have many users left - if any. So remove the exception. Arguably that's better semantics anyway: the page is shared, and another process seems to be the more active user. Once we stop moving such pages, rmap doesn't need lock_page_memcg() anymore. The next patch will remove it. Link: https://lkml.kernel.org/r/20221206171340.139790-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20221206171340.139790-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Hugh Dickins Acked-by: Hugh Dickins Acked-by: Shakeel Butt Acked-by: Michal Hocko Cc: Roman Gushchin Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 52 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ab457f0394ab..a698a2b6523b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5692,7 +5692,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. * - * The caller must make sure the page is not on LRU (isolate_page() is useful.) + * The page must be locked and not on the LRU. * * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" * from old cgroup. @@ -5709,20 +5709,13 @@ static int mem_cgroup_move_account(struct page *page, int nid, ret; VM_BUG_ON(from == to); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); VM_BUG_ON(compound && !folio_test_large(folio)); - /* - * Prevent mem_cgroup_migrate() from looking at - * page's memory cgroup of its source page while we change it. - */ - ret = -EBUSY; - if (!folio_trylock(folio)) - goto out; - ret = -EINVAL; if (folio_memcg(folio) != from) - goto out_unlock; + goto out; pgdat = folio_pgdat(folio); from_vec = mem_cgroup_lruvec(from, pgdat); @@ -5809,8 +5802,6 @@ static int mem_cgroup_move_account(struct page *page, mem_cgroup_charge_statistics(from, -nr_pages); memcg_check_events(from, nid); local_irq_enable(); -out_unlock: - folio_unlock(folio); out: return ret; } @@ -5859,6 +5850,29 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, else if (is_swap_pte(ptent)) page = mc_handle_swap_pte(vma, ptent, &ent); + if (target && page) { + if (!trylock_page(page)) { + put_page(page); + return ret; + } + /* + * page_mapped() must be stable during the move. This + * pte is locked, so if it's present, the page cannot + * become unmapped. If it isn't, we have only partial + * control over the mapped state: the page lock will + * prevent new faults against pagecache and swapcache, + * so an unmapped page cannot become mapped. However, + * if the page is already mapped elsewhere, it can + * unmap, and there is nothing we can do about it. + * Alas, skip moving the page in this case. + */ + if (!pte_present(ptent) && page_mapped(page)) { + unlock_page(page); + put_page(page); + return ret; + } + } + if (!page && !ent.val) return ret; if (page) { @@ -5875,8 +5889,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, if (target) target->page = page; } - if (!ret || !target) + if (!ret || !target) { + if (target) + unlock_page(page); put_page(page); + } } /* * There is a swap entry and a page doesn't exist or isn't charged. @@ -5916,6 +5933,10 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, ret = MC_TARGET_PAGE; if (target) { get_page(page); + if (!trylock_page(page)) { + put_page(page); + return MC_TARGET_NONE; + } target->page = page; } } @@ -6154,6 +6175,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, } putback_lru_page(page); } + unlock_page(page); put_page(page); } else if (target_type == MC_TARGET_DEVICE) { page = target.page; @@ -6162,6 +6184,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, mc.precharge -= HPAGE_PMD_NR; mc.moved_charge += HPAGE_PMD_NR; } + unlock_page(page); put_page(page); } spin_unlock(ptl); @@ -6204,7 +6227,8 @@ retry: } if (!device) putback_lru_page(page); -put: /* get_mctgt_type() gets the page */ +put: /* get_mctgt_type() gets & locks the page */ + unlock_page(page); put_page(page); break; case MC_TARGET_SWAP: -- cgit v1.2.3 From c7c3dec1c9db9746912af2bbb5d6a2dd9f152d20 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 6 Dec 2022 18:13:40 +0100 Subject: mm: rmap: remove lock_page_memcg() The previous patch made sure charge moving only touches pages for which page_mapped() is stable. lock_page_memcg() is no longer needed. Link: https://lkml.kernel.org/r/20221206171340.139790-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Hugh Dickins Acked-by: Shakeel Butt Acked-by: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/rmap.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index b616870a09be..32e48b1c5847 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1222,9 +1222,6 @@ void page_add_anon_rmap(struct page *page, bool compound = flags & RMAP_COMPOUND; bool first = true; - if (unlikely(PageKsm(page))) - lock_page_memcg(page); - /* Is page being mapped by PTE? Is this its first map to be added? */ if (likely(!compound)) { first = atomic_inc_and_test(&page->_mapcount); @@ -1262,15 +1259,14 @@ void page_add_anon_rmap(struct page *page, if (nr) __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); - if (unlikely(PageKsm(page))) - unlock_page_memcg(page); - - /* address might be in next vma when migration races vma_adjust */ - else if (first) - __page_set_anon_rmap(page, vma, address, - !!(flags & RMAP_EXCLUSIVE)); - else - __page_check_anon_rmap(page, vma, address); + if (likely(!PageKsm(page))) { + /* address might be in next vma when migration races vma_adjust */ + if (first) + __page_set_anon_rmap(page, vma, address, + !!(flags & RMAP_EXCLUSIVE)); + else + __page_check_anon_rmap(page, vma, address); + } mlock_vma_page(page, vma, compound); } @@ -1329,7 +1325,6 @@ void page_add_file_rmap(struct page *page, bool first; VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); - lock_page_memcg(page); /* Is page being mapped by PTE? Is this its first map to be added? */ if (likely(!compound)) { @@ -1365,7 +1360,6 @@ void page_add_file_rmap(struct page *page, NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped); if (nr) __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); - unlock_page_memcg(page); mlock_vma_page(page, vma, compound); } @@ -1394,8 +1388,6 @@ void page_remove_rmap(struct page *page, return; } - lock_page_memcg(page); - /* Is page being unmapped by PTE? Is this its last map to be removed? */ if (likely(!compound)) { last = atomic_add_negative(-1, &page->_mapcount); @@ -1451,8 +1443,6 @@ void page_remove_rmap(struct page *page, * and remember that it's only reliable while mapped. */ - unlock_page_memcg(page); - munlock_vma_page(page, vma, compound); } -- cgit v1.2.3 From da34a8484d162585e22ed8c1e4114aa2f60e3567 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 7 Dec 2022 14:00:39 +0100 Subject: mm: memcontrol: deprecate charge moving Charge moving mode in cgroup1 allows memory to follow tasks as they migrate between cgroups. This is, and always has been, a questionable thing to do - for several reasons. First, it's expensive. Pages need to be identified, locked and isolated from various MM operations, and reassigned, one by one. Second, it's unreliable. Once pages are charged to a cgroup, there isn't always a clear owner task anymore. Cache isn't moved at all, for example. Mapped memory is moved - but if trylocking or isolating a page fails, it's arbitrarily left behind. Frequent moving between domains may leave a task's memory scattered all over the place. Third, it isn't really needed. Launcher tasks can kick off workload tasks directly in their target cgroup. Using dedicated per-workload groups allows fine-grained policy adjustments - no need to move tasks and their physical pages between control domains. The feature was never forward-ported to cgroup2, and it hasn't been missed. Despite it being a niche usecase, the maintenance overhead of supporting it is enormous. Because pages are moved while they are live and subject to various MM operations, the synchronization rules are complicated. There are lock_page_memcg() in MM and FS code, which non-cgroup people don't understand. In some cases we've been able to shift code and cgroup API calls around such that we can rely on native locking as much as possible. But that's fragile, and sometimes we need to hold MM locks for longer than we otherwise would (pte lock e.g.). Mark the feature deprecated. Hopefully we can remove it soon. And backport into -stable kernels so that people who develop against earlier kernels are warned about this deprecation as early as possible. [akpm@linux-foundation.org: fix memory.rst underlining] Link: https://lkml.kernel.org/r/Y5COd+qXwk/S+n8N@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Hugh Dickins Acked-by: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v1/memory.rst | 13 +++++++++++-- mm/memcontrol.c | 4 ++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 60370f2c67b9..258e45cc3b2d 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -86,6 +86,8 @@ Brief summary of control files. memory.swappiness set/show swappiness parameter of vmscan (See sysctl's vm.swappiness) memory.move_charge_at_immigrate set/show controls of moving charges + This knob is deprecated and shouldn't be + used. memory.oom_control set/show oom controls. memory.numa_stat show the number of memory usage per numa node @@ -717,8 +719,15 @@ NOTE2: It is recommended to set the soft limit always below the hard limit, otherwise the hard limit will take precedence. -8. Move charges at task migration -================================= +8. Move charges at task migration (DEPRECATED!) +=============================================== + +THIS IS DEPRECATED! + +It's expensive and unreliable! It's better practice to launch workload +tasks directly from inside their target cgroup. Use dedicated workload +cgroups to allow fine-grained policy adjustments without having to +move physical pages between control domains. Users can move charges associated with a task along with task migration, that is, uncharge task's pages from the old cgroup and charge them to the new cgroup. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a698a2b6523b..49f67176a1a2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3919,6 +3919,10 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); + if (val & ~MOVE_MASK) return -EINVAL; -- cgit v1.2.3 From 98def236f63c66629fb6b2d4b69cecffc5b46539 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:20 +0000 Subject: mm/damon/core: implement damos filter Patch series "implement DAMOS filtering for anon pages and/or specific memory cgroups" DAMOS let users do system operations in a data access pattern oriented way. The data access pattern, which is extracted by DAMON, is somewhat accurate more than what user space could know in many cases. However, in some situation, users could know something more than the kernel about the pattern or some special requirements for some types of memory or processes. For example, some users would have slow swap devices and knows latency-ciritical processes and therefore want to use DAMON-based proactive reclamation (DAMON_RECLAIM) for only non-anonymous pages of non-latency-critical processes. For such restriction, users could exclude the memory regions from the initial monitoring regions and use non-dynamic monitoring regions update monitoring operations set including fvaddr and paddr. They could also adjust the DAMOS target access pattern. For dynamically changing memory layout and access pattern, those would be not enough. To help the case, add an interface, namely DAMOS filters, which can be used to avoid the DAMOS actions be applied to specific types of memory, to DAMON kernel API (damon.h). At the moment, it supports filtering anonymous pages and/or specific memory cgroups in or out for each DAMOS scheme. This patchset adds the support for all DAMOS actions that 'paddr' monitoring operations set supports ('pageout', 'lru_prio', and 'lru_deprio'), and the functionality is exposed via DAMON kernel API (damon.h) the DAMON sysfs interface (/sys/kernel/mm/damon/admins/), and DAMON_RECLAIM module parameters. Patches Sequence ---------------- First patch implements DAMOS filter interface to DAMON kernel API. Second patch makes the physical address space monitoring operations set to support the filters from all supporting DAMOS actions. Third patch adds anonymous pages filter support to DAMON_RECLAIM, and the fourth patch documents the DAMON_RECLAIM's new feature. Fifth to seventh patches implement DAMON sysfs files for support of the filters, and eighth patch connects the file to use DAMOS filters feature. Ninth patch adds simple self test cases for DAMOS filters of the sysfs interface. Finally, following two patches (tenth and eleventh) document the new features and interfaces. This patch (of 11): DAMOS lets users do system operation in a data access pattern oriented way. The data access pattern, which is extracted by DAMON, is somewhat accurate more than what user space could know in many cases. However, in some situation, users could know something more than the kernel about the pattern or some special requirements for some types of memory or processes. For example, some users would have slow swap devices and knows latency-ciritical processes and therefore want to use DAMON-based proactive reclamation (DAMON_RECLAIM) for only non-anonymous pages of non-latency-critical processes. For such restriction, users could exclude the memory regions from the initial monitoring regions and use non-dynamic monitoring regions update monitoring operations set including fvaddr and paddr. They could also adjust the DAMOS target access pattern. For dynamically changing memory layout and access pattern, those would be not enough. To help the case, add an interface, namely DAMOS filters, which can be used to avoid the DAMOS actions be applied to specific types of memory, to DAMON kernel API (damon.h). At the moment, it supports filtering anonymous pages and/or specific memory cgroups in or out for each DAMOS scheme. Note that this commit adds only the interface to the DAMON kernel API. The impelmentation should be made in the monitoring operations sets, and following commits will add that. Link: https://lkml.kernel.org/r/20221205230830.144349-1-sj@kernel.org Link: https://lkml.kernel.org/r/20221205230830.144349-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ mm/damon/core.c | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index ad15a5b88e3a..7907918ad2e0 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -8,6 +8,7 @@ #ifndef _DAMON_H_ #define _DAMON_H_ +#include #include #include #include @@ -215,6 +216,39 @@ struct damos_stat { unsigned long qt_exceeds; }; +/** + * enum damos_filter_type - Type of memory for &struct damos_filter + * @DAMOS_FILTER_TYPE_ANON: Anonymous pages. + * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages. + * @NR_DAMOS_FILTER_TYPES: Number of filter types. + */ +enum damos_filter_type { + DAMOS_FILTER_TYPE_ANON, + DAMOS_FILTER_TYPE_MEMCG, + NR_DAMOS_FILTER_TYPES, +}; + +/** + * struct damos_filter - DAMOS action target memory filter. + * @type: Type of the page. + * @matching: If the matching page should filtered out or in. + * @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG. + * @list: List head for siblings. + * + * Before applying the &damos->action to a memory region, DAMOS checks if each + * page of the region matches to this and avoid applying the action if so. + * Note that the check support is up to &struct damon_operations + * implementation. + */ +struct damos_filter { + enum damos_filter_type type; + bool matching; + union { + unsigned short memcg_id; + }; + struct list_head list; +}; + /** * struct damos_access_pattern - Target access pattern of the given scheme. * @min_sz_region: Minimum size of target regions. @@ -239,6 +273,7 @@ struct damos_access_pattern { * @action: &damo_action to be applied to the target regions. * @quota: Control the aggressiveness of this scheme. * @wmarks: Watermarks for automated (in)activation of this scheme. + * @filters: Additional set of &struct damos_filter for &action. * @stat: Statistics of this scheme. * @list: List head for siblings. * @@ -254,6 +289,10 @@ struct damos_access_pattern { * If all schemes that registered to a &struct damon_ctx are inactive, DAMON * stops monitoring and just repeatedly checks the watermarks. * + * Before applying the &action to a memory region, &struct damon_operations + * implementation could check pages of the region and skip &action to respect + * &filters + * * After applying the &action to each region, &stat_count and &stat_sz is * updated to reflect the number of regions and total size of regions that the * &action is applied. @@ -263,6 +302,7 @@ struct damos { enum damos_action action; struct damos_quota quota; struct damos_watermarks wmarks; + struct list_head filters; struct damos_stat stat; struct list_head list; }; @@ -516,6 +556,12 @@ static inline unsigned long damon_sz_region(struct damon_region *r) #define damon_for_each_scheme_safe(s, next, ctx) \ list_for_each_entry_safe(s, next, &(ctx)->schemes, list) +#define damos_for_each_filter(f, scheme) \ + list_for_each_entry(f, &(scheme)->filters, list) + +#define damos_for_each_filter_safe(f, next, scheme) \ + list_for_each_entry_safe(f, next, &(scheme)->filters, list) + #ifdef CONFIG_DAMON struct damon_region *damon_new_region(unsigned long start, unsigned long end); @@ -536,6 +582,11 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, unsigned int nr_ranges); +struct damos_filter *damos_new_filter(enum damos_filter_type type, + bool matching); +void damos_add_filter(struct damos *s, struct damos_filter *f); +void damos_destroy_filter(struct damos_filter *f); + struct damos *damon_new_scheme(struct damos_access_pattern *pattern, enum damos_action action, struct damos_quota *quota, struct damos_watermarks *wmarks); diff --git a/mm/damon/core.c b/mm/damon/core.c index ceec75b88ef9..1bf0654ae189 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -263,6 +263,40 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, return 0; } +struct damos_filter *damos_new_filter(enum damos_filter_type type, + bool matching) +{ + struct damos_filter *filter; + + filter = kmalloc(sizeof(*filter), GFP_KERNEL); + if (!filter) + return NULL; + filter->type = type; + filter->matching = matching; + return filter; +} + +void damos_add_filter(struct damos *s, struct damos_filter *f) +{ + list_add_tail(&f->list, &s->filters); +} + +static void damos_del_filter(struct damos_filter *f) +{ + list_del(&f->list); +} + +static void damos_free_filter(struct damos_filter *f) +{ + kfree(f); +} + +void damos_destroy_filter(struct damos_filter *f) +{ + damos_del_filter(f); + damos_free_filter(f); +} + /* initialize private fields of damos_quota and return the pointer */ static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota) { @@ -287,6 +321,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, return NULL; scheme->pattern = *pattern; scheme->action = action; + INIT_LIST_HEAD(&scheme->filters); scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); @@ -315,6 +350,10 @@ static void damon_free_scheme(struct damos *s) void damon_destroy_scheme(struct damos *s) { + struct damos_filter *f, *next; + + damos_for_each_filter_safe(f, next, s) + damos_destroy_filter(f); damon_del_scheme(s); damon_free_scheme(s); } -- cgit v1.2.3 From 18250e78f9c759010d7e0008af6e9c37aeddb1e4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:21 +0000 Subject: mm/damon/paddr: support DAMOS filters Implement support of the DAMOS filters in the physical address space monitoring operations set, for all DAMOS actions that it supports including 'pageout', 'lru_prio', and 'lru_deprio'. Link: https://lkml.kernel.org/r/20221205230830.144349-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 62 insertions(+), 9 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index e1a4315c4be6..ebd1905eed6f 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -202,7 +202,47 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) return max_nr_accesses; } -static unsigned long damon_pa_pageout(struct damon_region *r) +static bool __damos_pa_filter_out(struct damos_filter *filter, + struct page *page) +{ + bool matched = false; + struct mem_cgroup *memcg; + + switch (filter->type) { + case DAMOS_FILTER_TYPE_ANON: + matched = PageAnon(page); + break; + case DAMOS_FILTER_TYPE_MEMCG: + rcu_read_lock(); + memcg = page_memcg_check(page); + if (!memcg) + matched = false; + else + matched = filter->memcg_id == mem_cgroup_id(memcg); + rcu_read_unlock(); + break; + default: + break; + } + + return matched == filter->matching; +} + +/* + * damos_pa_filter_out - Return true if the page should be filtered out. + */ +static bool damos_pa_filter_out(struct damos *scheme, struct page *page) +{ + struct damos_filter *filter; + + damos_for_each_filter(filter, scheme) { + if (__damos_pa_filter_out(filter, page)) + return true; + } + return false; +} + +static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) { unsigned long addr, applied; LIST_HEAD(page_list); @@ -213,6 +253,11 @@ static unsigned long damon_pa_pageout(struct damon_region *r) if (!page) continue; + if (damos_pa_filter_out(s, page)) { + put_page(page); + continue; + } + ClearPageReferenced(page); test_and_clear_page_young(page); if (isolate_lru_page(page)) { @@ -232,7 +277,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r) } static inline unsigned long damon_pa_mark_accessed_or_deactivate( - struct damon_region *r, bool mark_accessed) + struct damon_region *r, struct damos *s, bool mark_accessed) { unsigned long addr, applied = 0; @@ -241,6 +286,12 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( if (!page) continue; + + if (damos_pa_filter_out(s, page)) { + put_page(page); + continue; + } + if (mark_accessed) mark_page_accessed(page); else @@ -251,14 +302,16 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( return applied * PAGE_SIZE; } -static unsigned long damon_pa_mark_accessed(struct damon_region *r) +static unsigned long damon_pa_mark_accessed(struct damon_region *r, + struct damos *s) { - return damon_pa_mark_accessed_or_deactivate(r, true); + return damon_pa_mark_accessed_or_deactivate(r, s, true); } -static unsigned long damon_pa_deactivate_pages(struct damon_region *r) +static unsigned long damon_pa_deactivate_pages(struct damon_region *r, + struct damos *s) { - return damon_pa_mark_accessed_or_deactivate(r, false); + return damon_pa_mark_accessed_or_deactivate(r, s, false); } static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, @@ -267,11 +320,11 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, { switch (scheme->action) { case DAMOS_PAGEOUT: - return damon_pa_pageout(r); + return damon_pa_pageout(r, scheme); case DAMOS_LRU_PRIO: - return damon_pa_mark_accessed(r); + return damon_pa_mark_accessed(r, scheme); case DAMOS_LRU_DEPRIO: - return damon_pa_deactivate_pages(r); + return damon_pa_deactivate_pages(r, scheme); case DAMOS_STAT: break; default: -- cgit v1.2.3 From 66d9faec0745f8db4bd9ef59a287627fc5ea691f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:22 +0000 Subject: mm/damon/reclaim: add a parameter called skip_anon for avoiding anonymous pages reclamation In some cases, for example if users have confidence at anonymous pages management or the swap device is too slow, users would want to avoid DAMON_RECLAIM swapping the anonymous pages out. For such case, add yet another DAMON_RECLAIM parameter, namely 'skip_anon'. When it is set as 'Y', DAMON_RECLAIM will avoid reclaiming anonymous pages using a DAMOS filter. Link: https://lkml.kernel.org/r/20221205230830.144349-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index e82631f39481..648d2a85523a 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -98,6 +98,15 @@ module_param(monitor_region_start, ulong, 0600); static unsigned long monitor_region_end __read_mostly; module_param(monitor_region_end, ulong, 0600); +/* + * Skip anonymous pages reclamation. + * + * If this parameter is set as ``Y``, DAMON_RECLAIM does not reclaim anonymous + * pages. By default, ``N``. + */ +static bool skip_anon __read_mostly; +module_param(skip_anon, bool, 0600); + /* * PID of the DAMON thread * @@ -142,6 +151,7 @@ static struct damos *damon_reclaim_new_scheme(void) static int damon_reclaim_apply_parameters(void) { struct damos *scheme; + struct damos_filter *filter; int err = 0; err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs); @@ -152,6 +162,15 @@ static int damon_reclaim_apply_parameters(void) scheme = damon_reclaim_new_scheme(); if (!scheme) return -ENOMEM; + if (skip_anon) { + filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true); + if (!filter) { + /* Will be freed by next 'damon_set_schemes()' below */ + damon_destroy_scheme(scheme); + return -ENOMEM; + } + damos_add_filter(scheme, filter); + } damon_set_schemes(ctx, &scheme, 1); return damon_set_region_biggest_system_ram_default(target, -- cgit v1.2.3 From d56fe24237c3dec29b7de20ce93a4a53306d180b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:23 +0000 Subject: Docs/admin-guide/damon/reclaim: document 'skip_anon' parameter Document the newly added 'skip_anon' parameter of DAMON_RECLAIM, which can be used to avoid anonymous pages reclamation. Link: https://lkml.kernel.org/r/20221205230830.144349-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/reclaim.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst index 4f1479a11e63..ff335e96e0d8 100644 --- a/Documentation/admin-guide/mm/damon/reclaim.rst +++ b/Documentation/admin-guide/mm/damon/reclaim.rst @@ -205,6 +205,15 @@ The end physical address of memory region that DAMON_RECLAIM will do work against. That is, DAMON_RECLAIM will find cold memory regions in this region and reclaims. By default, biggest System RAM is used as the region. +skip_anon +--------- + +Skip anonymous pages reclamation. + +If this parameter is set as ``Y``, DAMON_RECLAIM does not reclaim anonymous +pages. By default, ``N``. + + kdamond_pid ----------- -- cgit v1.2.3 From ac35264b9e8807f019f36e7dbc640b66fd43a865 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:24 +0000 Subject: mm/damon/sysfs-schemes: implement filters directory DAMOS filters are currently supported by only DAMON kernel API. To expose the feature to user space, implement a DAMON sysfs directory named 'filters' under each scheme directory. Please note that this is implementing only the directory. Following commits will implement more files and directories, and finally connect the DAMOS filters feature. Link: https://lkml.kernel.org/r/20221205230830.144349-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 81fc4d27f4e4..50c8148cb474 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -258,6 +258,63 @@ static struct kobj_type damon_sysfs_stats_ktype = { .default_groups = damon_sysfs_stats_groups, }; +/* + * filters directory + */ + +struct damon_sysfs_scheme_filters { + struct kobject kobj; + int nr; +}; + +static struct damon_sysfs_scheme_filters * +damon_sysfs_scheme_filters_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_scheme_filters), GFP_KERNEL); +} + +static ssize_t nr_filters_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filters *filters = container_of(kobj, + struct damon_sysfs_scheme_filters, kobj); + + return sysfs_emit(buf, "%d\n", filters->nr); +} + +static ssize_t nr_filters_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0) + return -EINVAL; + + return count; +} + +static void damon_sysfs_scheme_filters_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_scheme_filters, kobj)); +} + +static struct kobj_attribute damon_sysfs_scheme_filters_nr_attr = + __ATTR_RW_MODE(nr_filters, 0600); + +static struct attribute *damon_sysfs_scheme_filters_attrs[] = { + &damon_sysfs_scheme_filters_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme_filters); + +static struct kobj_type damon_sysfs_scheme_filters_ktype = { + .release = damon_sysfs_scheme_filters_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_filters_groups, +}; + /* * watermarks directory */ @@ -784,6 +841,7 @@ struct damon_sysfs_scheme { struct damon_sysfs_access_pattern *access_pattern; struct damon_sysfs_quotas *quotas; struct damon_sysfs_watermarks *watermarks; + struct damon_sysfs_scheme_filters *filters; struct damon_sysfs_stats *stats; struct damon_sysfs_scheme_regions *tried_regions; }; @@ -878,6 +936,24 @@ static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme) return err; } +static int damon_sysfs_scheme_set_filters(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_scheme_filters *filters = + damon_sysfs_scheme_filters_alloc(); + int err; + + if (!filters) + return -ENOMEM; + err = kobject_init_and_add(&filters->kobj, + &damon_sysfs_scheme_filters_ktype, &scheme->kobj, + "filters"); + if (err) + kobject_put(&filters->kobj); + else + scheme->filters = filters; + return err; +} + static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme) { struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc(); @@ -926,9 +1002,12 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) err = damon_sysfs_scheme_set_watermarks(scheme); if (err) goto put_quotas_access_pattern_out; - err = damon_sysfs_scheme_set_stats(scheme); + err = damon_sysfs_scheme_set_filters(scheme); if (err) goto put_watermarks_quotas_access_pattern_out; + err = damon_sysfs_scheme_set_stats(scheme); + if (err) + goto put_filters_watermarks_quotas_access_pattern_out; err = damon_sysfs_scheme_set_tried_regions(scheme); if (err) goto put_tried_regions_out; @@ -937,6 +1016,9 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) put_tried_regions_out: kobject_put(&scheme->tried_regions->kobj); scheme->tried_regions = NULL; +put_filters_watermarks_quotas_access_pattern_out: + kobject_put(&scheme->filters->kobj); + scheme->filters = NULL; put_watermarks_quotas_access_pattern_out: kobject_put(&scheme->watermarks->kobj); scheme->watermarks = NULL; @@ -956,6 +1038,7 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) damon_sysfs_quotas_rm_dirs(scheme->quotas); kobject_put(&scheme->quotas->kobj); kobject_put(&scheme->watermarks->kobj); + kobject_put(&scheme->filters->kobj); kobject_put(&scheme->stats->kobj); damon_sysfs_scheme_regions_rm_dirs(scheme->tried_regions); kobject_put(&scheme->tried_regions->kobj); -- cgit v1.2.3 From 7ee161f18b5da5170b5d6a51aace49d312099128 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:25 +0000 Subject: mm/damon/sysfs-schemes: implement filter directory Implement DAMOS filter directory which will be located under the filters directory. The directory provides three files, namely type, matching, and memcg_path. 'type' and 'matching' will be directly connected to the fields of 'struct damos_filter' having same name. 'memcg_path' will receive the path of the memory cgroup of the interest and later converted to memcg id when it's committed. Link: https://lkml.kernel.org/r/20221205230830.144349-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 128 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 50c8148cb474..afbfc55a8e84 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -258,6 +258,134 @@ static struct kobj_type damon_sysfs_stats_ktype = { .default_groups = damon_sysfs_stats_groups, }; +/* + * filter directory + */ + +struct damon_sysfs_scheme_filter { + struct kobject kobj; + enum damos_filter_type type; + bool matching; + char *memcg_path; +}; + +/* Should match with enum damos_filter_type */ +static const char * const damon_sysfs_scheme_filter_type_strs[] = { + "anon", + "memcg", +}; + +static ssize_t type_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + return sysfs_emit(buf, "%s\n", + damon_sysfs_scheme_filter_type_strs[filter->type]); +} + +static ssize_t type_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + enum damos_filter_type type; + ssize_t ret = -EINVAL; + + for (type = 0; type < NR_DAMOS_FILTER_TYPES; type++) { + if (sysfs_streq(buf, damon_sysfs_scheme_filter_type_strs[ + type])) { + filter->type = type; + ret = count; + break; + } + } + return ret; +} + +static ssize_t matching_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + return sysfs_emit(buf, "%c\n", filter->matching ? 'Y' : 'N'); +} + +static ssize_t matching_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + bool matching; + int err = kstrtobool(buf, &matching); + + if (err) + return err; + + filter->matching = matching; + return count; +} + +static ssize_t memcg_path_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + return sysfs_emit(buf, "%s\n", + filter->memcg_path ? filter->memcg_path : ""); +} + +static ssize_t memcg_path_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + char *path = kmalloc(sizeof(*path) * (count + 1), GFP_KERNEL); + + if (!path) + return -ENOMEM; + + strncpy(path, buf, count); + path[count] = '\0'; + filter->memcg_path = path; + return count; +} + +static void damon_sysfs_scheme_filter_release(struct kobject *kobj) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + kfree(filter->memcg_path); + kfree(filter); +} + +static struct kobj_attribute damon_sysfs_scheme_filter_type_attr = + __ATTR_RW_MODE(type, 0600); + +static struct kobj_attribute damon_sysfs_scheme_filter_matching_attr = + __ATTR_RW_MODE(matching, 0600); + +static struct kobj_attribute damon_sysfs_scheme_filter_memcg_path_attr = + __ATTR_RW_MODE(memcg_path, 0600); + +static struct attribute *damon_sysfs_scheme_filter_attrs[] = { + &damon_sysfs_scheme_filter_type_attr.attr, + &damon_sysfs_scheme_filter_matching_attr.attr, + &damon_sysfs_scheme_filter_memcg_path_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme_filter); + +static struct kobj_type damon_sysfs_scheme_filter_ktype = { + .release = damon_sysfs_scheme_filter_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_filter_groups, +}; + /* * filters directory */ -- cgit v1.2.3 From 472e2b70eda6a1bccb62f391808924a59c49e22f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:26 +0000 Subject: mm/damon/sysfs-schemes: connect filter directory and filters directory Implement 'nr_filters' file under 'filters' directory, which will be used to populate specific number of 'filter' directory under the directory, similar to other 'nr_*' files in DAMON sysfs interface. Link: https://lkml.kernel.org/r/20221205230830.144349-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index afbfc55a8e84..e79c678a69d5 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -269,6 +269,11 @@ struct damon_sysfs_scheme_filter { char *memcg_path; }; +static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_scheme_filter), GFP_KERNEL); +} + /* Should match with enum damos_filter_type */ static const char * const damon_sysfs_scheme_filter_type_strs[] = { "anon", @@ -392,6 +397,7 @@ static struct kobj_type damon_sysfs_scheme_filter_ktype = { struct damon_sysfs_scheme_filters { struct kobject kobj; + struct damon_sysfs_scheme_filter **filters_arr; int nr; }; @@ -401,6 +407,57 @@ damon_sysfs_scheme_filters_alloc(void) return kzalloc(sizeof(struct damon_sysfs_scheme_filters), GFP_KERNEL); } +static void damon_sysfs_scheme_filters_rm_dirs( + struct damon_sysfs_scheme_filters *filters) +{ + struct damon_sysfs_scheme_filter **filters_arr = filters->filters_arr; + int i; + + for (i = 0; i < filters->nr; i++) + kobject_put(&filters_arr[i]->kobj); + filters->nr = 0; + kfree(filters_arr); + filters->filters_arr = NULL; +} + +static int damon_sysfs_scheme_filters_add_dirs( + struct damon_sysfs_scheme_filters *filters, int nr_filters) +{ + struct damon_sysfs_scheme_filter **filters_arr, *filter; + int err, i; + + damon_sysfs_scheme_filters_rm_dirs(filters); + if (!nr_filters) + return 0; + + filters_arr = kmalloc_array(nr_filters, sizeof(*filters_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!filters_arr) + return -ENOMEM; + filters->filters_arr = filters_arr; + + for (i = 0; i < nr_filters; i++) { + filter = damon_sysfs_scheme_filter_alloc(); + if (!filter) { + damon_sysfs_scheme_filters_rm_dirs(filters); + return -ENOMEM; + } + + err = kobject_init_and_add(&filter->kobj, + &damon_sysfs_scheme_filter_ktype, + &filters->kobj, "%d", i); + if (err) { + kobject_put(&filter->kobj); + damon_sysfs_scheme_filters_rm_dirs(filters); + return err; + } + + filters_arr[i] = filter; + filters->nr++; + } + return 0; +} + static ssize_t nr_filters_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -413,6 +470,7 @@ static ssize_t nr_filters_show(struct kobject *kobj, static ssize_t nr_filters_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + struct damon_sysfs_scheme_filters *filters; int nr, err = kstrtoint(buf, 0, &nr); if (err) @@ -420,6 +478,15 @@ static ssize_t nr_filters_store(struct kobject *kobj, if (nr < 0) return -EINVAL; + filters = container_of(kobj, struct damon_sysfs_scheme_filters, kobj); + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_scheme_filters_add_dirs(filters, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + return count; } @@ -1166,6 +1233,7 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) damon_sysfs_quotas_rm_dirs(scheme->quotas); kobject_put(&scheme->quotas->kobj); kobject_put(&scheme->watermarks->kobj); + damon_sysfs_scheme_filters_rm_dirs(scheme->filters); kobject_put(&scheme->filters->kobj); kobject_put(&scheme->stats->kobj); damon_sysfs_scheme_regions_rm_dirs(scheme->tried_regions); -- cgit v1.2.3 From 29cbb9a13f05b20f0777c60db9603730b487a4e0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:27 +0000 Subject: mm/damon/sysfs-schemes: implement scheme filters Implement scheme filters functionality of DAMON sysfs interface by making the code reads the values of files under the filter directories and pass that to DAMON using DAMON kernel API. [sj@kernel.org: fix leaking a filter for wrong cgroup path] Link: https://lkml.kernel.org/r/20221219171807.55708-2-sj@kernel.org [sj@kernel.org: return an error for filter memcg path id lookup failure] Link: https://lkml.kernel.org/r/20221219171807.55708-3-sj@kernel.org Link: https://lkml.kernel.org/r/20221205230830.144349-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index e79c678a69d5..f0dabe3e2dc0 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1403,6 +1403,79 @@ struct kobj_type damon_sysfs_schemes_ktype = { .default_groups = damon_sysfs_schemes_groups, }; +static bool damon_sysfs_memcg_path_eq(struct mem_cgroup *memcg, + char *memcg_path_buf, char *path) +{ +#ifdef CONFIG_MEMCG + cgroup_path(memcg->css.cgroup, memcg_path_buf, PATH_MAX); + if (sysfs_streq(memcg_path_buf, path)) + return true; +#endif /* CONFIG_MEMCG */ + return false; +} + +static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id) +{ + struct mem_cgroup *memcg; + char *path; + bool found = false; + + if (!memcg_path) + return -EINVAL; + + path = kmalloc(sizeof(*path) * PATH_MAX, GFP_KERNEL); + if (!path) + return -ENOMEM; + + for (memcg = mem_cgroup_iter(NULL, NULL, NULL); memcg; + memcg = mem_cgroup_iter(NULL, memcg, NULL)) { + /* skip removed memcg */ + if (!mem_cgroup_id(memcg)) + continue; + if (damon_sysfs_memcg_path_eq(memcg, path, memcg_path)) { + *id = mem_cgroup_id(memcg); + found = true; + break; + } + } + + kfree(path); + return found ? 0 : -EINVAL; +} + +static int damon_sysfs_set_scheme_filters(struct damos *scheme, + struct damon_sysfs_scheme_filters *sysfs_filters) +{ + int i; + struct damos_filter *filter, *next; + + damos_for_each_filter_safe(filter, next, scheme) + damos_destroy_filter(filter); + + for (i = 0; i < sysfs_filters->nr; i++) { + struct damon_sysfs_scheme_filter *sysfs_filter = + sysfs_filters->filters_arr[i]; + struct damos_filter *filter = + damos_new_filter(sysfs_filter->type, + sysfs_filter->matching); + int err; + + if (!filter) + return -ENOMEM; + if (filter->type == DAMOS_FILTER_TYPE_MEMCG) { + err = damon_sysfs_memcg_path_to_id( + sysfs_filter->memcg_path, + &filter->memcg_id); + if (err) { + damos_destroy_filter(filter); + return err; + } + } + damos_add_filter(scheme, filter); + } + return 0; +} + static struct damos *damon_sysfs_mk_scheme( struct damon_sysfs_scheme *sysfs_scheme) { @@ -1411,6 +1484,10 @@ static struct damos *damon_sysfs_mk_scheme( struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; + struct damon_sysfs_scheme_filters *sysfs_filters = + sysfs_scheme->filters; + struct damos *scheme; + int err; struct damos_access_pattern pattern = { .min_sz_region = access_pattern->sz->min, @@ -1436,8 +1513,17 @@ static struct damos *damon_sysfs_mk_scheme( .low = sysfs_wmarks->low, }; - return damon_new_scheme(&pattern, sysfs_scheme->action, "a, + scheme = damon_new_scheme(&pattern, sysfs_scheme->action, "a, &wmarks); + if (!scheme) + return NULL; + + err = damon_sysfs_set_scheme_filters(scheme, sysfs_filters); + if (err) { + damon_destroy_scheme(scheme); + return NULL; + } + return scheme; } static void damon_sysfs_update_scheme(struct damos *scheme, @@ -1448,6 +1534,7 @@ static void damon_sysfs_update_scheme(struct damos *scheme, struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; + int err; scheme->pattern.min_sz_region = access_pattern->sz->min; scheme->pattern.max_sz_region = access_pattern->sz->max; @@ -1470,6 +1557,10 @@ static void damon_sysfs_update_scheme(struct damos *scheme, scheme->wmarks.high = sysfs_wmarks->high; scheme->wmarks.mid = sysfs_wmarks->mid; scheme->wmarks.low = sysfs_wmarks->low; + + err = damon_sysfs_set_scheme_filters(scheme, sysfs_scheme->filters); + if (err) + damon_destroy_scheme(scheme); } int damon_sysfs_set_schemes(struct damon_ctx *ctx, -- cgit v1.2.3 From 553b014244298d9f807286d6a71d722bc1f50f84 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:28 +0000 Subject: selftests/damon/sysfs: test filters directory Add simple test cases for scheme filters of DAMON sysfs interface. The test cases check if the files are populated as expected, receives some valid inputs, and refuses some invalid inputs. Link: https://lkml.kernel.org/r/20221205230830.144349-10-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.sh | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh index db4942383a50..a00336ffdcad 100644 --- a/tools/testing/selftests/damon/sysfs.sh +++ b/tools/testing/selftests/damon/sysfs.sh @@ -96,6 +96,34 @@ test_stats() done } +test_filter() +{ + filter_dir=$1 + ensure_file "$filter_dir/type" "exist" "600" + ensure_write_succ "$filter_dir/type" "anon" "valid input" + ensure_write_succ "$filter_dir/type" "memcg" "valid input" + ensure_write_fail "$filter_dir/type" "foo" "invalid input" + ensure_file "$filter_dir/matching" "exist" "600" + ensure_file "$filter_dir/memcg_path" "exist" "600" +} + +test_filters() +{ + filters_dir=$1 + ensure_dir "$filters_dir" "exist" + ensure_file "$filters_dir/nr_filters" "exist" "600" + ensure_write_succ "$filters_dir/nr_filters" "1" "valid input" + test_filter "$filters_dir/0" + + ensure_write_succ "$filters_dir/nr_filters" "2" "valid input" + test_filter "$filters_dir/0" + test_filter "$filters_dir/1" + + ensure_write_succ "$filters_dir/nr_filters" "0" "valid input" + ensure_dir "$filters_dir/0" "not_exist" + ensure_dir "$filters_dir/1" "not_exist" +} + test_watermarks() { watermarks_dir=$1 @@ -143,6 +171,7 @@ test_scheme() test_access_pattern "$scheme_dir/access_pattern" test_quotas "$scheme_dir/quotas" test_watermarks "$scheme_dir/watermarks" + test_filters "$scheme_dir/filters" test_stats "$scheme_dir/stats" test_tried_regions "$scheme_dir/tried_regions" } -- cgit v1.2.3 From 9b7f9322a5300505fffba492b45621c897c0e0df Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:29 +0000 Subject: Docs/admin-guide/mm/damon/usage: document DAMOS filters of sysfs Document about the newly added files for DAMOS filters on the DAMON usage document. Link: https://lkml.kernel.org/r/20221205230830.144349-11-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 48 ++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 1a5b6b71efa1..3d82ca6a17ff 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -87,6 +87,8 @@ comma (","). :: │ │ │ │ │ │ │ quotas/ms,bytes,reset_interval_ms │ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil │ │ │ │ │ │ │ watermarks/metric,interval_us,high,mid,low + │ │ │ │ │ │ │ filters/nr_filters + │ │ │ │ │ │ │ │ 0/type,matching,memcg_id │ │ │ │ │ │ │ stats/nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds │ │ │ │ │ │ │ tried_regions/ │ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age @@ -151,6 +153,8 @@ number (``N``) to the file creates the number of child directories named as moment, only one context per kdamond is supported, so only ``0`` or ``1`` can be written to the file. +.. _sysfs_contexts: + contexts// ------------- @@ -268,8 +272,8 @@ schemes// ------------ In each scheme directory, five directories (``access_pattern``, ``quotas``, -``watermarks``, ``stats``, and ``tried_regions``) and one file (``action``) -exist. +``watermarks``, ``filters``, ``stats``, and ``tried_regions``) and one file +(``action``) exist. The ``action`` file is for setting and getting what action you want to apply to memory regions having specific access pattern of the interest. The keywords @@ -347,6 +351,46 @@ as below. The ``interval`` should written in microseconds unit. +schemes//filters/ +-------------------- + +Users could know something more than the kernel for specific types of memory. +In the case, users could do their own management for the memory and hence +doesn't want DAMOS bothers that. Users could limit DAMOS by setting the access +pattern of the scheme and/or the monitoring regions for the purpose, but that +can be inefficient in some cases. In such cases, users could set non-access +pattern driven filters using files in this directory. + +In the beginning, this directory has only one file, ``nr_filters``. Writing a +number (``N``) to the file creates the number of child directories named ``0`` +to ``N-1``. Each directory represents each filter. The filters are evaluated +in the numeric order. + +Each filter directory contains three files, namely ``type``, ``matcing``, and +``memcg_path``. You can write one of two special keywords, ``anon`` for +anonymous pages, or ``memcg`` for specific memory cgroup filtering. In case of +the memory cgroup filtering, you can specify the memory cgroup of the interest +by writing the path of the memory cgroup from the cgroups mount point to +``memcg_path`` file. You can write ``Y`` or ``N`` to ``matching`` file to +filter out pages that does or does not match to the type, respectively. Then, +the scheme's action will not be applied to the pages that specified to be +filtered out. + +For example, below restricts a DAMOS action to be applied to only non-anonymous +pages of all memory cgroups except ``/having_care_already``.:: + + # echo 2 > nr_filters + # # filter out anonymous pages + echo anon > 0/type + echo Y > 0/matching + # # further filter out all cgroups except one at '/having_care_already' + echo memcg > 1/type + echo /having_care_already > 1/memcg_path + echo N > 1/matching + +Note that filters could be ignored depend on the running DAMON operations set +`implementation `. + .. _sysfs_schemes_stats: schemes//stats/ -- cgit v1.2.3 From 497b099d9a166f819e667f4bf258e7a00ea2e78a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:30 +0000 Subject: Docs/ABI/damon: document scheme filters files Document newly added DAMON sysfs interface files for DAMOS filtering on the DAMON ABI document. Link: https://lkml.kernel.org/r/20221205230830.144349-12-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-mm-damon | 29 +++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index 13397b853692..2744f21b5a6b 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -258,6 +258,35 @@ Contact: SeongJae Park Description: Writing to and reading from this file sets and gets the low watermark of the scheme in permil. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters/nr_filters +Date: Dec 2022 +Contact: SeongJae Park +Description: Writing a number 'N' to this file creates the number of + directories for setting filters of the scheme named '0' to + 'N-1' under the filters/ directory. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//type +Date: Dec 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the type of + the memory of the interest. 'anon' for anonymous pages, or + 'memcg' for specific memory cgroup can be written and read. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//memcg_path +Date: Dec 2022 +Contact: SeongJae Park +Description: If 'memcg' is written to the 'type' file, writing to and + reading from this file sets and gets the path to the memory + cgroup of the interest. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//matching +Date: Dec 2022 +Contact: SeongJae Park +Description: Writing 'Y' or 'N' to this file sets whether to filter out + pages that do or do not match to the 'type' and 'memcg_path', + respectively. Filter out means the action of the scheme will + not be applied to. + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//stats/nr_tried Date: Mar 2022 Contact: SeongJae Park -- cgit v1.2.3 From 3f79b187ad2f677047a1f1104bfb8a81b58b67d5 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 20 Dec 2022 02:58:37 +0800 Subject: swapfile: get rid of volatile and avoid redundant read Patch series "Clean up and fixes for swap", v2. This series cleans up some code paths, saves a few cycles and reduces the object size by a bit. It also fixes some rare race issue with statistics. This patch (of 4): Convert a volatile variable to more readable READ_ONCE. And this actually avoids the code from reading the variable twice redundantly when it races. Link: https://lkml.kernel.org/r/20221219185840.25441-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20221219185840.25441-2-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: "Huang, Ying" Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/swapfile.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 908a529bca12..6d3f60bd383b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1835,13 +1835,13 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_t *pte; struct swap_info_struct *si; int ret = 0; - volatile unsigned char *swap_map; si = swap_info[type]; pte = pte_offset_map(pmd, addr); do { struct folio *folio; unsigned long offset; + unsigned char swp_count; if (!is_swap_pte(*pte)) continue; @@ -1852,7 +1852,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, offset = swp_offset(entry); pte_unmap(pte); - swap_map = &si->swap_map[offset]; folio = swap_cache_get_folio(entry, vma, addr); if (!folio) { struct page *page; @@ -1869,8 +1868,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, folio = page_folio(page); } if (!folio) { - if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) + swp_count = READ_ONCE(si->swap_map[offset]); + if (swp_count == 0 || swp_count == SWAP_MAP_BAD) goto try_next; + return -ENOMEM; } -- cgit v1.2.3 From 18ad72f5b7186336646182b17e0654bf907b39e6 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 20 Dec 2022 02:58:38 +0800 Subject: swap: avoid a redundant pte map if ra window is 1 Avoid a redundant pte map/unmap when swap readahead window is 1. Link: https://lkml.kernel.org/r/20221219185840.25441-3-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: "Huang, Ying" Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/swap_state.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 2927507b43d8..af8bc123b7c4 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -727,8 +727,6 @@ static void swap_ra_info(struct vm_fault *vmf, } faddr = vmf->address; - orig_pte = pte = pte_offset_map(vmf->pmd, faddr); - fpfn = PFN_DOWN(faddr); ra_val = GET_SWAP_RA_VAL(vma); pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val)); @@ -739,12 +737,11 @@ static void swap_ra_info(struct vm_fault *vmf, atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); - if (win == 1) { - pte_unmap(orig_pte); + if (win == 1) return; - } /* Copy the PTEs because the page table may be unmapped */ + orig_pte = pte = pte_offset_map(vmf->pmd, faddr); if (fpfn == pfn + 1) swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end); else if (pfn == fpfn + 1) -- cgit v1.2.3 From 16ba391e9c6bc26e6f0c950f2117f57b8e542d71 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 20 Dec 2022 02:58:39 +0800 Subject: swap: fold swap_ra_clamp_pfn into swap_ra_info This makes the code cleaner. This helper is made of only two line of self explanational code and not reused anywhere else. And this actually make the compiled object smaller by a bit. bloat-o-meter results on x86_64 of mm/swap_state.o: add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-35 (-35) Function old new delta swap_ra_info.constprop 512 477 -35 Total: Before=8388, After=8353, chg -0.42% Link: https://lkml.kernel.org/r/20221219185840.25441-4-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: "Huang, Ying" Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/swap_state.c | 44 +++++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index af8bc123b7c4..d8d171195a3a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -693,28 +693,15 @@ void exit_swap_address_space(unsigned int type) swapper_spaces[type] = NULL; } -static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, - unsigned long faddr, - unsigned long lpfn, - unsigned long rpfn, - unsigned long *start, - unsigned long *end) -{ - *start = max3(lpfn, PFN_DOWN(vma->vm_start), - PFN_DOWN(faddr & PMD_MASK)); - *end = min3(rpfn, PFN_DOWN(vma->vm_end), - PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); -} - static void swap_ra_info(struct vm_fault *vmf, - struct vma_swap_readahead *ra_info) + struct vma_swap_readahead *ra_info) { struct vm_area_struct *vma = vmf->vma; unsigned long ra_val; - unsigned long faddr, pfn, fpfn; + unsigned long faddr, pfn, fpfn, lpfn, rpfn; unsigned long start, end; pte_t *pte, *orig_pte; - unsigned int max_win, hits, prev_win, win, left; + unsigned int max_win, hits, prev_win, win; #ifndef CONFIG_64BIT pte_t *tpte; #endif @@ -742,16 +729,23 @@ static void swap_ra_info(struct vm_fault *vmf, /* Copy the PTEs because the page table may be unmapped */ orig_pte = pte = pte_offset_map(vmf->pmd, faddr); - if (fpfn == pfn + 1) - swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end); - else if (pfn == fpfn + 1) - swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1, - &start, &end); - else { - left = (win - 1) / 2; - swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, - &start, &end); + if (fpfn == pfn + 1) { + lpfn = fpfn; + rpfn = fpfn + win; + } else if (pfn == fpfn + 1) { + lpfn = fpfn - win + 1; + rpfn = fpfn + 1; + } else { + unsigned int left = (win - 1) / 2; + + lpfn = fpfn - left; + rpfn = fpfn + win - left; } + start = max3(lpfn, PFN_DOWN(vma->vm_start), + PFN_DOWN(faddr & PMD_MASK)); + end = min3(rpfn, PFN_DOWN(vma->vm_end), + PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); + ra_info->nr_pte = end - start; ra_info->offset = fpfn - start; pte -= ra_info->offset; -- cgit v1.2.3 From cbc2bd98db85504074c1b94175e5e136e457dc0b Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 20 Dec 2022 02:58:40 +0800 Subject: swap: avoid holding swap reference in swap_cache_get_folio All its callers either already hold a reference to, or lock the swap device while calling this function. There is only one exception in shmem_swapin_folio, just make this caller also hold a reference of the swap device, so this helper can be simplified and saves a few cycles. This also provides finer control of error handling in shmem_swapin_folio, on race (with swap off), it can just try again. For invalid swap entry, it can fail with a proper error code. Link: https://lkml.kernel.org/r/20221219185840.25441-5-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/shmem.c | 11 +++++++++++ mm/swap_state.c | 8 ++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index d3f0c94f6836..bc5c156ef470 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1739,6 +1739,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL; + struct swap_info_struct *si; struct folio *folio = NULL; swp_entry_t swap; int error; @@ -1750,6 +1751,14 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (is_swapin_error_entry(swap)) return -EIO; + si = get_swap_device(swap); + if (!si) { + if (!shmem_confirm_swap(mapping, index, swap)) + return -EEXIST; + else + return -EINVAL; + } + /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); if (!folio) { @@ -1810,6 +1819,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, delete_from_swap_cache(folio); folio_mark_dirty(folio); swap_free(swap); + put_swap_device(si); *foliop = folio; return 0; @@ -1823,6 +1833,7 @@ unlock: folio_unlock(folio); folio_put(folio); } + put_swap_device(si); return error; } diff --git a/mm/swap_state.c b/mm/swap_state.c index d8d171195a3a..cb9aaa00951d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -321,19 +321,15 @@ static inline bool swap_use_vma_readahead(void) * unlocked and with its refcount incremented - we rely on the kernel * lock getting page table operations atomic even if we drop the folio * lock before returning. + * + * Caller must lock the swap device or hold a reference to keep it valid. */ struct folio *swap_cache_get_folio(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr) { struct folio *folio; - struct swap_info_struct *si; - si = get_swap_device(entry); - if (!si) - return NULL; folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); - put_swap_device(si); - if (folio) { bool vma_ra = swap_use_vma_readahead(); bool readahead; -- cgit v1.2.3 From 44383cef54c0ce1201f884d83cc2b367bc5aa4f7 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 19 Dec 2022 19:09:18 +0100 Subject: kasan: allow sampling page_alloc allocations for HW_TAGS As Hardware Tag-Based KASAN is intended to be used in production, its performance impact is crucial. As page_alloc allocations tend to be big, tagging and checking all such allocations can introduce a significant slowdown. Add two new boot parameters that allow to alleviate that slowdown: - kasan.page_alloc.sample, which makes Hardware Tag-Based KASAN tag only every Nth page_alloc allocation with the order configured by the second added parameter (default: tag every such allocation). - kasan.page_alloc.sample.order, which makes sampling enabled by the first parameter only affect page_alloc allocations with the order equal or greater than the specified value (default: 3, see below). The exact performance improvement caused by using the new parameters depends on their values and the applied workload. The chosen default value for kasan.page_alloc.sample.order is 3, which matches both PAGE_ALLOC_COSTLY_ORDER and SKB_FRAG_PAGE_ORDER. This is done for two reasons: 1. PAGE_ALLOC_COSTLY_ORDER is "the order at which allocations are deemed costly to service", which corresponds to the idea that only large and thus costly allocations are supposed to sampled. 2. One of the workloads targeted by this patch is a benchmark that sends a large amount of data over a local loopback connection. Most multi-page data allocations in the networking subsystem have the order of SKB_FRAG_PAGE_ORDER (or PAGE_ALLOC_COSTLY_ORDER). When running a local loopback test on a testing MTE-enabled device in sync mode, enabling Hardware Tag-Based KASAN introduces a ~50% slowdown. Applying this patch and setting kasan.page_alloc.sampling to a value higher than 1 allows to lower the slowdown. The performance improvement saturates around the sampling interval value of 10 with the default sampling page order of 3. This lowers the slowdown to ~20%. The slowdown in real scenarios involving the network will likely be better. Enabling page_alloc sampling has a downside: KASAN misses bad accesses to a page_alloc allocation that has not been tagged. This lowers the value of KASAN as a security mitigation. However, based on measuring the number of page_alloc allocations of different orders during boot in a test build, sampling with the default kasan.page_alloc.sample.order value affects only ~7% of allocations. The rest ~93% of allocations are still checked deterministically. Link: https://lkml.kernel.org/r/129da0614123bb85ed4dd61ae30842b2dd7c903f.1671471846.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Jann Horn Cc: Mark Brand Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- Documentation/dev-tools/kasan.rst | 17 +++++++++++ include/linux/kasan.h | 14 +++++---- mm/kasan/common.c | 9 ++++-- mm/kasan/hw_tags.c | 60 +++++++++++++++++++++++++++++++++++++++ mm/kasan/kasan.h | 27 ++++++++++++++++++ mm/page_alloc.c | 43 +++++++++++++++++++--------- 6 files changed, 149 insertions(+), 21 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 5c93ab915049..e66916a483cd 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -140,6 +140,23 @@ disabling KASAN altogether or controlling its features: - ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc allocations (default: ``on``). +- ``kasan.page_alloc.sample=`` makes KASAN tag only every + Nth page_alloc allocation with the order equal or greater than + ``kasan.page_alloc.sample.order``, where N is the value of the ``sample`` + parameter (default: ``1``, or tag every such allocation). + This parameter is intended to mitigate the performance overhead introduced + by KASAN. + Note that enabling this parameter makes Hardware Tag-Based KASAN skip checks + of allocations chosen by sampling and thus miss bad accesses to these + allocations. Use the default value for accurate bug detection. + +- ``kasan.page_alloc.sample.order=`` specifies the minimum + order of allocations that are affected by sampling (default: ``3``). + Only applies when ``kasan.page_alloc.sample`` is set to a value greater + than ``1``. + This parameter is intended to allow sampling only large page_alloc + allocations, which is the biggest source of the performance overhead. + Error reports ~~~~~~~~~~~~~ diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 96c9d56e5510..5ebbaf672009 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -120,12 +120,13 @@ static __always_inline void kasan_poison_pages(struct page *page, __kasan_poison_pages(page, order, init); } -void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init); -static __always_inline void kasan_unpoison_pages(struct page *page, +bool __kasan_unpoison_pages(struct page *page, unsigned int order, bool init); +static __always_inline bool kasan_unpoison_pages(struct page *page, unsigned int order, bool init) { if (kasan_enabled()) - __kasan_unpoison_pages(page, order, init); + return __kasan_unpoison_pages(page, order, init); + return false; } void __kasan_cache_create_kmalloc(struct kmem_cache *cache); @@ -249,8 +250,11 @@ static __always_inline bool kasan_check_byte(const void *addr) static inline void kasan_unpoison_range(const void *address, size_t size) {} static inline void kasan_poison_pages(struct page *page, unsigned int order, bool init) {} -static inline void kasan_unpoison_pages(struct page *page, unsigned int order, - bool init) {} +static inline bool kasan_unpoison_pages(struct page *page, unsigned int order, + bool init) +{ + return false; +} static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {} static inline void kasan_poison_slab(struct slab *slab) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 833bf2cfd2a3..1d0008e1c420 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -95,19 +95,24 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) } #endif /* CONFIG_KASAN_STACK */ -void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init) +bool __kasan_unpoison_pages(struct page *page, unsigned int order, bool init) { u8 tag; unsigned long i; if (unlikely(PageHighMem(page))) - return; + return false; + + if (!kasan_sample_page_alloc(order)) + return false; tag = kasan_random_tag(); kasan_unpoison(set_tag(page_address(page), tag), PAGE_SIZE << order, init); for (i = 0; i < (1 << order); i++) page_kasan_tag_set(page + i, tag); + + return true; } void __kasan_poison_pages(struct page *page, unsigned int order, bool init) diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index b22c4f461cb0..d1bcb0205327 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -59,6 +59,24 @@ EXPORT_SYMBOL_GPL(kasan_mode); /* Whether to enable vmalloc tagging. */ DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc); +#define PAGE_ALLOC_SAMPLE_DEFAULT 1 +#define PAGE_ALLOC_SAMPLE_ORDER_DEFAULT 3 + +/* + * Sampling interval of page_alloc allocation (un)poisoning. + * Defaults to no sampling. + */ +unsigned long kasan_page_alloc_sample = PAGE_ALLOC_SAMPLE_DEFAULT; + +/* + * Minimum order of page_alloc allocations to be affected by sampling. + * The default value is chosen to match both + * PAGE_ALLOC_COSTLY_ORDER and SKB_FRAG_PAGE_ORDER. + */ +unsigned int kasan_page_alloc_sample_order = PAGE_ALLOC_SAMPLE_ORDER_DEFAULT; + +DEFINE_PER_CPU(long, kasan_page_alloc_skip); + /* kasan=off/on */ static int __init early_kasan_flag(char *arg) { @@ -122,6 +140,48 @@ static inline const char *kasan_mode_info(void) return "sync"; } +/* kasan.page_alloc.sample= */ +static int __init early_kasan_flag_page_alloc_sample(char *arg) +{ + int rv; + + if (!arg) + return -EINVAL; + + rv = kstrtoul(arg, 0, &kasan_page_alloc_sample); + if (rv) + return rv; + + if (!kasan_page_alloc_sample || kasan_page_alloc_sample > LONG_MAX) { + kasan_page_alloc_sample = PAGE_ALLOC_SAMPLE_DEFAULT; + return -EINVAL; + } + + return 0; +} +early_param("kasan.page_alloc.sample", early_kasan_flag_page_alloc_sample); + +/* kasan.page_alloc.sample.order= */ +static int __init early_kasan_flag_page_alloc_sample_order(char *arg) +{ + int rv; + + if (!arg) + return -EINVAL; + + rv = kstrtouint(arg, 0, &kasan_page_alloc_sample_order); + if (rv) + return rv; + + if (kasan_page_alloc_sample_order > INT_MAX) { + kasan_page_alloc_sample_order = PAGE_ALLOC_SAMPLE_ORDER_DEFAULT; + return -EINVAL; + } + + return 0; +} +early_param("kasan.page_alloc.sample.order", early_kasan_flag_page_alloc_sample_order); + /* * kasan_init_hw_tags_cpu() is called for each CPU. * Not marked as __init as a CPU can be hot-plugged after boot. diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index ea8cf1310b1e..32413f22aa82 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -42,6 +42,10 @@ enum kasan_mode { extern enum kasan_mode kasan_mode __ro_after_init; +extern unsigned long kasan_page_alloc_sample; +extern unsigned int kasan_page_alloc_sample_order; +DECLARE_PER_CPU(long, kasan_page_alloc_skip); + static inline bool kasan_vmalloc_enabled(void) { return static_branch_likely(&kasan_flag_vmalloc); @@ -57,6 +61,24 @@ static inline bool kasan_sync_fault_possible(void) return kasan_mode == KASAN_MODE_SYNC || kasan_mode == KASAN_MODE_ASYMM; } +static inline bool kasan_sample_page_alloc(unsigned int order) +{ + /* Fast-path for when sampling is disabled. */ + if (kasan_page_alloc_sample == 1) + return true; + + if (order < kasan_page_alloc_sample_order) + return true; + + if (this_cpu_dec_return(kasan_page_alloc_skip) < 0) { + this_cpu_write(kasan_page_alloc_skip, + kasan_page_alloc_sample - 1); + return true; + } + + return false; +} + #else /* CONFIG_KASAN_HW_TAGS */ static inline bool kasan_async_fault_possible(void) @@ -69,6 +91,11 @@ static inline bool kasan_sync_fault_possible(void) return true; } +static inline bool kasan_sample_page_alloc(unsigned int order) +{ + return true; +} + #endif /* CONFIG_KASAN_HW_TAGS */ #ifdef CONFIG_KASAN_GENERIC diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0745aedebb37..7d980dc0000e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1356,6 +1356,8 @@ out: * see the comment next to it. * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON, * see the comment next to it. + * 4. The allocation is excluded from being checked due to sampling, + * see the call to kasan_unpoison_pages. * * Poisoning pages during deferred memory init will greatly lengthen the * process and cause problem in large memory systems as the deferred pages @@ -2468,7 +2470,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order, { bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && !should_skip_init(gfp_flags); - bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); + bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS); + bool reset_tags = !zero_tags; int i; set_page_private(page, 0); @@ -2491,30 +2494,42 @@ inline void post_alloc_hook(struct page *page, unsigned int order, */ /* - * If memory tags should be zeroed (which happens only when memory - * should be initialized as well). + * If memory tags should be zeroed + * (which happens only when memory should be initialized as well). */ - if (init_tags) { + if (zero_tags) { /* Initialize both memory and tags. */ for (i = 0; i != 1 << order; ++i) tag_clear_highpage(page + i); - /* Note that memory is already initialized by the loop above. */ + /* Take note that memory was initialized by the loop above. */ init = false; } if (!should_skip_kasan_unpoison(gfp_flags)) { - /* Unpoison shadow memory or set memory tags. */ - kasan_unpoison_pages(page, order, init); - - /* Note that memory is already initialized by KASAN. */ - if (kasan_has_integrated_init()) - init = false; - } else { - /* Ensure page_address() dereferencing does not fault. */ + /* Try unpoisoning (or setting tags) and initializing memory. */ + if (kasan_unpoison_pages(page, order, init)) { + /* Take note that memory was initialized by KASAN. */ + if (kasan_has_integrated_init()) + init = false; + /* Take note that memory tags were set by KASAN. */ + reset_tags = false; + } else { + /* + * KASAN decided to exclude this allocation from being + * poisoned due to sampling. Skip poisoning as well. + */ + SetPageSkipKASanPoison(page); + } + } + /* + * If memory tags have not been set, reset the page tags to ensure + * page_address() dereferencing does not fault. + */ + if (reset_tags) { for (i = 0; i != 1 << order; ++i) page_kasan_tag_reset(page + i); } - /* If memory is still not initialized, do it now. */ + /* If memory is still not initialized, initialize it now. */ if (init) kernel_init_pages(page, 1 << order); /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */ -- cgit v1.2.3 From ef1faf0e370a8e33fe625088ddc5fde02cf8c4c4 Mon Sep 17 00:00:00 2001 From: Jianlin Lv Date: Mon, 19 Dec 2022 16:49:17 +0000 Subject: tools/vm/page_owner_sort: free memory before exit Although when a process terminates, the kernel will removes memory associated with that process, It's neither good style nor proper design to leave it to kernel. This patch free allocated memory before process exit. Link: https://lkml.kernel.org/r/20221219164917.14132-1-iecedge@gmail.com Signed-off-by: Jianlin Lv Signed-off-by: Andrew Morton --- tools/vm/page_owner_sort.c | 65 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 20 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index ce860ab94162..7c2ac124cdc8 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -246,15 +246,16 @@ static int search_pattern(regex_t *pattern, char *pattern_str, char *buf) return 0; } -static void check_regcomp(regex_t *pattern, const char *regex) +static bool check_regcomp(regex_t *pattern, const char *regex) { int err; err = regcomp(pattern, regex, REG_EXTENDED | REG_NEWLINE); if (err != 0 || pattern->re_nsub != 1) { fprintf(stderr, "Invalid pattern %s code %d\n", regex, err); - exit(1); + return false; } + return true; } static char **explode(char sep, const char *str, int *size) @@ -494,28 +495,28 @@ static bool is_need(char *buf) return true; } -static void add_list(char *buf, int len, char *ext_buf) +static bool add_list(char *buf, int len, char *ext_buf) { if (list_size != 0 && len == list[list_size-1].len && memcmp(buf, list[list_size-1].txt, len) == 0) { list[list_size-1].num++; list[list_size-1].page_num += get_page_num(buf); - return; + return true; } if (list_size == max_size) { fprintf(stderr, "max_size too small??\n"); - exit(1); + return false; } if (!is_need(buf)) - return; + return true; list[list_size].pid = get_pid(buf); list[list_size].tgid = get_tgid(buf); list[list_size].comm = get_comm(buf); list[list_size].txt = malloc(len+1); if (!list[list_size].txt) { fprintf(stderr, "Out of memory\n"); - exit(1); + return false; } memcpy(list[list_size].txt, buf, len); list[list_size].txt[len] = 0; @@ -534,6 +535,7 @@ static void add_list(char *buf, int len, char *ext_buf) printf("loaded %d\r", list_size); fflush(stdout); } + return true; } static bool parse_cull_args(const char *arg_str) @@ -790,12 +792,19 @@ int main(int argc, char **argv) exit(1); } - check_regcomp(&order_pattern, "order\\s*([0-9]*),"); - check_regcomp(&pid_pattern, "pid\\s*([0-9]*),"); - check_regcomp(&tgid_pattern, "tgid\\s*([0-9]*) "); - check_regcomp(&comm_pattern, "tgid\\s*[0-9]*\\s*\\((.*)\\),\\s*ts"); - check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,"); - check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns"); + if (!check_regcomp(&order_pattern, "order\\s*([0-9]*),")) + goto out_order; + if (!check_regcomp(&pid_pattern, "pid\\s*([0-9]*),")) + goto out_pid; + if (!check_regcomp(&tgid_pattern, "tgid\\s*([0-9]*) ")) + goto out_tgid; + if (!check_regcomp(&comm_pattern, "tgid\\s*[0-9]*\\s*\\((.*)\\),\\s*ts")) + goto out_comm; + if (!check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,")) + goto out_ts; + if (!check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns")) + goto out_free_ts; + fstat(fileno(fin), &st); max_size = st.st_size / 100; /* hack ... */ @@ -804,7 +813,7 @@ int main(int argc, char **argv) ext_buf = malloc(BUF_SIZE); if (!list || !buf || !ext_buf) { fprintf(stderr, "Out of memory\n"); - exit(1); + goto out_free; } for ( ; ; ) { @@ -812,7 +821,8 @@ int main(int argc, char **argv) if (buf_len < 0) break; - add_list(buf, buf_len, ext_buf); + if (!add_list(buf, buf_len, ext_buf)) + goto out_free; } printf("loaded %d\n", list_size); @@ -862,11 +872,26 @@ int main(int argc, char **argv) fprintf(fout, "\n"); } } - regfree(&order_pattern); - regfree(&pid_pattern); - regfree(&tgid_pattern); - regfree(&comm_pattern); - regfree(&ts_nsec_pattern); + +out_free: + if (ext_buf) + free(ext_buf); + if (buf) + free(buf); + if (list) + free(list); +out_free_ts: regfree(&free_ts_nsec_pattern); +out_ts: + regfree(&ts_nsec_pattern); +out_comm: + regfree(&comm_pattern); +out_tgid: + regfree(&tgid_pattern); +out_pid: + regfree(&pid_pattern); +out_order: + regfree(&order_pattern); + return 0; } -- cgit v1.2.3 From 80b1d8fdfad1f3084450afa6e2efcdcce867d4af Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 19 Dec 2022 12:36:59 +0000 Subject: mm: vmalloc: correct use of __GFP_NOWARN mask in __vmalloc_area_node() This function sets __GFP_NOWARN in the gfp_mask rendering the warn_alloc() invocations no-ops. Remove this and instead rely on this flag being set only for the vm_area_alloc_pages() function, ensuring it is cleared for each of the warn_alloc() calls. Link: https://lkml.kernel.org/r/20221219123659.90614-1-lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Uladzislau Rezki (Sony) Cc: Baoquan He Cc: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ca71de7c9d77..10fe83c24436 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3031,7 +3031,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, int ret; array_size = (unsigned long)nr_small_pages * sizeof(struct page *); - gfp_mask |= __GFP_NOWARN; + if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; -- cgit v1.2.3 From 831978e37e93bd3e36612917a4b193278950daff Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Wed, 21 Dec 2022 14:00:52 +0800 Subject: maple_tree: remove extra space and blank line Patch series "Clean up and refinement for maple tree", v2. This patchset cleans up and refines some maple tree code. A few small changes make the code easier to understand and for better readability. This patch (of 7): These extra space and blank lines are unnecessary, so drop them. Link: https://lkml.kernel.org/r/20221221060058.609003-1-vernon2gm@gmail.com Link: https://lkml.kernel.org/r/20221221060058.609003-2-vernon2gm@gmail.com Signed-off-by: Vernon Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 2 -- lib/maple_tree.c | 14 ++++---------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index e594db58a0f1..4ee5a969441c 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -517,7 +517,6 @@ static inline void mas_reset(struct ma_state *mas) * entry. * * Note: may return the zero entry. - * */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) @@ -639,7 +638,6 @@ static inline void mt_set_in_rcu(struct maple_tree *mt) } static inline unsigned int mt_height(const struct maple_tree *mt) - { return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET; } diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 26e2045d3cda..975358bec754 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -183,7 +183,6 @@ static void ma_free_rcu(struct maple_node *node) call_rcu(&node->rcu, mt_free_rcu); } - static void mas_set_height(struct ma_state *mas) { unsigned int new_flags = mas->tree->ma_flags; @@ -468,7 +467,7 @@ static inline void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent, unsigned char slot) { - unsigned long val = (unsigned long) parent; + unsigned long val = (unsigned long)parent; unsigned long shift; unsigned long type; enum maple_type p_type = mte_node_type(parent); @@ -502,7 +501,7 @@ void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent, */ static inline unsigned int mte_parent_slot(const struct maple_enode *enode) { - unsigned long val = (unsigned long) mte_to_node(enode)->parent; + unsigned long val = (unsigned long)mte_to_node(enode)->parent; /* Root. */ if (val & 1) @@ -1278,7 +1277,6 @@ nomem_one: mas->alloc->total = success; mas_set_err(mas, -ENOMEM); return; - } /* @@ -2946,7 +2944,7 @@ next: mas->min = prev_min; mas->max = prev_max; mas->node = last; - return (void *) next; + return (void *)next; dead_node: mas_reset(mas); @@ -3466,7 +3464,6 @@ static inline bool mas_push_data(struct ma_state *mas, int height, */ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) { - struct maple_subtree_state mast; int height = 0; unsigned char mid_split, split = 0; @@ -3892,7 +3889,7 @@ next: goto dead_node; } while (!ma_is_leaf(type)); - return (void *) next; + return (void *)next; dead_node: mas_reset(mas); @@ -4710,7 +4707,6 @@ found: static inline void mas_rewalk(struct ma_state *mas, unsigned long index) { - retry: mas_set(mas, index); mas_state_walk(mas); @@ -4718,7 +4714,6 @@ retry: goto retry; return; - } /* @@ -5620,7 +5615,6 @@ static void mas_wr_store_setup(struct ma_wr_state *wr_mas) mas_reset(wr_mas->mas); } } - } /* Interface */ -- cgit v1.2.3 From d56c593c8e128c42dc81707c07cbd5af41862214 Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Wed, 21 Dec 2022 14:00:53 +0800 Subject: maple_tree: remove extra return statement For functions with a return type of void, it is unnecessary to add a reurn statement at the end of the function, so drop it. Link: https://lkml.kernel.org/r/20221221060058.609003-3-vernon2gm@gmail.com Signed-off-by: Vernon Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 975358bec754..fc70ae9850b1 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1276,7 +1276,6 @@ nomem_one: if (mas->alloc && !(((unsigned long)mas->alloc & 0x1))) mas->alloc->total = success; mas_set_err(mas, -ENOMEM); - return; } /* @@ -4712,8 +4711,6 @@ retry: mas_state_walk(mas); if (mas_is_start(mas)) goto retry; - - return; } /* -- cgit v1.2.3 From bd592703b81a95473f6a01fe731beccd0992236e Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Wed, 21 Dec 2022 14:00:54 +0800 Subject: maple_tree: use mt_node_max() instead of direct operations mt_max[] Use mt_node_max() to get the maximum number of slots for a node, rather than direct operations mt_max[], makes it better portability. Link: https://lkml.kernel.org/r/20221221060058.609003-4-vernon2gm@gmail.com Signed-off-by: Vernon Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index fc70ae9850b1..506b8fdb5f1b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6725,7 +6725,7 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry, if (i < (MAPLE_RANGE64_SLOTS - 1)) last = node->pivot[i]; - else if (!node->slot[i] && max != mt_max[mte_node_type(entry)]) + else if (!node->slot[i] && max != mt_node_max(entry)) break; if (last == 0 && i > 0) break; @@ -6832,7 +6832,7 @@ void mt_dump(const struct maple_tree *mt) if (!xa_is_node(entry)) mt_dump_entry(entry, 0, 0, 0); else if (entry) - mt_dump_node(mt, entry, 0, mt_max[mte_node_type(entry)], 0); + mt_dump_node(mt, entry, 0, mt_node_max(entry), 0); } EXPORT_SYMBOL_GPL(mt_dump); -- cgit v1.2.3 From 84fd3e1ee395649ac45b7317d44c10b33d0dca79 Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Wed, 21 Dec 2022 14:00:55 +0800 Subject: maple_tree: use macro MA_ROOT_PARENT instead of number When you need to compare whether node->parent is parent of the root node, using macro MA_ROOT_PARENT is easier to understand and for better readability. Link: https://lkml.kernel.org/r/20221221060058.609003-5-vernon2gm@gmail.com Signed-off-by: Vernon Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 506b8fdb5f1b..4b0575b60a11 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -503,8 +503,7 @@ static inline unsigned int mte_parent_slot(const struct maple_enode *enode) { unsigned long val = (unsigned long)mte_to_node(enode)->parent; - /* Root. */ - if (val & 1) + if (val & MA_ROOT_PARENT) return 0; /* -- cgit v1.2.3 From eabb305293835b191ffe60234587ae8bf5e4e9fd Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Wed, 21 Dec 2022 14:00:56 +0800 Subject: maple_tree: remove the redundant code The macros CONFIG_DEBUG_MAPLE_TREE_VERBOSE no one uses, functions mas_dup_tree() and mas_dup_store() are not implemented, just function declaration, so drop it. Link: https://lkml.kernel.org/r/20221221060058.609003-6-vernon2gm@gmail.com Signed-off-by: Vernon Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 4ee5a969441c..815a27661517 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -12,7 +12,6 @@ #include #include /* #define CONFIG_MAPLE_RCU_DISABLED */ -/* #define CONFIG_DEBUG_MAPLE_TREE_VERBOSE */ /* * Allocated nodes are mutable until they have been inserted into the tree, @@ -483,9 +482,6 @@ static inline bool mas_is_paused(struct ma_state *mas) return mas->node == MAS_PAUSE; } -void mas_dup_tree(struct ma_state *oldmas, struct ma_state *mas); -void mas_dup_store(struct ma_state *mas, void *entry); - /* * This finds an empty area from the highest address to the lowest. * AKA "Topdown" version, -- cgit v1.2.3 From 46b345848261009477552d654cb2f65000c30e4d Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Wed, 21 Dec 2022 14:00:57 +0800 Subject: maple_tree: refine ma_state init from mas_start() If mas->node is an MAS_START, there are three cases, and they all assign different values to mas->node and mas->offset. So there is no need to set them to a default value before updating. Update them directly to make them easier to understand and for better readability. Link: https://lkml.kernel.org/r/20221221060058.609003-7-vernon2gm@gmail.com Signed-off-by: Vernon Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 4b0575b60a11..d4554c11ec15 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1329,7 +1329,7 @@ static void mas_node_count(struct ma_state *mas, int count) * mas_start() - Sets up maple state for operations. * @mas: The maple state. * - * If mas->node == MAS_START, then set the min, max, depth, and offset to + * If mas->node == MAS_START, then set the min, max and depth to * defaults. * * Return: @@ -1343,22 +1343,22 @@ static inline struct maple_enode *mas_start(struct ma_state *mas) if (likely(mas_is_start(mas))) { struct maple_enode *root; - mas->node = MAS_NONE; mas->min = 0; mas->max = ULONG_MAX; mas->depth = 0; - mas->offset = 0; root = mas_root(mas); /* Tree with nodes */ if (likely(xa_is_node(root))) { mas->depth = 1; mas->node = mte_safe_root(root); + mas->offset = 0; return NULL; } /* empty tree */ if (unlikely(!root)) { + mas->node = MAS_NONE; mas->offset = MAPLE_NODE_SLOTS; return NULL; } -- cgit v1.2.3 From e11cb683b2ebc6699bc0ca200442f1b80a51553f Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Wed, 21 Dec 2022 14:00:58 +0800 Subject: maple_tree: refine mab_calc_split function Invert the conditional judgment of the mid_split, to focus the return statement in the last statement, which is easier to understand and for better readability. Link: https://lkml.kernel.org/r/20221221060058.609003-8-vernon2gm@gmail.com Signed-off-by: Vernon Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index d4554c11ec15..94f0053ec3e0 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1882,10 +1882,9 @@ static inline int mab_calc_split(struct ma_state *mas, /* Avoid ending a node on a NULL entry */ split = mab_no_null_split(bn, split, slot_count); - if (!(*mid_split)) - return split; - *mid_split = mab_no_null_split(bn, *mid_split, slot_count); + if (unlikely(*mid_split)) + *mid_split = mab_no_null_split(bn, *mid_split, slot_count); return split; } -- cgit v1.2.3 From 318e9342fbbb6888d903d86e83865609901a1c65 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 21 Dec 2022 10:08:45 -0800 Subject: mm/memory: add vm_normal_folio() Patch series "Convert deactivate_page() to folio_deactivate()", v4. Deactivate_page() has already been converted to use folios. This patch series modifies the callers of deactivate_page() to use folios. It also introduces vm_normal_folio() to assist with folio conversions, and converts deactivate_page() to folio_deactivate() which takes in a folio. This patch (of 4): Introduce a wrapper function called vm_normal_folio(). This function calls vm_normal_page() and returns the folio of the page found, or null if no page is found. This function allows callers to get a folio from a pte, which will eventually allow them to completely replace their struct page variables with struct folio instead. Link: https://lkml.kernel.org/r/20221221180848.20774-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20221221180848.20774-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ mm/memory.c | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 253b2d7489e6..8e14183dfc58 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1968,6 +1968,8 @@ static inline bool can_do_mlock(void) { return false; } extern int user_shm_lock(size_t, struct ucounts *); extern void user_shm_unlock(size_t, struct ucounts *); +struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, + pte_t pte); struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/memory.c b/mm/memory.c index ca490596b36f..1598051a2a24 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -625,6 +625,16 @@ out: return pfn_to_page(pfn); } +struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) +{ + struct page *page = vm_normal_page(vma, addr, pte); + + if (page) + return page_folio(page); + return NULL; +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) -- cgit v1.2.3 From 07e8c82b5eff8ef34b74210eacb8d9c4a2886b82 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 21 Dec 2022 10:08:46 -0800 Subject: madvise: convert madvise_cold_or_pageout_pte_range() to use folios This change removes a number of calls to compound_head(), and saves 1729 bytes of kernel text. Link: https://lkml.kernel.org/r/20221221180848.20774-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: SeongJae Park Signed-off-by: Andrew Morton --- mm/madvise.c | 98 ++++++++++++++++++++++++++++++------------------------------ 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 479d9a32e44a..575ebf0363b8 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -345,8 +345,8 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, struct vm_area_struct *vma = walk->vma; pte_t *orig_pte, *pte, ptent; spinlock_t *ptl; - struct page *page = NULL; - LIST_HEAD(page_list); + struct folio *folio = NULL; + LIST_HEAD(folio_list); bool pageout_anon_only_filter; if (fatal_signal_pending(current)) @@ -375,26 +375,26 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, goto huge_unlock; } - page = pmd_page(orig_pmd); + folio = pfn_folio(pmd_pfn(orig_pmd)); - /* Do not interfere with other mappings of this page */ - if (page_mapcount(page) != 1) + /* Do not interfere with other mappings of this folio */ + if (folio_mapcount(folio) != 1) goto huge_unlock; - if (pageout_anon_only_filter && !PageAnon(page)) + if (pageout_anon_only_filter && !folio_test_anon(folio)) goto huge_unlock; if (next - addr != HPAGE_PMD_SIZE) { int err; - get_page(page); + folio_get(folio); spin_unlock(ptl); - lock_page(page); - err = split_huge_page(page); - unlock_page(page); - put_page(page); + folio_lock(folio); + err = split_folio(folio); + folio_unlock(folio); + folio_put(folio); if (!err) - goto regular_page; + goto regular_folio; return 0; } @@ -406,25 +406,25 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } - ClearPageReferenced(page); - test_and_clear_page_young(page); + folio_clear_referenced(folio); + folio_test_clear_young(folio); if (pageout) { - if (!isolate_lru_page(page)) { - if (PageUnevictable(page)) - putback_lru_page(page); + if (!folio_isolate_lru(folio)) { + if (folio_test_unevictable(folio)) + folio_putback_lru(folio); else - list_add(&page->lru, &page_list); + list_add(&folio->lru, &folio_list); } } else - deactivate_page(page); + deactivate_page(&folio->page); huge_unlock: spin_unlock(ptl); if (pageout) - reclaim_pages(&page_list); + reclaim_pages(&folio_list); return 0; } -regular_page: +regular_folio: if (pmd_trans_unstable(pmd)) return 0; #endif @@ -441,33 +441,33 @@ regular_page: if (!pte_present(ptent)) continue; - page = vm_normal_page(vma, addr, ptent); - if (!page || is_zone_device_page(page)) + folio = vm_normal_folio(vma, addr, ptent); + if (!folio || folio_is_zone_device(folio)) continue; /* * Creating a THP page is expensive so split it only if we * are sure it's worth. Split it if we are only owner. */ - if (PageTransCompound(page)) { - if (page_mapcount(page) != 1) + if (folio_test_large(folio)) { + if (folio_mapcount(folio) != 1) break; - if (pageout_anon_only_filter && !PageAnon(page)) + if (pageout_anon_only_filter && !folio_test_anon(folio)) break; - get_page(page); - if (!trylock_page(page)) { - put_page(page); + folio_get(folio); + if (!folio_trylock(folio)) { + folio_put(folio); break; } pte_unmap_unlock(orig_pte, ptl); - if (split_huge_page(page)) { - unlock_page(page); - put_page(page); + if (split_folio(folio)) { + folio_unlock(folio); + folio_put(folio); orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); break; } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); pte--; addr -= PAGE_SIZE; @@ -475,16 +475,16 @@ regular_page: } /* - * Do not interfere with other mappings of this page and - * non-LRU page. + * Do not interfere with other mappings of this folio and + * non-LRU folio. */ - if (!PageLRU(page) || page_mapcount(page) != 1) + if (!folio_test_lru(folio) || folio_mapcount(folio) != 1) continue; - if (pageout_anon_only_filter && !PageAnon(page)) + if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; - VM_BUG_ON_PAGE(PageTransCompound(page), page); + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); if (pte_young(ptent)) { ptent = ptep_get_and_clear_full(mm, addr, pte, @@ -495,28 +495,28 @@ regular_page: } /* - * We are deactivating a page for accelerating reclaiming. - * VM couldn't reclaim the page unless we clear PG_young. + * We are deactivating a folio for accelerating reclaiming. + * VM couldn't reclaim the folio unless we clear PG_young. * As a side effect, it makes confuse idle-page tracking * because they will miss recent referenced history. */ - ClearPageReferenced(page); - test_and_clear_page_young(page); + folio_clear_referenced(folio); + folio_test_clear_young(folio); if (pageout) { - if (!isolate_lru_page(page)) { - if (PageUnevictable(page)) - putback_lru_page(page); + if (!folio_isolate_lru(folio)) { + if (folio_test_unevictable(folio)) + folio_putback_lru(folio); else - list_add(&page->lru, &page_list); + list_add(&folio->lru, &folio_list); } } else - deactivate_page(page); + deactivate_page(&folio->page); } arch_leave_lazy_mmu_mode(); pte_unmap_unlock(orig_pte, ptl); if (pageout) - reclaim_pages(&page_list); + reclaim_pages(&folio_list); cond_resched(); return 0; -- cgit v1.2.3 From f70da5ee8fe15b21501613ccab27eb2f722a3394 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 21 Dec 2022 10:08:47 -0800 Subject: mm/damon: convert damon_pa_mark_accessed_or_deactivate() to use folios This change replaces 2 calls to compound_head() from put_page() and 1 call from mark_page_accessed() with one from page_folio(). This is in preparation for the conversion of deactivate_page() to folio_deactivate(). Link: https://lkml.kernel.org/r/20221221180848.20774-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: SeongJae Park Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index ebd1905eed6f..884c8bf18b12 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -283,21 +283,23 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { struct page *page = damon_get_page(PHYS_PFN(addr)); + struct folio *folio; if (!page) continue; + folio = page_folio(page); - if (damos_pa_filter_out(s, page)) { - put_page(page); + if (damos_pa_filter_out(s, &folio->page)) { + folio_put(folio); continue; } if (mark_accessed) - mark_page_accessed(page); + folio_mark_accessed(folio); else - deactivate_page(page); - put_page(page); - applied++; + deactivate_page(&folio->page); + folio_put(folio); + applied += folio_nr_pages(folio); } return applied * PAGE_SIZE; } -- cgit v1.2.3 From 5a9e34747c9f731bbb6b7fd7521c4fec0d840593 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 21 Dec 2022 10:08:48 -0800 Subject: mm/swap: convert deactivate_page() to folio_deactivate() Deactivate_page() has already been converted to use folios, this change converts it to take in a folio argument instead of calling page_folio(). It also renames the function folio_deactivate() to be more consistent with other folio functions. [akpm@linux-foundation.org: fix left-over comments, per Yu Zhao] Link: https://lkml.kernel.org/r/20221221180848.20774-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 +- mm/damon/paddr.c | 2 +- mm/madvise.c | 4 ++-- mm/page-writeback.c | 4 ++-- mm/swap.c | 14 ++++++-------- mm/vmscan.c | 2 +- 6 files changed, 13 insertions(+), 15 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 93f1cebd8545..87cecb8c0bdc 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -401,7 +401,7 @@ extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); extern void lru_add_drain_all(void); -extern void deactivate_page(struct page *page); +void folio_deactivate(struct folio *folio); void folio_mark_lazyfree(struct folio *folio); extern void swap_setup(void); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 884c8bf18b12..6334c99e5152 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -297,7 +297,7 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( if (mark_accessed) folio_mark_accessed(folio); else - deactivate_page(&folio->page); + folio_deactivate(folio); folio_put(folio); applied += folio_nr_pages(folio); } diff --git a/mm/madvise.c b/mm/madvise.c index 575ebf0363b8..e407d335e614 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -416,7 +416,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, list_add(&folio->lru, &folio_list); } } else - deactivate_page(&folio->page); + folio_deactivate(folio); huge_unlock: spin_unlock(ptl); if (pageout) @@ -510,7 +510,7 @@ regular_folio: list_add(&folio->lru, &folio_list); } } else - deactivate_page(&folio->page); + folio_deactivate(folio); } arch_leave_lazy_mmu_mode(); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ad608ef2a243..41128ea9c997 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2846,11 +2846,11 @@ bool folio_mark_dirty(struct folio *folio) if (likely(mapping)) { /* - * readahead/lru_deactivate_page could remain + * readahead/folio_deactivate could remain * PG_readahead/PG_reclaim due to race with folio_end_writeback * About readahead, if the folio is written, the flags would be * reset. So no problem. - * About lru_deactivate_page, if the folio is redirtied, + * About folio_deactivate, if the folio is redirtied, * the flag will be reset. So no problem. but if the * folio is used by readahead it will confuse readahead * and make it restart the size rampup process. But it's diff --git a/mm/swap.c b/mm/swap.c index 5e5eba186930..e54e2a252e27 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -733,17 +733,15 @@ void deactivate_file_folio(struct folio *folio) } /* - * deactivate_page - deactivate a page - * @page: page to deactivate + * folio_deactivate - deactivate a folio + * @folio: folio to deactivate * - * deactivate_page() moves @page to the inactive list if @page was on the active - * list and was not an unevictable page. This is done to accelerate the reclaim - * of @page. + * folio_deactivate() moves @folio to the inactive list if @folio was on the + * active list and was not unevictable. This is done to accelerate the + * reclaim of @folio. */ -void deactivate_page(struct page *page) +void folio_deactivate(struct folio *folio) { - struct folio *folio = page_folio(page); - if (folio_test_lru(folio) && !folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) { struct folio_batch *fbatch; diff --git a/mm/vmscan.c b/mm/vmscan.c index bd6637fcd8f9..aa8c252949da 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1920,7 +1920,7 @@ retry: !test_bit(PGDAT_DIRTY, &pgdat->flags))) { /* * Immediately reclaim when written back. - * Similar in principle to deactivate_page() + * Similar in principle to folio_deactivate() * except we already have the folio isolated * and know it's dirty */ -- cgit v1.2.3 From 0b7b8704ddcee372099a2bc6781db6ab273a85d5 Mon Sep 17 00:00:00 2001 From: Hao Sun Date: Wed, 21 Dec 2022 22:42:45 +0800 Subject: mm: new primitive kvmemdup() Similar to kmemdup(), but support large amount of bytes with kvmalloc() and does *not* guarantee that the result will be physically contiguous. Use only in cases where kvmalloc() is needed and free it with kvfree(). Also adapt policy_unpack.c in case someone bisect into this. Link: https://lkml.kernel.org/r/20221221144245.27164-1-sunhao.th@gmail.com Signed-off-by: Hao Sun Suggested-by: Daniel Borkmann Cc: Nick Terrell Cc: John Johansen Cc: Paul Moore Cc: James Morris Cc: "Serge E. Hallyn" Signed-off-by: Andrew Morton --- include/linux/string.h | 1 + mm/util.c | 24 +++++++++++++++++++++++- security/apparmor/policy_unpack.c | 11 +---------- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/include/linux/string.h b/include/linux/string.h index db28802ab0a6..c062c581a98b 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -177,6 +177,7 @@ extern char *kstrdup(const char *s, gfp_t gfp) __malloc; extern const char *kstrdup_const(const char *s, gfp_t gfp); extern char *kstrndup(const char *s, size_t len, gfp_t gfp); extern void *kmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); +extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp); extern char **argv_split(gfp_t gfp, const char *str, int *argcp); diff --git a/mm/util.c b/mm/util.c index b56c92fb910f..cec9327b27b4 100644 --- a/mm/util.c +++ b/mm/util.c @@ -120,7 +120,8 @@ EXPORT_SYMBOL(kstrndup); * @len: memory region length * @gfp: GFP mask to use * - * Return: newly allocated copy of @src or %NULL in case of error + * Return: newly allocated copy of @src or %NULL in case of error, + * result is physically contiguous. Use kfree() to free. */ void *kmemdup(const void *src, size_t len, gfp_t gfp) { @@ -133,6 +134,27 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp) } EXPORT_SYMBOL(kmemdup); +/** + * kvmemdup - duplicate region of memory + * + * @src: memory region to duplicate + * @len: memory region length + * @gfp: GFP mask to use + * + * Return: newly allocated copy of @src or %NULL in case of error, + * result may be not physically contiguous. Use kvfree() to free. + */ +void *kvmemdup(const void *src, size_t len, gfp_t gfp) +{ + void *p; + + p = kvmalloc(len, gfp); + if (p) + memcpy(p, src, len); + return p; +} +EXPORT_SYMBOL(kvmemdup); + /** * kmemdup_nul - Create a NUL-terminated string from unterminated data * @s: The data to stringify diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 66915653108c..5e9949832af6 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -161,15 +161,6 @@ VISIBLE_IF_KUNIT bool aa_inbounds(struct aa_ext *e, size_t size) } EXPORT_SYMBOL_IF_KUNIT(aa_inbounds); -static void *kvmemdup(const void *src, size_t len) -{ - void *p = kvmalloc(len, GFP_KERNEL); - - if (p) - memcpy(p, src, len); - return p; -} - /** * aa_unpack_u16_chunk - test and do bounds checking for a u16 size based chunk * @e: serialized data read head (NOT NULL) @@ -1027,7 +1018,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) data->key = key; data->size = aa_unpack_blob(e, &data->data, NULL); - data->data = kvmemdup(data->data, data->size); + data->data = kvmemdup(data->data, data->size, GFP_KERNEL); if (data->size && !data->data) { kfree_sensitive(data->key); kfree_sensitive(data); -- cgit v1.2.3 From b5054174ac7c7d8fae15deee7ddc0e20fd604f30 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 21 Dec 2022 21:24:54 +0000 Subject: mm: move FOLL_* defs to mm_types.h Move FOLL_* definitions to linux/mm_types.h to make them more accessible without having to drag in all of linux/mm.h and everything that drags in too[1]. Link: https://lkml.kernel.org/r/2161258.1671657894@warthog.procyon.org.uk Signed-off-by: David Howells Suggested-by: Matthew Wilcox Reviewed-by: John Hubbard Cc: Christoph Hellwig Cc: Al Viro Signed-off-by: Andrew Morton --- include/linux/mm.h | 75 ------------------------------------------------ include/linux/mm_types.h | 75 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 75 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8e14183dfc58..d68579bf8484 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3057,81 +3057,6 @@ static inline vm_fault_t vmf_error(int err) struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags); -#define FOLL_WRITE 0x01 /* check pte is writable */ -#define FOLL_TOUCH 0x02 /* mark page accessed */ -#define FOLL_GET 0x04 /* do get_page on page */ -#define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ -#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ -#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO - * and return without waiting upon it */ -#define FOLL_NOFAULT 0x80 /* do not fault in pages */ -#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ -#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ -#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ -#define FOLL_ANON 0x8000 /* don't do file mappings */ -#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ -#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ -#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ -#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ -#define FOLL_PCI_P2PDMA 0x100000 /* allow returning PCI P2PDMA pages */ -#define FOLL_INTERRUPTIBLE 0x200000 /* allow interrupts from generic signals */ - -/* - * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each - * other. Here is what they mean, and how to use them: - * - * FOLL_LONGTERM indicates that the page will be held for an indefinite time - * period _often_ under userspace control. This is in contrast to - * iov_iter_get_pages(), whose usages are transient. - * - * FIXME: For pages which are part of a filesystem, mappings are subject to the - * lifetime enforced by the filesystem and we need guarantees that longterm - * users like RDMA and V4L2 only establish mappings which coordinate usage with - * the filesystem. Ideas for this coordination include revoking the longterm - * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was - * added after the problem with filesystems was found FS DAX VMAs are - * specifically failed. Filesystem pages are still subject to bugs and use of - * FOLL_LONGTERM should be avoided on those pages. - * - * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call. - * Currently only get_user_pages() and get_user_pages_fast() support this flag - * and calls to get_user_pages_[un]locked are specifically not allowed. This - * is due to an incompatibility with the FS DAX check and - * FAULT_FLAG_ALLOW_RETRY. - * - * In the CMA case: long term pins in a CMA region would unnecessarily fragment - * that region. And so, CMA attempts to migrate the page before pinning, when - * FOLL_LONGTERM is specified. - * - * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount, - * but an additional pin counting system) will be invoked. This is intended for - * anything that gets a page reference and then touches page data (for example, - * Direct IO). This lets the filesystem know that some non-file-system entity is - * potentially changing the pages' data. In contrast to FOLL_GET (whose pages - * are released via put_page()), FOLL_PIN pages must be released, ultimately, by - * a call to unpin_user_page(). - * - * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different - * and separate refcounting mechanisms, however, and that means that each has - * its own acquire and release mechanisms: - * - * FOLL_GET: get_user_pages*() to acquire, and put_page() to release. - * - * FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release. - * - * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call. - * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based - * calls applied to them, and that's perfectly OK. This is a constraint on the - * callers, not on the pages.) - * - * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never - * directly by the caller. That's in order to help avoid mismatches when - * releasing pages: get_user_pages*() pages must be released via put_page(), - * while pin_user_pages*() pages must be released via unpin_user_page(). - * - * Please see Documentation/core-api/pin_user_pages.rst for more information. - */ - static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) { if (vm_fault & VM_FAULT_OOM) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9757067c3053..1118e381fcdc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1085,4 +1085,79 @@ enum fault_flag { typedef unsigned int __bitwise zap_flags_t; +/* + * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each + * other. Here is what they mean, and how to use them: + * + * FOLL_LONGTERM indicates that the page will be held for an indefinite time + * period _often_ under userspace control. This is in contrast to + * iov_iter_get_pages(), whose usages are transient. + * + * FIXME: For pages which are part of a filesystem, mappings are subject to the + * lifetime enforced by the filesystem and we need guarantees that longterm + * users like RDMA and V4L2 only establish mappings which coordinate usage with + * the filesystem. Ideas for this coordination include revoking the longterm + * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was + * added after the problem with filesystems was found FS DAX VMAs are + * specifically failed. Filesystem pages are still subject to bugs and use of + * FOLL_LONGTERM should be avoided on those pages. + * + * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call. + * Currently only get_user_pages() and get_user_pages_fast() support this flag + * and calls to get_user_pages_[un]locked are specifically not allowed. This + * is due to an incompatibility with the FS DAX check and + * FAULT_FLAG_ALLOW_RETRY. + * + * In the CMA case: long term pins in a CMA region would unnecessarily fragment + * that region. And so, CMA attempts to migrate the page before pinning, when + * FOLL_LONGTERM is specified. + * + * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount, + * but an additional pin counting system) will be invoked. This is intended for + * anything that gets a page reference and then touches page data (for example, + * Direct IO). This lets the filesystem know that some non-file-system entity is + * potentially changing the pages' data. In contrast to FOLL_GET (whose pages + * are released via put_page()), FOLL_PIN pages must be released, ultimately, by + * a call to unpin_user_page(). + * + * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different + * and separate refcounting mechanisms, however, and that means that each has + * its own acquire and release mechanisms: + * + * FOLL_GET: get_user_pages*() to acquire, and put_page() to release. + * + * FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release. + * + * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call. + * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based + * calls applied to them, and that's perfectly OK. This is a constraint on the + * callers, not on the pages.) + * + * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never + * directly by the caller. That's in order to help avoid mismatches when + * releasing pages: get_user_pages*() pages must be released via put_page(), + * while pin_user_pages*() pages must be released via unpin_user_page(). + * + * Please see Documentation/core-api/pin_user_pages.rst for more information. + */ + +#define FOLL_WRITE 0x01 /* check pte is writable */ +#define FOLL_TOUCH 0x02 /* mark page accessed */ +#define FOLL_GET 0x04 /* do get_page on page */ +#define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ +#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ +#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO + * and return without waiting upon it */ +#define FOLL_NOFAULT 0x80 /* do not fault in pages */ +#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ +#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ +#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ +#define FOLL_ANON 0x8000 /* don't do file mappings */ +#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ +#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ +#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ +#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ +#define FOLL_PCI_P2PDMA 0x100000 /* allow returning PCI P2PDMA pages */ +#define FOLL_INTERRUPTIBLE 0x200000 /* allow interrupts from generic signals */ + #endif /* _LINUX_MM_TYPES_H */ -- cgit v1.2.3 From edd898181e2f6f0969c08e1dfe2b7cdf902b9b33 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 22 Dec 2022 20:00:20 +0100 Subject: mm: vmalloc: avoid calling __find_vmap_area() twice in __vunmap() Currently the __vunmap() path calls __find_vmap_area() twice. Once on entry to check that the area exists, then inside the remove_vm_area() function which also performs a new search for the VA. In order to improvie it from a performance point of view we split remove_vm_area() into two new parts: - find_unlink_vmap_area() that does a search and unlink from tree; - __remove_vm_area() that removes without searching. In this case there is no any functional change for remove_vm_area() whereas vm_remove_mappings(), where a second search happens, switches to the __remove_vm_area() variant where the already detached VA is passed as a parameter, so there is no need to find it again. Performance wise, i use test_vmalloc.sh with 32 threads doing alloc free on a 64-CPUs-x86_64-box: perf without this patch: - 31.41% 0.50% vmalloc_test/10 [kernel.vmlinux] [k] __vunmap - 30.92% __vunmap - 17.67% _raw_spin_lock native_queued_spin_lock_slowpath - 12.33% remove_vm_area - 11.79% free_vmap_area_noflush - 11.18% _raw_spin_lock native_queued_spin_lock_slowpath 0.76% free_unref_page perf with this patch: - 11.35% 0.13% vmalloc_test/14 [kernel.vmlinux] [k] __vunmap - 11.23% __vunmap - 8.28% find_unlink_vmap_area - 7.95% _raw_spin_lock 7.44% native_queued_spin_lock_slowpath - 1.93% free_vmap_area_noflush - 0.56% _raw_spin_lock 0.53% native_queued_spin_lock_slowpath 0.60% __vunmap_range_noflush __vunmap() consumes around ~20% less CPU cycles on this test. Also, switch from find_vmap_area() to find_unlink_vmap_area() to prevent a double access to the vmap_area_lock: one for finding area, second time is for unlinking from a tree. [urezki@gmail.com: switch to find_unlink_vmap_area() in vm_unmap_ram()] Link: https://lkml.kernel.org/r/20221222190022.134380-2-urezki@gmail.com Link: https://lkml.kernel.org/r/20221222190022.134380-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reported-by: Roman Gushchin Reviewed-by: Lorenzo Stoakes Cc: Baoquan He Cc: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmalloc.c | 79 ++++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 48 insertions(+), 31 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 10fe83c24436..476ccbffb208 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1815,9 +1815,9 @@ static void drain_vmap_area_work(struct work_struct *work) } /* - * Free a vmap area, caller ensuring that the area has been unmapped - * and flush_cache_vunmap had been called for the correct range - * previously. + * Free a vmap area, caller ensuring that the area has been unmapped, + * unlinked and flush_cache_vunmap had been called for the correct + * range previously. */ static void free_vmap_area_noflush(struct vmap_area *va) { @@ -1825,9 +1825,8 @@ static void free_vmap_area_noflush(struct vmap_area *va) unsigned long va_start = va->va_start; unsigned long nr_lazy; - spin_lock(&vmap_area_lock); - unlink_va(va, &vmap_area_root); - spin_unlock(&vmap_area_lock); + if (WARN_ON_ONCE(!list_empty(&va->list))) + return; nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); @@ -1871,6 +1870,19 @@ struct vmap_area *find_vmap_area(unsigned long addr) return va; } +static struct vmap_area *find_unlink_vmap_area(unsigned long addr) +{ + struct vmap_area *va; + + spin_lock(&vmap_area_lock); + va = __find_vmap_area(addr, &vmap_area_root); + if (va) + unlink_va(va, &vmap_area_root); + spin_unlock(&vmap_area_lock); + + return va; +} + /*** Per cpu kva allocator ***/ /* @@ -2015,6 +2027,10 @@ static void free_vmap_block(struct vmap_block *vb) tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start)); BUG_ON(tmp != vb); + spin_lock(&vmap_area_lock); + unlink_va(vb->va, &vmap_area_root); + spin_unlock(&vmap_area_lock); + free_vmap_area_noflush(vb->va); kfree_rcu(vb, rcu_head); } @@ -2236,7 +2252,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) return; } - va = find_vmap_area(addr); + va = find_unlink_vmap_area(addr); BUG_ON(!va); debug_check_no_locks_freed((void *)va->va_start, (va->va_end - va->va_start)); @@ -2591,6 +2607,20 @@ struct vm_struct *find_vm_area(const void *addr) return va->vm; } +static struct vm_struct *__remove_vm_area(struct vmap_area *va) +{ + struct vm_struct *vm; + + if (!va || !va->vm) + return NULL; + + vm = va->vm; + kasan_free_module_shadow(vm); + free_unmap_vmap_area(va); + + return vm; +} + /** * remove_vm_area - find and remove a continuous kernel virtual area * @addr: base address @@ -2603,26 +2633,10 @@ struct vm_struct *find_vm_area(const void *addr) */ struct vm_struct *remove_vm_area(const void *addr) { - struct vmap_area *va; - might_sleep(); - spin_lock(&vmap_area_lock); - va = __find_vmap_area((unsigned long)addr, &vmap_area_root); - if (va && va->vm) { - struct vm_struct *vm = va->vm; - - va->vm = NULL; - spin_unlock(&vmap_area_lock); - - kasan_free_module_shadow(vm); - free_unmap_vmap_area(va); - - return vm; - } - - spin_unlock(&vmap_area_lock); - return NULL; + return __remove_vm_area( + find_unlink_vmap_area((unsigned long) addr)); } static inline void set_area_direct_map(const struct vm_struct *area, @@ -2636,16 +2650,17 @@ static inline void set_area_direct_map(const struct vm_struct *area, set_direct_map(area->pages[i]); } -/* Handle removing and resetting vm mappings related to the vm_struct. */ -static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) +/* Handle removing and resetting vm mappings related to the VA's vm_struct. */ +static void va_remove_mappings(struct vmap_area *va, int deallocate_pages) { + struct vm_struct *area = va->vm; unsigned long start = ULONG_MAX, end = 0; unsigned int page_order = vm_area_page_order(area); int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; int flush_dmap = 0; int i; - remove_vm_area(area->addr); + __remove_vm_area(va); /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ if (!flush_reset) @@ -2690,6 +2705,7 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) static void __vunmap(const void *addr, int deallocate_pages) { struct vm_struct *area; + struct vmap_area *va; if (!addr) return; @@ -2698,19 +2714,20 @@ static void __vunmap(const void *addr, int deallocate_pages) addr)) return; - area = find_vm_area(addr); - if (unlikely(!area)) { + va = find_unlink_vmap_area((unsigned long)addr); + if (unlikely(!va)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); return; } + area = va->vm; debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); - vm_remove_mappings(area, deallocate_pages); + va_remove_mappings(va, deallocate_pages); if (deallocate_pages) { int i; -- cgit v1.2.3 From 14687619e1122d71b2ed70e1afa6bc352e629e85 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 22 Dec 2022 20:00:22 +0100 Subject: mm: vmalloc: replace BUG_ON() by WARN_ON_ONCE() Currently a vm_unmap_ram() functions triggers a BUG() if an area is not found. Replace it by the WARN_ON_ONCE() error message and keep machine alive instead of stopping it. The worst case is a memory leaking. Link: https://lkml.kernel.org/r/20221222190022.134380-3-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Lorenzo Stoakes Reviewed-by: Christoph Hellwig Cc: Baoquan He Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/vmalloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 476ccbffb208..428e0bee5c9c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2253,7 +2253,9 @@ void vm_unmap_ram(const void *mem, unsigned int count) } va = find_unlink_vmap_area(addr); - BUG_ON(!va); + if (WARN_ON_ONCE(!va)) + return; + debug_check_no_locks_freed((void *)va->va_start, (va->va_end - va->va_start)); free_unmap_vmap_area(va); -- cgit v1.2.3 From 391655fe08d1f942359a11148aa9aaf3f99d6d6f Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 21 Dec 2022 21:18:59 -0700 Subject: mm: multi-gen LRU: rename lru_gen_struct to lru_gen_folio Patch series "mm: multi-gen LRU: memcg LRU", v3. Overview ======== An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs, since each node and memcg combination has an LRU of folios (see mem_cgroup_lruvec()). Its goal is to improve the scalability of global reclaim, which is critical to system-wide memory overcommit in data centers. Note that memcg reclaim is currently out of scope. Its memory bloat is a pointer to each lruvec and negligible to each pglist_data. In terms of traversing memcgs during global reclaim, it improves the best-case complexity from O(n) to O(1) and does not affect the worst-case complexity O(n). Therefore, on average, it has a sublinear complexity in contrast to the current linear complexity. The basic structure of an memcg LRU can be understood by an analogy to the active/inactive LRU (of folios): 1. It has the young and the old (generations), i.e., the counterparts to the active and the inactive; 2. The increment of max_seq triggers promotion, i.e., the counterpart to activation; 3. Other events trigger similar operations, e.g., offlining an memcg triggers demotion, i.e., the counterpart to deactivation. In terms of global reclaim, it has two distinct features: 1. Sharding, which allows each thread to start at a random memcg (in the old generation) and improves parallelism; 2. Eventual fairness, which allows direct reclaim to bail out at will and reduces latency without affecting fairness over some time. The commit message in patch 6 details the workflow: https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com/ The following is a simple test to quickly verify its effectiveness. Test design: 1. Create multiple memcgs. 2. Each memcg contains a job (fio). 3. All jobs access the same amount of memory randomly. 4. The system does not experience global memory pressure. 5. Periodically write to the root memory.reclaim. Desired outcome: 1. All memcgs have similar pgsteal counts, i.e., stddev(pgsteal) over mean(pgsteal) is close to 0%. 2. The total pgsteal is close to the total requested through memory.reclaim, i.e., sum(pgsteal) over sum(requested) is close to 100%. Actual outcome [1]: MGLRU off MGLRU on stddev(pgsteal) / mean(pgsteal) 75% 20% sum(pgsteal) / sum(requested) 425% 95% #################################################################### MEMCGS=128 for ((memcg = 0; memcg < $MEMCGS; memcg++)); do mkdir /sys/fs/cgroup/memcg$memcg done start() { echo $BASHPID > /sys/fs/cgroup/memcg$memcg/cgroup.procs fio -name=memcg$memcg --numjobs=1 --ioengine=mmap \ --filename=/dev/zero --size=1920M --rw=randrw \ --rate=64m,64m --random_distribution=random \ --fadvise_hint=0 --time_based --runtime=10h \ --group_reporting --minimal } for ((memcg = 0; memcg < $MEMCGS; memcg++)); do start & done sleep 600 for ((i = 0; i < 600; i++)); do echo 256m >/sys/fs/cgroup/memory.reclaim sleep 6 done for ((memcg = 0; memcg < $MEMCGS; memcg++)); do grep "pgsteal " /sys/fs/cgroup/memcg$memcg/memory.stat done #################################################################### [1]: This was obtained from running the above script (touches less than 256GB memory) on an EPYC 7B13 with 512GB DRAM for over an hour. This patch (of 8): The new name lru_gen_folio will be more distinct from the coming lru_gen_memcg. Link: https://lkml.kernel.org/r/20221222041905.2431096-1-yuzhao@google.com Link: https://lkml.kernel.org/r/20221222041905.2431096-2-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Roman Gushchin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 4 ++-- include/linux/mmzone.h | 6 +++--- mm/vmscan.c | 34 +++++++++++++++++----------------- mm/workingset.c | 4 ++-- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index ff3f3f23f649..177b8b1dd43c 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -178,7 +178,7 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); enum lru_list lru = type * LRU_INACTIVE_FILE; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); @@ -224,7 +224,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cd28a100d9e4..1686fcc4ed01 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -404,7 +404,7 @@ enum { * The number of pages in each generation is eventually consistent and therefore * can be transiently negative when reset_batch_size() is pending. */ -struct lru_gen_struct { +struct lru_gen_folio { /* the aging increments the youngest generation number */ unsigned long max_seq; /* the eviction increments the oldest generation numbers */ @@ -461,7 +461,7 @@ struct lru_gen_mm_state { struct lru_gen_mm_walk { /* the lruvec under reclaim */ struct lruvec *lruvec; - /* unstable max_seq from lru_gen_struct */ + /* unstable max_seq from lru_gen_folio */ unsigned long max_seq; /* the next address within an mm to scan */ unsigned long next_addr; @@ -524,7 +524,7 @@ struct lruvec { unsigned long flags; #ifdef CONFIG_LRU_GEN /* evictable pages divided into generations */ - struct lru_gen_struct lrugen; + struct lru_gen_folio lrugen; /* to concurrently iterate lru_gen_mm_list */ struct lru_gen_mm_state mm_state; #endif diff --git a/mm/vmscan.c b/mm/vmscan.c index aa8c252949da..5505f54871c9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3215,7 +3215,7 @@ static int get_nr_gens(struct lruvec *lruvec, int type) static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) { - /* see the comment on lru_gen_struct */ + /* see the comment on lru_gen_folio */ return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; @@ -3612,7 +3612,7 @@ struct ctrl_pos { static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, struct ctrl_pos *pos) { - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; int hist = lru_hist_from_seq(lrugen->min_seq[type]); pos->refaulted = lrugen->avg_refaulted[type][tier] + @@ -3627,7 +3627,7 @@ static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) { int hist, tier; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; @@ -3704,7 +3704,7 @@ static int folio_update_gen(struct folio *folio, int gen) static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { int type = folio_is_file_lru(folio); - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); unsigned long new_flags, old_flags = READ_ONCE(folio->flags); @@ -3749,7 +3749,7 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio, static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk) { int gen, type, zone; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; walk->batched = 0; @@ -4263,7 +4263,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) { int zone; int remaining = MAX_LRU_BATCH; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); if (type == LRU_GEN_ANON && !can_swap) @@ -4299,7 +4299,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) { int gen, type, zone; bool success = false; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; DEFINE_MIN_SEQ(lruvec); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); @@ -4320,7 +4320,7 @@ next: ; } - /* see the comment on lru_gen_struct */ + /* see the comment on lru_gen_folio */ if (can_swap) { min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); @@ -4342,7 +4342,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) { int prev, next; int type, zone; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; spin_lock_irq(&lruvec->lru_lock); @@ -4400,7 +4400,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool success; struct lru_gen_mm_walk *walk; struct mm_struct *mm = NULL; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); @@ -4465,7 +4465,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig unsigned long old = 0; unsigned long young = 0; unsigned long total = 0; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); for (type = !can_swap; type < ANON_AND_FILE; type++) { @@ -4750,7 +4750,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); int tier = lru_tier_from_refs(refs); - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); @@ -4850,7 +4850,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, int scanned = 0; int isolated = 0; int remaining = MAX_LRU_BATCH; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); VM_WARN_ON_ONCE(!list_empty(list)); @@ -5251,7 +5251,7 @@ done: static bool __maybe_unused state_is_valid(struct lruvec *lruvec) { - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; if (lrugen->enabled) { enum lru_list lru; @@ -5530,7 +5530,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, int i; int type, tier; int hist = lru_hist_from_seq(seq); - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; for (tier = 0; tier < MAX_NR_TIERS; tier++) { seq_printf(m, " %10d", tier); @@ -5580,7 +5580,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v) unsigned long seq; bool full = !debugfs_real_fops(m->file)->write; struct lruvec *lruvec = v; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; int nid = lruvec_pgdat(lruvec)->node_id; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); @@ -5834,7 +5834,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) { int i; int gen, type, zone; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; lrugen->max_seq = MIN_NR_GENS + 1; lrugen->enabled = lru_gen_enabled(); diff --git a/mm/workingset.c b/mm/workingset.c index 1a86645b7b3c..fd666584515c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -223,7 +223,7 @@ static void *lru_gen_eviction(struct folio *folio) unsigned long token; unsigned long min_seq; struct lruvec *lruvec; - struct lru_gen_struct *lrugen; + struct lru_gen_folio *lrugen; int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); @@ -252,7 +252,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow) unsigned long token; unsigned long min_seq; struct lruvec *lruvec; - struct lru_gen_struct *lrugen; + struct lru_gen_folio *lrugen; struct mem_cgroup *memcg; struct pglist_data *pgdat; int type = folio_is_file_lru(folio); -- cgit v1.2.3 From 6df1b2212950aae2b2188c6645ea18e2a9e3fdd5 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 21 Dec 2022 21:19:00 -0700 Subject: mm: multi-gen LRU: rename lrugen->lists[] to lrugen->folios[] lru_gen_folio will be chained into per-node lists by the coming lrugen->list. Link: https://lkml.kernel.org/r/20221222041905.2431096-3-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Roman Gushchin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- Documentation/mm/multigen_lru.rst | 8 ++++---- include/linux/mm_inline.h | 4 ++-- include/linux/mmzone.h | 8 ++++---- mm/vmscan.c | 20 ++++++++++---------- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst index d7062c6a8946..d8f721f98868 100644 --- a/Documentation/mm/multigen_lru.rst +++ b/Documentation/mm/multigen_lru.rst @@ -89,15 +89,15 @@ variables are monotonically increasing. Generation numbers are truncated into ``order_base_2(MAX_NR_GENS+1)`` bits in order to fit into the gen counter in ``folio->flags``. Each -truncated generation number is an index to ``lrugen->lists[]``. The +truncated generation number is an index to ``lrugen->folios[]``. The sliding window technique is used to track at least ``MIN_NR_GENS`` and at most ``MAX_NR_GENS`` generations. The gen counter stores a value within ``[1, MAX_NR_GENS]`` while a page is on one of -``lrugen->lists[]``; otherwise it stores zero. +``lrugen->folios[]``; otherwise it stores zero. Each generation is divided into multiple tiers. A page accessed ``N`` times through file descriptors is in tier ``order_base_2(N)``. Unlike -generations, tiers do not have dedicated ``lrugen->lists[]``. In +generations, tiers do not have dedicated ``lrugen->folios[]``. In contrast to moving across generations, which requires the LRU lock, moving across tiers only involves atomic operations on ``folio->flags`` and therefore has a negligible cost. A feedback loop @@ -127,7 +127,7 @@ page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``. Eviction -------- The eviction consumes old generations. Given an ``lruvec``, it -increments ``min_seq`` when ``lrugen->lists[]`` indexed by +increments ``min_seq`` when ``lrugen->folios[]`` indexed by ``min_seq%MAX_NR_GENS`` becomes empty. To select a type and a tier to evict from, it first compares ``min_seq[]`` to select the older type. If both types are equally old, it selects the one whose first tier has diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 177b8b1dd43c..eb8a2435ee80 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -256,9 +256,9 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ if (reclaiming) - list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]); + list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]); else - list_add(&folio->lru, &lrugen->lists[gen][type][zone]); + list_add(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1686fcc4ed01..6c96ee823dbd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -312,7 +312,7 @@ enum lruvec_flags { * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the * corresponding generation. The gen counter in folio->flags stores gen+1 while - * a page is on one of lrugen->lists[]. Otherwise it stores 0. + * a page is on one of lrugen->folios[]. Otherwise it stores 0. * * A page is added to the youngest generation on faulting. The aging needs to * check the accessed bit at least twice before handing this page over to the @@ -324,8 +324,8 @@ enum lruvec_flags { * rest of generations, if they exist, are considered inactive. See * lru_gen_is_active(). * - * PG_active is always cleared while a page is on one of lrugen->lists[] so that - * the aging needs not to worry about it. And it's set again when a page + * PG_active is always cleared while a page is on one of lrugen->folios[] so + * that the aging needs not to worry about it. And it's set again when a page * considered active is isolated for non-reclaiming purposes, e.g., migration. * See lru_gen_add_folio() and lru_gen_del_folio(). * @@ -412,7 +412,7 @@ struct lru_gen_folio { /* the birth time of each generation in jiffies */ unsigned long timestamps[MAX_NR_GENS]; /* the multi-gen LRU lists, lazily sorted on eviction */ - struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the multi-gen LRU sizes, eventually consistent */ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the exponential moving average of refaulted */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 5505f54871c9..d8a53b7443d4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4271,7 +4271,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) /* prevent cold/hot inversion if force_scan is true */ for (zone = 0; zone < MAX_NR_ZONES; zone++) { - struct list_head *head = &lrugen->lists[old_gen][type][zone]; + struct list_head *head = &lrugen->folios[old_gen][type][zone]; while (!list_empty(head)) { struct folio *folio = lru_to_folio(head); @@ -4282,7 +4282,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); new_gen = folio_inc_gen(lruvec, folio, false); - list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]); + list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); if (!--remaining) return false; @@ -4310,7 +4310,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) gen = lru_gen_from_seq(min_seq[type]); for (zone = 0; zone < MAX_NR_ZONES; zone++) { - if (!list_empty(&lrugen->lists[gen][type][zone])) + if (!list_empty(&lrugen->folios[gen][type][zone])) goto next; } @@ -4775,7 +4775,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) /* promoted */ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { - list_move(&folio->lru, &lrugen->lists[gen][type][zone]); + list_move(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } @@ -4784,7 +4784,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) int hist = lru_hist_from_seq(lrugen->min_seq[type]); gen = folio_inc_gen(lruvec, folio, false); - list_move_tail(&folio->lru, &lrugen->lists[gen][type][zone]); + list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); WRITE_ONCE(lrugen->protected[hist][type][tier - 1], lrugen->protected[hist][type][tier - 1] + delta); @@ -4796,7 +4796,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) if (folio_test_locked(folio) || folio_test_writeback(folio) || (type == LRU_GEN_FILE && folio_test_dirty(folio))) { gen = folio_inc_gen(lruvec, folio, true); - list_move(&folio->lru, &lrugen->lists[gen][type][zone]); + list_move(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } @@ -4863,7 +4863,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, for (zone = sc->reclaim_idx; zone >= 0; zone--) { LIST_HEAD(moved); int skipped = 0; - struct list_head *head = &lrugen->lists[gen][type][zone]; + struct list_head *head = &lrugen->folios[gen][type][zone]; while (!list_empty(head)) { struct folio *folio = lru_to_folio(head); @@ -5264,7 +5264,7 @@ static bool __maybe_unused state_is_valid(struct lruvec *lruvec) int gen, type, zone; for_each_gen_type_zone(gen, type, zone) { - if (!list_empty(&lrugen->lists[gen][type][zone])) + if (!list_empty(&lrugen->folios[gen][type][zone])) return false; } } @@ -5309,7 +5309,7 @@ static bool drain_evictable(struct lruvec *lruvec) int remaining = MAX_LRU_BATCH; for_each_gen_type_zone(gen, type, zone) { - struct list_head *head = &lruvec->lrugen.lists[gen][type][zone]; + struct list_head *head = &lruvec->lrugen.folios[gen][type][zone]; while (!list_empty(head)) { bool success; @@ -5843,7 +5843,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) lrugen->timestamps[i] = jiffies; for_each_gen_type_zone(gen, type, zone) - INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); + INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]); lruvec->mm_state.seq = MIN_NR_GENS; init_waitqueue_head(&lruvec->mm_state.wait); -- cgit v1.2.3 From a579086c99ed70cc4bfc104348dbe3dd8f2787e6 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 21 Dec 2022 21:19:01 -0700 Subject: mm: multi-gen LRU: remove eviction fairness safeguard Recall that the eviction consumes the oldest generation: first it bucket-sorts folios whose gen counters were updated by the aging and reclaims the rest; then it increments lrugen->min_seq. The current eviction fairness safeguard for global reclaim has a dilemma: when there are multiple eligible memcgs, should it continue or stop upon meeting the reclaim goal? If it continues, it overshoots and increases direct reclaim latency; if it stops, it loses fairness between memcgs it has taken memory away from and those it has yet to. With memcg LRU, the eviction, while ensuring eventual fairness, will stop upon meeting its goal. Therefore the current eviction fairness safeguard for global reclaim will not be needed. Note that memcg LRU only applies to global reclaim. For memcg reclaim, the eviction will continue, even if it is overshooting. This becomes unconditional due to code simplification. Link: https://lkml.kernel.org/r/20221222041905.2431096-4-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Roman Gushchin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/vmscan.c | 81 ++++++++++++++++++------------------------------------------- 1 file changed, 23 insertions(+), 58 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index d8a53b7443d4..bfbfc98c856c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -449,6 +449,11 @@ static bool cgroup_reclaim(struct scan_control *sc) return sc->target_mem_cgroup; } +static bool global_reclaim(struct scan_control *sc) +{ + return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup); +} + /** * writeback_throttling_sane - is the usual dirty throttling mechanism available? * @sc: scan_control in question @@ -499,6 +504,11 @@ static bool cgroup_reclaim(struct scan_control *sc) return false; } +static bool global_reclaim(struct scan_control *sc) +{ + return true; +} + static bool writeback_throttling_sane(struct scan_control *sc) { return true; @@ -5006,8 +5016,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw return scanned; } -static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, - bool *need_swapping) +static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) { int type; int scanned; @@ -5096,9 +5105,6 @@ retry: goto retry; } - if (need_swapping && type == LRU_GEN_ANON) - *need_swapping = true; - return scanned; } @@ -5138,67 +5144,26 @@ done: return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; } -static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq, - struct scan_control *sc, bool need_swapping) +static unsigned long get_nr_to_reclaim(struct scan_control *sc) { - int i; - DEFINE_MAX_SEQ(lruvec); - - if (!current_is_kswapd()) { - /* age each memcg at most once to ensure fairness */ - if (max_seq - seq > 1) - return true; - - /* over-swapping can increase allocation latency */ - if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping) - return true; - - /* give this thread a chance to exit and free its memory */ - if (fatal_signal_pending(current)) { - sc->nr_reclaimed += MIN_LRU_BATCH; - return true; - } - - if (cgroup_reclaim(sc)) - return false; - } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim) - return false; - - /* keep scanning at low priorities to ensure fairness */ - if (sc->priority > DEF_PRIORITY - 2) - return false; - - /* - * A minimum amount of work was done under global memory pressure. For - * kswapd, it may be overshooting. For direct reclaim, the allocation - * may succeed if all suitable zones are somewhat safe. In either case, - * it's better to stop now, and restart later if necessary. - */ - for (i = 0; i <= sc->reclaim_idx; i++) { - unsigned long wmark; - struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; - - if (!managed_zone(zone)) - continue; - - wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone); - if (wmark > zone_page_state(zone, NR_FREE_PAGES)) - return false; - } + /* don't abort memcg reclaim to ensure fairness */ + if (!global_reclaim(sc)) + return -1; - sc->nr_reclaimed += MIN_LRU_BATCH; + /* discount the previous progress for kswapd */ + if (current_is_kswapd()) + return sc->nr_to_reclaim + sc->last_reclaimed; - return true; + return max(sc->nr_to_reclaim, compact_gap(sc->order)); } static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { struct blk_plug plug; bool need_aging = false; - bool need_swapping = false; unsigned long scanned = 0; unsigned long reclaimed = sc->nr_reclaimed; - DEFINE_MAX_SEQ(lruvec); + unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); lru_add_drain(); @@ -5222,7 +5187,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc if (!nr_to_scan) goto done; - delta = evict_folios(lruvec, sc, swappiness, &need_swapping); + delta = evict_folios(lruvec, sc, swappiness); if (!delta) goto done; @@ -5230,7 +5195,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc if (scanned >= nr_to_scan) break; - if (should_abort_scan(lruvec, max_seq, sc, need_swapping)) + if (sc->nr_reclaimed >= nr_to_reclaim) break; cond_resched(); @@ -5677,7 +5642,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co if (sc->nr_reclaimed >= nr_to_reclaim) return 0; - if (!evict_folios(lruvec, sc, swappiness, NULL)) + if (!evict_folios(lruvec, sc, swappiness)) return 0; cond_resched(); -- cgit v1.2.3 From 7348cc91821b0cb24dfb00e578047f68299a50ab Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 21 Dec 2022 21:19:02 -0700 Subject: mm: multi-gen LRU: remove aging fairness safeguard Recall that the aging produces the youngest generation: first it scans for accessed folios and updates their gen counters; then it increments lrugen->max_seq. The current aging fairness safeguard for kswapd uses two passes to ensure the fairness to multiple eligible memcgs. On the first pass, which is shared with the eviction, it checks whether all eligible memcgs are low on cold folios. If so, it requires a second pass, on which it ages all those memcgs at the same time. With memcg LRU, the aging, while ensuring eventual fairness, will run when necessary. Therefore the current aging fairness safeguard for kswapd will not be needed. Note that memcg LRU only applies to global reclaim. For memcg reclaim, the aging can be unfair to different memcgs, i.e., their lrugen->max_seq can be incremented at different paces. Link: https://lkml.kernel.org/r/20221222041905.2431096-5-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Roman Gushchin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/vmscan.c | 126 ++++++++++++++++++++++++++++-------------------------------- 1 file changed, 59 insertions(+), 67 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index bfbfc98c856c..cc522e048ed7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -137,7 +137,6 @@ struct scan_control { #ifdef CONFIG_LRU_GEN /* help kswapd make better choices among multiple memcgs */ - unsigned int memcgs_need_aging:1; unsigned long last_reclaimed; #endif @@ -4468,7 +4467,7 @@ done: return true; } -static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq, +static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) { int gen, type, zone; @@ -4477,6 +4476,13 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig unsigned long total = 0; struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MIN_SEQ(lruvec); + + /* whether this lruvec is completely out of cold folios */ + if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { + *nr_to_scan = 0; + return true; + } for (type = !can_swap; type < ANON_AND_FILE; type++) { unsigned long seq; @@ -4505,8 +4511,6 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig * stalls when the number of generations reaches MIN_NR_GENS. Hence, the * ideal number of generations is MIN_NR_GENS+1. */ - if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) - return true; if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) return false; @@ -4525,40 +4529,54 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig return false; } -static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl) +static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) { - bool need_aging; - unsigned long nr_to_scan; - int swappiness = get_swappiness(lruvec, sc); + int gen, type, zone; + unsigned long total = 0; + bool can_swap = get_swappiness(lruvec, sc); + struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); DEFINE_MIN_SEQ(lruvec); - VM_WARN_ON_ONCE(sc->memcg_low_reclaim); + for (type = !can_swap; type < ANON_AND_FILE; type++) { + unsigned long seq; - mem_cgroup_calculate_protection(NULL, memcg); + for (seq = min_seq[type]; seq <= max_seq; seq++) { + gen = lru_gen_from_seq(seq); - if (mem_cgroup_below_min(NULL, memcg)) - return false; + for (zone = 0; zone < MAX_NR_ZONES; zone++) + total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); + } + } - need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); + /* whether the size is big enough to be helpful */ + return mem_cgroup_online(memcg) ? (total >> sc->priority) : total; +} - if (min_ttl) { - int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); - unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); +static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc, + unsigned long min_ttl) +{ + int gen; + unsigned long birth; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MIN_SEQ(lruvec); - if (time_is_after_jiffies(birth + min_ttl)) - return false; + VM_WARN_ON_ONCE(sc->memcg_low_reclaim); - /* the size is likely too small to be helpful */ - if (!nr_to_scan && sc->priority != DEF_PRIORITY) - return false; - } + /* see the comment on lru_gen_folio */ + gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); + birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); - if (need_aging) - try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false); + if (time_is_after_jiffies(birth + min_ttl)) + return false; - return true; + if (!lruvec_is_sizable(lruvec, sc)) + return false; + + mem_cgroup_calculate_protection(NULL, memcg); + + return !mem_cgroup_below_min(NULL, memcg); } /* to protect the working set of the last N jiffies */ @@ -4567,46 +4585,32 @@ static unsigned long lru_gen_min_ttl __read_mostly; static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { struct mem_cgroup *memcg; - bool success = false; unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); VM_WARN_ON_ONCE(!current_is_kswapd()); sc->last_reclaimed = sc->nr_reclaimed; - /* - * To reduce the chance of going into the aging path, which can be - * costly, optimistically skip it if the flag below was cleared in the - * eviction path. This improves the overall performance when multiple - * memcgs are available. - */ - if (!sc->memcgs_need_aging) { - sc->memcgs_need_aging = true; + /* check the order to exclude compaction-induced reclaim */ + if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY) return; - } - - set_mm_walk(pgdat); memcg = mem_cgroup_iter(NULL, NULL, NULL); do { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); - if (age_lruvec(lruvec, sc, min_ttl)) - success = true; + if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) { + mem_cgroup_iter_break(NULL, memcg); + return; + } cond_resched(); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); - clear_mm_walk(); - - /* check the order to exclude compaction-induced reclaim */ - if (success || !min_ttl || sc->order) - return; - /* * The main goal is to OOM kill if every generation from all memcgs is * younger than min_ttl. However, another possibility is all memcgs are - * either below min or empty. + * either too small or below min. */ if (mutex_trylock(&oom_lock)) { struct oom_control oc = { @@ -5114,34 +5118,28 @@ retry: * reclaim. */ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, - bool can_swap, bool *need_aging) + bool can_swap) { unsigned long nr_to_scan; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); - DEFINE_MIN_SEQ(lruvec); if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) || (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) && !sc->memcg_low_reclaim)) return 0; - *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); - if (!*need_aging) + if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan)) return nr_to_scan; /* skip the aging path at the default priority */ if (sc->priority == DEF_PRIORITY) - goto done; + return nr_to_scan; - /* leave the work to lru_gen_age_node() */ - if (current_is_kswapd()) - return 0; + try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false); - if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false)) - return nr_to_scan; -done: - return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; + /* skip this lruvec as it's low on cold folios */ + return 0; } static unsigned long get_nr_to_reclaim(struct scan_control *sc) @@ -5160,9 +5158,7 @@ static unsigned long get_nr_to_reclaim(struct scan_control *sc) static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { struct blk_plug plug; - bool need_aging = false; unsigned long scanned = 0; - unsigned long reclaimed = sc->nr_reclaimed; unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); lru_add_drain(); @@ -5183,13 +5179,13 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc else swappiness = 0; - nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging); + nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); if (!nr_to_scan) - goto done; + break; delta = evict_folios(lruvec, sc, swappiness); if (!delta) - goto done; + break; scanned += delta; if (scanned >= nr_to_scan) @@ -5201,10 +5197,6 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc cond_resched(); } - /* see the comment in lru_gen_age_node() */ - if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging) - sc->memcgs_need_aging = false; -done: clear_mm_walk(); blk_finish_plug(&plug); -- cgit v1.2.3 From 77d4459a4a1a472b7309e475f962dda87d950abd Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 21 Dec 2022 21:19:03 -0700 Subject: mm: multi-gen LRU: shuffle should_run_aging() Move should_run_aging() next to its only caller left. Link: https://lkml.kernel.org/r/20221222041905.2431096-6-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Roman Gushchin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/vmscan.c | 124 ++++++++++++++++++++++++++++++------------------------------ 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index cc522e048ed7..5a167f8efc38 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4467,68 +4467,6 @@ done: return true; } -static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, - struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) -{ - int gen, type, zone; - unsigned long old = 0; - unsigned long young = 0; - unsigned long total = 0; - struct lru_gen_folio *lrugen = &lruvec->lrugen; - struct mem_cgroup *memcg = lruvec_memcg(lruvec); - DEFINE_MIN_SEQ(lruvec); - - /* whether this lruvec is completely out of cold folios */ - if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { - *nr_to_scan = 0; - return true; - } - - for (type = !can_swap; type < ANON_AND_FILE; type++) { - unsigned long seq; - - for (seq = min_seq[type]; seq <= max_seq; seq++) { - unsigned long size = 0; - - gen = lru_gen_from_seq(seq); - - for (zone = 0; zone < MAX_NR_ZONES; zone++) - size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); - - total += size; - if (seq == max_seq) - young += size; - else if (seq + MIN_NR_GENS == max_seq) - old += size; - } - } - - /* try to scrape all its memory if this memcg was deleted */ - *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; - - /* - * The aging tries to be lazy to reduce the overhead, while the eviction - * stalls when the number of generations reaches MIN_NR_GENS. Hence, the - * ideal number of generations is MIN_NR_GENS+1. - */ - if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) - return false; - - /* - * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) - * of the total number of pages for each generation. A reasonable range - * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The - * aging cares about the upper bound of hot pages, while the eviction - * cares about the lower bound of cold pages. - */ - if (young * MIN_NR_GENS > total) - return true; - if (old * (MIN_NR_GENS + 2) < total) - return true; - - return false; -} - static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) { int gen, type, zone; @@ -5112,6 +5050,68 @@ retry: return scanned; } +static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, + struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) +{ + int gen, type, zone; + unsigned long old = 0; + unsigned long young = 0; + unsigned long total = 0; + struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MIN_SEQ(lruvec); + + /* whether this lruvec is completely out of cold folios */ + if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { + *nr_to_scan = 0; + return true; + } + + for (type = !can_swap; type < ANON_AND_FILE; type++) { + unsigned long seq; + + for (seq = min_seq[type]; seq <= max_seq; seq++) { + unsigned long size = 0; + + gen = lru_gen_from_seq(seq); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) + size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); + + total += size; + if (seq == max_seq) + young += size; + else if (seq + MIN_NR_GENS == max_seq) + old += size; + } + } + + /* try to scrape all its memory if this memcg was deleted */ + *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; + + /* + * The aging tries to be lazy to reduce the overhead, while the eviction + * stalls when the number of generations reaches MIN_NR_GENS. Hence, the + * ideal number of generations is MIN_NR_GENS+1. + */ + if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) + return false; + + /* + * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) + * of the total number of pages for each generation. A reasonable range + * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The + * aging cares about the upper bound of hot pages, while the eviction + * cares about the lower bound of cold pages. + */ + if (young * MIN_NR_GENS > total) + return true; + if (old * (MIN_NR_GENS + 2) < total) + return true; + + return false; +} + /* * For future optimizations: * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg -- cgit v1.2.3 From e4dde56cd208674ce899b47589f263499e5b8cdc Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 21 Dec 2022 21:19:04 -0700 Subject: mm: multi-gen LRU: per-node lru_gen_folio lists For each node, memcgs are divided into two generations: the old and the young. For each generation, memcgs are randomly sharded into multiple bins to improve scalability. For each bin, an RCU hlist_nulls is virtually divided into three segments: the head, the tail and the default. An onlining memcg is added to the tail of a random bin in the old generation. The eviction starts at the head of a random bin in the old generation. The per-node memcg generation counter, whose reminder (mod 2) indexes the old generation, is incremented when all its bins become empty. There are four operations: 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its current generation (old or young) and updates its "seg" to "head"; 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its current generation (old or young) and updates its "seg" to "tail"; 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old generation, updates its "gen" to "old" and resets its "seg" to "default"; 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the young generation, updates its "gen" to "young" and resets its "seg" to "default". The events that trigger the above operations are: 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; 2. The first attempt to reclaim an memcg below low, which triggers MEMCG_LRU_TAIL; 3. The first attempt to reclaim an memcg below reclaimable size threshold, which triggers MEMCG_LRU_TAIL; 4. The second attempt to reclaim an memcg below reclaimable size threshold, which triggers MEMCG_LRU_YOUNG; 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG; 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; 7. Offlining an memcg, which triggers MEMCG_LRU_OLD. Note that memcg LRU only applies to global reclaim, and the round-robin incrementing of their max_seq counters ensures the eventual fairness to all eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). Link: https://lkml.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Roman Gushchin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 10 ++ include/linux/mm_inline.h | 17 +++ include/linux/mmzone.h | 117 +++++++++++++- mm/memcontrol.c | 16 ++ mm/page_alloc.c | 1 + mm/vmscan.c | 374 +++++++++++++++++++++++++++++++++++++++++---- 6 files changed, 500 insertions(+), 35 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d3c8203cab6c..2e08b05bc6bf 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -794,6 +794,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg) percpu_ref_put(&objcg->refcnt); } +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) +{ + return !memcg || css_tryget(&memcg->css); +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { if (memcg) @@ -1301,6 +1306,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg) { } +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) +{ + return true; +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index eb8a2435ee80..acf03147fff8 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -122,6 +122,18 @@ static inline bool lru_gen_in_fault(void) return current->in_lru_fault; } +#ifdef CONFIG_MEMCG +static inline int lru_gen_memcg_seg(struct lruvec *lruvec) +{ + return READ_ONCE(lruvec->lrugen.seg); +} +#else +static inline int lru_gen_memcg_seg(struct lruvec *lruvec) +{ + return 0; +} +#endif + static inline int lru_gen_from_seq(unsigned long seq) { return seq % MAX_NR_GENS; @@ -297,6 +309,11 @@ static inline bool lru_gen_in_fault(void) return false; } +static inline int lru_gen_memcg_seg(struct lruvec *lruvec) +{ + return 0; +} + static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6c96ee823dbd..815c7c2edf45 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -367,6 +368,15 @@ struct page_vma_mapped_walk; #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) +/* see the comment on MEMCG_NR_GENS */ +enum { + MEMCG_LRU_NOP, + MEMCG_LRU_HEAD, + MEMCG_LRU_TAIL, + MEMCG_LRU_OLD, + MEMCG_LRU_YOUNG, +}; + #ifdef CONFIG_LRU_GEN enum { @@ -426,6 +436,14 @@ struct lru_gen_folio { atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* whether the multi-gen LRU is enabled */ bool enabled; +#ifdef CONFIG_MEMCG + /* the memcg generation this lru_gen_folio belongs to */ + u8 gen; + /* the list segment this lru_gen_folio belongs to */ + u8 seg; + /* per-node lru_gen_folio list for global reclaim */ + struct hlist_nulls_node list; +#endif }; enum { @@ -479,12 +497,87 @@ void lru_gen_init_lruvec(struct lruvec *lruvec); void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); #ifdef CONFIG_MEMCG + +/* + * For each node, memcgs are divided into two generations: the old and the + * young. For each generation, memcgs are randomly sharded into multiple bins + * to improve scalability. For each bin, the hlist_nulls is virtually divided + * into three segments: the head, the tail and the default. + * + * An onlining memcg is added to the tail of a random bin in the old generation. + * The eviction starts at the head of a random bin in the old generation. The + * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes + * the old generation, is incremented when all its bins become empty. + * + * There are four operations: + * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its + * current generation (old or young) and updates its "seg" to "head"; + * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its + * current generation (old or young) and updates its "seg" to "tail"; + * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old + * generation, updates its "gen" to "old" and resets its "seg" to "default"; + * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the + * young generation, updates its "gen" to "young" and resets its "seg" to + * "default". + * + * The events that trigger the above operations are: + * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; + * 2. The first attempt to reclaim an memcg below low, which triggers + * MEMCG_LRU_TAIL; + * 3. The first attempt to reclaim an memcg below reclaimable size threshold, + * which triggers MEMCG_LRU_TAIL; + * 4. The second attempt to reclaim an memcg below reclaimable size threshold, + * which triggers MEMCG_LRU_YOUNG; + * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG; + * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; + * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD. + * + * Note that memcg LRU only applies to global reclaim, and the round-robin + * incrementing of their max_seq counters ensures the eventual fairness to all + * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). + */ +#define MEMCG_NR_GENS 2 +#define MEMCG_NR_BINS 8 + +struct lru_gen_memcg { + /* the per-node memcg generation counter */ + unsigned long seq; + /* each memcg has one lru_gen_folio per node */ + unsigned long nr_memcgs[MEMCG_NR_GENS]; + /* per-node lru_gen_folio list for global reclaim */ + struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS]; + /* protects the above */ + spinlock_t lock; +}; + +void lru_gen_init_pgdat(struct pglist_data *pgdat); + void lru_gen_init_memcg(struct mem_cgroup *memcg); void lru_gen_exit_memcg(struct mem_cgroup *memcg); -#endif +void lru_gen_online_memcg(struct mem_cgroup *memcg); +void lru_gen_offline_memcg(struct mem_cgroup *memcg); +void lru_gen_release_memcg(struct mem_cgroup *memcg); +void lru_gen_rotate_memcg(struct lruvec *lruvec, int op); + +#else /* !CONFIG_MEMCG */ + +#define MEMCG_NR_GENS 1 + +struct lru_gen_memcg { +}; + +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) +{ +} + +#endif /* CONFIG_MEMCG */ #else /* !CONFIG_LRU_GEN */ +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) +{ +} + static inline void lru_gen_init_lruvec(struct lruvec *lruvec) { } @@ -494,6 +587,7 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) } #ifdef CONFIG_MEMCG + static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) { } @@ -501,7 +595,24 @@ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg) { } -#endif + +static inline void lru_gen_online_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_release_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) +{ +} + +#endif /* CONFIG_MEMCG */ #endif /* CONFIG_LRU_GEN */ @@ -1243,6 +1354,8 @@ typedef struct pglist_data { #ifdef CONFIG_LRU_GEN /* kswap mm walk data */ struct lru_gen_mm_walk mm_walk; + /* lru_gen_folio list */ + struct lru_gen_memcg memcg_lru; #endif CACHELINE_PADDING(_pad2_); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 49f67176a1a2..2758b67eb169 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -478,6 +478,16 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) struct mem_cgroup_per_node *mz; struct mem_cgroup_tree_per_node *mctz; + if (lru_gen_enabled()) { + struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; + + /* see the comment on MEMCG_NR_GENS */ + if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); + + return; + } + mctz = soft_limit_tree.rb_tree_per_node[nid]; if (!mctz) return; @@ -3530,6 +3540,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, struct mem_cgroup_tree_per_node *mctz; unsigned long excess; + if (lru_gen_enabled()) + return 0; + if (order > 0) return 0; @@ -5391,6 +5404,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) if (unlikely(mem_cgroup_is_root(memcg))) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); + lru_gen_online_memcg(memcg); return 0; offline_kmem: memcg_offline_kmem(memcg); @@ -5422,6 +5436,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg_offline_kmem(memcg); reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); + lru_gen_offline_memcg(memcg); drain_all_stock(memcg); @@ -5433,6 +5448,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); invalidate_reclaim_iterators(memcg); + lru_gen_release_memcg(memcg); } static void mem_cgroup_css_free(struct cgroup_subsys_state *css) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7d980dc0000e..5668c1a2de49 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7941,6 +7941,7 @@ static void __init free_area_init_node(int nid) pgdat_set_deferred_range(pgdat); free_area_init_core(pgdat); + lru_gen_init_pgdat(pgdat); } static void __init free_area_init_memoryless_node(int nid) diff --git a/mm/vmscan.c b/mm/vmscan.c index 5a167f8efc38..178465a503db 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -55,6 +55,8 @@ #include #include #include +#include +#include #include #include @@ -135,11 +137,6 @@ struct scan_control { /* Always discard instead of demoting to lower tier memory */ unsigned int no_demotion:1; -#ifdef CONFIG_LRU_GEN - /* help kswapd make better choices among multiple memcgs */ - unsigned long last_reclaimed; -#endif - /* Allocation order */ s8 order; @@ -3185,6 +3182,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) +#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS) +#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS) + static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) { struct pglist_data *pgdat = NODE_DATA(nid); @@ -4453,8 +4453,7 @@ done: if (sc->priority <= DEF_PRIORITY - 2) wait_event_killable(lruvec->mm_state.wait, max_seq < READ_ONCE(lrugen->max_seq)); - - return max_seq < READ_ONCE(lrugen->max_seq); + return false; } VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); @@ -4527,8 +4526,6 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) VM_WARN_ON_ONCE(!current_is_kswapd()); - sc->last_reclaimed = sc->nr_reclaimed; - /* check the order to exclude compaction-induced reclaim */ if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY) return; @@ -5117,8 +5114,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg * reclaim. */ -static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, - bool can_swap) +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap) { unsigned long nr_to_scan; struct mem_cgroup *memcg = lruvec_memcg(lruvec); @@ -5136,10 +5132,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * if (sc->priority == DEF_PRIORITY) return nr_to_scan; - try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false); - /* skip this lruvec as it's low on cold folios */ - return 0; + return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0; } static unsigned long get_nr_to_reclaim(struct scan_control *sc) @@ -5148,29 +5142,18 @@ static unsigned long get_nr_to_reclaim(struct scan_control *sc) if (!global_reclaim(sc)) return -1; - /* discount the previous progress for kswapd */ - if (current_is_kswapd()) - return sc->nr_to_reclaim + sc->last_reclaimed; - return max(sc->nr_to_reclaim, compact_gap(sc->order)); } -static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { - struct blk_plug plug; + long nr_to_scan; unsigned long scanned = 0; unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); - lru_add_drain(); - - blk_start_plug(&plug); - - set_mm_walk(lruvec_pgdat(lruvec)); - while (true) { int delta; int swappiness; - unsigned long nr_to_scan; if (sc->may_swap) swappiness = get_swappiness(lruvec, sc); @@ -5180,7 +5163,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc swappiness = 0; nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); - if (!nr_to_scan) + if (nr_to_scan <= 0) break; delta = evict_folios(lruvec, sc, swappiness); @@ -5197,11 +5180,252 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc cond_resched(); } + /* whether try_to_inc_max_seq() was successful */ + return nr_to_scan < 0; +} + +static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) +{ + bool success; + unsigned long scanned = sc->nr_scanned; + unsigned long reclaimed = sc->nr_reclaimed; + int seg = lru_gen_memcg_seg(lruvec); + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + /* see the comment on MEMCG_NR_GENS */ + if (!lruvec_is_sizable(lruvec, sc)) + return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG; + + mem_cgroup_calculate_protection(NULL, memcg); + + if (mem_cgroup_below_min(NULL, memcg)) + return MEMCG_LRU_YOUNG; + + if (mem_cgroup_below_low(NULL, memcg)) { + /* see the comment on MEMCG_NR_GENS */ + if (seg != MEMCG_LRU_TAIL) + return MEMCG_LRU_TAIL; + + memcg_memory_event(memcg, MEMCG_LOW); + } + + success = try_to_shrink_lruvec(lruvec, sc); + + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); + + if (!sc->proactive) + vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); + + sc->nr_reclaimed += current->reclaim_state->reclaimed_slab; + current->reclaim_state->reclaimed_slab = 0; + + return success ? MEMCG_LRU_YOUNG : 0; +} + +#ifdef CONFIG_MEMCG + +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) +{ + int gen; + int bin; + int first_bin; + struct lruvec *lruvec; + struct lru_gen_folio *lrugen; + const struct hlist_nulls_node *pos; + int op = 0; + struct mem_cgroup *memcg = NULL; + unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); + + bin = first_bin = get_random_u32_below(MEMCG_NR_BINS); +restart: + gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); + + rcu_read_lock(); + + hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) { + if (op) + lru_gen_rotate_memcg(lruvec, op); + + mem_cgroup_put(memcg); + + lruvec = container_of(lrugen, struct lruvec, lrugen); + memcg = lruvec_memcg(lruvec); + + if (!mem_cgroup_tryget(memcg)) { + op = 0; + memcg = NULL; + continue; + } + + rcu_read_unlock(); + + op = shrink_one(lruvec, sc); + + if (sc->nr_reclaimed >= nr_to_reclaim) + goto success; + + rcu_read_lock(); + } + + rcu_read_unlock(); + + /* restart if raced with lru_gen_rotate_memcg() */ + if (gen != get_nulls_value(pos)) + goto restart; + + /* try the rest of the bins of the current generation */ + bin = get_memcg_bin(bin + 1); + if (bin != first_bin) + goto restart; +success: + if (op) + lru_gen_rotate_memcg(lruvec, op); + + mem_cgroup_put(memcg); +} + +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + struct blk_plug plug; + + VM_WARN_ON_ONCE(global_reclaim(sc)); + + lru_add_drain(); + + blk_start_plug(&plug); + + set_mm_walk(lruvec_pgdat(lruvec)); + + if (try_to_shrink_lruvec(lruvec, sc)) + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG); + clear_mm_walk(); blk_finish_plug(&plug); } +#else /* !CONFIG_MEMCG */ + +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) +{ + BUILD_BUG(); +} + +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + BUILD_BUG(); +} + +#endif + +static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) +{ + int priority; + unsigned long reclaimable; + struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); + + if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) + return; + /* + * Determine the initial priority based on ((total / MEMCG_NR_GENS) >> + * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the + * estimated reclaimed_to_scanned_ratio = inactive / total. + */ + reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); + if (get_swappiness(lruvec, sc)) + reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); + + reclaimable /= MEMCG_NR_GENS; + + /* round down reclaimable and round up sc->nr_to_reclaim */ + priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); + + sc->priority = clamp(priority, 0, DEF_PRIORITY); +} + +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) +{ + struct blk_plug plug; + unsigned long reclaimed = sc->nr_reclaimed; + + VM_WARN_ON_ONCE(!global_reclaim(sc)); + + lru_add_drain(); + + blk_start_plug(&plug); + + set_mm_walk(pgdat); + + set_initial_priority(pgdat, sc); + + if (current_is_kswapd()) + sc->nr_reclaimed = 0; + + if (mem_cgroup_disabled()) + shrink_one(&pgdat->__lruvec, sc); + else + shrink_many(pgdat, sc); + + if (current_is_kswapd()) + sc->nr_reclaimed += reclaimed; + + clear_mm_walk(); + + blk_finish_plug(&plug); + + /* kswapd should never fail */ + pgdat->kswapd_failures = 0; +} + +#ifdef CONFIG_MEMCG +void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) +{ + int seg; + int old, new; + int bin = get_random_u32_below(MEMCG_NR_BINS); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + spin_lock(&pgdat->memcg_lru.lock); + + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); + + seg = 0; + new = old = lruvec->lrugen.gen; + + /* see the comment on MEMCG_NR_GENS */ + if (op == MEMCG_LRU_HEAD) + seg = MEMCG_LRU_HEAD; + else if (op == MEMCG_LRU_TAIL) + seg = MEMCG_LRU_TAIL; + else if (op == MEMCG_LRU_OLD) + new = get_memcg_gen(pgdat->memcg_lru.seq); + else if (op == MEMCG_LRU_YOUNG) + new = get_memcg_gen(pgdat->memcg_lru.seq + 1); + else + VM_WARN_ON_ONCE(true); + + hlist_nulls_del_rcu(&lruvec->lrugen.list); + + if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) + hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); + else + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); + + pgdat->memcg_lru.nr_memcgs[old]--; + pgdat->memcg_lru.nr_memcgs[new]++; + + lruvec->lrugen.gen = new; + WRITE_ONCE(lruvec->lrugen.seg, seg); + + if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); + + spin_unlock(&pgdat->memcg_lru.lock); +} +#endif + /****************************************************************************** * state change ******************************************************************************/ @@ -5655,11 +5879,11 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, if (!mem_cgroup_disabled()) { rcu_read_lock(); + memcg = mem_cgroup_from_id(memcg_id); -#ifdef CONFIG_MEMCG - if (memcg && !css_tryget(&memcg->css)) + if (!mem_cgroup_tryget(memcg)) memcg = NULL; -#endif + rcu_read_unlock(); if (!memcg) @@ -5807,6 +6031,19 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) } #ifdef CONFIG_MEMCG + +void lru_gen_init_pgdat(struct pglist_data *pgdat) +{ + int i, j; + + spin_lock_init(&pgdat->memcg_lru.lock); + + for (i = 0; i < MEMCG_NR_GENS; i++) { + for (j = 0; j < MEMCG_NR_BINS; j++) + INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i); + } +} + void lru_gen_init_memcg(struct mem_cgroup *memcg) { INIT_LIST_HEAD(&memcg->mm_list.fifo); @@ -5830,7 +6067,69 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg) } } } -#endif + +void lru_gen_online_memcg(struct mem_cgroup *memcg) +{ + int gen; + int nid; + int bin = get_random_u32_below(MEMCG_NR_BINS); + + for_each_node(nid) { + struct pglist_data *pgdat = NODE_DATA(nid); + struct lruvec *lruvec = get_lruvec(memcg, nid); + + spin_lock(&pgdat->memcg_lru.lock); + + VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); + + gen = get_memcg_gen(pgdat->memcg_lru.seq); + + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); + pgdat->memcg_lru.nr_memcgs[gen]++; + + lruvec->lrugen.gen = gen; + + spin_unlock(&pgdat->memcg_lru.lock); + } +} + +void lru_gen_offline_memcg(struct mem_cgroup *memcg) +{ + int nid; + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD); + } +} + +void lru_gen_release_memcg(struct mem_cgroup *memcg) +{ + int gen; + int nid; + + for_each_node(nid) { + struct pglist_data *pgdat = NODE_DATA(nid); + struct lruvec *lruvec = get_lruvec(memcg, nid); + + spin_lock(&pgdat->memcg_lru.lock); + + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); + + gen = lruvec->lrugen.gen; + + hlist_nulls_del_rcu(&lruvec->lrugen.list); + pgdat->memcg_lru.nr_memcgs[gen]--; + + if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); + + spin_unlock(&pgdat->memcg_lru.lock); + } +} + +#endif /* CONFIG_MEMCG */ static int __init init_lru_gen(void) { @@ -5857,6 +6156,10 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc { } +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) +{ +} + #endif /* CONFIG_LRU_GEN */ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) @@ -5870,7 +6173,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) bool proportional_reclaim; struct blk_plug plug; - if (lru_gen_enabled()) { + if (lru_gen_enabled() && !global_reclaim(sc)) { lru_gen_shrink_lruvec(lruvec, sc); return; } @@ -6113,6 +6416,11 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) struct lruvec *target_lruvec; bool reclaimable = false; + if (lru_gen_enabled() && global_reclaim(sc)) { + lru_gen_shrink_node(pgdat, sc); + return; + } + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); again: -- cgit v1.2.3 From e9d4e1ee788097484606c32122f146d802a9c5fb Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 21 Dec 2022 21:19:05 -0700 Subject: mm: multi-gen LRU: clarify scan_control flags Among the flags in scan_control: 1. sc->may_swap, which indicates swap constraint due to memsw.max, is supported as usual. 2. sc->proactive, which indicates reclaim by memory.reclaim, may not opportunistically skip the aging path, since it is considered less latency sensitive. 3. !(sc->gfp_mask & __GFP_IO), which indicates IO constraint, lowers swappiness to prioritize file LRU, since clean file folios are more likely to exist. 4. sc->may_writepage and sc->may_unmap, which indicates opportunistic reclaim, are rejected, since unmapped clean folios are already prioritized. Scanning for more of them is likely futile and can cause high reclaim latency when there is a large number of memcgs. The rest are handled by the existing code. Link: https://lkml.kernel.org/r/20221222041905.2431096-8-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Roman Gushchin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/vmscan.c | 56 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 178465a503db..2964652d1aa8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3210,6 +3210,9 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); + if (!sc->may_swap) + return 0; + if (!can_demote(pgdat->node_id, sc) && mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) return 0; @@ -4236,7 +4239,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_ } while (err == -EAGAIN); } -static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat) +static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc) { struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; @@ -4244,7 +4247,7 @@ static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat) VM_WARN_ON_ONCE(walk); walk = &pgdat->mm_walk; - } else if (!pgdat && !walk) { + } else if (!walk && force_alloc) { VM_WARN_ON_ONCE(current_is_kswapd()); walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); @@ -4430,7 +4433,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, goto done; } - walk = set_mm_walk(NULL); + walk = set_mm_walk(NULL, true); if (!walk) { success = iterate_mm_list_nowalk(lruvec, max_seq); goto done; @@ -4499,8 +4502,6 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MIN_SEQ(lruvec); - VM_WARN_ON_ONCE(sc->memcg_low_reclaim); - /* see the comment on lru_gen_folio */ gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); @@ -4756,12 +4757,8 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca { bool success; - /* unmapping inhibited */ - if (!sc->may_unmap && folio_mapped(folio)) - return false; - /* swapping inhibited */ - if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && + if (!(sc->gfp_mask & __GFP_IO) && (folio_test_dirty(folio) || (folio_test_anon(folio) && !folio_test_swapcache(folio)))) return false; @@ -4858,9 +4855,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, __count_vm_events(PGSCAN_ANON + type, isolated); /* - * There might not be eligible pages due to reclaim_idx, may_unmap and - * may_writepage. Check the remaining to prevent livelock if it's not - * making progress. + * There might not be eligible folios due to reclaim_idx. Check the + * remaining to prevent livelock if it's not making progress. */ return isolated || !remaining ? scanned : 0; } @@ -5120,9 +5116,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); - if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) || - (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) && - !sc->memcg_low_reclaim)) + if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) return 0; if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan)) @@ -5150,17 +5144,14 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) long nr_to_scan; unsigned long scanned = 0; unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); + int swappiness = get_swappiness(lruvec, sc); + + /* clean file folios are more likely to exist */ + if (swappiness && !(sc->gfp_mask & __GFP_IO)) + swappiness = 1; while (true) { int delta; - int swappiness; - - if (sc->may_swap) - swappiness = get_swappiness(lruvec, sc); - else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc)) - swappiness = 1; - else - swappiness = 0; nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); if (nr_to_scan <= 0) @@ -5291,12 +5282,13 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc struct blk_plug plug; VM_WARN_ON_ONCE(global_reclaim(sc)); + VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap); lru_add_drain(); blk_start_plug(&plug); - set_mm_walk(lruvec_pgdat(lruvec)); + set_mm_walk(NULL, sc->proactive); if (try_to_shrink_lruvec(lruvec, sc)) lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG); @@ -5352,11 +5344,19 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * VM_WARN_ON_ONCE(!global_reclaim(sc)); + /* + * Unmapped clean folios are already prioritized. Scanning for more of + * them is likely futile and can cause high reclaim latency when there + * is a large number of memcgs. + */ + if (!sc->may_writepage || !sc->may_unmap) + goto done; + lru_add_drain(); blk_start_plug(&plug); - set_mm_walk(pgdat); + set_mm_walk(pgdat, sc->proactive); set_initial_priority(pgdat, sc); @@ -5374,7 +5374,7 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * clear_mm_walk(); blk_finish_plug(&plug); - +done: /* kswapd should never fail */ pgdat->kswapd_failures = 0; } @@ -5943,7 +5943,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, set_task_reclaim_state(current, &sc.reclaim_state); flags = memalloc_noreclaim_save(); blk_start_plug(&plug); - if (!set_mm_walk(NULL)) { + if (!set_mm_walk(NULL, true)) { err = -ENOMEM; goto done; } -- cgit v1.2.3 From f386e9314025ea99dae639ed2032560a92081430 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 21 Dec 2022 21:19:06 -0700 Subject: mm: multi-gen LRU: simplify arch_has_hw_pte_young() check Scanning page tables when hardware does not set the accessed bit has no real use cases. Link: https://lkml.kernel.org/r/20221222041905.2431096-9-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Roman Gushchin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 2964652d1aa8..7c3fd900a89d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4428,7 +4428,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, * handful of PTEs. Spreading the work out over a period of time usually * is less efficient, but it avoids bursty page faults. */ - if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { + if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) { success = iterate_mm_list_nowalk(lruvec, max_seq); goto done; } -- cgit v1.2.3 From a9af8e6bb3e5de8ea9d29c1d318bcfbc5667c939 Mon Sep 17 00:00:00 2001 From: Xu Panda Date: Fri, 23 Dec 2022 10:50:24 +0800 Subject: selftests/vm: ksm_functional_tests: fix a typo in comment Fix a typo of "comaring" which should be "comparing". Link: https://lkml.kernel.org/r/202212231050245952617@zte.com.cn Signed-off-by: Xu Panda Signed-off-by: xu xin Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/ksm_functional_tests.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/ksm_functional_tests.c b/tools/testing/selftests/vm/ksm_functional_tests.c index b11b7e5115dc..d8b5b4930412 100644 --- a/tools/testing/selftests/vm/ksm_functional_tests.c +++ b/tools/testing/selftests/vm/ksm_functional_tests.c @@ -37,7 +37,7 @@ static bool range_maps_duplicates(char *addr, unsigned long size) /* * There is no easy way to check if there are KSM pages mapped into * this range. We only check that the range does not map the same PFN - * twice by comaring each pair of mapped pages. + * twice by comparing each pair of mapped pages. */ for (offs_a = 0; offs_a < size; offs_a += pagesize) { pfn_a = pagemap_get_pfn(pagemap_fd, addr + offs_a); -- cgit v1.2.3 From 931298e103c228c4ce6d13e7b5781aeaaff37ac7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 23 Dec 2022 16:56:15 +0100 Subject: mm/userfaultfd: rely on vma->vm_page_prot in uffd_wp_range() Patch series "mm: uffd-wp + change_protection() cleanups". Cleanup page protection handling in uffd-wp when calling change_protection() and improve unprotecting uffd=wp in private mappings, trying to set PTEs writable again if possible just like we do during mprotect() when upgrading write permissions. Make the change_protection() interface harder to get wrong :) I consider both pages primarily cleanups, although patch #1 fixes a corner case with uffd-wp and softdirty tracking for shmem. @Peter, please let me know if we should flag patch #1 as pure cleanup -- I have no idea how important softdirty tracking on shmem is. This patch (of 2): uffd_wp_range() currently calculates page protection manually using vm_get_page_prot(). This will ignore any other reason for active writenotify: one mechanism applicable to shmem is softdirty tracking. For example, the following sequence 1) Write to mapped shmem page 2) Clear softdirty 3) Register uffd-wp covering the mapped page 4) Unregister uffd-wp covering the mapped page 5) Write to page again will not set the modified page softdirty, because uffd_wp_range() will ignore that writenotify is required for softdirty tracking and simply map the page writable again using change_protection(). Similarly, instead of unregistering, protecting followed by un-protecting the page using uffd-wp would result in the same situation. Now that we enable writenotify whenever enabling uffd-wp on a VMA, vma->vm_page_prot will already properly reflect our requirements: the default is to write-protect all PTEs. However, for shared mappings we would now not remap the PTEs writable if possible when unprotecting, just like for private mappings (COW). To compensate, set MM_CP_TRY_CHANGE_WRITABLE just like mprotect() does to try mapping individual PTEs writable. For private mappings, this change implies that we will now always try setting PTEs writable when un-protecting, just like when upgrading write permissions using mprotect(), which is an improvement. For shared mappings, we will only set PTEs writable if can_change_pte_writable()/can_change_pmd_writable() indicates that it's ok. For ordinary shmem, this will be the case when PTEs are dirty, which should usually be the case -- otherwise we could special-case shmem in can_change_pte_writable()/can_change_pmd_writable() easily, because shmem itself doesn't require writenotify. Note that hugetlb does not yet implement MM_CP_TRY_CHANGE_WRITABLE, so we won't try setting PTEs writable when unprotecting or when unregistering uffd-wp. This can be added later on top by implementing MM_CP_TRY_CHANGE_WRITABLE. While commit ffd05793963a ("userfaultfd: wp: support write protection for userfault vma range") introduced that code, it should only be applicable to uffd-wp on shared mappings -- shmem (hugetlb does not support softdirty tracking). I don't think this corner cases justifies to cc stable. Let's just handle it correctly and prepare for change_protection() cleanups. [david@redhat.com: o need for additional harmless checks if we're wr-protecting either way] Link: https://lkml.kernel.org/r/71412742-a71f-9c74-865f-773ad83db7a5@redhat.com Link: https://lkml.kernel.org/r/20221223155616.297723-1-david@redhat.com Link: https://lkml.kernel.org/r/20221223155616.297723-2-david@redhat.com Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs") Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Nadav Amit Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index f8d31b82aceb..46771362550f 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -713,17 +713,25 @@ ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long start, unsigned long len, bool enable_wp) { + unsigned int mm_cp_flags; struct mmu_gather tlb; - pgprot_t newprot; if (enable_wp) - newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE)); + mm_cp_flags = MM_CP_UFFD_WP; else - newprot = vm_get_page_prot(dst_vma->vm_flags); + mm_cp_flags = MM_CP_UFFD_WP_RESOLVE; + /* + * vma->vm_page_prot already reflects that uffd-wp is enabled for this + * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed + * to be write-protected as default whenever protection changes. + * Try upgrading write permissions manually. + */ + if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) + mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; tlb_gather_mmu(&tlb, dst_mm); - change_protection(&tlb, dst_vma, start, start + len, newprot, - enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE); + change_protection(&tlb, dst_vma, start, start + len, vma->vm_page_prot, + mm_cp_flags); tlb_finish_mmu(&tlb); } -- cgit v1.2.3 From 1ef488edd6c4d447784710974f049628c2890481 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 23 Dec 2022 16:56:16 +0100 Subject: mm/mprotect: drop pgprot_t parameter from change_protection() Being able to provide a custom protection opens the door for inconsistencies and BUGs: for example, accidentally allowing for more permissions than desired by other mechanisms (e.g., softdirty tracking). vma->vm_page_prot should be the single source of truth. Only PROT_NUMA is special: there is no way we can erroneously allow for more permissions when removing all permissions. Special-case using the MM_CP_PROT_NUMA flag. [david@redhat.com: PAGE_NONE might not be defined without CONFIG_NUMA_BALANCING] Link: https://lkml.kernel.org/r/5084ff1c-ebb3-f918-6a60-bacabf550a88@redhat.com Link: https://lkml.kernel.org/r/20221223155616.297723-3-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Nadav Amit Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 +-- mm/mempolicy.c | 3 +-- mm/mprotect.c | 18 +++++++++++++++--- mm/userfaultfd.c | 3 +-- 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d68579bf8484..329ed67edd76 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2134,8 +2134,7 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, pte_t pte); extern unsigned long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgprot_t newprot, - unsigned long cp_flags); + unsigned long end, unsigned long cp_flags); extern int mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index becf41e10076..d3558248a0f0 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -635,8 +635,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, tlb_gather_mmu(&tlb, vma->vm_mm); - nr_updated = change_protection(&tlb, vma, addr, end, PAGE_NONE, - MM_CP_PROT_NUMA); + nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); if (nr_updated) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); diff --git a/mm/mprotect.c b/mm/mprotect.c index bf8fa0af5a15..71358e45a742 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -507,13 +507,25 @@ static unsigned long change_protection_range(struct mmu_gather *tlb, unsigned long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgprot_t newprot, - unsigned long cp_flags) + unsigned long end, unsigned long cp_flags) { + pgprot_t newprot = vma->vm_page_prot; unsigned long pages; BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL); +#ifdef CONFIG_NUMA_BALANCING + /* + * Ordinary protection updates (mprotect, uffd-wp, softdirty tracking) + * are expected to reflect their requirements via VMA flags such that + * vma_set_page_prot() will adjust vma->vm_page_prot accordingly. + */ + if (cp_flags & MM_CP_PROT_NUMA) + newprot = PAGE_NONE; +#else + WARN_ON_ONCE(cp_flags & MM_CP_PROT_NUMA); +#endif + if (is_vm_hugetlb_page(vma)) pages = hugetlb_change_protection(vma, start, end, newprot, cp_flags); @@ -642,7 +654,7 @@ success: mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; vma_set_page_prot(vma); - change_protection(tlb, vma, start, end, vma->vm_page_prot, mm_cp_flags); + change_protection(tlb, vma, start, end, mm_cp_flags); /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 46771362550f..65ad172add27 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -730,8 +730,7 @@ void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; tlb_gather_mmu(&tlb, dst_mm); - change_protection(&tlb, dst_vma, start, start + len, vma->vm_page_prot, - mm_cp_flags); + change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); tlb_finish_mmu(&tlb); } -- cgit v1.2.3 From 3783e1721b650588938d28e4a084a1c9748361c8 Mon Sep 17 00:00:00 2001 From: Kele Huang Date: Sat, 24 Dec 2022 01:02:33 -0500 Subject: mm: fix comment of page table counter Commit af5b0f6a09e42 ("mm: consolidate page table accounting") consolidates page table accounting to a single counter in struct mm_struct {} as mm->pgtables_bytes. So the meanning of this counter should be the size of all page tables now. Link: https://lkml.kernel.org/r/20221224060233.417827-1-kele.huang@columbia.edu Signed-off-by: Kele Huang Cc: Arnd Bergmann Cc: Colin Cross Cc: David Hildenbrand Cc: Hugh Dickins Cc: Liam Howlett Cc: Matthew Wilcox (Oracle) Cc: Pasha Tatashin Cc: Peter Xu Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1118e381fcdc..10b6eb311ede 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -647,7 +647,7 @@ struct mm_struct { atomic_t mm_count; #ifdef CONFIG_MMU - atomic_long_t pgtables_bytes; /* PTE page table pages */ + atomic_long_t pgtables_bytes; /* size of all page tables */ #endif int map_count; /* number of VMAs */ -- cgit v1.2.3 From 01b5022f0a8a2911bb8f2bc3f0c9b9b2c21c3316 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 28 Dec 2022 17:59:42 +0000 Subject: mm/page_reporting: replace rcu_access_pointer() with rcu_dereference_protected() Page reporting fetches pr_dev_info using rcu_access_pointer(), which is for safely fetching a pointer that will not be dereferenced but could concurrently updated. The code indeed does not dereference pr_dev_info after fetching it using rcu_access_pointer(), but it fetches the pointer while concurrent updates to the pointer is avoided by holding the update side lock, page_reporting_mutex. In the case, rcu_dereference_protected() should be used instead because it provides better readability and performance on some cases, as rcu_dereference_protected() avoids use of READ_ONCE(). Replace the rcu_access_pointer() calls with rcu_dereference_protected(). Link: https://lkml.kernel.org/r/20221228175942.149491-1-sj@kernel.org Fixes: 36e66c554b5c ("mm: introduce Reported pages") Signed-off-by: SeongJae Park Cc: Alexander Duyck Cc: Matthew Wilcox Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/page_reporting.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/page_reporting.c b/mm/page_reporting.c index 79a8554f024c..c65813a9dc78 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -356,7 +356,8 @@ int page_reporting_register(struct page_reporting_dev_info *prdev) mutex_lock(&page_reporting_mutex); /* nothing to do if already in use */ - if (rcu_access_pointer(pr_dev_info)) { + if (rcu_dereference_protected(pr_dev_info, + lockdep_is_held(&page_reporting_mutex))) { err = -EBUSY; goto err_out; } @@ -401,7 +402,8 @@ void page_reporting_unregister(struct page_reporting_dev_info *prdev) { mutex_lock(&page_reporting_mutex); - if (rcu_access_pointer(pr_dev_info) == prdev) { + if (prdev == rcu_dereference_protected(pr_dev_info, + lockdep_is_held(&page_reporting_mutex))) { /* Disable page reporting notification */ RCU_INIT_POINTER(pr_dev_info, NULL); synchronize_rcu(); -- cgit v1.2.3 From 81e506bec9be1eceaf5a2c654e28ba5176ef48d8 Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Fri, 23 Dec 2022 21:52:07 +0800 Subject: mm/thp: check and bail out if page in deferred queue already Kernel build regression with LLVM was reported here: https://lore.kernel.org/all/Y1GCYXGtEVZbcv%2F5@dev-arch.thelio-3990X/ with commit f35b5d7d676e ("mm: align larger anonymous mappings on THP boundaries"). And the commit f35b5d7d676e was reverted. It turned out the regression is related with madvise(MADV_DONTNEED) was used by ld.lld. But with none PMD_SIZE aligned parameter len. trace-bpfcc captured: 531607 531732 ld.lld do_madvise.part.0 start: 0x7feca9000000, len: 0x7fb000, behavior: 0x4 531607 531793 ld.lld do_madvise.part.0 start: 0x7fec86a00000, len: 0x7fb000, behavior: 0x4 If the underneath physical page is THP, the madvise(MADV_DONTNEED) can trigger split_queue_lock contention raised significantly. perf showed following data: 14.85% 0.00% ld.lld [kernel.kallsyms] [k] entry_SYSCALL_64_after_hwframe 11.52% entry_SYSCALL_64_after_hwframe do_syscall_64 __x64_sys_madvise do_madvise.part.0 zap_page_range unmap_single_vma unmap_page_range page_remove_rmap deferred_split_huge_page __lock_text_start native_queued_spin_lock_slowpath If THP can't be removed from rmap as whole THP, partial THP will be removed from rmap by removing sub-pages from rmap. Even the THP head page is added to deferred queue already, the split_queue_lock will be acquired and check whether the THP head page is in the queue already. Thus, the contention of split_queue_lock is raised. Before acquire split_queue_lock, check and bail out early if the THP head page is in the queue already. The checking without holding split_queue_lock could race with deferred_split_scan, but it doesn't impact the correctness here. Test result of building kernel with ld.lld: commit 7b5a0b664ebe (parent commit of f35b5d7d676e): time -f "\t%E real,\t%U user,\t%S sys" make LD=ld.lld -skj96 allmodconfig all 6:07.99 real, 26367.77 user, 5063.35 sys commit f35b5d7d676e: time -f "\t%E real,\t%U user,\t%S sys" make LD=ld.lld -skj96 allmodconfig all 7:22.15 real, 26235.03 user, 12504.55 sys commit f35b5d7d676e with the fixing patch: time -f "\t%E real,\t%U user,\t%S sys" make LD=ld.lld -skj96 allmodconfig all 6:08.49 real, 26520.15 user, 5047.91 sys Link: https://lkml.kernel.org/r/20221223135207.2275317-1-fengwei.yin@intel.com Signed-off-by: Yin Fengwei Tested-by: Nathan Chancellor Acked-by: David Rientjes Reviewed-by: "Huang, Ying" Cc: Feng Tang Cc: Matthew Wilcox Cc: Rik van Riel Cc: Xing Zhengjun Cc: Yang Shi Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 266c4b557946..0ea6510a3be7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2835,6 +2835,9 @@ void deferred_split_huge_page(struct page *page) if (PageSwapCache(page)) return; + if (!list_empty(page_deferred_list(page))) + return; + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (list_empty(page_deferred_list(page))) { count_vm_event(THP_DEFERRED_SPLIT_PAGE); -- cgit v1.2.3 From 5b68de67037168f826d6fe434d03b5876aec4cb6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Dec 2022 06:10:26 -1000 Subject: fs: remove an outdated comment on mpage_writepages Patch series "remove generic_writepages" This series removes generic_writepages by open coding the current functionality in the three remaining callers. Besides removing some code the main benefit is that one of the few remaining ->writepage callers from outside the core page cache code go away. This patch (of 6): mpage_writepages doesn't do any of the page locking itself, so remove and outdated comment on the locking pattern there. Link: https://lkml.kernel.org/r/20221229161031.391878-1-hch@lst.de Link: https://lkml.kernel.org/r/20221229161031.391878-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Joel Becker Cc: Joseph Qi Cc: Konstantin Komarov Cc: Mark Fasheh Cc: Matthew Wilcox Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/mpage.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/mpage.c b/fs/mpage.c index db59cbf6affc..d36a95473f77 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -641,14 +641,6 @@ out: * * This is a library function, which implements the writepages() * address_space_operation. - * - * If a page is already under I/O, generic_writepages() skips it, even - * if it's dirty. This is desirable behaviour for memory-cleaning writeback, - * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() - * and msync() need to guarantee that all the data which was dirty at the time - * the call was made get new I/O started against them. If wbc->sync_mode is - * WB_SYNC_ALL then we were called for data integrity and we must wait for - * existing IO to complete. */ int mpage_writepages(struct address_space *mapping, -- cgit v1.2.3 From d4428bad14dd1509d7a1176dba69a01d67c0b86d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Dec 2022 06:10:27 -1000 Subject: ntfs3: stop using generic_writepages Open code the resident inode handling in ntfs_writepages by directly using write_cache_pages to prepare removing the ->writepage handler in ntfs3. Link: https://lkml.kernel.org/r/20221229161031.391878-3-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Jan Kara Cc: Joel Becker Cc: Joseph Qi Cc: Konstantin Komarov Cc: Mark Fasheh Cc: Matthew Wilcox Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/ntfs3/inode.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 20b953871574..b6dad2da5950 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -852,12 +852,29 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc) return block_write_full_page(page, ntfs_get_block, wbc); } +static int ntfs_resident_writepage(struct page *page, + struct writeback_control *wbc, void *data) +{ + struct address_space *mapping = data; + struct ntfs_inode *ni = ntfs_i(mapping->host); + int ret; + + ni_lock(ni); + ret = attr_data_write_resident(ni, page); + ni_unlock(ni); + + if (ret != E_NTFS_NONRESIDENT) + unlock_page(page); + mapping_set_error(mapping, ret); + return ret; +} + static int ntfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - /* Redirect call to 'ntfs_writepage' for resident files. */ if (is_resident(ntfs_i(mapping->host))) - return generic_writepages(mapping, wbc); + return write_cache_pages(mapping, wbc, ntfs_resident_writepage, + mapping); return mpage_writepages(mapping, wbc, ntfs_get_block); } -- cgit v1.2.3 From 25a89826f270ddbf76dca7d64e4f8a8dccda3d1e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Dec 2022 06:10:28 -1000 Subject: ntfs3: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and remove the ->writepage implementation. Link: https://lkml.kernel.org/r/20221229161031.391878-4-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Jan Kara Cc: Joel Becker Cc: Joseph Qi Cc: Konstantin Komarov Cc: Mark Fasheh Cc: Matthew Wilcox Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/ntfs3/inode.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index b6dad2da5950..6b50b6e32378 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -832,26 +832,6 @@ out: return err; } -static int ntfs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - struct ntfs_inode *ni = ntfs_i(inode); - int err; - - if (is_resident(ni)) { - ni_lock(ni); - err = attr_data_write_resident(ni, page); - ni_unlock(ni); - if (err != E_NTFS_NONRESIDENT) { - unlock_page(page); - return err; - } - } - - return block_write_full_page(page, ntfs_get_block, wbc); -} - static int ntfs_resident_writepage(struct page *page, struct writeback_control *wbc, void *data) { @@ -2083,13 +2063,13 @@ const struct inode_operations ntfs_link_inode_operations = { const struct address_space_operations ntfs_aops = { .read_folio = ntfs_read_folio, .readahead = ntfs_readahead, - .writepage = ntfs_writepage, .writepages = ntfs_writepages, .write_begin = ntfs_write_begin, .write_end = ntfs_write_end, .direct_IO = ntfs_direct_IO, .bmap = ntfs_bmap, .dirty_folio = block_dirty_folio, + .migrate_folio = buffer_migrate_folio, .invalidate_folio = block_invalidate_folio, }; -- cgit v1.2.3 From cff61bbc717bfddd6e433fe142b8e70b21546a1d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Dec 2022 06:10:29 -1000 Subject: jbd2,ocfs2: move jbd2_journal_submit_inode_data_buffers to ocfs2 jbd2_journal_submit_inode_data_buffers is only used by ocfs2, so move it there to prepare for removing generic_writepages. Link: https://lkml.kernel.org/r/20221229161031.391878-5-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Jan Kara Cc: Joel Becker Cc: Joseph Qi Cc: Konstantin Komarov Cc: Mark Fasheh Cc: Matthew Wilcox Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/jbd2/commit.c | 25 ------------------------- fs/jbd2/journal.c | 1 - fs/ocfs2/journal.c | 16 +++++++++++++++- include/linux/jbd2.h | 2 -- 4 files changed, 15 insertions(+), 29 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 96a1ebc6342d..b33155dd7001 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -177,31 +177,6 @@ static int journal_wait_on_commit_record(journal_t *journal, return ret; } -/* - * write the filemap data using writepage() address_space_operations. - * We don't do block allocation here even for delalloc. We don't - * use writepages() because with delayed allocation we may be doing - * block allocation in writepages(). - */ -int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) -{ - struct address_space *mapping = jinode->i_vfs_inode->i_mapping; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = mapping->nrpages * 2, - .range_start = jinode->i_dirty_start, - .range_end = jinode->i_dirty_end, - }; - - /* - * submit the inode data buffers. We use writepage - * instead of writepages. Because writepages can do - * block allocation with delalloc. We need to write - * only allocated blocks here. - */ - return generic_writepages(mapping, &wbc); -} - /* Send all the data buffers related to an inode */ int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode) { diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 4095fe91457f..e80c781731f8 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -89,7 +89,6 @@ EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); EXPORT_SYMBOL(jbd2_journal_inode_ranged_write); EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait); -EXPORT_SYMBOL(jbd2_journal_submit_inode_data_buffers); EXPORT_SYMBOL(jbd2_journal_finish_inode_data_buffers); EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 3fb98b4569a2..59f612684c51 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -841,6 +842,19 @@ bail: return status; } +static int ocfs2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) +{ + struct address_space *mapping = jinode->i_vfs_inode->i_mapping; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = mapping->nrpages * 2, + .range_start = jinode->i_dirty_start, + .range_end = jinode->i_dirty_end, + }; + + return generic_writepages(mapping, &wbc); +} + int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) { int status = -1; @@ -910,7 +924,7 @@ int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) journal->j_journal = j_journal; journal->j_journal->j_submit_inode_data_buffers = - jbd2_journal_submit_inode_data_buffers; + ocfs2_journal_submit_inode_data_buffers; journal->j_journal->j_finish_inode_data_buffers = jbd2_journal_finish_inode_data_buffers; journal->j_inode = inode; diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 2170e0cc279d..5962072a4b19 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1570,8 +1570,6 @@ extern int jbd2_journal_inode_ranged_write(handle_t *handle, extern int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *inode, loff_t start_byte, loff_t length); -extern int jbd2_journal_submit_inode_data_buffers( - struct jbd2_inode *jinode); extern int jbd2_journal_finish_inode_data_buffers( struct jbd2_inode *jinode); extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, -- cgit v1.2.3 From 17c30ee6f2670804148f23b19b5de8308a02bd2c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Dec 2022 06:10:30 -1000 Subject: ocfs2: use filemap_fdatawrite_wbc instead of generic_writepages filemap_fdatawrite_wbc is a fairly thing wrapper around do_writepages, and the big difference there is support for cgroup writeback, which is not supported by ocfs2, and the potential to use ->writepages instead of ->writepage, which ocfs2 does not currently implement but eventually should. Link: https://lkml.kernel.org/r/20221229161031.391878-6-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Joel Becker Cc: Joseph Qi Cc: Konstantin Komarov Cc: Mark Fasheh Cc: Matthew Wilcox Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/ocfs2/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 59f612684c51..25d8072ccfce 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -852,7 +852,7 @@ static int ocfs2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) .range_end = jinode->i_dirty_end, }; - return generic_writepages(mapping, &wbc); + return filemap_fdatawrite_wbc(mapping, &wbc); } int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) -- cgit v1.2.3 From c2ca7a59a4199059556b57cfdf98fcf46039ca6b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Dec 2022 06:10:31 -1000 Subject: mm: remove generic_writepages Now that all external callers are gone, just fold it into do_writepages. Link: https://lkml.kernel.org/r/20221229161031.391878-7-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Joel Becker Cc: Joseph Qi Cc: Konstantin Komarov Cc: Mark Fasheh Cc: Matthew Wilcox Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- include/linux/writeback.h | 2 -- mm/page-writeback.c | 53 ++++++++++++++--------------------------------- 2 files changed, 15 insertions(+), 40 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 06f9291b6fd5..2554b71765e9 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -369,8 +369,6 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb); typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, void *data); -int generic_writepages(struct address_space *mapping, - struct writeback_control *wbc); void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end); int write_cache_pages(struct address_space *mapping, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 41128ea9c997..337cafe9978c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2526,12 +2526,8 @@ continue_unlock: } EXPORT_SYMBOL(write_cache_pages); -/* - * Function used by generic_writepages to call the real writepage - * function and set the mapping flags on error - */ -static int __writepage(struct page *page, struct writeback_control *wbc, - void *data) +static int writepage_cb(struct page *page, struct writeback_control *wbc, + void *data) { struct address_space *mapping = data; int ret = mapping->a_ops->writepage(page, wbc); @@ -2539,34 +2535,6 @@ static int __writepage(struct page *page, struct writeback_control *wbc, return ret; } -/** - * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. - * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * - * This is a library function, which implements the writepages() - * address_space_operation. - * - * Return: %0 on success, negative error code otherwise - */ -int generic_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - struct blk_plug plug; - int ret; - - /* deal with chardevs and other special file */ - if (!mapping->a_ops->writepage) - return 0; - - blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, __writepage, mapping); - blk_finish_plug(&plug); - return ret; -} - -EXPORT_SYMBOL(generic_writepages); - int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; @@ -2577,11 +2545,20 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) wb = inode_to_wb_wbc(mapping->host, wbc); wb_bandwidth_estimate_start(wb); while (1) { - if (mapping->a_ops->writepages) + if (mapping->a_ops->writepages) { ret = mapping->a_ops->writepages(mapping, wbc); - else - ret = generic_writepages(mapping, wbc); - if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL)) + } else if (mapping->a_ops->writepage) { + struct blk_plug plug; + + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, writepage_cb, + mapping); + blk_finish_plug(&plug); + } else { + /* deal with chardevs and other special files */ + ret = 0; + } + if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL) break; /* -- cgit v1.2.3 From 630e7c5ee3399be509447035428994e2d88f12c1 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 29 Dec 2022 20:25:03 +0800 Subject: mm: huge_memory: convert split_huge_pages_all() to use a folio Straightforwardly convert split_huge_pages_all() to use a folio. Link: https://lkml.kernel.org/r/20221229122503.149083-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Mike Kravetz Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/huge_memory.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0ea6510a3be7..7e68a36b4f7d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2935,6 +2935,7 @@ static void split_huge_pages_all(void) { struct zone *zone; struct page *page; + struct folio *folio; unsigned long pfn, max_zone_pfn; unsigned long total = 0, split = 0; @@ -2947,24 +2948,32 @@ static void split_huge_pages_all(void) int nr_pages; page = pfn_to_online_page(pfn); - if (!page || !get_page_unless_zero(page)) + if (!page || PageTail(page)) + continue; + folio = page_folio(page); + if (!folio_try_get(folio)) continue; - if (zone != page_zone(page)) + if (unlikely(page_folio(page) != folio)) + goto next; + + if (zone != folio_zone(folio)) goto next; - if (!PageHead(page) || PageHuge(page) || !PageLRU(page)) + if (!folio_test_large(folio) + || folio_test_hugetlb(folio) + || !folio_test_lru(folio)) goto next; total++; - lock_page(page); - nr_pages = thp_nr_pages(page); - if (!split_huge_page(page)) + folio_lock(folio); + nr_pages = folio_nr_pages(folio); + if (!split_folio(folio)) split++; pfn += nr_pages - 1; - unlock_page(page); + folio_unlock(folio); next: - put_page(page); + folio_put(folio); cond_resched(); } } -- cgit v1.2.3 From 071acb3084c5e04db3ff09865e4030aefaa2ab92 Mon Sep 17 00:00:00 2001 From: JeongHyeon Lee Date: Fri, 23 Dec 2022 13:03:31 +0900 Subject: zram: fix typos in comments - The double `range` is duplicated in comment, remove one. - change `syfs` to `sysfs` Link: https://lkml.kernel.org/r/20221223040331.4194-1-jhs2.lee@samsung.com Signed-off-by: JeongHyeon Lee Reviewed-by: Sergey Senozhatsky Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e290d6d97047..7becd5448791 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -190,7 +190,7 @@ static inline bool valid_io_request(struct zram *zram, end = start + (size >> SECTOR_SHIFT); bound = zram->disksize >> SECTOR_SHIFT; - /* out of range range */ + /* out of range */ if (unlikely(start >= bound || end > bound || start > end)) return false; @@ -2385,7 +2385,7 @@ static int zram_add(void) zram->disk->private_data = zram; snprintf(zram->disk->disk_name, 16, "zram%d", device_id); - /* Actual capacity set using syfs (/sys/block/zram/disksize */ + /* Actual capacity set using sysfs (/sys/block/zram/disksize */ set_capacity(zram->disk, 0); /* zram devices sort of resembles non-rotational disks */ blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue); -- cgit v1.2.3 From becacb04fdd439d7d1f2a93739161706a2e3e947 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 30 Dec 2022 15:08:42 +0800 Subject: mm: memcg: add folio_memcg_check() Patch series "mm: convert page_idle/damon to use folios", v4. This patch (of 8): Convert page_memcg_check() into folio_memcg_check() and add a page_memcg_check() wrapper. The behaviour of page_memcg_check() is unchanged; tail pages always had a NULL ->memcg_data. Link: https://lkml.kernel.org/r/20221230070849.63358-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20221230070849.63358-2-wangkefeng.wang@huawei.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: SeongJae Park Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 40 ++++++++++++++++++++++++++-------------- mm/memcontrol.c | 6 +++--- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2e08b05bc6bf..26667bf16da5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -466,34 +466,34 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) } /* - * page_memcg_check - get the memory cgroup associated with a page - * @page: a pointer to the page struct + * folio_memcg_check - Get the memory cgroup associated with a folio. + * @folio: Pointer to the folio. * - * Returns a pointer to the memory cgroup associated with the page, - * or NULL. This function unlike page_memcg() can take any page - * as an argument. It has to be used in cases when it's not known if a page + * Returns a pointer to the memory cgroup associated with the folio, + * or NULL. This function unlike folio_memcg() can take any folio + * as an argument. It has to be used in cases when it's not known if a folio * has an associated memory cgroup pointer or an object cgroups vector or * an object cgroup. * - * For a non-kmem page any of the following ensures page and memcg binding + * For a non-kmem folio any of the following ensures folio and memcg binding * stability: * - * - the page lock + * - the folio lock * - LRU isolation - * - lock_page_memcg() + * - lock_folio_memcg() * - exclusive reference * - mem_cgroup_trylock_pages() * - * For a kmem page a caller should hold an rcu read lock to protect memcg - * associated with a kmem page from being released. + * For a kmem folio a caller should hold an rcu read lock to protect memcg + * associated with a kmem folio from being released. */ -static inline struct mem_cgroup *page_memcg_check(struct page *page) +static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) { /* - * Because page->memcg_data might be changed asynchronously - * for slab pages, READ_ONCE() should be used here. + * Because folio->memcg_data might be changed asynchronously + * for slabs, READ_ONCE() should be used here. */ - unsigned long memcg_data = READ_ONCE(page->memcg_data); + unsigned long memcg_data = READ_ONCE(folio->memcg_data); if (memcg_data & MEMCG_DATA_OBJCGS) return NULL; @@ -508,6 +508,13 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } +static inline struct mem_cgroup *page_memcg_check(struct page *page) +{ + if (PageTail(page)) + return NULL; + return folio_memcg_check((struct folio *)page); +} + static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) { struct mem_cgroup *memcg; @@ -1170,6 +1177,11 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) return NULL; } +static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) +{ + return NULL; +} + static inline struct mem_cgroup *page_memcg_check(struct page *page) { return NULL; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2758b67eb169..17965e558ab7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2952,13 +2952,13 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) } /* - * page_memcg_check() is used here, because in theory we can encounter + * folio_memcg_check() is used here, because in theory we can encounter * a folio where the slab flag has been cleared already, but * slab->memcg_data has not been freed yet - * page_memcg_check(page) will guarantee that a proper memory + * folio_memcg_check() will guarantee that a proper memory * cgroup pointer or NULL will be returned. */ - return page_memcg_check(folio_page(folio, 0)); + return folio_memcg_check(folio); } /* -- cgit v1.2.3 From 5acc17fd35e62780a14e4198deb2a6d1d57aa372 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 30 Dec 2022 15:08:43 +0800 Subject: mm: page_idle: convert page idle to use a folio Firstly, make page_idle_get_page() return a folio, also rename it to page_idle_get_folio(), then, use it to convert page_idle_bitmap_read() and page_idle_bitmap_write() functions. Link: https://lkml.kernel.org/r/20221230070849.63358-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/page_idle.c | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/mm/page_idle.c b/mm/page_idle.c index bc08332a609c..41ea77f22011 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -31,19 +31,22 @@ * * This function tries to get a user memory page by pfn as described above. */ -static struct page *page_idle_get_page(unsigned long pfn) +static struct folio *page_idle_get_folio(unsigned long pfn) { struct page *page = pfn_to_online_page(pfn); + struct folio *folio; - if (!page || !PageLRU(page) || - !get_page_unless_zero(page)) + if (!page || PageTail(page)) return NULL; - if (unlikely(!PageLRU(page))) { - put_page(page); - page = NULL; + folio = page_folio(page); + if (!folio_test_lru(folio) || !folio_try_get(folio)) + return NULL; + if (unlikely(page_folio(page) != folio || !folio_test_lru(folio))) { + folio_put(folio); + folio = NULL; } - return page; + return folio; } static bool page_idle_clear_pte_refs_one(struct folio *folio, @@ -83,10 +86,8 @@ static bool page_idle_clear_pte_refs_one(struct folio *folio, return true; } -static void page_idle_clear_pte_refs(struct page *page) +static void page_idle_clear_pte_refs(struct folio *folio) { - struct folio *folio = page_folio(page); - /* * Since rwc.try_lock is unused, rwc is effectively immutable, so we * can make it static to save some cycles and stack. @@ -115,7 +116,7 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, loff_t pos, size_t count) { u64 *out = (u64 *)buf; - struct page *page; + struct folio *folio; unsigned long pfn, end_pfn; int bit; @@ -134,19 +135,19 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, bit = pfn % BITMAP_CHUNK_BITS; if (!bit) *out = 0ULL; - page = page_idle_get_page(pfn); - if (page) { - if (page_is_idle(page)) { + folio = page_idle_get_folio(pfn); + if (folio) { + if (folio_test_idle(folio)) { /* * The page might have been referenced via a * pte, in which case it is not idle. Clear * refs and recheck. */ - page_idle_clear_pte_refs(page); - if (page_is_idle(page)) + page_idle_clear_pte_refs(folio); + if (folio_test_idle(folio)) *out |= 1ULL << bit; } - put_page(page); + folio_put(folio); } if (bit == BITMAP_CHUNK_BITS - 1) out++; @@ -160,7 +161,7 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj, loff_t pos, size_t count) { const u64 *in = (u64 *)buf; - struct page *page; + struct folio *folio; unsigned long pfn, end_pfn; int bit; @@ -178,11 +179,11 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj, for (; pfn < end_pfn; pfn++) { bit = pfn % BITMAP_CHUNK_BITS; if ((*in >> bit) & 1) { - page = page_idle_get_page(pfn); - if (page) { - page_idle_clear_pte_refs(page); - set_page_idle(page); - put_page(page); + folio = page_idle_get_folio(pfn); + if (folio) { + page_idle_clear_pte_refs(folio); + folio_set_idle(folio); + folio_put(folio); } } if (bit == BITMAP_CHUNK_BITS - 1) -- cgit v1.2.3 From 5e012bba019afa6aca74df19751783a47d16ebf7 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 30 Dec 2022 15:08:44 +0800 Subject: mm/damon: introduce damon_get_folio() Introduce damon_get_folio(), and the temporary wrapper function damon_get_page(), which help us to convert damon related functions to use folios, and it will be dropped once the conversion is completed. Link: https://lkml.kernel.org/r/20221230070849.63358-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/damon/ops-common.c | 18 +++++++++++------- mm/damon/ops-common.h | 9 ++++++++- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 75409601f934..1294a256a87c 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -16,21 +16,25 @@ * Get an online page for a pfn if it's in the LRU list. Otherwise, returns * NULL. * - * The body of this function is stolen from the 'page_idle_get_page()'. We + * The body of this function is stolen from the 'page_idle_get_folio()'. We * steal rather than reuse it because the code is quite simple. */ -struct page *damon_get_page(unsigned long pfn) +struct folio *damon_get_folio(unsigned long pfn) { struct page *page = pfn_to_online_page(pfn); + struct folio *folio; - if (!page || !PageLRU(page) || !get_page_unless_zero(page)) + if (!page || PageTail(page)) return NULL; - if (unlikely(!PageLRU(page))) { - put_page(page); - page = NULL; + folio = page_folio(page); + if (!folio_test_lru(folio) || !folio_try_get(folio)) + return NULL; + if (unlikely(page_folio(page) != folio || !folio_test_lru(folio))) { + folio_put(folio); + folio = NULL; } - return page; + return folio; } void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr) diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h index 8d82d3722204..65f290f0a9d6 100644 --- a/mm/damon/ops-common.h +++ b/mm/damon/ops-common.h @@ -7,7 +7,14 @@ #include -struct page *damon_get_page(unsigned long pfn); +struct folio *damon_get_folio(unsigned long pfn); +static inline struct page *damon_get_page(unsigned long pfn) +{ + struct folio *folio = damon_get_folio(pfn); + + /* when folio is NULL, return &(0->page) mean return NULL */ + return &folio->page; +} void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr); void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr); -- cgit v1.2.3 From 70e314c9ab4faf810fc088503a37fb3b126d09e2 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 30 Dec 2022 15:08:45 +0800 Subject: mm/damon: convert damon_ptep/pmdp_mkold() to use a folio With damon_get_folio(), let's convert damon_ptep_mkold() and damon_pmdp_mkold() to use a folio. Link: https://lkml.kernel.org/r/20221230070849.63358-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/damon/ops-common.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 1294a256a87c..cc63cf953636 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -40,9 +40,9 @@ struct folio *damon_get_folio(unsigned long pfn) void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr) { bool referenced = false; - struct page *page = damon_get_page(pte_pfn(*pte)); + struct folio *folio = damon_get_folio(pte_pfn(*pte)); - if (!page) + if (!folio) return; if (pte_young(*pte)) { @@ -56,19 +56,19 @@ void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr) #endif /* CONFIG_MMU_NOTIFIER */ if (referenced) - set_page_young(page); + folio_set_young(folio); - set_page_idle(page); - put_page(page); + folio_set_idle(folio); + folio_put(folio); } void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE bool referenced = false; - struct page *page = damon_get_page(pmd_pfn(*pmd)); + struct folio *folio = damon_get_folio(pmd_pfn(*pmd)); - if (!page) + if (!folio) return; if (pmd_young(*pmd)) { @@ -82,10 +82,10 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr) #endif /* CONFIG_MMU_NOTIFIER */ if (referenced) - set_page_young(page); + folio_set_young(folio); - set_page_idle(page); - put_page(page); + folio_set_idle(folio); + folio_put(folio); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ } -- cgit v1.2.3 From 07bb1fbaa2bbd2a387bcc1171d8aee1a96178b45 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 30 Dec 2022 15:08:46 +0800 Subject: mm/damon/paddr: convert damon_pa_*() to use a folio With damon_get_folio(), let's convert all the damon_pa_*() to use a folio. Link: https://lkml.kernel.org/r/20221230070849.63358-6-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 58 +++++++++++++++++++++++++------------------------------- 1 file changed, 26 insertions(+), 32 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 6334c99e5152..99d4c357ef2b 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -33,17 +33,15 @@ static bool __damon_pa_mkold(struct folio *folio, struct vm_area_struct *vma, static void damon_pa_mkold(unsigned long paddr) { - struct folio *folio; - struct page *page = damon_get_page(PHYS_PFN(paddr)); + struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); struct rmap_walk_control rwc = { .rmap_one = __damon_pa_mkold, .anon_lock = folio_lock_anon_vma_read, }; bool need_lock; - if (!page) + if (!folio) return; - folio = page_folio(page); if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { folio_set_idle(folio); @@ -122,8 +120,7 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz) { - struct folio *folio; - struct page *page = damon_get_page(PHYS_PFN(paddr)); + struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); struct damon_pa_access_chk_result result = { .page_sz = PAGE_SIZE, .accessed = false, @@ -135,9 +132,8 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz) }; bool need_lock; - if (!page) + if (!folio) return false; - folio = page_folio(page); if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { if (folio_test_idle(folio)) @@ -203,18 +199,18 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) } static bool __damos_pa_filter_out(struct damos_filter *filter, - struct page *page) + struct folio *folio) { bool matched = false; struct mem_cgroup *memcg; switch (filter->type) { case DAMOS_FILTER_TYPE_ANON: - matched = PageAnon(page); + matched = folio_test_anon(folio); break; case DAMOS_FILTER_TYPE_MEMCG: rcu_read_lock(); - memcg = page_memcg_check(page); + memcg = folio_memcg_check(folio); if (!memcg) matched = false; else @@ -231,12 +227,12 @@ static bool __damos_pa_filter_out(struct damos_filter *filter, /* * damos_pa_filter_out - Return true if the page should be filtered out. */ -static bool damos_pa_filter_out(struct damos *scheme, struct page *page) +static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio) { struct damos_filter *filter; damos_for_each_filter(filter, scheme) { - if (__damos_pa_filter_out(filter, page)) + if (__damos_pa_filter_out(filter, folio)) return true; } return false; @@ -245,33 +241,33 @@ static bool damos_pa_filter_out(struct damos *scheme, struct page *page) static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) { unsigned long addr, applied; - LIST_HEAD(page_list); + LIST_HEAD(folio_list); for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { - struct page *page = damon_get_page(PHYS_PFN(addr)); + struct folio *folio = damon_get_folio(PHYS_PFN(addr)); - if (!page) + if (!folio) continue; - if (damos_pa_filter_out(s, page)) { - put_page(page); + if (damos_pa_filter_out(s, folio)) { + folio_put(folio); continue; } - ClearPageReferenced(page); - test_and_clear_page_young(page); - if (isolate_lru_page(page)) { - put_page(page); + folio_clear_referenced(folio); + folio_test_clear_young(folio); + if (folio_isolate_lru(folio)) { + folio_put(folio); continue; } - if (PageUnevictable(page)) { - putback_lru_page(page); + if (folio_test_unevictable(folio)) { + folio_putback_lru(folio); } else { - list_add(&page->lru, &page_list); - put_page(page); + list_add(&folio->lru, &folio_list); + folio_put(folio); } } - applied = reclaim_pages(&page_list); + applied = reclaim_pages(&folio_list); cond_resched(); return applied * PAGE_SIZE; } @@ -282,14 +278,12 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( unsigned long addr, applied = 0; for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { - struct page *page = damon_get_page(PHYS_PFN(addr)); - struct folio *folio; + struct folio *folio = damon_get_folio(PHYS_PFN(addr)); - if (!page) + if (!folio) continue; - folio = page_folio(page); - if (damos_pa_filter_out(s, &folio->page)) { + if (damos_pa_filter_out(s, folio)) { folio_put(folio); continue; } -- cgit v1.2.3 From dc1b78665b37ec50fc142a7f1a8fa4f626cd6a58 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 30 Dec 2022 15:08:47 +0800 Subject: mm/damon/vaddr: convert damon_young_pmd_entry() to use a folio With damon_get_folio(), let's convert damon_young_pmd_entry() to use a folio. Link: https://lkml.kernel.org/r/20221230070849.63358-7-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 15f03df66db6..29227b7a6032 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -431,7 +431,7 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, { pte_t *pte; spinlock_t *ptl; - struct page *page; + struct folio *folio; struct damon_young_walk_private *priv = walk->private; #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -446,16 +446,16 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, spin_unlock(ptl); goto regular_page; } - page = damon_get_page(pmd_pfn(*pmd)); - if (!page) + folio = damon_get_folio(pmd_pfn(*pmd)); + if (!folio) goto huge_out; - if (pmd_young(*pmd) || !page_is_idle(page) || + if (pmd_young(*pmd) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) { *priv->page_sz = HPAGE_PMD_SIZE; priv->young = true; } - put_page(page); + folio_put(folio); huge_out: spin_unlock(ptl); return 0; @@ -469,15 +469,15 @@ regular_page: pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte_present(*pte)) goto out; - page = damon_get_page(pte_pfn(*pte)); - if (!page) + folio = damon_get_folio(pte_pfn(*pte)); + if (!folio) goto out; - if (pte_young(*pte) || !page_is_idle(page) || + if (pte_young(*pte) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) { *priv->page_sz = PAGE_SIZE; priv->young = true; } - put_page(page); + folio_put(folio); out: pte_unmap_unlock(pte, ptl); return 0; -- cgit v1.2.3 From 7824debb3d029e6a6252137fd10f3553cc8de22a Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 30 Dec 2022 15:08:48 +0800 Subject: mm/damon: remove unneeded damon_get_page() After all damon_get_page() callers are converted to damon_get_folio(), remove unneeded wrapper damon_get_page(). Link: https://lkml.kernel.org/r/20221230070849.63358-8-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/damon/ops-common.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h index 65f290f0a9d6..14f4bc69f29b 100644 --- a/mm/damon/ops-common.h +++ b/mm/damon/ops-common.h @@ -8,13 +8,6 @@ #include struct folio *damon_get_folio(unsigned long pfn); -static inline struct page *damon_get_page(unsigned long pfn) -{ - struct folio *folio = damon_get_folio(pfn); - - /* when folio is NULL, return &(0->page) mean return NULL */ - return &folio->page; -} void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr); void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr); -- cgit v1.2.3 From 6b7cea90c82e104c1151fec1f3ee216997fda652 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 30 Dec 2022 15:08:49 +0800 Subject: mm/damon/vaddr: convert hugetlb related functions to use a folio Convert damon_hugetlb_mkold() and damon_young_hugetlb_entry() to use a folio. Link: https://lkml.kernel.org/r/20221230070849.63358-9-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 29227b7a6032..9d92c5eb3a1f 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -335,9 +335,9 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, { bool referenced = false; pte_t entry = huge_ptep_get(pte); - struct page *page = pte_page(entry); + struct folio *folio = pfn_folio(pte_pfn(entry)); - get_page(page); + folio_get(folio); if (pte_young(entry)) { referenced = true; @@ -352,10 +352,10 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, #endif /* CONFIG_MMU_NOTIFIER */ if (referenced) - set_page_young(page); + folio_set_young(folio); - set_page_idle(page); - put_page(page); + folio_set_idle(folio); + folio_put(folio); } static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask, @@ -490,7 +490,7 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, { struct damon_young_walk_private *priv = walk->private; struct hstate *h = hstate_vma(walk->vma); - struct page *page; + struct folio *folio; spinlock_t *ptl; pte_t entry; @@ -499,16 +499,16 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, if (!pte_present(entry)) goto out; - page = pte_page(entry); - get_page(page); + folio = pfn_folio(pte_pfn(entry)); + folio_get(folio); - if (pte_young(entry) || !page_is_idle(page) || + if (pte_young(entry) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) { *priv->page_sz = huge_page_size(h); priv->young = true; } - put_page(page); + folio_put(folio); out: spin_unlock(ptl); -- cgit v1.2.3 From a79390f5d6a78647fd70856bd42b22d994de0ba2 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 4 Jan 2023 17:52:06 -0500 Subject: mm/mprotect: use long for page accountings and retval Switch to use type "long" for page accountings and retval across the whole procedure of change_protection(). The change should have shrinked the possible maximum page number to be half comparing to previous (ULONG_MAX / 2), but it shouldn't overflow on any system either because the maximum possible pages touched by change protection should be ULONG_MAX / PAGE_SIZE. Two reasons to switch from "unsigned long" to "long": 1. It suites better on count_vm_numa_events(), whose 2nd parameter takes a long type. 2. It paves way for returning negative (error) values in the future. Currently the only caller that consumes this retval is change_prot_numa(), where the unsigned long was converted to an int. Since at it, touching up the numa code to also take a long, so it'll avoid any possible overflow too during the int-size convertion. Link: https://lkml.kernel.org/r/20230104225207.1066932-3-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: Mike Kravetz Acked-by: James Houghton Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Muchun Song Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- include/linux/mm.h | 2 +- mm/hugetlb.c | 4 ++-- mm/mempolicy.c | 2 +- mm/mprotect.c | 26 +++++++++++++------------- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index b6b10101bea7..e3aa336df900 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -248,7 +248,7 @@ void hugetlb_vma_lock_release(struct kref *kref); int pmd_huge(pmd_t pmd); int pud_huge(pud_t pud); -unsigned long hugetlb_change_protection(struct vm_area_struct *vma, +long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags); @@ -437,7 +437,7 @@ static inline void move_hugetlb_state(struct folio *old_folio, { } -static inline unsigned long hugetlb_change_protection( +static inline long hugetlb_change_protection( struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) diff --git a/include/linux/mm.h b/include/linux/mm.h index 329ed67edd76..4ac5ea4b584c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2132,7 +2132,7 @@ static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma } bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, pte_t pte); -extern unsigned long change_protection(struct mmu_gather *tlb, +extern long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long cp_flags); extern int mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a82f41024167..a0d6d0980064 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6615,7 +6615,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, return i ? i : err; } -unsigned long hugetlb_change_protection(struct vm_area_struct *vma, +long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { @@ -6624,7 +6624,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, pte_t *ptep; pte_t pte; struct hstate *h = hstate_vma(vma); - unsigned long pages = 0, psize = huge_page_size(h); + long pages = 0, psize = huge_page_size(h); bool shared_pmd = false; struct mmu_notifier_range range; unsigned long last_addr_mask; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d3558248a0f0..a86b8f15e2f0 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -631,7 +631,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct mmu_gather tlb; - int nr_updated; + long nr_updated; tlb_gather_mmu(&tlb, vma->vm_mm); diff --git a/mm/mprotect.c b/mm/mprotect.c index 71358e45a742..0af22ab59ea8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -80,13 +80,13 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, return pte_dirty(pte); } -static unsigned long change_pte_range(struct mmu_gather *tlb, +static long change_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { pte_t *pte, oldpte; spinlock_t *ptl; - unsigned long pages = 0; + long pages = 0; int target_node = NUMA_NO_NODE; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; @@ -353,13 +353,13 @@ uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags) } \ } while (0) -static inline unsigned long change_pmd_range(struct mmu_gather *tlb, +static inline long change_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { pmd_t *pmd; unsigned long next; - unsigned long pages = 0; + long pages = 0; unsigned long nr_huge_updates = 0; struct mmu_notifier_range range; @@ -367,7 +367,7 @@ static inline unsigned long change_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { - unsigned long this_pages; + long this_pages; next = pmd_addr_end(addr, end); @@ -437,13 +437,13 @@ next: return pages; } -static inline unsigned long change_pud_range(struct mmu_gather *tlb, +static inline long change_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { pud_t *pud; unsigned long next; - unsigned long pages = 0; + long pages = 0; pud = pud_offset(p4d, addr); do { @@ -458,13 +458,13 @@ static inline unsigned long change_pud_range(struct mmu_gather *tlb, return pages; } -static inline unsigned long change_p4d_range(struct mmu_gather *tlb, +static inline long change_p4d_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { p4d_t *p4d; unsigned long next; - unsigned long pages = 0; + long pages = 0; p4d = p4d_offset(pgd, addr); do { @@ -479,14 +479,14 @@ static inline unsigned long change_p4d_range(struct mmu_gather *tlb, return pages; } -static unsigned long change_protection_range(struct mmu_gather *tlb, +static long change_protection_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; unsigned long next; - unsigned long pages = 0; + long pages = 0; BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); @@ -505,12 +505,12 @@ static unsigned long change_protection_range(struct mmu_gather *tlb, return pages; } -unsigned long change_protection(struct mmu_gather *tlb, +long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long cp_flags) { pgprot_t newprot = vma->vm_page_prot; - unsigned long pages; + long pages; BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL); -- cgit v1.2.3 From d1751118c88673fe5a948ad82277898e9e284c55 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 4 Jan 2023 17:52:07 -0500 Subject: mm/uffd: detect pgtable allocation failures Before this patch, when there's any pgtable allocation issues happened during change_protection(), the error will be ignored from the syscall. For shmem, there will be an error dumped into the host dmesg. Two issues with that: (1) Doing a trace dump when allocation fails is not anything close to grace. (2) The user should be notified with any kind of such error, so the user can trap it and decide what to do next, either by retrying, or stop the process properly, or anything else. For userfault users, this will change the API of UFFDIO_WRITEPROTECT when pgtable allocation failure happened. It should not normally break anyone, though. If it breaks, then in good ways. One man-page update will be on the way to introduce the new -ENOMEM for UFFDIO_WRITEPROTECT. Not marking stable so we keep the old behavior on the 5.19-till-now kernels. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20230104225207.1066932-4-peterx@redhat.com Signed-off-by: Peter Xu Reported-by: James Houghton Acked-by: James Houghton Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Mike Kravetz Cc: Muchun Song Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 2 +- mm/hugetlb.c | 6 +++-- mm/mempolicy.c | 2 +- mm/mprotect.c | 63 ++++++++++++++++++++++++++++--------------- mm/userfaultfd.c | 16 +++++++---- 5 files changed, 59 insertions(+), 30 deletions(-) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 9df0b9a762cc..3767f18114ef 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -73,7 +73,7 @@ extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing); -extern void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma, +extern long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp); /* mm helpers */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a0d6d0980064..6fe65f14d33b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6660,8 +6660,10 @@ long hugetlb_change_protection(struct vm_area_struct *vma, * pre-allocations to install pte markers. */ ptep = huge_pte_alloc(mm, vma, address, psize); - if (!ptep) + if (!ptep) { + pages = -ENOMEM; break; + } } ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, vma, address, ptep)) { @@ -6751,7 +6753,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, hugetlb_vma_unlock_write(vma); mmu_notifier_invalidate_range_end(&range); - return pages << h->order; + return pages > 0 ? (pages << h->order) : pages; } /* Return true if reservation was successful, false otherwise. */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a86b8f15e2f0..85a34f1f3ab8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -636,7 +636,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, tlb_gather_mmu(&tlb, vma->vm_mm); nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); - if (nr_updated) + if (nr_updated > 0) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); tlb_finish_mmu(&tlb); diff --git a/mm/mprotect.c b/mm/mprotect.c index 0af22ab59ea8..92fc6f3fa512 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -330,28 +330,34 @@ uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags) /* * If wr-protecting the range for file-backed, populate pgtable for the case * when pgtable is empty but page cache exists. When {pte|pmd|...}_alloc() - * failed it means no memory, we don't have a better option but stop. + * failed we treat it the same way as pgtable allocation failures during + * page faults by kicking OOM and returning error. */ #define change_pmd_prepare(vma, pmd, cp_flags) \ - do { \ + ({ \ + long err = 0; \ if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \ - if (WARN_ON_ONCE(pte_alloc(vma->vm_mm, pmd))) \ - break; \ + if (pte_alloc(vma->vm_mm, pmd)) \ + err = -ENOMEM; \ } \ - } while (0) + err; \ + }) + /* * This is the general pud/p4d/pgd version of change_pmd_prepare(). We need to * have separate change_pmd_prepare() because pte_alloc() returns 0 on success, * while {pmd|pud|p4d}_alloc() returns the valid pointer on success. */ #define change_prepare(vma, high, low, addr, cp_flags) \ - do { \ + ({ \ + long err = 0; \ if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \ low##_t *p = low##_alloc(vma->vm_mm, high, addr); \ - if (WARN_ON_ONCE(p == NULL)) \ - break; \ + if (p == NULL) \ + err = -ENOMEM; \ } \ - } while (0) + err; \ + }) static inline long change_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, @@ -367,11 +373,15 @@ static inline long change_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { - long this_pages; + long ret; next = pmd_addr_end(addr, end); - change_pmd_prepare(vma, pmd, cp_flags); + ret = change_pmd_prepare(vma, pmd, cp_flags); + if (ret) { + pages = ret; + break; + } /* * Automatic NUMA balancing walks the tables with mmap_lock * held for read. It's possible a parallel update to occur @@ -401,7 +411,11 @@ static inline long change_pmd_range(struct mmu_gather *tlb, * cleared; make sure pmd populated if * necessary, then fall-through to pte level. */ - change_pmd_prepare(vma, pmd, cp_flags); + ret = change_pmd_prepare(vma, pmd, cp_flags); + if (ret) { + pages = ret; + break; + } } else { /* * change_huge_pmd() does not defer TLB flushes, @@ -422,9 +436,8 @@ static inline long change_pmd_range(struct mmu_gather *tlb, } /* fall through, the trans huge pmd just split */ } - this_pages = change_pte_range(tlb, vma, pmd, addr, next, - newprot, cp_flags); - pages += this_pages; + pages += change_pte_range(tlb, vma, pmd, addr, next, + newprot, cp_flags); next: cond_resched(); } while (pmd++, addr = next, addr != end); @@ -443,12 +456,14 @@ static inline long change_pud_range(struct mmu_gather *tlb, { pud_t *pud; unsigned long next; - long pages = 0; + long pages = 0, ret; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); - change_prepare(vma, pud, pmd, addr, cp_flags); + ret = change_prepare(vma, pud, pmd, addr, cp_flags); + if (ret) + return ret; if (pud_none_or_clear_bad(pud)) continue; pages += change_pmd_range(tlb, vma, pud, addr, next, newprot, @@ -464,12 +479,14 @@ static inline long change_p4d_range(struct mmu_gather *tlb, { p4d_t *p4d; unsigned long next; - long pages = 0; + long pages = 0, ret; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); - change_prepare(vma, p4d, pud, addr, cp_flags); + ret = change_prepare(vma, p4d, pud, addr, cp_flags); + if (ret) + return ret; if (p4d_none_or_clear_bad(p4d)) continue; pages += change_pud_range(tlb, vma, p4d, addr, next, newprot, @@ -486,14 +503,18 @@ static long change_protection_range(struct mmu_gather *tlb, struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; unsigned long next; - long pages = 0; + long pages = 0, ret; BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); tlb_start_vma(tlb, vma); do { next = pgd_addr_end(addr, end); - change_prepare(vma, pgd, p4d, addr, cp_flags); + ret = change_prepare(vma, pgd, p4d, addr, cp_flags); + if (ret) { + pages = ret; + break; + } if (pgd_none_or_clear_bad(pgd)) continue; pages += change_p4d_range(tlb, vma, pgd, addr, next, newprot, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 65ad172add27..53c3d916ff66 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -710,11 +710,12 @@ ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, mmap_changing, 0); } -void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, +long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long start, unsigned long len, bool enable_wp) { unsigned int mm_cp_flags; struct mmu_gather tlb; + long ret; if (enable_wp) mm_cp_flags = MM_CP_UFFD_WP; @@ -730,8 +731,10 @@ void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; tlb_gather_mmu(&tlb, dst_mm); - change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); + ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); tlb_finish_mmu(&tlb); + + return ret; } int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, @@ -740,7 +743,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, { struct vm_area_struct *dst_vma; unsigned long page_mask; - int err; + long err; /* * Sanitize the command parameters: @@ -779,9 +782,12 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, goto out_unlock; } - uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp); + err = uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp); + + /* Return 0 on success, <0 on failures */ + if (err > 0) + err = 0; - err = 0; out_unlock: mmap_read_unlock(dst_mm); return err; -- cgit v1.2.3 From f78dfc7b77d5c3527d0f895bef693f711802de5a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 4 Jan 2023 14:29:44 -0800 Subject: workingset: fix confusion around eviction vs refault container Refault decisions are made based on the lruvec where the page was evicted, as that determined its LRU order while it was alive. Stats and workingset aging must then occur on the lruvec of the new page, as that's the node and cgroup that experience the refault and that's the lruvec whose nonresident info ages out by a new resident page. Those lruvecs could be different when a page is shared between cgroups, or the refaulting page is allocated on a different node. There are currently two mix-ups: 1. When swap is available, the resident anon set must be considered when comparing the refault distance. The comparison is made against the right anon set, but the check for swap is not. When pages get evicted from a cgroup with swap, and refault in one without, this can incorrectly consider a hot refault as cold - and vice versa. Fix that by using the eviction cgroup for the swap check. 2. The stats and workingset age are updated against the wrong lruvec altogether: the right cgroup but the wrong NUMA node. When a page refaults on a different NUMA node, this will have confusing stats and distort the workingset age on a different lruvec - again possibly resulting in hot/cold misclassifications down the line. Fix the swap check and the refault pgdat to address both concerns. This was found during code review. It hasn't caused notable issues in production, suggesting that those refault-migrations are relatively rare in practice. Link: https://lkml.kernel.org/r/20230104222944.2380117-1-nphamcs@gmail.com Signed-off-by: Johannes Weiner Co-developed-by: Nhat Pham Signed-off-by: Nhat Pham Signed-off-by: Andrew Morton --- mm/workingset.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/workingset.c b/mm/workingset.c index fd666584515c..f194d13beabb 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -457,6 +457,7 @@ void workingset_refault(struct folio *folio, void *shadow) */ nr = folio_nr_pages(folio); memcg = folio_memcg(folio); + pgdat = folio_pgdat(folio); lruvec = mem_cgroup_lruvec(memcg, pgdat); mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); @@ -474,7 +475,7 @@ void workingset_refault(struct folio *folio, void *shadow) workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_FILE); } - if (mem_cgroup_get_nr_swap_pages(memcg) > 0) { + if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) { workingset_size += lruvec_page_state(eviction_lruvec, NR_ACTIVE_ANON); if (file) { -- cgit v1.2.3 From fc5744881eabcc73ed24a4229034f0fbdeb3f46f Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Wed, 4 Jan 2023 21:18:05 +0200 Subject: mm/page_alloc: invert logic for early page initialisation checks Rename early_page_uninitialised() to early_page_initialised() and invert its logic to make the code more readable. Link: https://lkml.kernel.org/r/20230104191805.2535864-1-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Acked-by: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_alloc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5668c1a2de49..5514d84cc712 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -443,15 +443,15 @@ static inline bool deferred_pages_enabled(void) return static_branch_unlikely(&deferred_pages); } -/* Returns true if the struct page for the pfn is uninitialised */ -static inline bool __meminit early_page_uninitialised(unsigned long pfn) +/* Returns true if the struct page for the pfn is initialised */ +static inline bool __meminit early_page_initialised(unsigned long pfn) { int nid = early_pfn_to_nid(pfn); if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) - return true; + return false; - return false; + return true; } /* @@ -498,9 +498,9 @@ static inline bool deferred_pages_enabled(void) return false; } -static inline bool early_page_uninitialised(unsigned long pfn) +static inline bool early_page_initialised(unsigned long pfn) { - return false; + return true; } static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) @@ -1643,7 +1643,7 @@ static void __meminit init_reserved_page(unsigned long pfn) pg_data_t *pgdat; int nid, zid; - if (!early_page_uninitialised(pfn)) + if (early_page_initialised(pfn)) return; nid = early_pfn_to_nid(pfn); @@ -1806,7 +1806,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn) void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) { - if (early_page_uninitialised(pfn)) + if (!early_page_initialised(pfn)) return; if (!kmsan_memblock_free_pages(page, order)) { /* KMSAN will take care of these pages. */ -- cgit v1.2.3 From 541e06b772c1aaffb3b6a245ccface36d7107af2 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Thu, 5 Jan 2023 16:05:34 +0000 Subject: maple_tree: remove GFP_ZERO from kmem_cache_alloc() and kmem_cache_alloc_bulk() Preallocations are common in the VMA code to avoid allocating under certain locking conditions. The preallocations must also cover the worst-case scenario. Removing the GFP_ZERO flag from the kmem_cache_alloc() (and bulk variant) calls will reduce the amount of time spent zeroing memory that may not be used. Only zero out the necessary area to keep track of the allocations in the maple state. Zero the entire node prior to using it in the tree. This required internal changes to node counting on allocation, so the test code is also updated. This restores some micro-benchmark performance: up to +9% in mmtests mmap1 by my testing +10% to +20% in mmap, mmapaddr, mmapmany tests reported by Red Hat Link: https://bugzilla.redhat.com/show_bug.cgi?id=2149636 Link: https://lkml.kernel.org/r/20230105160427.2988454-1-Liam.Howlett@oracle.com Signed-off-by: Liam Howlett Reported-by: Jirka Hladky Suggested-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- lib/maple_tree.c | 80 +++++++++++++++++++++------------------- tools/testing/radix-tree/maple.c | 18 ++++----- 2 files changed, 52 insertions(+), 46 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 94f0053ec3e0..8db3c336d19f 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -149,13 +149,12 @@ struct maple_subtree_state { /* Functions */ static inline struct maple_node *mt_alloc_one(gfp_t gfp) { - return kmem_cache_alloc(maple_node_cache, gfp | __GFP_ZERO); + return kmem_cache_alloc(maple_node_cache, gfp); } static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes) { - return kmem_cache_alloc_bulk(maple_node_cache, gfp | __GFP_ZERO, size, - nodes); + return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes); } static inline void mt_free_bulk(size_t size, void __rcu **nodes) @@ -1125,9 +1124,10 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas) { struct maple_alloc *ret, *node = mas->alloc; unsigned long total = mas_allocated(mas); + unsigned int req = mas_alloc_req(mas); /* nothing or a request pending. */ - if (unlikely(!total)) + if (WARN_ON(!total)) return NULL; if (total == 1) { @@ -1137,27 +1137,25 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas) goto single_node; } - if (!node->node_count) { + if (node->node_count == 1) { /* Single allocation in this node. */ mas->alloc = node->slot[0]; - node->slot[0] = NULL; mas->alloc->total = node->total - 1; ret = node; goto new_head; } - node->total--; - ret = node->slot[node->node_count]; - node->slot[node->node_count--] = NULL; + ret = node->slot[--node->node_count]; + node->slot[node->node_count] = NULL; single_node: new_head: - ret->total = 0; - ret->node_count = 0; - if (ret->request_count) { - mas_set_alloc_req(mas, ret->request_count + 1); - ret->request_count = 0; + if (req) { + req++; + mas_set_alloc_req(mas, req); } + + memset(ret, 0, sizeof(*ret)); return (struct maple_node *)ret; } @@ -1176,21 +1174,20 @@ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) unsigned long count; unsigned int requested = mas_alloc_req(mas); - memset(reuse, 0, sizeof(*reuse)); count = mas_allocated(mas); - if (count && (head->node_count < MAPLE_ALLOC_SLOTS - 1)) { - if (head->slot[0]) - head->node_count++; - head->slot[head->node_count] = reuse; + reuse->request_count = 0; + reuse->node_count = 0; + if (count && (head->node_count < MAPLE_ALLOC_SLOTS)) { + head->slot[head->node_count++] = reuse; head->total++; goto done; } reuse->total = 1; if ((head) && !((unsigned long)head & 0x1)) { - head->request_count = 0; reuse->slot[0] = head; + reuse->node_count = 1; reuse->total += head->total; } @@ -1209,7 +1206,6 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) { struct maple_alloc *node; unsigned long allocated = mas_allocated(mas); - unsigned long success = allocated; unsigned int requested = mas_alloc_req(mas); unsigned int count; void **slots = NULL; @@ -1225,24 +1221,29 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) WARN_ON(!allocated); } - if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS - 1) { + if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) { node = (struct maple_alloc *)mt_alloc_one(gfp); if (!node) goto nomem_one; - if (allocated) + if (allocated) { node->slot[0] = mas->alloc; + node->node_count = 1; + } else { + node->node_count = 0; + } - success++; mas->alloc = node; + node->total = ++allocated; requested--; } node = mas->alloc; + node->request_count = 0; while (requested) { max_req = MAPLE_ALLOC_SLOTS; - if (node->slot[0]) { - unsigned int offset = node->node_count + 1; + if (node->node_count) { + unsigned int offset = node->node_count; slots = (void **)&node->slot[offset]; max_req -= offset; @@ -1256,15 +1257,13 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) goto nomem_bulk; node->node_count += count; - /* zero indexed. */ - if (slots == (void **)&node->slot) - node->node_count--; - - success += count; + allocated += count; node = node->slot[0]; + node->node_count = 0; + node->request_count = 0; requested -= count; } - mas->alloc->total = success; + mas->alloc->total = allocated; return; nomem_bulk: @@ -1273,7 +1272,7 @@ nomem_bulk: nomem_one: mas_set_alloc_req(mas, requested); if (mas->alloc && !(((unsigned long)mas->alloc & 0x1))) - mas->alloc->total = success; + mas->alloc->total = allocated; mas_set_err(mas, -ENOMEM); } @@ -5734,6 +5733,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) void mas_destroy(struct ma_state *mas) { struct maple_alloc *node; + unsigned long total; /* * When using mas_for_each() to insert an expected number of elements, @@ -5756,14 +5756,20 @@ void mas_destroy(struct ma_state *mas) } mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC); - while (mas->alloc && !((unsigned long)mas->alloc & 0x1)) { + total = mas_allocated(mas); + while (total) { node = mas->alloc; mas->alloc = node->slot[0]; - if (node->node_count > 0) - mt_free_bulk(node->node_count, - (void __rcu **)&node->slot[1]); + if (node->node_count > 1) { + size_t count = node->node_count - 1; + + mt_free_bulk(count, (void __rcu **)&node->slot[1]); + total -= count; + } kmem_cache_free(maple_node_cache, node); + total--; } + mas->alloc = NULL; } EXPORT_SYMBOL_GPL(mas_destroy); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 81fa7ec2e66a..1f36bc1c5d36 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -173,11 +173,11 @@ static noinline void check_new_node(struct maple_tree *mt) if (!MAPLE_32BIT) { if (i >= 35) - e = i - 35; + e = i - 34; else if (i >= 5) - e = i - 5; + e = i - 4; else if (i >= 2) - e = i - 2; + e = i - 1; } else { if (i >= 4) e = i - 4; @@ -305,17 +305,17 @@ static noinline void check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); mn = mas_pop_node(&mas); /* get the next node. */ MT_BUG_ON(mt, mn == NULL); MT_BUG_ON(mt, not_empty(mn)); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 2); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); mas_push_node(&mas, mn); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); /* Check the limit of pop/push/pop */ mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 2); /* Request */ @@ -323,14 +323,14 @@ static noinline void check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); MT_BUG_ON(mt, mas_alloc_req(&mas)); - MT_BUG_ON(mt, mas.alloc->node_count); + MT_BUG_ON(mt, mas.alloc->node_count != 1); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); mn = mas_pop_node(&mas); MT_BUG_ON(mt, not_empty(mn)); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); mas_push_node(&mas, mn); - MT_BUG_ON(mt, mas.alloc->node_count); + MT_BUG_ON(mt, mas.alloc->node_count != 1); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); mn = mas_pop_node(&mas); MT_BUG_ON(mt, not_empty(mn)); -- cgit v1.2.3 From 9eefefd835e451d340f5e95bc14ffd68b9b99268 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Thu, 5 Jan 2023 13:04:24 +0100 Subject: mm: remove an ambiguous sentence from kmap_local_folio() kdocs In the kdocs of kmap_local_folio() there is a an ambiguous sentence which suggests to use this API "only when really necessary". On the contrary, since kmap() and kmap_atomic() are deprecated, both kmap_local_folio(), as well as kmap_local_page(), must be preferred to the previous ones. Therefore, remove the above-mentioned sentence exactly how it has previously been done for the kmap_local_page() kdocs in commit 72f1c55adf70 ("highmem: delete a sentence from kmap_local_page() kdocs"). Link: https://lkml.kernel.org/r/20230105120424.30055-1-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Reviewed-by: Ira Weiny Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/highmem.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 44242268f53b..daeb0d8e753a 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -119,9 +119,8 @@ static inline void *kmap_local_page(struct page *page); * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * - * While it is significantly faster than kmap() for the higmem case it - * comes with restrictions about the pointer validity. Only use when really - * necessary. + * While it is significantly faster than kmap() for the highmem case it + * comes with restrictions about the pointer validity. * * On HIGHMEM enabled systems mapping a highmem page has the side effect of * disabling migration in order to keep the virtual address stable across -- cgit v1.2.3 From 1f8549fce525bc95df40ea3ddbfc6e8e719d188d Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Thu, 5 Jan 2023 13:13:05 +0100 Subject: mm: fix spelling mistake in highmem.h Substitute "higmem" with "highmem" in highmem.h. Link: https://lkml.kernel.org/r/20230105121305.30714-1-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: "Matthew Wilcox (Oracle)" Reviewed-by: Ira Weiny Signed-off-by: Andrew Morton --- include/linux/highmem.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index daeb0d8e753a..d7097b8158f2 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -86,8 +86,8 @@ static inline void kmap_flush_unused(void); * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * - * While it is significantly faster than kmap() for the higmem case it - * comes with restrictions about the pointer validity. + * While kmap_local_page() is significantly faster than kmap() for the highmem + * case it comes with restrictions about the pointer validity. * * On HIGHMEM enabled systems mapping a highmem page has the side effect of * disabling migration in order to keep the virtual address stable across -- cgit v1.2.3 From dee2ad120571f38433211098cd6b95a59bdfc8c7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 4 Jan 2023 15:49:05 +0100 Subject: selftests/vm: cow: add COW tests for collapsing of PTE-mapped anon THP Currently, anonymous PTE-mapped THPs cannot be collapsed in-place: collapsing (e.g., via MADV_COLLAPSE) implies allocating a fresh THP and mapping that new THP via a PMD: as it's a fresh anon THP, it will get the exclusive flag set on the head page and everybody is happy. However, if the kernel would ever support in-place collapse of anonymous THPs (replacing a page table mapping each sub-page of a THP via PTEs with a single PMD mapping the complete THP), exclusivity information stored for each sub-page would have to be collapsed accordingly: (1) All PTEs map !exclusive anon sub-pages: the in-place collapsed THP must not not have the exclusive flag set on the head page mapped by the PMD. This is the easiest case to handle ("simply don't set any exclusive flags"). (2) All PTEs map exclusive anon sub-pages: when collapsing, we have to clear the exclusive flag from all tail pages and only leave the exclusive flag set for the head page. Otherwise, fork() after collapse would not clear the exclusive flags from the tail pages and we'd be in trouble once PTE-mapping the shared THP when writing to shared tail pages that still have the exclusive flag set. This would effectively revert what the PTE-mapping code does when propagating the exclusive flag to all sub-pages. (3) PTEs map a mixture of exclusive and !exclusive anon sub-pages (can happen e.g., due to MADV_DONTFORK before fork()). We must not collapse the THP in-place, otherwise bad things may happen: the exclusive flags of sub-pages would get ignored and the exclusive flag of the head page would get used instead. Now that we have MADV_COLLAPSE in place to trigger collapsing a THP, let's add some test cases that would bail out early, if we'd voluntarily/accidantially unlock in-place collapse for anon THPs and forget about taking proper care of exclusive flags. Running the test on a kernel with MADV_COLLAPSE support: # [INFO] Anonymous THP tests # [RUN] Basic COW after fork() when collapsing before fork() ok 169 No leak from parent into child # [RUN] Basic COW after fork() when collapsing after fork() (fully shared) ok 170 # SKIP MADV_COLLAPSE failed: Invalid argument # [RUN] Basic COW after fork() when collapsing after fork() (lower shared) ok 171 No leak from parent into child # [RUN] Basic COW after fork() when collapsing after fork() (upper shared) ok 172 No leak from parent into child For now, MADV_COLLAPSE always seems to fail if all PTEs map shared sub-pages. Link: https://lkml.kernel.org/r/20230104144905.460075-1-david@redhat.com Signed-off-by: David Hildenbrand Cc: Shuah Khan Cc: Hugh Dickins Cc: Peter Xu Cc: Vlastimil Babka Cc: Nadav Amit Cc: Zach O'Keefe Cc: Andrea Arcangeli Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/cow.c | 228 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 228 insertions(+) diff --git a/tools/testing/selftests/vm/cow.c b/tools/testing/selftests/vm/cow.c index 26f6ea3079e2..16216d893d96 100644 --- a/tools/testing/selftests/vm/cow.c +++ b/tools/testing/selftests/vm/cow.c @@ -30,6 +30,10 @@ #include "../kselftest.h" #include "vm_util.h" +#ifndef MADV_COLLAPSE +#define MADV_COLLAPSE 25 +#endif + static size_t pagesize; static int pagemap_fd; static size_t thpsize; @@ -1178,6 +1182,228 @@ static int tests_per_anon_test_case(void) return tests; } +enum anon_thp_collapse_test { + ANON_THP_COLLAPSE_UNSHARED, + ANON_THP_COLLAPSE_FULLY_SHARED, + ANON_THP_COLLAPSE_LOWER_SHARED, + ANON_THP_COLLAPSE_UPPER_SHARED, +}; + +static void do_test_anon_thp_collapse(char *mem, size_t size, + enum anon_thp_collapse_test test) +{ + struct comm_pipes comm_pipes; + char buf; + int ret; + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + return; + } + + /* + * Trigger PTE-mapping the THP by temporarily mapping a single subpage + * R/O, such that we can try collapsing it later. + */ + ret = mprotect(mem + pagesize, pagesize, PROT_READ); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto close_comm_pipes; + } + ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto close_comm_pipes; + } + + switch (test) { + case ANON_THP_COLLAPSE_UNSHARED: + /* Collapse before actually COW-sharing the page. */ + ret = madvise(mem, size, MADV_COLLAPSE); + if (ret) { + ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", + strerror(errno)); + goto close_comm_pipes; + } + break; + case ANON_THP_COLLAPSE_FULLY_SHARED: + /* COW-share the full PTE-mapped THP. */ + break; + case ANON_THP_COLLAPSE_LOWER_SHARED: + /* Don't COW-share the upper part of the THP. */ + ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK); + if (ret) { + ksft_test_result_fail("MADV_DONTFORK failed\n"); + goto close_comm_pipes; + } + break; + case ANON_THP_COLLAPSE_UPPER_SHARED: + /* Don't COW-share the lower part of the THP. */ + ret = madvise(mem, size / 2, MADV_DONTFORK); + if (ret) { + ksft_test_result_fail("MADV_DONTFORK failed\n"); + goto close_comm_pipes; + } + break; + default: + assert(false); + } + + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto close_comm_pipes; + } else if (!ret) { + switch (test) { + case ANON_THP_COLLAPSE_UNSHARED: + case ANON_THP_COLLAPSE_FULLY_SHARED: + exit(child_memcmp_fn(mem, size, &comm_pipes)); + break; + case ANON_THP_COLLAPSE_LOWER_SHARED: + exit(child_memcmp_fn(mem, size / 2, &comm_pipes)); + break; + case ANON_THP_COLLAPSE_UPPER_SHARED: + exit(child_memcmp_fn(mem + size / 2, size / 2, + &comm_pipes)); + break; + default: + assert(false); + } + } + + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + + switch (test) { + case ANON_THP_COLLAPSE_UNSHARED: + break; + case ANON_THP_COLLAPSE_UPPER_SHARED: + case ANON_THP_COLLAPSE_LOWER_SHARED: + /* + * Revert MADV_DONTFORK such that we merge the VMAs and are + * able to actually collapse. + */ + ret = madvise(mem, size, MADV_DOFORK); + if (ret) { + ksft_test_result_fail("MADV_DOFORK failed\n"); + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + goto close_comm_pipes; + } + /* FALLTHROUGH */ + case ANON_THP_COLLAPSE_FULLY_SHARED: + /* Collapse before anyone modified the COW-shared page. */ + ret = madvise(mem, size, MADV_COLLAPSE); + if (ret) { + ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", + strerror(errno)); + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + goto close_comm_pipes; + } + break; + default: + assert(false); + } + + /* Modify the page. */ + memset(mem, 0xff, size); + write(comm_pipes.parent_ready[1], "0", 1); + + wait(&ret); + if (WIFEXITED(ret)) + ret = WEXITSTATUS(ret); + else + ret = -EINVAL; + + ksft_test_result(!ret, "No leak from parent into child\n"); +close_comm_pipes: + close_comm_pipes(&comm_pipes); +} + +static void test_anon_thp_collapse_unshared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED); +} + +static void test_anon_thp_collapse_fully_shared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED); +} + +static void test_anon_thp_collapse_lower_shared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED); +} + +static void test_anon_thp_collapse_upper_shared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED); +} + +/* + * Test cases that are specific to anonymous THP: pages in private mappings + * that may get shared via COW during fork(). + */ +static const struct test_case anon_thp_test_cases[] = { + /* + * Basic COW test for fork() without any GUP when collapsing a THP + * before fork(). + * + * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place + * collapse") might easily get COW handling wrong when not collapsing + * exclusivity information properly. + */ + { + "Basic COW after fork() when collapsing before fork()", + test_anon_thp_collapse_unshared, + }, + /* Basic COW test, but collapse after COW-sharing a full THP. */ + { + "Basic COW after fork() when collapsing after fork() (fully shared)", + test_anon_thp_collapse_fully_shared, + }, + /* + * Basic COW test, but collapse after COW-sharing the lower half of a + * THP. + */ + { + "Basic COW after fork() when collapsing after fork() (lower shared)", + test_anon_thp_collapse_lower_shared, + }, + /* + * Basic COW test, but collapse after COW-sharing the upper half of a + * THP. + */ + { + "Basic COW after fork() when collapsing after fork() (upper shared)", + test_anon_thp_collapse_upper_shared, + }, +}; + +static void run_anon_thp_test_cases(void) +{ + int i; + + if (!thpsize) + return; + + ksft_print_msg("[INFO] Anonymous THP tests\n"); + + for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) { + struct test_case const *test_case = &anon_thp_test_cases[i]; + + ksft_print_msg("[RUN] %s\n", test_case->desc); + do_run_with_thp(test_case->fn, THP_RUN_PMD); + } +} + +static int tests_per_anon_thp_test_case(void) +{ + return thpsize ? 1 : 0; +} + typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size); static void test_cow(char *mem, const char *smem, size_t size) @@ -1518,6 +1744,7 @@ int main(int argc, char **argv) ksft_print_header(); ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() + + ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() + ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case()); gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); @@ -1526,6 +1753,7 @@ int main(int argc, char **argv) ksft_exit_fail_msg("opening pagemap failed\n"); run_anon_test_cases(); + run_anon_thp_test_cases(); run_non_anon_test_cases(); err = ksft_get_fail_cnt(); -- cgit v1.2.3 From bb94429096d090f5e209624d08919a7962d6c4b7 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 4 Jan 2023 14:06:04 +0800 Subject: mm/slab: add is_kmalloc_cache() helper function commit 6edf2576a6cc ("mm/slub: enable debugging memory wasting of kmalloc") introduces 'SLAB_KMALLOC' bit specifying whether a kmem_cache is a kmalloc cache for slab/slub (slob doesn't have dedicated kmalloc caches). Add a helper inline function for other components like kasan to simplify code. Link: https://lkml.kernel.org/r/20230104060605.930910-1-feng.tang@intel.com Signed-off-by: Feng Tang Acked-by: Vlastimil Babka Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: David Rientjes Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Christoph Lameter Cc: Dmitry Vyukov Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Roman Gushchin Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/slab.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/slab.h b/mm/slab.h index 7cc432969945..63fb4c00d529 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -323,6 +323,14 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size, } #endif +static inline bool is_kmalloc_cache(struct kmem_cache *s) +{ +#ifndef CONFIG_SLOB + return (s->flags & SLAB_KMALLOC); +#else + return false; +#endif +} /* Legal flag mask for kmem_cache_create(), for various configurations */ #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ -- cgit v1.2.3 From bbc61844b4645d54c147a82654ac974bb7be85de Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 4 Jan 2023 14:06:05 +0800 Subject: mm/kasan: simplify and refine kasan_cache code struct 'kasan_cache' has a member 'is_kmalloc' indicating whether its host kmem_cache is a kmalloc cache. With newly introduced is_kmalloc_cache() helper, 'is_kmalloc' and its related function can be replaced and removed. Also 'kasan_cache' is only needed by KASAN generic mode, and not by SW/HW tag modes, so refine its protection macro accordingly, suggested by Andrey Konoval. Link: https://lkml.kernel.org/r/20230104060605.930910-2-feng.tang@intel.com Signed-off-by: Feng Tang Reviewed-by: Andrey Konovalov Acked-by: Vlastimil Babka Acked-by: David Rientjes Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Christoph Lameter Cc: Dmitry Vyukov Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Roman Gushchin Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/kasan.h | 22 +++++----------------- include/linux/slab_def.h | 2 +- include/linux/slub_def.h | 2 +- mm/kasan/common.c | 9 ++------- mm/slab_common.c | 1 - 5 files changed, 9 insertions(+), 27 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 5ebbaf672009..f7ef70661ce2 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -96,15 +96,6 @@ static inline bool kasan_has_integrated_init(void) } #ifdef CONFIG_KASAN - -struct kasan_cache { -#ifdef CONFIG_KASAN_GENERIC - int alloc_meta_offset; - int free_meta_offset; -#endif - bool is_kmalloc; -}; - void __kasan_unpoison_range(const void *addr, size_t size); static __always_inline void kasan_unpoison_range(const void *addr, size_t size) { @@ -129,13 +120,6 @@ static __always_inline bool kasan_unpoison_pages(struct page *page, return false; } -void __kasan_cache_create_kmalloc(struct kmem_cache *cache); -static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) -{ - if (kasan_enabled()) - __kasan_cache_create_kmalloc(cache); -} - void __kasan_poison_slab(struct slab *slab); static __always_inline void kasan_poison_slab(struct slab *slab) { @@ -255,7 +239,6 @@ static inline bool kasan_unpoison_pages(struct page *page, unsigned int order, { return false; } -static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {} static inline void kasan_poison_slab(struct slab *slab) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, void *object) {} @@ -306,6 +289,11 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {} #ifdef CONFIG_KASAN_GENERIC +struct kasan_cache { + int alloc_meta_offset; + int free_meta_offset; +}; + size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object); slab_flags_t kasan_never_merge(void); void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 5834bad8ad78..a61e7d55d0d3 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -72,7 +72,7 @@ struct kmem_cache { int obj_offset; #endif /* CONFIG_DEBUG_SLAB */ -#ifdef CONFIG_KASAN +#ifdef CONFIG_KASAN_GENERIC struct kasan_cache kasan_info; #endif diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index aa0ee1678d29..f6df03f934e5 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -136,7 +136,7 @@ struct kmem_cache { unsigned int *random_seq; #endif -#ifdef CONFIG_KASAN +#ifdef CONFIG_KASAN_GENERIC struct kasan_cache kasan_info; #endif diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 1d0008e1c420..6b8e9c848573 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -122,11 +122,6 @@ void __kasan_poison_pages(struct page *page, unsigned int order, bool init) KASAN_PAGE_FREE, init); } -void __kasan_cache_create_kmalloc(struct kmem_cache *cache) -{ - cache->kasan_info.is_kmalloc = true; -} - void __kasan_poison_slab(struct slab *slab) { struct page *page = slab_page(slab); @@ -326,7 +321,7 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, kasan_unpoison(tagged_object, cache->object_size, init); /* Save alloc info (if possible) for non-kmalloc() allocations. */ - if (kasan_stack_collection_enabled() && !cache->kasan_info.is_kmalloc) + if (kasan_stack_collection_enabled() && !is_kmalloc_cache(cache)) kasan_save_alloc_info(cache, tagged_object, flags); return tagged_object; @@ -372,7 +367,7 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache, * Save alloc info (if possible) for kmalloc() allocations. * This also rewrites the alloc info when called from kasan_krealloc(). */ - if (kasan_stack_collection_enabled() && cache->kasan_info.is_kmalloc) + if (kasan_stack_collection_enabled() && is_kmalloc_cache(cache)) kasan_save_alloc_info(cache, (void *)object, flags); /* Keep the tag that was set by kasan_slab_alloc(). */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 1cba98acc486..bf4e777cfe90 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -670,7 +670,6 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, create_boot_cache(s, name, size, flags | SLAB_KMALLOC, useroffset, usersize); - kasan_cache_create_kmalloc(s); list_add(&s->list, &slab_caches); s->refcount = 1; return s; -- cgit v1.2.3 From e9adcfecf572fcfaa9f8525904cf49c709974f73 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 3 Jan 2023 16:27:32 -0800 Subject: mm: remove zap_page_range and create zap_vma_pages zap_page_range was originally designed to unmap pages within an address range that could span multiple vmas. While working on [1], it was discovered that all callers of zap_page_range pass a range entirely within a single vma. In addition, the mmu notification call within zap_page range does not correctly handle ranges that span multiple vmas. When crossing a vma boundary, a new mmu_notifier_range_init/end call pair with the new vma should be made. Instead of fixing zap_page_range, do the following: - Create a new routine zap_vma_pages() that will remove all pages within the passed vma. Most users of zap_page_range pass the entire vma and can use this new routine. - For callers of zap_page_range not passing the entire vma, instead call zap_page_range_single(). - Remove zap_page_range. [1] https://lore.kernel.org/linux-mm/20221114235507.294320-2-mike.kravetz@oracle.com/ Link: https://lkml.kernel.org/r/20230104002732.232573-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Suggested-by: Peter Xu Acked-by: Michal Hocko Acked-by: Peter Xu Acked-by: Heiko Carstens [s390] Reviewed-by: Christoph Hellwig Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dave Hansen Cc: David Hildenbrand Cc: Eric Dumazet Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Nadav Amit Cc: Palmer Dabbelt Cc: Rik van Riel Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/kernel/vdso.c | 6 ++---- arch/powerpc/kernel/vdso.c | 4 +--- arch/powerpc/platforms/book3s/vas-api.c | 2 +- arch/powerpc/platforms/pseries/vas.c | 3 +-- arch/riscv/kernel/vdso.c | 6 ++---- arch/s390/kernel/vdso.c | 4 +--- arch/s390/mm/gmap.c | 2 +- arch/x86/entry/vdso/vma.c | 4 +--- drivers/android/binder_alloc.c | 2 +- include/linux/mm.h | 7 +++++-- mm/memory.c | 30 ------------------------------ mm/page-writeback.c | 2 +- net/ipv4/tcp.c | 7 ++++--- 13 files changed, 21 insertions(+), 58 deletions(-) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index e59a32aa0c49..0119dc91abb5 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -138,13 +138,11 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) mmap_read_lock(mm); for_each_vma(vmi, vma) { - unsigned long size = vma->vm_end - vma->vm_start; - if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA64].dm)) - zap_page_range(vma, vma->vm_start, size); + zap_vma_pages(vma); #ifdef CONFIG_COMPAT_VDSO if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA32].dm)) - zap_page_range(vma, vma->vm_start, size); + zap_vma_pages(vma); #endif } diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 507f8228f983..7a2ff9010f17 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -120,10 +120,8 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) mmap_read_lock(mm); for_each_vma(vmi, vma) { - unsigned long size = vma->vm_end - vma->vm_start; - if (vma_is_special_mapping(vma, &vvar_spec)) - zap_page_range(vma, vma->vm_start, size); + zap_vma_pages(vma); } mmap_read_unlock(mm); diff --git a/arch/powerpc/platforms/book3s/vas-api.c b/arch/powerpc/platforms/book3s/vas-api.c index eb5bed333750..9580e8e12165 100644 --- a/arch/powerpc/platforms/book3s/vas-api.c +++ b/arch/powerpc/platforms/book3s/vas-api.c @@ -414,7 +414,7 @@ static vm_fault_t vas_mmap_fault(struct vm_fault *vmf) /* * When the LPAR lost credits due to core removal or during * migration, invalidate the existing mapping for the current - * paste addresses and set windows in-active (zap_page_range in + * paste addresses and set windows in-active (zap_vma_pages in * reconfig_close_windows()). * New mapping will be done later after migration or new credits * available. So continue to receive faults if the user space diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index 4ad6e510d405..559112312810 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -760,8 +760,7 @@ static int reconfig_close_windows(struct vas_caps *vcap, int excess_creds, * is done before the original mmap() and after the ioctl. */ if (vma) - zap_page_range(vma, vma->vm_start, - vma->vm_end - vma->vm_start); + zap_vma_pages(vma); mmap_write_unlock(task_ref->mm); mutex_unlock(&task_ref->mmap_mutex); diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index e410275918ac..5c30212d8d1c 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -124,13 +124,11 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) mmap_read_lock(mm); for_each_vma(vmi, vma) { - unsigned long size = vma->vm_end - vma->vm_start; - if (vma_is_special_mapping(vma, vdso_info.dm)) - zap_page_range(vma, vma->vm_start, size); + zap_vma_pages(vma); #ifdef CONFIG_COMPAT if (vma_is_special_mapping(vma, compat_vdso_info.dm)) - zap_page_range(vma, vma->vm_start, size); + zap_vma_pages(vma); #endif } diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index ff7bf4432229..bbaefd84f15e 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -59,11 +59,9 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) mmap_read_lock(mm); for_each_vma(vmi, vma) { - unsigned long size = vma->vm_end - vma->vm_start; - if (!vma_is_special_mapping(vma, &vvar_mapping)) continue; - zap_page_range(vma, vma->vm_start, size); + zap_vma_pages(vma); break; } mmap_read_unlock(mm); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 74e1d873dce0..69af6cdf1a2a 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -722,7 +722,7 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) if (is_vm_hugetlb_page(vma)) continue; size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); - zap_page_range(vma, vmaddr, size); + zap_page_range_single(vma, vmaddr, size, NULL); } mmap_read_unlock(gmap->mm); } diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index b8f3f9b9e53c..ec5e4d2048cb 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -113,10 +113,8 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) mmap_read_lock(mm); for_each_vma(vmi, vma) { - unsigned long size = vma->vm_end - vma->vm_start; - if (vma_is_special_mapping(vma, &vvar_mapping)) - zap_page_range(vma, vma->vm_start, size); + zap_vma_pages(vma); } mmap_read_unlock(mm); diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 4ad42b0f75cd..55a3c3c2409f 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -1019,7 +1019,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, if (vma) { trace_binder_unmap_user_start(alloc, index); - zap_page_range(vma, page_addr, PAGE_SIZE); + zap_page_range_single(vma, page_addr, PAGE_SIZE, NULL); trace_binder_unmap_user_end(alloc, index); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 4ac5ea4b584c..eb5bfc77c2c2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1977,10 +1977,13 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); -void zap_page_range(struct vm_area_struct *vma, unsigned long address, - unsigned long size); void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details); +static inline void zap_vma_pages(struct vm_area_struct *vma) +{ + zap_page_range_single(vma, vma->vm_start, + vma->vm_end - vma->vm_start, NULL); +} void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, struct vm_area_struct *start_vma, unsigned long start, unsigned long end); diff --git a/mm/memory.c b/mm/memory.c index 1598051a2a24..b0dda866ffe6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1693,36 +1693,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, mmu_notifier_invalidate_range_end(&range); } -/** - * zap_page_range - remove user pages in a given range - * @vma: vm_area_struct holding the applicable pages - * @start: starting address of pages to zap - * @size: number of bytes to zap - * - * Caller must protect the VMA list - */ -void zap_page_range(struct vm_area_struct *vma, unsigned long start, - unsigned long size) -{ - struct maple_tree *mt = &vma->vm_mm->mm_mt; - unsigned long end = start + size; - struct mmu_notifier_range range; - struct mmu_gather tlb; - MA_STATE(mas, mt, vma->vm_end, vma->vm_end); - - lru_add_drain(); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, - start, start + size); - tlb_gather_mmu(&tlb, vma->vm_mm); - update_hiwater_rss(vma->vm_mm); - mmu_notifier_invalidate_range_start(&range); - do { - unmap_single_vma(&tlb, vma, start, range.end, NULL); - } while ((vma = mas_find(&mas, end - 1)) != NULL); - mmu_notifier_invalidate_range_end(&range); - tlb_finish_mmu(&tlb); -} - /** * zap_page_range_single - remove user pages in a given range * @vma: vm_area_struct holding the applicable pages diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 337cafe9978c..e91f94b3438b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2690,7 +2690,7 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) * * The caller must hold lock_page_memcg(). Most callers have the folio * locked. A few have the folio blocked from truncation through other - * means (eg zap_page_range() has it mapped and is holding the page table + * means (eg zap_vma_pages() has it mapped and is holding the page table * lock). This can also be called from mark_buffer_dirty(), which I * cannot prove is always protected against truncate. */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c567d5e8053e..f713c0422f0f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2092,7 +2092,7 @@ static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, maybe_zap_len = total_bytes_to_map - /* All bytes to map */ *length + /* Mapped or pending */ (pages_remaining * PAGE_SIZE); /* Failed map. */ - zap_page_range(vma, *address, maybe_zap_len); + zap_page_range_single(vma, *address, maybe_zap_len, NULL); err = 0; } @@ -2100,7 +2100,7 @@ static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, unsigned long leftover_pages = pages_remaining; int bytes_mapped; - /* We called zap_page_range, try to reinsert. */ + /* We called zap_page_range_single, try to reinsert. */ err = vm_insert_pages(vma, *address, pending_pages, &pages_remaining); @@ -2234,7 +2234,8 @@ static int tcp_zerocopy_receive(struct sock *sk, total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1); if (total_bytes_to_map) { if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT)) - zap_page_range(vma, address, total_bytes_to_map); + zap_page_range_single(vma, address, total_bytes_to_map, + NULL); zc->length = total_bytes_to_map; zc->recv_skip_hint = 0; } else { -- cgit v1.2.3 From 183986209935b51d50de819eeae512a155be3568 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 3 Jan 2023 18:07:50 +0000 Subject: MAINTAINERS: add types to akpm/mm git trees entries Patch series "mm: trivial fixups". This patchset is for trivial fixups of mm stuff on MAINTAINERS, tools/ selftests, and docs. This patch (of 5): Each SCM tree entry of MAINTAINERS file should have both type and location, but akpm/mm git tree entries of 'MEMORY MANAGEMENT' and 'VMALLOC' sections of the file don't have the type. Add the type. Link: https://lkml.kernel.org/r/20230103180754.129637-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 123216b76534..6871e24c2de5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13474,7 +13474,7 @@ M: Andrew Morton L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org -T: git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm T: quilt git://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new F: include/linux/gfp.h F: include/linux/gfp_types.h @@ -13492,7 +13492,7 @@ R: Christoph Hellwig L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org -T: git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm F: include/linux/vmalloc.h F: mm/vmalloc.c -- cgit v1.2.3 From 060deca404ba7c2f499fcee793956c502c60e193 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 3 Jan 2023 18:07:51 +0000 Subject: MAINTAINERS/MEMORY MANAGEMENT: add tools/vm/ as managed files 'tools/vm/' directory should be a part of memory management subsystem, but MAINTAINERS file doesn't mark the directory so. Add one more 'F:' entry for the directory to 'MEMORY MANAGEMENT' section. Link: https://lkml.kernel.org/r/20230103180754.129637-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 6871e24c2de5..c05f95aa7af1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13484,6 +13484,7 @@ F: include/linux/mmzone.h F: include/linux/pagewalk.h F: mm/ F: tools/testing/selftests/vm/ +F: tools/vm/ VMALLOC M: Andrew Morton -- cgit v1.2.3 From 799fb82aa132fa3a3886b7872997a5a84e820062 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 3 Jan 2023 18:07:52 +0000 Subject: tools/vm: rename tools/vm to tools/mm Rename tools/vm to tools/mm for being more consistent with the code and documentation directories, and won't be confused with virtual machines. Link: https://lkml.kernel.org/r/20230103180754.129637-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../admin-guide/mm/idle_page_tracking.rst | 2 +- Documentation/admin-guide/mm/pagemap.rst | 4 +- Documentation/mm/page_owner.rst | 2 +- Documentation/mm/slub.rst | 2 +- Documentation/translations/zh_CN/mm/page_owner.rst | 2 +- MAINTAINERS | 2 +- mm/Kconfig.debug | 2 +- mm/memory-failure.c | 2 +- tools/mm/.gitignore | 4 + tools/mm/Makefile | 32 + tools/mm/page-types.c | 1396 ++++++++++++++++++ tools/mm/page_owner_sort.c | 897 ++++++++++++ tools/mm/slabinfo-gnuplot.sh | 268 ++++ tools/mm/slabinfo.c | 1544 ++++++++++++++++++++ tools/vm/.gitignore | 4 - tools/vm/Makefile | 32 - tools/vm/page-types.c | 1396 ------------------ tools/vm/page_owner_sort.c | 897 ------------ tools/vm/slabinfo-gnuplot.sh | 268 ---- tools/vm/slabinfo.c | 1544 -------------------- 20 files changed, 4150 insertions(+), 4150 deletions(-) create mode 100644 tools/mm/.gitignore create mode 100644 tools/mm/Makefile create mode 100644 tools/mm/page-types.c create mode 100644 tools/mm/page_owner_sort.c create mode 100644 tools/mm/slabinfo-gnuplot.sh create mode 100644 tools/mm/slabinfo.c delete mode 100644 tools/vm/.gitignore delete mode 100644 tools/vm/Makefile delete mode 100644 tools/vm/page-types.c delete mode 100644 tools/vm/page_owner_sort.c delete mode 100644 tools/vm/slabinfo-gnuplot.sh delete mode 100644 tools/vm/slabinfo.c diff --git a/Documentation/admin-guide/mm/idle_page_tracking.rst b/Documentation/admin-guide/mm/idle_page_tracking.rst index df9394fb39c2..19492064278c 100644 --- a/Documentation/admin-guide/mm/idle_page_tracking.rst +++ b/Documentation/admin-guide/mm/idle_page_tracking.rst @@ -65,7 +65,7 @@ workload one should: are not reclaimable, he or she can filter them out using ``/proc/kpageflags``. -The page-types tool in the tools/vm directory can be used to assist in this. +The page-types tool in the tools/mm directory can be used to assist in this. If the tool is run initially with the appropriate option, it will mark all the queried pages as idle. Subsequent runs of the tool can then show which pages have their idle flag cleared in the interim. diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index 6e2e416af783..ceb5da3172ba 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -46,7 +46,7 @@ There are four components to pagemap: * ``/proc/kpagecount``. This file contains a 64-bit count of the number of times each page is mapped, indexed by PFN. -The page-types tool in the tools/vm directory can be used to query the +The page-types tool in the tools/mm directory can be used to query the number of times a page is mapped. * ``/proc/kpageflags``. This file contains a 64-bit set of flags for each @@ -173,7 +173,7 @@ LRU related page flags 14 - SWAPBACKED The page is backed by swap/RAM. -The page-types tool in the tools/vm directory can be used to query the +The page-types tool in the tools/mm directory can be used to query the above flags. Using pagemap to do something useful diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst index 127514955a5e..5df26c0a0c1f 100644 --- a/Documentation/mm/page_owner.rst +++ b/Documentation/mm/page_owner.rst @@ -61,7 +61,7 @@ Usage 1) Build user-space helper:: - cd tools/vm + cd tools/mm make page_owner_sort 2) Enable page owner: add "page_owner=on" to boot cmdline. diff --git a/Documentation/mm/slub.rst b/Documentation/mm/slub.rst index 7f652216dabe..3ffa7eded251 100644 --- a/Documentation/mm/slub.rst +++ b/Documentation/mm/slub.rst @@ -21,7 +21,7 @@ slabs that have data in them. See "slabinfo -h" for more options when running the command. ``slabinfo`` can be compiled with :: - gcc -o slabinfo tools/vm/slabinfo.c + gcc -o slabinfo tools/mm/slabinfo.c Some of the modes of operation of ``slabinfo`` require that slub debugging be enabled on the command line. F.e. no tracking information will be diff --git a/Documentation/translations/zh_CN/mm/page_owner.rst b/Documentation/translations/zh_CN/mm/page_owner.rst index 21a6a0837d42..4d3b2c33e4ef 100644 --- a/Documentation/translations/zh_CN/mm/page_owner.rst +++ b/Documentation/translations/zh_CN/mm/page_owner.rst @@ -62,7 +62,7 @@ page owner在默认情况下是禁用的。所以,如果你想使用它,你 1) 构建用户空间的帮助:: - cd tools/vm + cd tools/mm make page_owner_sort 2) 启用page owner: 添加 "page_owner=on" 到 boot cmdline. diff --git a/MAINTAINERS b/MAINTAINERS index c05f95aa7af1..c726adfd1f0d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13483,8 +13483,8 @@ F: include/linux/mm.h F: include/linux/mmzone.h F: include/linux/pagewalk.h F: mm/ +F: tools/mm/ F: tools/testing/selftests/vm/ -F: tools/vm/ VMALLOC M: Andrew Morton diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index fca699ad1fb0..d62f48131952 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -90,7 +90,7 @@ config PAGE_OWNER help to find bare alloc_page(s) leaks. Even if you include this feature on your build, it is disabled in default. You should pass "page_owner=on" to boot parameter in order to enable it. Eats - a fair amount of memory if enabled. See tools/vm/page_owner_sort.c + a fair amount of memory if enabled. See tools/mm/page_owner_sort.c for user-space helper. If unsure, say N. diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c77a9e37e27e..6bf07345ea2c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -24,7 +24,7 @@ * - You have a test that can be added to mce-test * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/ * - The case actually shows up as a frequent (top 10) page state in - * tools/vm/page-types when running a real workload. + * tools/mm/page-types when running a real workload. * * There are several operations here with exponential complexity because * of unsuitable VM data structures. For example the operation to map back diff --git a/tools/mm/.gitignore b/tools/mm/.gitignore new file mode 100644 index 000000000000..922879f93fc8 --- /dev/null +++ b/tools/mm/.gitignore @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +slabinfo +page-types +page_owner_sort diff --git a/tools/mm/Makefile b/tools/mm/Makefile new file mode 100644 index 000000000000..9860622cbb15 --- /dev/null +++ b/tools/mm/Makefile @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for vm tools +# +include ../scripts/Makefile.include + +TARGETS=page-types slabinfo page_owner_sort + +LIB_DIR = ../lib/api +LIBS = $(LIB_DIR)/libapi.a + +CFLAGS = -Wall -Wextra -I../lib/ +LDFLAGS = $(LIBS) + +all: $(TARGETS) + +$(TARGETS): $(LIBS) + +$(LIBS): + make -C $(LIB_DIR) + +%: %.c + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +clean: + $(RM) page-types slabinfo page_owner_sort + make -C $(LIB_DIR) clean + +sbindir ?= /usr/sbin + +install: all + install -d $(DESTDIR)$(sbindir) + install -m 755 -p $(TARGETS) $(DESTDIR)$(sbindir) diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c new file mode 100644 index 000000000000..381dcc00cb62 --- /dev/null +++ b/tools/mm/page-types.c @@ -0,0 +1,1396 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * page-types: Tool for querying page flags + * + * Copyright (C) 2009 Intel corporation + * + * Authors: Wu Fengguang + */ + +#define _FILE_OFFSET_BITS 64 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../../include/uapi/linux/magic.h" +#include "../../include/uapi/linux/kernel-page-flags.h" +#include + +#ifndef MAX_PATH +# define MAX_PATH 256 +#endif + +#ifndef STR +# define _STR(x) #x +# define STR(x) _STR(x) +#endif + +/* + * pagemap kernel ABI bits + */ + +#define PM_ENTRY_BYTES 8 +#define PM_PFRAME_BITS 55 +#define PM_PFRAME_MASK ((1LL << PM_PFRAME_BITS) - 1) +#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) +#define MAX_SWAPFILES_SHIFT 5 +#define PM_SWAP_OFFSET(x) (((x) & PM_PFRAME_MASK) >> MAX_SWAPFILES_SHIFT) +#define PM_SOFT_DIRTY (1ULL << 55) +#define PM_MMAP_EXCLUSIVE (1ULL << 56) +#define PM_FILE (1ULL << 61) +#define PM_SWAP (1ULL << 62) +#define PM_PRESENT (1ULL << 63) + +/* + * kernel page flags + */ + +#define KPF_BYTES 8 +#define PROC_KPAGEFLAGS "/proc/kpageflags" +#define PROC_KPAGECOUNT "/proc/kpagecount" +#define PROC_KPAGECGROUP "/proc/kpagecgroup" + +#define SYS_KERNEL_MM_PAGE_IDLE "/sys/kernel/mm/page_idle/bitmap" + +/* [32-] kernel hacking assistances */ +#define KPF_RESERVED 32 +#define KPF_MLOCKED 33 +#define KPF_MAPPEDTODISK 34 +#define KPF_PRIVATE 35 +#define KPF_PRIVATE_2 36 +#define KPF_OWNER_PRIVATE 37 +#define KPF_ARCH 38 +#define KPF_UNCACHED 39 +#define KPF_SOFTDIRTY 40 +#define KPF_ARCH_2 41 + +/* [47-] take some arbitrary free slots for expanding overloaded flags + * not part of kernel API + */ +#define KPF_ANON_EXCLUSIVE 47 +#define KPF_READAHEAD 48 +#define KPF_SLOB_FREE 49 +#define KPF_SLUB_FROZEN 50 +#define KPF_SLUB_DEBUG 51 +#define KPF_FILE 61 +#define KPF_SWAP 62 +#define KPF_MMAP_EXCLUSIVE 63 + +#define KPF_ALL_BITS ((uint64_t)~0ULL) +#define KPF_HACKERS_BITS (0xffffULL << 32) +#define KPF_OVERLOADED_BITS (0xffffULL << 48) +#define BIT(name) (1ULL << KPF_##name) +#define BITS_COMPOUND (BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL)) + +static const char * const page_flag_names[] = { + [KPF_LOCKED] = "L:locked", + [KPF_ERROR] = "E:error", + [KPF_REFERENCED] = "R:referenced", + [KPF_UPTODATE] = "U:uptodate", + [KPF_DIRTY] = "D:dirty", + [KPF_LRU] = "l:lru", + [KPF_ACTIVE] = "A:active", + [KPF_SLAB] = "S:slab", + [KPF_WRITEBACK] = "W:writeback", + [KPF_RECLAIM] = "I:reclaim", + [KPF_BUDDY] = "B:buddy", + + [KPF_MMAP] = "M:mmap", + [KPF_ANON] = "a:anonymous", + [KPF_SWAPCACHE] = "s:swapcache", + [KPF_SWAPBACKED] = "b:swapbacked", + [KPF_COMPOUND_HEAD] = "H:compound_head", + [KPF_COMPOUND_TAIL] = "T:compound_tail", + [KPF_HUGE] = "G:huge", + [KPF_UNEVICTABLE] = "u:unevictable", + [KPF_HWPOISON] = "X:hwpoison", + [KPF_NOPAGE] = "n:nopage", + [KPF_KSM] = "x:ksm", + [KPF_THP] = "t:thp", + [KPF_OFFLINE] = "o:offline", + [KPF_PGTABLE] = "g:pgtable", + [KPF_ZERO_PAGE] = "z:zero_page", + [KPF_IDLE] = "i:idle_page", + + [KPF_RESERVED] = "r:reserved", + [KPF_MLOCKED] = "m:mlocked", + [KPF_MAPPEDTODISK] = "d:mappedtodisk", + [KPF_PRIVATE] = "P:private", + [KPF_PRIVATE_2] = "p:private_2", + [KPF_OWNER_PRIVATE] = "O:owner_private", + [KPF_ARCH] = "h:arch", + [KPF_UNCACHED] = "c:uncached", + [KPF_SOFTDIRTY] = "f:softdirty", + [KPF_ARCH_2] = "H:arch_2", + + [KPF_ANON_EXCLUSIVE] = "d:anon_exclusive", + [KPF_READAHEAD] = "I:readahead", + [KPF_SLOB_FREE] = "P:slob_free", + [KPF_SLUB_FROZEN] = "A:slub_frozen", + [KPF_SLUB_DEBUG] = "E:slub_debug", + + [KPF_FILE] = "F:file", + [KPF_SWAP] = "w:swap", + [KPF_MMAP_EXCLUSIVE] = "1:mmap_exclusive", +}; + + +/* + * data structures + */ + +static int opt_raw; /* for kernel developers */ +static int opt_list; /* list pages (in ranges) */ +static int opt_mark_idle; /* set accessed bit */ +static int opt_no_summary; /* don't show summary */ +static pid_t opt_pid; /* process to walk */ +const char *opt_file; /* file or directory path */ +static uint64_t opt_cgroup; /* cgroup inode */ +static int opt_list_cgroup;/* list page cgroup */ +static int opt_list_mapcnt;/* list page map count */ +static const char *opt_kpageflags;/* kpageflags file to parse */ + +#define MAX_ADDR_RANGES 1024 +static int nr_addr_ranges; +static unsigned long opt_offset[MAX_ADDR_RANGES]; +static unsigned long opt_size[MAX_ADDR_RANGES]; + +#define MAX_VMAS 10240 +static int nr_vmas; +static unsigned long pg_start[MAX_VMAS]; +static unsigned long pg_end[MAX_VMAS]; + +#define MAX_BIT_FILTERS 64 +static int nr_bit_filters; +static uint64_t opt_mask[MAX_BIT_FILTERS]; +static uint64_t opt_bits[MAX_BIT_FILTERS]; + +static int page_size; + +static int pagemap_fd; +static int kpageflags_fd; +static int kpagecount_fd = -1; +static int kpagecgroup_fd = -1; +static int page_idle_fd = -1; + +static int opt_hwpoison; +static int opt_unpoison; + +static const char *hwpoison_debug_fs; +static int hwpoison_inject_fd; +static int hwpoison_forget_fd; + +#define HASH_SHIFT 13 +#define HASH_SIZE (1 << HASH_SHIFT) +#define HASH_MASK (HASH_SIZE - 1) +#define HASH_KEY(flags) (flags & HASH_MASK) + +static unsigned long total_pages; +static unsigned long nr_pages[HASH_SIZE]; +static uint64_t page_flags[HASH_SIZE]; + + +/* + * helper functions + */ + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +#define min_t(type, x, y) ({ \ + type __min1 = (x); \ + type __min2 = (y); \ + __min1 < __min2 ? __min1 : __min2; }) + +#define max_t(type, x, y) ({ \ + type __max1 = (x); \ + type __max2 = (y); \ + __max1 > __max2 ? __max1 : __max2; }) + +static unsigned long pages2mb(unsigned long pages) +{ + return (pages * page_size) >> 20; +} + +static void fatal(const char *x, ...) +{ + va_list ap; + + va_start(ap, x); + vfprintf(stderr, x, ap); + va_end(ap); + exit(EXIT_FAILURE); +} + +static int checked_open(const char *pathname, int flags) +{ + int fd = open(pathname, flags); + + if (fd < 0) { + perror(pathname); + exit(EXIT_FAILURE); + } + + return fd; +} + +/* + * pagemap/kpageflags routines + */ + +static unsigned long do_u64_read(int fd, const char *name, + uint64_t *buf, + unsigned long index, + unsigned long count) +{ + long bytes; + + if (index > ULONG_MAX / 8) + fatal("index overflow: %lu\n", index); + + bytes = pread(fd, buf, count * 8, (off_t)index * 8); + if (bytes < 0) { + perror(name); + exit(EXIT_FAILURE); + } + if (bytes % 8) + fatal("partial read: %lu bytes\n", bytes); + + return bytes / 8; +} + +static unsigned long kpageflags_read(uint64_t *buf, + unsigned long index, + unsigned long pages) +{ + return do_u64_read(kpageflags_fd, opt_kpageflags, buf, index, pages); +} + +static unsigned long kpagecgroup_read(uint64_t *buf, + unsigned long index, + unsigned long pages) +{ + if (kpagecgroup_fd < 0) + return pages; + + return do_u64_read(kpagecgroup_fd, opt_kpageflags, buf, index, pages); +} + +static unsigned long kpagecount_read(uint64_t *buf, + unsigned long index, + unsigned long pages) +{ + return kpagecount_fd < 0 ? pages : + do_u64_read(kpagecount_fd, PROC_KPAGECOUNT, + buf, index, pages); +} + +static unsigned long pagemap_read(uint64_t *buf, + unsigned long index, + unsigned long pages) +{ + return do_u64_read(pagemap_fd, "/proc/pid/pagemap", buf, index, pages); +} + +static unsigned long pagemap_pfn(uint64_t val) +{ + unsigned long pfn; + + if (val & PM_PRESENT) + pfn = PM_PFRAME(val); + else + pfn = 0; + + return pfn; +} + +static unsigned long pagemap_swap_offset(uint64_t val) +{ + return val & PM_SWAP ? PM_SWAP_OFFSET(val) : 0; +} + +/* + * page flag names + */ + +static char *page_flag_name(uint64_t flags) +{ + static char buf[65]; + int present; + size_t i, j; + + for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) { + present = (flags >> i) & 1; + if (!page_flag_names[i]) { + if (present) + fatal("unknown flag bit %d\n", i); + continue; + } + buf[j++] = present ? page_flag_names[i][0] : '_'; + } + + return buf; +} + +static char *page_flag_longname(uint64_t flags) +{ + static char buf[1024]; + size_t i, n; + + for (i = 0, n = 0; i < ARRAY_SIZE(page_flag_names); i++) { + if (!page_flag_names[i]) + continue; + if ((flags >> i) & 1) + n += snprintf(buf + n, sizeof(buf) - n, "%s,", + page_flag_names[i] + 2); + } + if (n) + n--; + buf[n] = '\0'; + + return buf; +} + + +/* + * page list and summary + */ + +static void show_page_range(unsigned long voffset, unsigned long offset, + unsigned long size, uint64_t flags, + uint64_t cgroup, uint64_t mapcnt) +{ + static uint64_t flags0; + static uint64_t cgroup0; + static uint64_t mapcnt0; + static unsigned long voff; + static unsigned long index; + static unsigned long count; + + if (flags == flags0 && cgroup == cgroup0 && mapcnt == mapcnt0 && + offset == index + count && size && voffset == voff + count) { + count += size; + return; + } + + if (count) { + if (opt_pid) + printf("%lx\t", voff); + if (opt_file) + printf("%lx\t", voff); + if (opt_list_cgroup) + printf("@%llu\t", (unsigned long long)cgroup0); + if (opt_list_mapcnt) + printf("%lu\t", mapcnt0); + printf("%lx\t%lx\t%s\n", + index, count, page_flag_name(flags0)); + } + + flags0 = flags; + cgroup0 = cgroup; + mapcnt0 = mapcnt; + index = offset; + voff = voffset; + count = size; +} + +static void flush_page_range(void) +{ + show_page_range(0, 0, 0, 0, 0, 0); +} + +static void show_page(unsigned long voffset, unsigned long offset, + uint64_t flags, uint64_t cgroup, uint64_t mapcnt) +{ + if (opt_pid) + printf("%lx\t", voffset); + if (opt_file) + printf("%lx\t", voffset); + if (opt_list_cgroup) + printf("@%llu\t", (unsigned long long)cgroup); + if (opt_list_mapcnt) + printf("%lu\t", mapcnt); + + printf("%lx\t%s\n", offset, page_flag_name(flags)); +} + +static void show_summary(void) +{ + size_t i; + + printf(" flags\tpage-count MB" + " symbolic-flags\t\t\tlong-symbolic-flags\n"); + + for (i = 0; i < ARRAY_SIZE(nr_pages); i++) { + if (nr_pages[i]) + printf("0x%016llx\t%10lu %8lu %s\t%s\n", + (unsigned long long)page_flags[i], + nr_pages[i], + pages2mb(nr_pages[i]), + page_flag_name(page_flags[i]), + page_flag_longname(page_flags[i])); + } + + printf(" total\t%10lu %8lu\n", + total_pages, pages2mb(total_pages)); +} + + +/* + * page flag filters + */ + +static int bit_mask_ok(uint64_t flags) +{ + int i; + + for (i = 0; i < nr_bit_filters; i++) { + if (opt_bits[i] == KPF_ALL_BITS) { + if ((flags & opt_mask[i]) == 0) + return 0; + } else { + if ((flags & opt_mask[i]) != opt_bits[i]) + return 0; + } + } + + return 1; +} + +static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme) +{ + /* Anonymous pages overload PG_mappedtodisk */ + if ((flags & BIT(ANON)) && (flags & BIT(MAPPEDTODISK))) + flags ^= BIT(MAPPEDTODISK) | BIT(ANON_EXCLUSIVE); + + /* SLOB/SLUB overload several page flags */ + if (flags & BIT(SLAB)) { + if (flags & BIT(PRIVATE)) + flags ^= BIT(PRIVATE) | BIT(SLOB_FREE); + if (flags & BIT(ACTIVE)) + flags ^= BIT(ACTIVE) | BIT(SLUB_FROZEN); + if (flags & BIT(ERROR)) + flags ^= BIT(ERROR) | BIT(SLUB_DEBUG); + } + + /* PG_reclaim is overloaded as PG_readahead in the read path */ + if ((flags & (BIT(RECLAIM) | BIT(WRITEBACK))) == BIT(RECLAIM)) + flags ^= BIT(RECLAIM) | BIT(READAHEAD); + + if (pme & PM_SOFT_DIRTY) + flags |= BIT(SOFTDIRTY); + if (pme & PM_FILE) + flags |= BIT(FILE); + if (pme & PM_SWAP) + flags |= BIT(SWAP); + if (pme & PM_MMAP_EXCLUSIVE) + flags |= BIT(MMAP_EXCLUSIVE); + + return flags; +} + +static uint64_t well_known_flags(uint64_t flags) +{ + /* hide flags intended only for kernel hacker */ + flags &= ~KPF_HACKERS_BITS; + + /* hide non-hugeTLB compound pages */ + if ((flags & BITS_COMPOUND) && !(flags & BIT(HUGE))) + flags &= ~BITS_COMPOUND; + + return flags; +} + +static uint64_t kpageflags_flags(uint64_t flags, uint64_t pme) +{ + if (opt_raw) + flags = expand_overloaded_flags(flags, pme); + else + flags = well_known_flags(flags); + + return flags; +} + +/* + * page actions + */ + +static void prepare_hwpoison_fd(void) +{ + char buf[MAX_PATH + 1]; + + hwpoison_debug_fs = debugfs__mount(); + if (!hwpoison_debug_fs) { + perror("mount debugfs"); + exit(EXIT_FAILURE); + } + + if (opt_hwpoison && !hwpoison_inject_fd) { + snprintf(buf, MAX_PATH, "%s/hwpoison/corrupt-pfn", + hwpoison_debug_fs); + hwpoison_inject_fd = checked_open(buf, O_WRONLY); + } + + if (opt_unpoison && !hwpoison_forget_fd) { + snprintf(buf, MAX_PATH, "%s/hwpoison/unpoison-pfn", + hwpoison_debug_fs); + hwpoison_forget_fd = checked_open(buf, O_WRONLY); + } +} + +static int hwpoison_page(unsigned long offset) +{ + char buf[100]; + int len; + + len = sprintf(buf, "0x%lx\n", offset); + len = write(hwpoison_inject_fd, buf, len); + if (len < 0) { + perror("hwpoison inject"); + return len; + } + return 0; +} + +static int unpoison_page(unsigned long offset) +{ + char buf[100]; + int len; + + len = sprintf(buf, "0x%lx\n", offset); + len = write(hwpoison_forget_fd, buf, len); + if (len < 0) { + perror("hwpoison forget"); + return len; + } + return 0; +} + +static int mark_page_idle(unsigned long offset) +{ + static unsigned long off; + static uint64_t buf; + int len; + + if ((offset / 64 == off / 64) || buf == 0) { + buf |= 1UL << (offset % 64); + off = offset; + return 0; + } + + len = pwrite(page_idle_fd, &buf, 8, 8 * (off / 64)); + if (len < 0) { + perror("mark page idle"); + return len; + } + + buf = 1UL << (offset % 64); + off = offset; + + return 0; +} + +/* + * page frame walker + */ + +static size_t hash_slot(uint64_t flags) +{ + size_t k = HASH_KEY(flags); + size_t i; + + /* Explicitly reserve slot 0 for flags 0: the following logic + * cannot distinguish an unoccupied slot from slot (flags==0). + */ + if (flags == 0) + return 0; + + /* search through the remaining (HASH_SIZE-1) slots */ + for (i = 1; i < ARRAY_SIZE(page_flags); i++, k++) { + if (!k || k >= ARRAY_SIZE(page_flags)) + k = 1; + if (page_flags[k] == 0) { + page_flags[k] = flags; + return k; + } + if (page_flags[k] == flags) + return k; + } + + fatal("hash table full: bump up HASH_SHIFT?\n"); + exit(EXIT_FAILURE); +} + +static void add_page(unsigned long voffset, unsigned long offset, + uint64_t flags, uint64_t cgroup, uint64_t mapcnt, + uint64_t pme) +{ + flags = kpageflags_flags(flags, pme); + + if (!bit_mask_ok(flags)) + return; + + if (opt_cgroup && cgroup != (uint64_t)opt_cgroup) + return; + + if (opt_hwpoison) + hwpoison_page(offset); + if (opt_unpoison) + unpoison_page(offset); + + if (opt_mark_idle) + mark_page_idle(offset); + + if (opt_list == 1) + show_page_range(voffset, offset, 1, flags, cgroup, mapcnt); + else if (opt_list == 2) + show_page(voffset, offset, flags, cgroup, mapcnt); + + nr_pages[hash_slot(flags)]++; + total_pages++; +} + +#define KPAGEFLAGS_BATCH (64 << 10) /* 64k pages */ +static void walk_pfn(unsigned long voffset, + unsigned long index, + unsigned long count, + uint64_t pme) +{ + uint64_t buf[KPAGEFLAGS_BATCH]; + uint64_t cgi[KPAGEFLAGS_BATCH]; + uint64_t cnt[KPAGEFLAGS_BATCH]; + unsigned long batch; + unsigned long pages; + unsigned long i; + + /* + * kpagecgroup_read() reads only if kpagecgroup were opened, but + * /proc/kpagecgroup might even not exist, so it's better to fill + * them with zeros here. + */ + if (count == 1) + cgi[0] = 0; + else + memset(cgi, 0, sizeof cgi); + + while (count) { + batch = min_t(unsigned long, count, KPAGEFLAGS_BATCH); + pages = kpageflags_read(buf, index, batch); + if (pages == 0) + break; + + if (kpagecgroup_read(cgi, index, pages) != pages) + fatal("kpagecgroup returned fewer pages than expected"); + + if (kpagecount_read(cnt, index, pages) != pages) + fatal("kpagecount returned fewer pages than expected"); + + for (i = 0; i < pages; i++) + add_page(voffset + i, index + i, + buf[i], cgi[i], cnt[i], pme); + + index += pages; + count -= pages; + } +} + +static void walk_swap(unsigned long voffset, uint64_t pme) +{ + uint64_t flags = kpageflags_flags(0, pme); + + if (!bit_mask_ok(flags)) + return; + + if (opt_cgroup) + return; + + if (opt_list == 1) + show_page_range(voffset, pagemap_swap_offset(pme), + 1, flags, 0, 0); + else if (opt_list == 2) + show_page(voffset, pagemap_swap_offset(pme), flags, 0, 0); + + nr_pages[hash_slot(flags)]++; + total_pages++; +} + +#define PAGEMAP_BATCH (64 << 10) +static void walk_vma(unsigned long index, unsigned long count) +{ + uint64_t buf[PAGEMAP_BATCH]; + unsigned long batch; + unsigned long pages; + unsigned long pfn; + unsigned long i; + + while (count) { + batch = min_t(unsigned long, count, PAGEMAP_BATCH); + pages = pagemap_read(buf, index, batch); + if (pages == 0) + break; + + for (i = 0; i < pages; i++) { + pfn = pagemap_pfn(buf[i]); + if (pfn) + walk_pfn(index + i, pfn, 1, buf[i]); + if (buf[i] & PM_SWAP) + walk_swap(index + i, buf[i]); + } + + index += pages; + count -= pages; + } +} + +static void walk_task(unsigned long index, unsigned long count) +{ + const unsigned long end = index + count; + unsigned long start; + int i = 0; + + while (index < end) { + + while (pg_end[i] <= index) + if (++i >= nr_vmas) + return; + if (pg_start[i] >= end) + return; + + start = max_t(unsigned long, pg_start[i], index); + index = min_t(unsigned long, pg_end[i], end); + + assert(start < index); + walk_vma(start, index - start); + } +} + +static void add_addr_range(unsigned long offset, unsigned long size) +{ + if (nr_addr_ranges >= MAX_ADDR_RANGES) + fatal("too many addr ranges\n"); + + opt_offset[nr_addr_ranges] = offset; + opt_size[nr_addr_ranges] = min_t(unsigned long, size, ULONG_MAX-offset); + nr_addr_ranges++; +} + +static void walk_addr_ranges(void) +{ + int i; + + kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY); + + if (!nr_addr_ranges) + add_addr_range(0, ULONG_MAX); + + for (i = 0; i < nr_addr_ranges; i++) + if (!opt_pid) + walk_pfn(opt_offset[i], opt_offset[i], opt_size[i], 0); + else + walk_task(opt_offset[i], opt_size[i]); + + if (opt_mark_idle) + mark_page_idle(0); + + close(kpageflags_fd); +} + + +/* + * user interface + */ + +static const char *page_flag_type(uint64_t flag) +{ + if (flag & KPF_HACKERS_BITS) + return "(r)"; + if (flag & KPF_OVERLOADED_BITS) + return "(o)"; + return " "; +} + +static void usage(void) +{ + size_t i, j; + + printf( +"page-types [options]\n" +" -r|--raw Raw mode, for kernel developers\n" +" -d|--describe flags Describe flags\n" +" -a|--addr addr-spec Walk a range of pages\n" +" -b|--bits bits-spec Walk pages with specified bits\n" +" -c|--cgroup path|@inode Walk pages within memory cgroup\n" +" -p|--pid pid Walk process address space\n" +" -f|--file filename Walk file address space\n" +" -i|--mark-idle Mark pages idle\n" +" -l|--list Show page details in ranges\n" +" -L|--list-each Show page details one by one\n" +" -C|--list-cgroup Show cgroup inode for pages\n" +" -M|--list-mapcnt Show page map count\n" +" -N|--no-summary Don't show summary info\n" +" -X|--hwpoison hwpoison pages\n" +" -x|--unpoison unpoison pages\n" +" -F|--kpageflags filename kpageflags file to parse\n" +" -h|--help Show this usage message\n" +"flags:\n" +" 0x10 bitfield format, e.g.\n" +" anon bit-name, e.g.\n" +" 0x10,anon comma-separated list, e.g.\n" +"addr-spec:\n" +" N one page at offset N (unit: pages)\n" +" N+M pages range from N to N+M-1\n" +" N,M pages range from N to M-1\n" +" N, pages range from N to end\n" +" ,M pages range from 0 to M-1\n" +"bits-spec:\n" +" bit1,bit2 (flags & (bit1|bit2)) != 0\n" +" bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n" +" bit1,~bit2 (flags & (bit1|bit2)) == bit1\n" +" =bit1,bit2 flags == (bit1|bit2)\n" +"bit-names:\n" + ); + + for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) { + if (!page_flag_names[i]) + continue; + printf("%16s%s", page_flag_names[i] + 2, + page_flag_type(1ULL << i)); + if (++j > 3) { + j = 0; + putchar('\n'); + } + } + printf("\n " + "(r) raw mode bits (o) overloaded bits\n"); +} + +static unsigned long long parse_number(const char *str) +{ + unsigned long long n; + + n = strtoll(str, NULL, 0); + + if (n == 0 && str[0] != '0') + fatal("invalid name or number: %s\n", str); + + return n; +} + +static void parse_pid(const char *str) +{ + FILE *file; + char buf[5000]; + + opt_pid = parse_number(str); + + sprintf(buf, "/proc/%d/pagemap", opt_pid); + pagemap_fd = checked_open(buf, O_RDONLY); + + sprintf(buf, "/proc/%d/maps", opt_pid); + file = fopen(buf, "r"); + if (!file) { + perror(buf); + exit(EXIT_FAILURE); + } + + while (fgets(buf, sizeof(buf), file) != NULL) { + unsigned long vm_start; + unsigned long vm_end; + unsigned long long pgoff; + int major, minor; + char r, w, x, s; + unsigned long ino; + int n; + + n = sscanf(buf, "%lx-%lx %c%c%c%c %llx %x:%x %lu", + &vm_start, + &vm_end, + &r, &w, &x, &s, + &pgoff, + &major, &minor, + &ino); + if (n < 10) { + fprintf(stderr, "unexpected line: %s\n", buf); + continue; + } + pg_start[nr_vmas] = vm_start / page_size; + pg_end[nr_vmas] = vm_end / page_size; + if (++nr_vmas >= MAX_VMAS) { + fprintf(stderr, "too many VMAs\n"); + break; + } + } + fclose(file); +} + +static void show_file(const char *name, const struct stat *st) +{ + unsigned long long size = st->st_size; + char atime[64], mtime[64]; + long now = time(NULL); + + printf("%s\tInode: %u\tSize: %llu (%llu pages)\n", + name, (unsigned)st->st_ino, + size, (size + page_size - 1) / page_size); + + strftime(atime, sizeof(atime), "%c", localtime(&st->st_atime)); + strftime(mtime, sizeof(mtime), "%c", localtime(&st->st_mtime)); + + printf("Modify: %s (%ld seconds ago)\nAccess: %s (%ld seconds ago)\n", + mtime, now - st->st_mtime, + atime, now - st->st_atime); +} + +static sigjmp_buf sigbus_jmp; + +static void * volatile sigbus_addr; + +static void sigbus_handler(int sig, siginfo_t *info, void *ucontex) +{ + (void)sig; + (void)ucontex; + sigbus_addr = info ? info->si_addr : NULL; + siglongjmp(sigbus_jmp, 1); +} + +static struct sigaction sigbus_action = { + .sa_sigaction = sigbus_handler, + .sa_flags = SA_SIGINFO, +}; + +static void walk_file_range(const char *name, int fd, + unsigned long off, unsigned long end) +{ + uint8_t vec[PAGEMAP_BATCH]; + uint64_t buf[PAGEMAP_BATCH], flags; + uint64_t cgroup = 0; + uint64_t mapcnt = 0; + unsigned long nr_pages, pfn, i; + ssize_t len; + void *ptr; + int first = 1; + + for (; off < end; off += len) { + nr_pages = (end - off + page_size - 1) / page_size; + if (nr_pages > PAGEMAP_BATCH) + nr_pages = PAGEMAP_BATCH; + len = nr_pages * page_size; + + ptr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, off); + if (ptr == MAP_FAILED) + fatal("mmap failed: %s", name); + + /* determine cached pages */ + if (mincore(ptr, len, vec)) + fatal("mincore failed: %s", name); + + /* turn off readahead */ + if (madvise(ptr, len, MADV_RANDOM)) + fatal("madvice failed: %s", name); + + if (sigsetjmp(sigbus_jmp, 1)) { + end = off + sigbus_addr ? sigbus_addr - ptr : 0; + fprintf(stderr, "got sigbus at offset %lld: %s\n", + (long long)end, name); + goto got_sigbus; + } + + /* populate ptes */ + for (i = 0; i < nr_pages ; i++) { + if (vec[i] & 1) + (void)*(volatile int *)(ptr + i * page_size); + } +got_sigbus: + + /* turn off harvesting reference bits */ + if (madvise(ptr, len, MADV_SEQUENTIAL)) + fatal("madvice failed: %s", name); + + if (pagemap_read(buf, (unsigned long)ptr / page_size, + nr_pages) != nr_pages) + fatal("cannot read pagemap"); + + munmap(ptr, len); + + for (i = 0; i < nr_pages; i++) { + pfn = pagemap_pfn(buf[i]); + if (!pfn) + continue; + if (!kpageflags_read(&flags, pfn, 1)) + continue; + if (!kpagecgroup_read(&cgroup, pfn, 1)) + fatal("kpagecgroup_read failed"); + if (!kpagecount_read(&mapcnt, pfn, 1)) + fatal("kpagecount_read failed"); + if (first && opt_list) { + first = 0; + flush_page_range(); + } + add_page(off / page_size + i, pfn, + flags, cgroup, mapcnt, buf[i]); + } + } +} + +static void walk_file(const char *name, const struct stat *st) +{ + int i; + int fd; + + fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW); + + if (!nr_addr_ranges) + add_addr_range(0, st->st_size / page_size); + + for (i = 0; i < nr_addr_ranges; i++) + walk_file_range(name, fd, opt_offset[i] * page_size, + (opt_offset[i] + opt_size[i]) * page_size); + + close(fd); +} + +int walk_tree(const char *name, const struct stat *st, int type, struct FTW *f) +{ + (void)f; + switch (type) { + case FTW_F: + if (S_ISREG(st->st_mode)) + walk_file(name, st); + break; + case FTW_DNR: + fprintf(stderr, "cannot read dir: %s\n", name); + break; + } + return 0; +} + +struct stat st; + +static void walk_page_cache(void) +{ + kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY); + pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY); + sigaction(SIGBUS, &sigbus_action, NULL); + + if (stat(opt_file, &st)) + fatal("stat failed: %s\n", opt_file); + + if (S_ISREG(st.st_mode)) { + walk_file(opt_file, &st); + } else if (S_ISDIR(st.st_mode)) { + /* do not follow symlinks and mountpoints */ + if (nftw(opt_file, walk_tree, 64, FTW_MOUNT | FTW_PHYS) < 0) + fatal("nftw failed: %s\n", opt_file); + } else + fatal("unhandled file type: %s\n", opt_file); + + close(kpageflags_fd); + close(pagemap_fd); + signal(SIGBUS, SIG_DFL); +} + +static void parse_file(const char *name) +{ + opt_file = name; +} + +static void parse_cgroup(const char *path) +{ + if (path[0] == '@') { + opt_cgroup = parse_number(path + 1); + return; + } + + struct stat st; + + if (stat(path, &st)) + fatal("stat failed: %s: %m\n", path); + + if (!S_ISDIR(st.st_mode)) + fatal("cgroup supposed to be a directory: %s\n", path); + + opt_cgroup = st.st_ino; +} + +static void parse_addr_range(const char *optarg) +{ + unsigned long offset; + unsigned long size; + char *p; + + p = strchr(optarg, ','); + if (!p) + p = strchr(optarg, '+'); + + if (p == optarg) { + offset = 0; + size = parse_number(p + 1); + } else if (p) { + offset = parse_number(optarg); + if (p[1] == '\0') + size = ULONG_MAX; + else { + size = parse_number(p + 1); + if (*p == ',') { + if (size < offset) + fatal("invalid range: %lu,%lu\n", + offset, size); + size -= offset; + } + } + } else { + offset = parse_number(optarg); + size = 1; + } + + add_addr_range(offset, size); +} + +static void add_bits_filter(uint64_t mask, uint64_t bits) +{ + if (nr_bit_filters >= MAX_BIT_FILTERS) + fatal("too much bit filters\n"); + + opt_mask[nr_bit_filters] = mask; + opt_bits[nr_bit_filters] = bits; + nr_bit_filters++; +} + +static uint64_t parse_flag_name(const char *str, int len) +{ + size_t i; + + if (!*str || !len) + return 0; + + if (len <= 8 && !strncmp(str, "compound", len)) + return BITS_COMPOUND; + + for (i = 0; i < ARRAY_SIZE(page_flag_names); i++) { + if (!page_flag_names[i]) + continue; + if (!strncmp(str, page_flag_names[i] + 2, len)) + return 1ULL << i; + } + + return parse_number(str); +} + +static uint64_t parse_flag_names(const char *str, int all) +{ + const char *p = str; + uint64_t flags = 0; + + while (1) { + if (*p == ',' || *p == '=' || *p == '\0') { + if ((*str != '~') || (*str == '~' && all && *++str)) + flags |= parse_flag_name(str, p - str); + if (*p != ',') + break; + str = p + 1; + } + p++; + } + + return flags; +} + +static void parse_bits_mask(const char *optarg) +{ + uint64_t mask; + uint64_t bits; + const char *p; + + p = strchr(optarg, '='); + if (p == optarg) { + mask = KPF_ALL_BITS; + bits = parse_flag_names(p + 1, 0); + } else if (p) { + mask = parse_flag_names(optarg, 0); + bits = parse_flag_names(p + 1, 0); + } else if (strchr(optarg, '~')) { + mask = parse_flag_names(optarg, 1); + bits = parse_flag_names(optarg, 0); + } else { + mask = parse_flag_names(optarg, 0); + bits = KPF_ALL_BITS; + } + + add_bits_filter(mask, bits); +} + +static void parse_kpageflags(const char *name) +{ + opt_kpageflags = name; +} + +static void describe_flags(const char *optarg) +{ + uint64_t flags = parse_flag_names(optarg, 0); + + printf("0x%016llx\t%s\t%s\n", + (unsigned long long)flags, + page_flag_name(flags), + page_flag_longname(flags)); +} + +static const struct option opts[] = { + { "raw" , 0, NULL, 'r' }, + { "pid" , 1, NULL, 'p' }, + { "file" , 1, NULL, 'f' }, + { "addr" , 1, NULL, 'a' }, + { "bits" , 1, NULL, 'b' }, + { "cgroup" , 1, NULL, 'c' }, + { "describe" , 1, NULL, 'd' }, + { "mark-idle" , 0, NULL, 'i' }, + { "list" , 0, NULL, 'l' }, + { "list-each" , 0, NULL, 'L' }, + { "list-cgroup", 0, NULL, 'C' }, + { "list-mapcnt", 0, NULL, 'M' }, + { "no-summary", 0, NULL, 'N' }, + { "hwpoison" , 0, NULL, 'X' }, + { "unpoison" , 0, NULL, 'x' }, + { "kpageflags", 0, NULL, 'F' }, + { "help" , 0, NULL, 'h' }, + { NULL , 0, NULL, 0 } +}; + +int main(int argc, char *argv[]) +{ + int c; + + page_size = getpagesize(); + + while ((c = getopt_long(argc, argv, + "rp:f:a:b:d:c:CilLMNXxF:h", + opts, NULL)) != -1) { + switch (c) { + case 'r': + opt_raw = 1; + break; + case 'p': + parse_pid(optarg); + break; + case 'f': + parse_file(optarg); + break; + case 'a': + parse_addr_range(optarg); + break; + case 'b': + parse_bits_mask(optarg); + break; + case 'c': + parse_cgroup(optarg); + break; + case 'C': + opt_list_cgroup = 1; + break; + case 'd': + describe_flags(optarg); + exit(0); + case 'i': + opt_mark_idle = 1; + break; + case 'l': + opt_list = 1; + break; + case 'L': + opt_list = 2; + break; + case 'M': + opt_list_mapcnt = 1; + break; + case 'N': + opt_no_summary = 1; + break; + case 'X': + opt_hwpoison = 1; + prepare_hwpoison_fd(); + break; + case 'x': + opt_unpoison = 1; + prepare_hwpoison_fd(); + break; + case 'F': + parse_kpageflags(optarg); + break; + case 'h': + usage(); + exit(0); + default: + usage(); + exit(1); + } + } + + if (!opt_kpageflags) + opt_kpageflags = PROC_KPAGEFLAGS; + + if (opt_cgroup || opt_list_cgroup) + kpagecgroup_fd = checked_open(PROC_KPAGECGROUP, O_RDONLY); + + if (opt_list && opt_list_mapcnt) + kpagecount_fd = checked_open(PROC_KPAGECOUNT, O_RDONLY); + + if (opt_mark_idle) + page_idle_fd = checked_open(SYS_KERNEL_MM_PAGE_IDLE, O_RDWR); + + if (opt_list && opt_pid) + printf("voffset\t"); + if (opt_list && opt_file) + printf("foffset\t"); + if (opt_list && opt_list_cgroup) + printf("cgroup\t"); + if (opt_list && opt_list_mapcnt) + printf("map-cnt\t"); + + if (opt_list == 1) + printf("offset\tlen\tflags\n"); + if (opt_list == 2) + printf("offset\tflags\n"); + + if (opt_file) + walk_page_cache(); + else + walk_addr_ranges(); + + if (opt_list == 1) + flush_page_range(); + + if (opt_no_summary) + return 0; + + if (opt_list) + printf("\n\n"); + + if (opt_file) { + show_file(opt_file, &st); + printf("\n"); + } + + show_summary(); + + if (opt_list_mapcnt) + close(kpagecount_fd); + + if (page_idle_fd >= 0) + close(page_idle_fd); + + return 0; +} diff --git a/tools/mm/page_owner_sort.c b/tools/mm/page_owner_sort.c new file mode 100644 index 000000000000..7c2ac124cdc8 --- /dev/null +++ b/tools/mm/page_owner_sort.c @@ -0,0 +1,897 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * User-space helper to sort the output of /sys/kernel/debug/page_owner + * + * Example use: + * cat /sys/kernel/debug/page_owner > page_owner_full.txt + * ./page_owner_sort page_owner_full.txt sorted_page_owner.txt + * Or sort by total memory: + * ./page_owner_sort -m page_owner_full.txt sorted_page_owner.txt + * + * See Documentation/mm/page_owner.rst +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define bool int +#define true 1 +#define false 0 +#define TASK_COMM_LEN 16 + +struct block_list { + char *txt; + char *comm; // task command name + char *stacktrace; + __u64 ts_nsec; + __u64 free_ts_nsec; + int len; + int num; + int page_num; + pid_t pid; + pid_t tgid; + int allocator; +}; +enum FILTER_BIT { + FILTER_UNRELEASE = 1<<1, + FILTER_PID = 1<<2, + FILTER_TGID = 1<<3, + FILTER_COMM = 1<<4 +}; +enum CULL_BIT { + CULL_UNRELEASE = 1<<1, + CULL_PID = 1<<2, + CULL_TGID = 1<<3, + CULL_COMM = 1<<4, + CULL_STACKTRACE = 1<<5, + CULL_ALLOCATOR = 1<<6 +}; +enum ALLOCATOR_BIT { + ALLOCATOR_CMA = 1<<1, + ALLOCATOR_SLAB = 1<<2, + ALLOCATOR_VMALLOC = 1<<3, + ALLOCATOR_OTHERS = 1<<4 +}; +enum ARG_TYPE { + ARG_TXT, ARG_COMM, ARG_STACKTRACE, ARG_ALLOC_TS, ARG_FREE_TS, + ARG_CULL_TIME, ARG_PAGE_NUM, ARG_PID, ARG_TGID, ARG_UNKNOWN, ARG_FREE, + ARG_ALLOCATOR +}; +enum SORT_ORDER { + SORT_ASC = 1, + SORT_DESC = -1, +}; +struct filter_condition { + pid_t *pids; + pid_t *tgids; + char **comms; + int pids_size; + int tgids_size; + int comms_size; +}; +struct sort_condition { + int (**cmps)(const void *, const void *); + int *signs; + int size; +}; +static struct filter_condition fc; +static struct sort_condition sc; +static regex_t order_pattern; +static regex_t pid_pattern; +static regex_t tgid_pattern; +static regex_t comm_pattern; +static regex_t ts_nsec_pattern; +static regex_t free_ts_nsec_pattern; +static struct block_list *list; +static int list_size; +static int max_size; +static int cull; +static int filter; +static bool debug_on; + +static void set_single_cmp(int (*cmp)(const void *, const void *), int sign); + +int read_block(char *buf, char *ext_buf, int buf_size, FILE *fin) +{ + char *curr = buf, *const buf_end = buf + buf_size; + + while (buf_end - curr > 1 && fgets(curr, buf_end - curr, fin)) { + if (*curr == '\n') { /* empty line */ + return curr - buf; + } + if (!strncmp(curr, "PFN", 3)) { + strcpy(ext_buf, curr); + continue; + } + curr += strlen(curr); + } + + return -1; /* EOF or no space left in buf. */ +} + +static int compare_txt(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return strcmp(l1->txt, l2->txt); +} + +static int compare_stacktrace(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return strcmp(l1->stacktrace, l2->stacktrace); +} + +static int compare_num(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->num - l2->num; +} + +static int compare_page_num(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->page_num - l2->page_num; +} + +static int compare_pid(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->pid - l2->pid; +} + +static int compare_tgid(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->tgid - l2->tgid; +} + +static int compare_allocator(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->allocator - l2->allocator; +} + +static int compare_comm(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return strcmp(l1->comm, l2->comm); +} + +static int compare_ts(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->ts_nsec < l2->ts_nsec ? -1 : 1; +} + +static int compare_free_ts(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->free_ts_nsec < l2->free_ts_nsec ? -1 : 1; +} + +static int compare_release(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + if (!l1->free_ts_nsec && !l2->free_ts_nsec) + return 0; + if (l1->free_ts_nsec && l2->free_ts_nsec) + return 0; + return l1->free_ts_nsec ? 1 : -1; +} + +static int compare_cull_condition(const void *p1, const void *p2) +{ + if (cull == 0) + return compare_txt(p1, p2); + if ((cull & CULL_STACKTRACE) && compare_stacktrace(p1, p2)) + return compare_stacktrace(p1, p2); + if ((cull & CULL_PID) && compare_pid(p1, p2)) + return compare_pid(p1, p2); + if ((cull & CULL_TGID) && compare_tgid(p1, p2)) + return compare_tgid(p1, p2); + if ((cull & CULL_COMM) && compare_comm(p1, p2)) + return compare_comm(p1, p2); + if ((cull & CULL_UNRELEASE) && compare_release(p1, p2)) + return compare_release(p1, p2); + if ((cull & CULL_ALLOCATOR) && compare_allocator(p1, p2)) + return compare_allocator(p1, p2); + return 0; +} + +static int compare_sort_condition(const void *p1, const void *p2) +{ + int cmp = 0; + + for (int i = 0; i < sc.size; ++i) + if (cmp == 0) + cmp = sc.signs[i] * sc.cmps[i](p1, p2); + return cmp; +} + +static int search_pattern(regex_t *pattern, char *pattern_str, char *buf) +{ + int err, val_len; + regmatch_t pmatch[2]; + + err = regexec(pattern, buf, 2, pmatch, REG_NOTBOL); + if (err != 0 || pmatch[1].rm_so == -1) { + if (debug_on) + fprintf(stderr, "no matching pattern in %s\n", buf); + return -1; + } + val_len = pmatch[1].rm_eo - pmatch[1].rm_so; + + memcpy(pattern_str, buf + pmatch[1].rm_so, val_len); + + return 0; +} + +static bool check_regcomp(regex_t *pattern, const char *regex) +{ + int err; + + err = regcomp(pattern, regex, REG_EXTENDED | REG_NEWLINE); + if (err != 0 || pattern->re_nsub != 1) { + fprintf(stderr, "Invalid pattern %s code %d\n", regex, err); + return false; + } + return true; +} + +static char **explode(char sep, const char *str, int *size) +{ + int count = 0, len = strlen(str); + int lastindex = -1, j = 0; + + for (int i = 0; i < len; i++) + if (str[i] == sep) + count++; + char **ret = calloc(++count, sizeof(char *)); + + for (int i = 0; i < len; i++) { + if (str[i] == sep) { + ret[j] = calloc(i - lastindex, sizeof(char)); + memcpy(ret[j++], str + lastindex + 1, i - lastindex - 1); + lastindex = i; + } + } + if (lastindex <= len - 1) { + ret[j] = calloc(len - lastindex, sizeof(char)); + memcpy(ret[j++], str + lastindex + 1, strlen(str) - 1 - lastindex); + } + *size = j; + return ret; +} + +static void free_explode(char **arr, int size) +{ + for (int i = 0; i < size; i++) + free(arr[i]); + free(arr); +} + +# define FIELD_BUFF 25 + +static int get_page_num(char *buf) +{ + int order_val; + char order_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&order_pattern, order_str, buf); + errno = 0; + order_val = strtol(order_str, &endptr, 10); + if (order_val > 64 || errno != 0 || endptr == order_str || *endptr != '\0') { + if (debug_on) + fprintf(stderr, "wrong order in follow buf:\n%s\n", buf); + return 0; + } + + return 1 << order_val; +} + +static pid_t get_pid(char *buf) +{ + pid_t pid; + char pid_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&pid_pattern, pid_str, buf); + errno = 0; + pid = strtol(pid_str, &endptr, 10); + if (errno != 0 || endptr == pid_str || *endptr != '\0') { + if (debug_on) + fprintf(stderr, "wrong/invalid pid in follow buf:\n%s\n", buf); + return -1; + } + + return pid; + +} + +static pid_t get_tgid(char *buf) +{ + pid_t tgid; + char tgid_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&tgid_pattern, tgid_str, buf); + errno = 0; + tgid = strtol(tgid_str, &endptr, 10); + if (errno != 0 || endptr == tgid_str || *endptr != '\0') { + if (debug_on) + fprintf(stderr, "wrong/invalid tgid in follow buf:\n%s\n", buf); + return -1; + } + + return tgid; + +} + +static __u64 get_ts_nsec(char *buf) +{ + __u64 ts_nsec; + char ts_nsec_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&ts_nsec_pattern, ts_nsec_str, buf); + errno = 0; + ts_nsec = strtoull(ts_nsec_str, &endptr, 10); + if (errno != 0 || endptr == ts_nsec_str || *endptr != '\0') { + if (debug_on) + fprintf(stderr, "wrong ts_nsec in follow buf:\n%s\n", buf); + return -1; + } + + return ts_nsec; +} + +static __u64 get_free_ts_nsec(char *buf) +{ + __u64 free_ts_nsec; + char free_ts_nsec_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&free_ts_nsec_pattern, free_ts_nsec_str, buf); + errno = 0; + free_ts_nsec = strtoull(free_ts_nsec_str, &endptr, 10); + if (errno != 0 || endptr == free_ts_nsec_str || *endptr != '\0') { + if (debug_on) + fprintf(stderr, "wrong free_ts_nsec in follow buf:\n%s\n", buf); + return -1; + } + + return free_ts_nsec; +} + +static char *get_comm(char *buf) +{ + char *comm_str = malloc(TASK_COMM_LEN); + + memset(comm_str, 0, TASK_COMM_LEN); + + search_pattern(&comm_pattern, comm_str, buf); + errno = 0; + if (errno != 0) { + if (debug_on) + fprintf(stderr, "wrong comm in follow buf:\n%s\n", buf); + return NULL; + } + + return comm_str; +} + +static int get_arg_type(const char *arg) +{ + if (!strcmp(arg, "pid") || !strcmp(arg, "p")) + return ARG_PID; + else if (!strcmp(arg, "tgid") || !strcmp(arg, "tg")) + return ARG_TGID; + else if (!strcmp(arg, "name") || !strcmp(arg, "n")) + return ARG_COMM; + else if (!strcmp(arg, "stacktrace") || !strcmp(arg, "st")) + return ARG_STACKTRACE; + else if (!strcmp(arg, "free") || !strcmp(arg, "f")) + return ARG_FREE; + else if (!strcmp(arg, "txt") || !strcmp(arg, "T")) + return ARG_TXT; + else if (!strcmp(arg, "free_ts") || !strcmp(arg, "ft")) + return ARG_FREE_TS; + else if (!strcmp(arg, "alloc_ts") || !strcmp(arg, "at")) + return ARG_ALLOC_TS; + else if (!strcmp(arg, "allocator") || !strcmp(arg, "ator")) + return ARG_ALLOCATOR; + else { + return ARG_UNKNOWN; + } +} + +static int get_allocator(const char *buf, const char *migrate_info) +{ + char *tmp, *first_line, *second_line; + int allocator = 0; + + if (strstr(migrate_info, "CMA")) + allocator |= ALLOCATOR_CMA; + if (strstr(migrate_info, "slab")) + allocator |= ALLOCATOR_SLAB; + tmp = strstr(buf, "__vmalloc_node_range"); + if (tmp) { + second_line = tmp; + while (*tmp != '\n') + tmp--; + tmp--; + while (*tmp != '\n') + tmp--; + first_line = ++tmp; + tmp = strstr(tmp, "alloc_pages"); + if (tmp && first_line <= tmp && tmp < second_line) + allocator |= ALLOCATOR_VMALLOC; + } + if (allocator == 0) + allocator = ALLOCATOR_OTHERS; + return allocator; +} + +static bool match_num_list(int num, int *list, int list_size) +{ + for (int i = 0; i < list_size; ++i) + if (list[i] == num) + return true; + return false; +} + +static bool match_str_list(const char *str, char **list, int list_size) +{ + for (int i = 0; i < list_size; ++i) + if (!strcmp(list[i], str)) + return true; + return false; +} + +static bool is_need(char *buf) +{ + __u64 ts_nsec, free_ts_nsec; + + ts_nsec = get_ts_nsec(buf); + free_ts_nsec = get_free_ts_nsec(buf); + + if ((filter & FILTER_UNRELEASE) && free_ts_nsec != 0 && ts_nsec < free_ts_nsec) + return false; + if ((filter & FILTER_PID) && !match_num_list(get_pid(buf), fc.pids, fc.pids_size)) + return false; + if ((filter & FILTER_TGID) && + !match_num_list(get_tgid(buf), fc.tgids, fc.tgids_size)) + return false; + + char *comm = get_comm(buf); + + if ((filter & FILTER_COMM) && + !match_str_list(comm, fc.comms, fc.comms_size)) { + free(comm); + return false; + } + free(comm); + return true; +} + +static bool add_list(char *buf, int len, char *ext_buf) +{ + if (list_size != 0 && + len == list[list_size-1].len && + memcmp(buf, list[list_size-1].txt, len) == 0) { + list[list_size-1].num++; + list[list_size-1].page_num += get_page_num(buf); + return true; + } + if (list_size == max_size) { + fprintf(stderr, "max_size too small??\n"); + return false; + } + if (!is_need(buf)) + return true; + list[list_size].pid = get_pid(buf); + list[list_size].tgid = get_tgid(buf); + list[list_size].comm = get_comm(buf); + list[list_size].txt = malloc(len+1); + if (!list[list_size].txt) { + fprintf(stderr, "Out of memory\n"); + return false; + } + memcpy(list[list_size].txt, buf, len); + list[list_size].txt[len] = 0; + list[list_size].len = len; + list[list_size].num = 1; + list[list_size].page_num = get_page_num(buf); + + list[list_size].stacktrace = strchr(list[list_size].txt, '\n') ?: ""; + if (*list[list_size].stacktrace == '\n') + list[list_size].stacktrace++; + list[list_size].ts_nsec = get_ts_nsec(buf); + list[list_size].free_ts_nsec = get_free_ts_nsec(buf); + list[list_size].allocator = get_allocator(buf, ext_buf); + list_size++; + if (list_size % 1000 == 0) { + printf("loaded %d\r", list_size); + fflush(stdout); + } + return true; +} + +static bool parse_cull_args(const char *arg_str) +{ + int size = 0; + char **args = explode(',', arg_str, &size); + + for (int i = 0; i < size; ++i) { + int arg_type = get_arg_type(args[i]); + + if (arg_type == ARG_PID) + cull |= CULL_PID; + else if (arg_type == ARG_TGID) + cull |= CULL_TGID; + else if (arg_type == ARG_COMM) + cull |= CULL_COMM; + else if (arg_type == ARG_STACKTRACE) + cull |= CULL_STACKTRACE; + else if (arg_type == ARG_FREE) + cull |= CULL_UNRELEASE; + else if (arg_type == ARG_ALLOCATOR) + cull |= CULL_ALLOCATOR; + else { + free_explode(args, size); + return false; + } + } + free_explode(args, size); + if (sc.size == 0) + set_single_cmp(compare_num, SORT_DESC); + return true; +} + +static void set_single_cmp(int (*cmp)(const void *, const void *), int sign) +{ + if (sc.signs == NULL || sc.size < 1) + sc.signs = calloc(1, sizeof(int)); + sc.signs[0] = sign; + if (sc.cmps == NULL || sc.size < 1) + sc.cmps = calloc(1, sizeof(int *)); + sc.cmps[0] = cmp; + sc.size = 1; +} + +static bool parse_sort_args(const char *arg_str) +{ + int size = 0; + + if (sc.size != 0) { /* reset sort_condition */ + free(sc.signs); + free(sc.cmps); + size = 0; + } + + char **args = explode(',', arg_str, &size); + + sc.signs = calloc(size, sizeof(int)); + sc.cmps = calloc(size, sizeof(int *)); + for (int i = 0; i < size; ++i) { + int offset = 0; + + sc.signs[i] = SORT_ASC; + if (args[i][0] == '-' || args[i][0] == '+') { + if (args[i][0] == '-') + sc.signs[i] = SORT_DESC; + offset = 1; + } + + int arg_type = get_arg_type(args[i]+offset); + + if (arg_type == ARG_PID) + sc.cmps[i] = compare_pid; + else if (arg_type == ARG_TGID) + sc.cmps[i] = compare_tgid; + else if (arg_type == ARG_COMM) + sc.cmps[i] = compare_comm; + else if (arg_type == ARG_STACKTRACE) + sc.cmps[i] = compare_stacktrace; + else if (arg_type == ARG_ALLOC_TS) + sc.cmps[i] = compare_ts; + else if (arg_type == ARG_FREE_TS) + sc.cmps[i] = compare_free_ts; + else if (arg_type == ARG_TXT) + sc.cmps[i] = compare_txt; + else if (arg_type == ARG_ALLOCATOR) + sc.cmps[i] = compare_allocator; + else { + free_explode(args, size); + sc.size = 0; + return false; + } + } + sc.size = size; + free_explode(args, size); + return true; +} + +static int *parse_nums_list(char *arg_str, int *list_size) +{ + int size = 0; + char **args = explode(',', arg_str, &size); + int *list = calloc(size, sizeof(int)); + + errno = 0; + for (int i = 0; i < size; ++i) { + char *endptr = NULL; + + list[i] = strtol(args[i], &endptr, 10); + if (errno != 0 || endptr == args[i] || *endptr != '\0') { + free(list); + return NULL; + } + } + *list_size = size; + free_explode(args, size); + return list; +} + +static void print_allocator(FILE *out, int allocator) +{ + fprintf(out, "allocated by "); + if (allocator & ALLOCATOR_CMA) + fprintf(out, "CMA "); + if (allocator & ALLOCATOR_SLAB) + fprintf(out, "SLAB "); + if (allocator & ALLOCATOR_VMALLOC) + fprintf(out, "VMALLOC "); + if (allocator & ALLOCATOR_OTHERS) + fprintf(out, "OTHERS "); +} + +#define BUF_SIZE (128 * 1024) + +static void usage(void) +{ + printf("Usage: ./page_owner_sort [OPTIONS] \n" + "-m\t\tSort by total memory.\n" + "-s\t\tSort by the stack trace.\n" + "-t\t\tSort by times (default).\n" + "-p\t\tSort by pid.\n" + "-P\t\tSort by tgid.\n" + "-n\t\tSort by task command name.\n" + "-a\t\tSort by memory allocate time.\n" + "-r\t\tSort by memory release time.\n" + "-f\t\tFilter out the information of blocks whose memory has been released.\n" + "-d\t\tPrint debug information.\n" + "--pid \tSelect by pid. This selects the information of blocks whose process ID numbers appear in .\n" + "--tgid \tSelect by tgid. This selects the information of blocks whose Thread Group ID numbers appear in .\n" + "--name \n\t\tSelect by command name. This selects the information of blocks whose command name appears in .\n" + "--cull \tCull by user-defined rules. is a single argument in the form of a comma-separated list with some common fields predefined\n" + "--sort \tSpecify sort order as: [+|-]key[,[+|-]key[,...]]\n" + ); +} + +int main(int argc, char **argv) +{ + FILE *fin, *fout; + char *buf, *ext_buf; + int i, count; + struct stat st; + int opt; + struct option longopts[] = { + { "pid", required_argument, NULL, 1 }, + { "tgid", required_argument, NULL, 2 }, + { "name", required_argument, NULL, 3 }, + { "cull", required_argument, NULL, 4 }, + { "sort", required_argument, NULL, 5 }, + { 0, 0, 0, 0}, + }; + + while ((opt = getopt_long(argc, argv, "adfmnprstP", longopts, NULL)) != -1) + switch (opt) { + case 'a': + set_single_cmp(compare_ts, SORT_ASC); + break; + case 'd': + debug_on = true; + break; + case 'f': + filter = filter | FILTER_UNRELEASE; + break; + case 'm': + set_single_cmp(compare_page_num, SORT_DESC); + break; + case 'p': + set_single_cmp(compare_pid, SORT_ASC); + break; + case 'r': + set_single_cmp(compare_free_ts, SORT_ASC); + break; + case 's': + set_single_cmp(compare_stacktrace, SORT_ASC); + break; + case 't': + set_single_cmp(compare_num, SORT_DESC); + break; + case 'P': + set_single_cmp(compare_tgid, SORT_ASC); + break; + case 'n': + set_single_cmp(compare_comm, SORT_ASC); + break; + case 1: + filter = filter | FILTER_PID; + fc.pids = parse_nums_list(optarg, &fc.pids_size); + if (fc.pids == NULL) { + fprintf(stderr, "wrong/invalid pid in from the command line:%s\n", + optarg); + exit(1); + } + break; + case 2: + filter = filter | FILTER_TGID; + fc.tgids = parse_nums_list(optarg, &fc.tgids_size); + if (fc.tgids == NULL) { + fprintf(stderr, "wrong/invalid tgid in from the command line:%s\n", + optarg); + exit(1); + } + break; + case 3: + filter = filter | FILTER_COMM; + fc.comms = explode(',', optarg, &fc.comms_size); + break; + case 4: + if (!parse_cull_args(optarg)) { + fprintf(stderr, "wrong argument after --cull option:%s\n", + optarg); + exit(1); + } + break; + case 5: + if (!parse_sort_args(optarg)) { + fprintf(stderr, "wrong argument after --sort option:%s\n", + optarg); + exit(1); + } + break; + default: + usage(); + exit(1); + } + + if (optind >= (argc - 1)) { + usage(); + exit(1); + } + + fin = fopen(argv[optind], "r"); + fout = fopen(argv[optind + 1], "w"); + if (!fin || !fout) { + usage(); + perror("open: "); + exit(1); + } + + if (!check_regcomp(&order_pattern, "order\\s*([0-9]*),")) + goto out_order; + if (!check_regcomp(&pid_pattern, "pid\\s*([0-9]*),")) + goto out_pid; + if (!check_regcomp(&tgid_pattern, "tgid\\s*([0-9]*) ")) + goto out_tgid; + if (!check_regcomp(&comm_pattern, "tgid\\s*[0-9]*\\s*\\((.*)\\),\\s*ts")) + goto out_comm; + if (!check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,")) + goto out_ts; + if (!check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns")) + goto out_free_ts; + + fstat(fileno(fin), &st); + max_size = st.st_size / 100; /* hack ... */ + + list = malloc(max_size * sizeof(*list)); + buf = malloc(BUF_SIZE); + ext_buf = malloc(BUF_SIZE); + if (!list || !buf || !ext_buf) { + fprintf(stderr, "Out of memory\n"); + goto out_free; + } + + for ( ; ; ) { + int buf_len = read_block(buf, ext_buf, BUF_SIZE, fin); + + if (buf_len < 0) + break; + if (!add_list(buf, buf_len, ext_buf)) + goto out_free; + } + + printf("loaded %d\n", list_size); + + printf("sorting ....\n"); + + qsort(list, list_size, sizeof(list[0]), compare_cull_condition); + + printf("culling\n"); + + for (i = count = 0; i < list_size; i++) { + if (count == 0 || + compare_cull_condition((void *)(&list[count-1]), (void *)(&list[i])) != 0) { + list[count++] = list[i]; + } else { + list[count-1].num += list[i].num; + list[count-1].page_num += list[i].page_num; + } + } + + qsort(list, count, sizeof(list[0]), compare_sort_condition); + + for (i = 0; i < count; i++) { + if (cull == 0) { + fprintf(fout, "%d times, %d pages, ", list[i].num, list[i].page_num); + print_allocator(fout, list[i].allocator); + fprintf(fout, ":\n%s\n", list[i].txt); + } + else { + fprintf(fout, "%d times, %d pages", + list[i].num, list[i].page_num); + if (cull & CULL_PID || filter & FILTER_PID) + fprintf(fout, ", PID %d", list[i].pid); + if (cull & CULL_TGID || filter & FILTER_TGID) + fprintf(fout, ", TGID %d", list[i].pid); + if (cull & CULL_COMM || filter & FILTER_COMM) + fprintf(fout, ", task_comm_name: %s", list[i].comm); + if (cull & CULL_ALLOCATOR) { + fprintf(fout, ", "); + print_allocator(fout, list[i].allocator); + } + if (cull & CULL_UNRELEASE) + fprintf(fout, " (%s)", + list[i].free_ts_nsec ? "UNRELEASED" : "RELEASED"); + if (cull & CULL_STACKTRACE) + fprintf(fout, ":\n%s", list[i].stacktrace); + fprintf(fout, "\n"); + } + } + +out_free: + if (ext_buf) + free(ext_buf); + if (buf) + free(buf); + if (list) + free(list); +out_free_ts: + regfree(&free_ts_nsec_pattern); +out_ts: + regfree(&ts_nsec_pattern); +out_comm: + regfree(&comm_pattern); +out_tgid: + regfree(&tgid_pattern); +out_pid: + regfree(&pid_pattern); +out_order: + regfree(&order_pattern); + + return 0; +} diff --git a/tools/mm/slabinfo-gnuplot.sh b/tools/mm/slabinfo-gnuplot.sh new file mode 100644 index 000000000000..873a892147e5 --- /dev/null +++ b/tools/mm/slabinfo-gnuplot.sh @@ -0,0 +1,268 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-only + +# Sergey Senozhatsky, 2015 +# sergey.senozhatsky.work@gmail.com +# + + +# This program is intended to plot a `slabinfo -X' stats, collected, +# for example, using the following command: +# while [ 1 ]; do slabinfo -X >> stats; sleep 1; done +# +# Use `slabinfo-gnuplot.sh stats' to pre-process collected records +# and generate graphs (totals, slabs sorted by size, slabs sorted +# by size). +# +# Graphs can be [individually] regenerate with different ranges and +# size (-r %d,%d and -s %d,%d options). +# +# To visually compare N `totals' graphs, do +# slabinfo-gnuplot.sh -t FILE1-totals FILE2-totals ... FILEN-totals +# + +min_slab_name_size=11 +xmin=0 +xmax=0 +width=1500 +height=700 +mode=preprocess + +usage() +{ + echo "Usage: [-s W,H] [-r MIN,MAX] [-t|-l] FILE1 [FILE2 ..]" + echo "FILEs must contain 'slabinfo -X' samples" + echo "-t - plot totals for FILE(s)" + echo "-l - plot slabs stats for FILE(s)" + echo "-s %d,%d - set image width and height" + echo "-r %d,%d - use data samples from a given range" +} + +check_file_exist() +{ + if [ ! -f "$1" ]; then + echo "File '$1' does not exist" + exit 1 + fi +} + +do_slabs_plotting() +{ + local file=$1 + local out_file + local range="every ::$xmin" + local xtic="" + local xtic_rotate="norotate" + local lines=2000000 + local wc_lines + + check_file_exist "$file" + + out_file=`basename "$file"` + if [ $xmax -ne 0 ]; then + range="$range::$xmax" + lines=$((xmax-xmin)) + fi + + wc_lines=`cat "$file" | wc -l` + if [ $? -ne 0 ] || [ "$wc_lines" -eq 0 ] ; then + wc_lines=$lines + fi + + if [ "$wc_lines" -lt "$lines" ]; then + lines=$wc_lines + fi + + if [ $((width / lines)) -gt $min_slab_name_size ]; then + xtic=":xtic(1)" + xtic_rotate=90 + fi + +gnuplot -p << EOF +#!/usr/bin/env gnuplot + +set terminal png enhanced size $width,$height large +set output '$out_file.png' +set autoscale xy +set xlabel 'samples' +set ylabel 'bytes' +set style histogram columnstacked title textcolor lt -1 +set style fill solid 0.15 +set xtics rotate $xtic_rotate +set key left above Left title reverse + +plot "$file" $range u 2$xtic title 'SIZE' with boxes,\ + '' $range u 3 title 'LOSS' with boxes +EOF + + if [ $? -eq 0 ]; then + echo "$out_file.png" + fi +} + +do_totals_plotting() +{ + local gnuplot_cmd="" + local range="every ::$xmin" + local file="" + + if [ $xmax -ne 0 ]; then + range="$range::$xmax" + fi + + for i in "${t_files[@]}"; do + check_file_exist "$i" + + file="$file"`basename "$i"` + gnuplot_cmd="$gnuplot_cmd '$i' $range using 1 title\ + '$i Memory usage' with lines," + gnuplot_cmd="$gnuplot_cmd '' $range using 2 title \ + '$i Loss' with lines," + done + +gnuplot -p << EOF +#!/usr/bin/env gnuplot + +set terminal png enhanced size $width,$height large +set autoscale xy +set output '$file.png' +set xlabel 'samples' +set ylabel 'bytes' +set key left above Left title reverse + +plot $gnuplot_cmd +EOF + + if [ $? -eq 0 ]; then + echo "$file.png" + fi +} + +do_preprocess() +{ + local out + local lines + local in=$1 + + check_file_exist "$in" + + # use only 'TOP' slab (biggest memory usage or loss) + let lines=3 + out=`basename "$in"`"-slabs-by-loss" + `cat "$in" | grep -A "$lines" 'Slabs sorted by loss' |\ + grep -E -iv '\-\-|Name|Slabs'\ + | awk '{print $1" "$4+$2*$3" "$4}' > "$out"` + if [ $? -eq 0 ]; then + do_slabs_plotting "$out" + fi + + let lines=3 + out=`basename "$in"`"-slabs-by-size" + `cat "$in" | grep -A "$lines" 'Slabs sorted by size' |\ + grep -E -iv '\-\-|Name|Slabs'\ + | awk '{print $1" "$4" "$4-$2*$3}' > "$out"` + if [ $? -eq 0 ]; then + do_slabs_plotting "$out" + fi + + out=`basename "$in"`"-totals" + `cat "$in" | grep "Memory used" |\ + awk '{print $3" "$7}' > "$out"` + if [ $? -eq 0 ]; then + t_files[0]=$out + do_totals_plotting + fi +} + +parse_opts() +{ + local opt + + while getopts "tlr::s::h" opt; do + case $opt in + t) + mode=totals + ;; + l) + mode=slabs + ;; + s) + array=(${OPTARG//,/ }) + width=${array[0]} + height=${array[1]} + ;; + r) + array=(${OPTARG//,/ }) + xmin=${array[0]} + xmax=${array[1]} + ;; + h) + usage + exit 0 + ;; + \?) + echo "Invalid option: -$OPTARG" >&2 + exit 1 + ;; + :) + echo "-$OPTARG requires an argument." >&2 + exit 1 + ;; + esac + done + + return $OPTIND +} + +parse_args() +{ + local idx=0 + local p + + for p in "$@"; do + case $mode in + preprocess) + files[$idx]=$p + idx=$idx+1 + ;; + totals) + t_files[$idx]=$p + idx=$idx+1 + ;; + slabs) + files[$idx]=$p + idx=$idx+1 + ;; + esac + done +} + +parse_opts "$@" +argstart=$? +parse_args "${@:$argstart}" + +if [ ${#files[@]} -eq 0 ] && [ ${#t_files[@]} -eq 0 ]; then + usage + exit 1 +fi + +case $mode in + preprocess) + for i in "${files[@]}"; do + do_preprocess "$i" + done + ;; + totals) + do_totals_plotting + ;; + slabs) + for i in "${files[@]}"; do + do_slabs_plotting "$i" + done + ;; + *) + echo "Unknown mode $mode" >&2 + usage + exit 1 + ;; +esac diff --git a/tools/mm/slabinfo.c b/tools/mm/slabinfo.c new file mode 100644 index 000000000000..cfaeaea71042 --- /dev/null +++ b/tools/mm/slabinfo.c @@ -0,0 +1,1544 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Slabinfo: Tool to get reports about slabs + * + * (C) 2007 sgi, Christoph Lameter + * (C) 2011 Linux Foundation, Christoph Lameter + * + * Compile with: + * + * gcc -o slabinfo slabinfo.c + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_SLABS 500 +#define MAX_ALIASES 500 +#define MAX_NODES 1024 + +struct slabinfo { + char *name; + int alias; + int refs; + int aliases, align, cache_dma, cpu_slabs, destroy_by_rcu; + unsigned int hwcache_align, object_size, objs_per_slab; + unsigned int sanity_checks, slab_size, store_user, trace; + int order, poison, reclaim_account, red_zone; + unsigned long partial, objects, slabs, objects_partial, objects_total; + unsigned long alloc_fastpath, alloc_slowpath; + unsigned long free_fastpath, free_slowpath; + unsigned long free_frozen, free_add_partial, free_remove_partial; + unsigned long alloc_from_partial, alloc_slab, free_slab, alloc_refill; + unsigned long cpuslab_flush, deactivate_full, deactivate_empty; + unsigned long deactivate_to_head, deactivate_to_tail; + unsigned long deactivate_remote_frees, order_fallback; + unsigned long cmpxchg_double_cpu_fail, cmpxchg_double_fail; + unsigned long alloc_node_mismatch, deactivate_bypass; + unsigned long cpu_partial_alloc, cpu_partial_free; + int numa[MAX_NODES]; + int numa_partial[MAX_NODES]; +} slabinfo[MAX_SLABS]; + +struct aliasinfo { + char *name; + char *ref; + struct slabinfo *slab; +} aliasinfo[MAX_ALIASES]; + +int slabs; +int actual_slabs; +int aliases; +int alias_targets; +int highest_node; + +char buffer[4096]; + +int show_empty; +int show_report; +int show_alias; +int show_slab; +int skip_zero = 1; +int show_numa; +int show_track; +int show_first_alias; +int validate; +int shrink; +int show_inverted; +int show_single_ref; +int show_totals; +int sort_size; +int sort_active; +int set_debug; +int show_ops; +int sort_partial; +int show_activity; +int output_lines = -1; +int sort_loss; +int extended_totals; +int show_bytes; +int unreclaim_only; + +/* Debug options */ +int sanity; +int redzone; +int poison; +int tracking; +int tracing; + +int page_size; + +regex_t pattern; + +static void fatal(const char *x, ...) +{ + va_list ap; + + va_start(ap, x); + vfprintf(stderr, x, ap); + va_end(ap); + exit(EXIT_FAILURE); +} + +static void usage(void) +{ + printf("slabinfo 4/15/2011. (c) 2007 sgi/(c) 2011 Linux Foundation.\n\n" + "slabinfo [-aABDefhilLnoPrsStTUvXz1] [N=K] [-dafzput] [slab-regexp]\n" + "-a|--aliases Show aliases\n" + "-A|--activity Most active slabs first\n" + "-B|--Bytes Show size in bytes\n" + "-D|--display-active Switch line format to activity\n" + "-e|--empty Show empty slabs\n" + "-f|--first-alias Show first alias\n" + "-h|--help Show usage information\n" + "-i|--inverted Inverted list\n" + "-l|--slabs Show slabs\n" + "-L|--Loss Sort by loss\n" + "-n|--numa Show NUMA information\n" + "-N|--lines=K Show the first K slabs\n" + "-o|--ops Show kmem_cache_ops\n" + "-P|--partial Sort by number of partial slabs\n" + "-r|--report Detailed report on single slabs\n" + "-s|--shrink Shrink slabs\n" + "-S|--Size Sort by size\n" + "-t|--tracking Show alloc/free information\n" + "-T|--Totals Show summary information\n" + "-U|--Unreclaim Show unreclaimable slabs only\n" + "-v|--validate Validate slabs\n" + "-X|--Xtotals Show extended summary information\n" + "-z|--zero Include empty slabs\n" + "-1|--1ref Single reference\n" + + "\n" + "-d | --debug Switch off all debug options\n" + "-da | --debug=a Switch on all debug options (--debug=FZPU)\n" + + "\n" + "-d[afzput] | --debug=[afzput]\n" + " f | F Sanity Checks (SLAB_CONSISTENCY_CHECKS)\n" + " z | Z Redzoning\n" + " p | P Poisoning\n" + " u | U Tracking\n" + " t | T Tracing\n" + + "\nSorting options (--Loss, --Size, --Partial) are mutually exclusive\n" + ); +} + +static unsigned long read_obj(const char *name) +{ + FILE *f = fopen(name, "r"); + + if (!f) { + buffer[0] = 0; + if (errno == EACCES) + fatal("%s, Try using superuser\n", strerror(errno)); + } else { + if (!fgets(buffer, sizeof(buffer), f)) + buffer[0] = 0; + fclose(f); + if (buffer[strlen(buffer)] == '\n') + buffer[strlen(buffer)] = 0; + } + return strlen(buffer); +} + + +/* + * Get the contents of an attribute + */ +static unsigned long get_obj(const char *name) +{ + if (!read_obj(name)) + return 0; + + return atol(buffer); +} + +static unsigned long get_obj_and_str(const char *name, char **x) +{ + unsigned long result = 0; + char *p; + + *x = NULL; + + if (!read_obj(name)) { + x = NULL; + return 0; + } + result = strtoul(buffer, &p, 10); + while (*p == ' ') + p++; + if (*p) + *x = strdup(p); + return result; +} + +static void set_obj(struct slabinfo *s, const char *name, int n) +{ + char x[100]; + FILE *f; + + snprintf(x, 100, "%s/%s", s->name, name); + f = fopen(x, "w"); + if (!f) + fatal("Cannot write to %s\n", x); + + fprintf(f, "%d\n", n); + fclose(f); +} + +static unsigned long read_slab_obj(struct slabinfo *s, const char *name) +{ + char x[100]; + FILE *f; + size_t l; + + snprintf(x, 100, "%s/%s", s->name, name); + f = fopen(x, "r"); + if (!f) { + buffer[0] = 0; + l = 0; + } else { + l = fread(buffer, 1, sizeof(buffer), f); + buffer[l] = 0; + fclose(f); + } + return l; +} + +static unsigned long read_debug_slab_obj(struct slabinfo *s, const char *name) +{ + char x[128]; + FILE *f; + size_t l; + + snprintf(x, 128, "/sys/kernel/debug/slab/%s/%s", s->name, name); + f = fopen(x, "r"); + if (!f) { + buffer[0] = 0; + l = 0; + } else { + l = fread(buffer, 1, sizeof(buffer), f); + buffer[l] = 0; + fclose(f); + } + return l; +} + +/* + * Put a size string together + */ +static int store_size(char *buffer, unsigned long value) +{ + unsigned long divisor = 1; + char trailer = 0; + int n; + + if (!show_bytes) { + if (value > 1000000000UL) { + divisor = 100000000UL; + trailer = 'G'; + } else if (value > 1000000UL) { + divisor = 100000UL; + trailer = 'M'; + } else if (value > 1000UL) { + divisor = 100; + trailer = 'K'; + } + } + + value /= divisor; + n = sprintf(buffer, "%ld",value); + if (trailer) { + buffer[n] = trailer; + n++; + buffer[n] = 0; + } + if (divisor != 1) { + memmove(buffer + n - 2, buffer + n - 3, 4); + buffer[n-2] = '.'; + n++; + } + return n; +} + +static void decode_numa_list(int *numa, char *t) +{ + int node; + int nr; + + memset(numa, 0, MAX_NODES * sizeof(int)); + + if (!t) + return; + + while (*t == 'N') { + t++; + node = strtoul(t, &t, 10); + if (*t == '=') { + t++; + nr = strtoul(t, &t, 10); + numa[node] = nr; + if (node > highest_node) + highest_node = node; + } + while (*t == ' ') + t++; + } +} + +static void slab_validate(struct slabinfo *s) +{ + if (strcmp(s->name, "*") == 0) + return; + + set_obj(s, "validate", 1); +} + +static void slab_shrink(struct slabinfo *s) +{ + if (strcmp(s->name, "*") == 0) + return; + + set_obj(s, "shrink", 1); +} + +int line = 0; + +static void first_line(void) +{ + if (show_activity) + printf("Name Objects Alloc Free" + " %%Fast Fallb O CmpX UL\n"); + else + printf("Name Objects Objsize %s " + "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n", + sort_loss ? " Loss" : "Space"); +} + +/* + * Find the shortest alias of a slab + */ +static struct aliasinfo *find_one_alias(struct slabinfo *find) +{ + struct aliasinfo *a; + struct aliasinfo *best = NULL; + + for(a = aliasinfo;a < aliasinfo + aliases; a++) { + if (a->slab == find && + (!best || strlen(best->name) < strlen(a->name))) { + best = a; + if (strncmp(a->name,"kmall", 5) == 0) + return best; + } + } + return best; +} + +static unsigned long slab_size(struct slabinfo *s) +{ + return s->slabs * (page_size << s->order); +} + +static unsigned long slab_activity(struct slabinfo *s) +{ + return s->alloc_fastpath + s->free_fastpath + + s->alloc_slowpath + s->free_slowpath; +} + +static unsigned long slab_waste(struct slabinfo *s) +{ + return slab_size(s) - s->objects * s->object_size; +} + +static void slab_numa(struct slabinfo *s, int mode) +{ + int node; + + if (strcmp(s->name, "*") == 0) + return; + + if (!highest_node) { + printf("\n%s: No NUMA information available.\n", s->name); + return; + } + + if (skip_zero && !s->slabs) + return; + + if (!line) { + printf("\n%-21s:", mode ? "NUMA nodes" : "Slab"); + for(node = 0; node <= highest_node; node++) + printf(" %4d", node); + printf("\n----------------------"); + for(node = 0; node <= highest_node; node++) + printf("-----"); + printf("\n"); + } + printf("%-21s ", mode ? "All slabs" : s->name); + for(node = 0; node <= highest_node; node++) { + char b[20]; + + store_size(b, s->numa[node]); + printf(" %4s", b); + } + printf("\n"); + if (mode) { + printf("%-21s ", "Partial slabs"); + for(node = 0; node <= highest_node; node++) { + char b[20]; + + store_size(b, s->numa_partial[node]); + printf(" %4s", b); + } + printf("\n"); + } + line++; +} + +static void show_tracking(struct slabinfo *s) +{ + printf("\n%s: Kernel object allocation\n", s->name); + printf("-----------------------------------------------------------------------\n"); + if (read_debug_slab_obj(s, "alloc_traces")) + printf("%s", buffer); + else if (read_slab_obj(s, "alloc_calls")) + printf("%s", buffer); + else + printf("No Data\n"); + + printf("\n%s: Kernel object freeing\n", s->name); + printf("------------------------------------------------------------------------\n"); + if (read_debug_slab_obj(s, "free_traces")) + printf("%s", buffer); + else if (read_slab_obj(s, "free_calls")) + printf("%s", buffer); + else + printf("No Data\n"); + +} + +static void ops(struct slabinfo *s) +{ + if (strcmp(s->name, "*") == 0) + return; + + if (read_slab_obj(s, "ops")) { + printf("\n%s: kmem_cache operations\n", s->name); + printf("--------------------------------------------\n"); + printf("%s", buffer); + } else + printf("\n%s has no kmem_cache operations\n", s->name); +} + +static const char *onoff(int x) +{ + if (x) + return "On "; + return "Off"; +} + +static void slab_stats(struct slabinfo *s) +{ + unsigned long total_alloc; + unsigned long total_free; + unsigned long total; + + if (!s->alloc_slab) + return; + + total_alloc = s->alloc_fastpath + s->alloc_slowpath; + total_free = s->free_fastpath + s->free_slowpath; + + if (!total_alloc) + return; + + printf("\n"); + printf("Slab Perf Counter Alloc Free %%Al %%Fr\n"); + printf("--------------------------------------------------\n"); + printf("Fastpath %8lu %8lu %3lu %3lu\n", + s->alloc_fastpath, s->free_fastpath, + s->alloc_fastpath * 100 / total_alloc, + total_free ? s->free_fastpath * 100 / total_free : 0); + printf("Slowpath %8lu %8lu %3lu %3lu\n", + total_alloc - s->alloc_fastpath, s->free_slowpath, + (total_alloc - s->alloc_fastpath) * 100 / total_alloc, + total_free ? s->free_slowpath * 100 / total_free : 0); + printf("Page Alloc %8lu %8lu %3lu %3lu\n", + s->alloc_slab, s->free_slab, + s->alloc_slab * 100 / total_alloc, + total_free ? s->free_slab * 100 / total_free : 0); + printf("Add partial %8lu %8lu %3lu %3lu\n", + s->deactivate_to_head + s->deactivate_to_tail, + s->free_add_partial, + (s->deactivate_to_head + s->deactivate_to_tail) * 100 / total_alloc, + total_free ? s->free_add_partial * 100 / total_free : 0); + printf("Remove partial %8lu %8lu %3lu %3lu\n", + s->alloc_from_partial, s->free_remove_partial, + s->alloc_from_partial * 100 / total_alloc, + total_free ? s->free_remove_partial * 100 / total_free : 0); + + printf("Cpu partial list %8lu %8lu %3lu %3lu\n", + s->cpu_partial_alloc, s->cpu_partial_free, + s->cpu_partial_alloc * 100 / total_alloc, + total_free ? s->cpu_partial_free * 100 / total_free : 0); + + printf("RemoteObj/SlabFrozen %8lu %8lu %3lu %3lu\n", + s->deactivate_remote_frees, s->free_frozen, + s->deactivate_remote_frees * 100 / total_alloc, + total_free ? s->free_frozen * 100 / total_free : 0); + + printf("Total %8lu %8lu\n\n", total_alloc, total_free); + + if (s->cpuslab_flush) + printf("Flushes %8lu\n", s->cpuslab_flush); + + total = s->deactivate_full + s->deactivate_empty + + s->deactivate_to_head + s->deactivate_to_tail + s->deactivate_bypass; + + if (total) { + printf("\nSlab Deactivation Occurrences %%\n"); + printf("-------------------------------------------------\n"); + printf("Slab full %7lu %3lu%%\n", + s->deactivate_full, (s->deactivate_full * 100) / total); + printf("Slab empty %7lu %3lu%%\n", + s->deactivate_empty, (s->deactivate_empty * 100) / total); + printf("Moved to head of partial list %7lu %3lu%%\n", + s->deactivate_to_head, (s->deactivate_to_head * 100) / total); + printf("Moved to tail of partial list %7lu %3lu%%\n", + s->deactivate_to_tail, (s->deactivate_to_tail * 100) / total); + printf("Deactivation bypass %7lu %3lu%%\n", + s->deactivate_bypass, (s->deactivate_bypass * 100) / total); + printf("Refilled from foreign frees %7lu %3lu%%\n", + s->alloc_refill, (s->alloc_refill * 100) / total); + printf("Node mismatch %7lu %3lu%%\n", + s->alloc_node_mismatch, (s->alloc_node_mismatch * 100) / total); + } + + if (s->cmpxchg_double_fail || s->cmpxchg_double_cpu_fail) { + printf("\nCmpxchg_double Looping\n------------------------\n"); + printf("Locked Cmpxchg Double redos %lu\nUnlocked Cmpxchg Double redos %lu\n", + s->cmpxchg_double_fail, s->cmpxchg_double_cpu_fail); + } +} + +static void report(struct slabinfo *s) +{ + if (strcmp(s->name, "*") == 0) + return; + + printf("\nSlabcache: %-15s Aliases: %2d Order : %2d Objects: %lu\n", + s->name, s->aliases, s->order, s->objects); + if (s->hwcache_align) + printf("** Hardware cacheline aligned\n"); + if (s->cache_dma) + printf("** Memory is allocated in a special DMA zone\n"); + if (s->destroy_by_rcu) + printf("** Slabs are destroyed via RCU\n"); + if (s->reclaim_account) + printf("** Reclaim accounting active\n"); + + printf("\nSizes (bytes) Slabs Debug Memory\n"); + printf("------------------------------------------------------------------------\n"); + printf("Object : %7d Total : %7ld Sanity Checks : %s Total: %7ld\n", + s->object_size, s->slabs, onoff(s->sanity_checks), + s->slabs * (page_size << s->order)); + printf("SlabObj: %7d Full : %7ld Redzoning : %s Used : %7ld\n", + s->slab_size, s->slabs - s->partial - s->cpu_slabs, + onoff(s->red_zone), s->objects * s->object_size); + printf("SlabSiz: %7d Partial: %7ld Poisoning : %s Loss : %7ld\n", + page_size << s->order, s->partial, onoff(s->poison), + s->slabs * (page_size << s->order) - s->objects * s->object_size); + printf("Loss : %7d CpuSlab: %7d Tracking : %s Lalig: %7ld\n", + s->slab_size - s->object_size, s->cpu_slabs, onoff(s->store_user), + (s->slab_size - s->object_size) * s->objects); + printf("Align : %7d Objects: %7d Tracing : %s Lpadd: %7ld\n", + s->align, s->objs_per_slab, onoff(s->trace), + ((page_size << s->order) - s->objs_per_slab * s->slab_size) * + s->slabs); + + ops(s); + show_tracking(s); + slab_numa(s, 1); + slab_stats(s); +} + +static void slabcache(struct slabinfo *s) +{ + char size_str[20]; + char dist_str[40]; + char flags[20]; + char *p = flags; + + if (strcmp(s->name, "*") == 0) + return; + + if (unreclaim_only && s->reclaim_account) + return; + + if (actual_slabs == 1) { + report(s); + return; + } + + if (skip_zero && !show_empty && !s->slabs) + return; + + if (show_empty && s->slabs) + return; + + if (sort_loss == 0) + store_size(size_str, slab_size(s)); + else + store_size(size_str, slab_waste(s)); + snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs, + s->partial, s->cpu_slabs); + + if (!line++) + first_line(); + + if (s->aliases) + *p++ = '*'; + if (s->cache_dma) + *p++ = 'd'; + if (s->hwcache_align) + *p++ = 'A'; + if (s->poison) + *p++ = 'P'; + if (s->reclaim_account) + *p++ = 'a'; + if (s->red_zone) + *p++ = 'Z'; + if (s->sanity_checks) + *p++ = 'F'; + if (s->store_user) + *p++ = 'U'; + if (s->trace) + *p++ = 'T'; + + *p = 0; + if (show_activity) { + unsigned long total_alloc; + unsigned long total_free; + + total_alloc = s->alloc_fastpath + s->alloc_slowpath; + total_free = s->free_fastpath + s->free_slowpath; + + printf("%-21s %8ld %10ld %10ld %3ld %3ld %5ld %1d %4ld %4ld\n", + s->name, s->objects, + total_alloc, total_free, + total_alloc ? (s->alloc_fastpath * 100 / total_alloc) : 0, + total_free ? (s->free_fastpath * 100 / total_free) : 0, + s->order_fallback, s->order, s->cmpxchg_double_fail, + s->cmpxchg_double_cpu_fail); + } else { + printf("%-21s %8ld %7d %15s %14s %4d %1d %3ld %3ld %s\n", + s->name, s->objects, s->object_size, size_str, dist_str, + s->objs_per_slab, s->order, + s->slabs ? (s->partial * 100) / s->slabs : 100, + s->slabs ? (s->objects * s->object_size * 100) / + (s->slabs * (page_size << s->order)) : 100, + flags); + } +} + +/* + * Analyze debug options. Return false if something is amiss. + */ +static int debug_opt_scan(char *opt) +{ + if (!opt || !opt[0] || strcmp(opt, "-") == 0) + return 1; + + if (strcasecmp(opt, "a") == 0) { + sanity = 1; + poison = 1; + redzone = 1; + tracking = 1; + return 1; + } + + for ( ; *opt; opt++) + switch (*opt) { + case 'F' : case 'f': + if (sanity) + return 0; + sanity = 1; + break; + case 'P' : case 'p': + if (poison) + return 0; + poison = 1; + break; + + case 'Z' : case 'z': + if (redzone) + return 0; + redzone = 1; + break; + + case 'U' : case 'u': + if (tracking) + return 0; + tracking = 1; + break; + + case 'T' : case 't': + if (tracing) + return 0; + tracing = 1; + break; + default: + return 0; + } + return 1; +} + +static int slab_empty(struct slabinfo *s) +{ + if (s->objects > 0) + return 0; + + /* + * We may still have slabs even if there are no objects. Shrinking will + * remove them. + */ + if (s->slabs != 0) + set_obj(s, "shrink", 1); + + return 1; +} + +static void slab_debug(struct slabinfo *s) +{ + if (strcmp(s->name, "*") == 0) + return; + + if (sanity && !s->sanity_checks) { + set_obj(s, "sanity_checks", 1); + } + if (!sanity && s->sanity_checks) { + if (slab_empty(s)) + set_obj(s, "sanity_checks", 0); + else + fprintf(stderr, "%s not empty cannot disable sanity checks\n", s->name); + } + if (redzone && !s->red_zone) { + if (slab_empty(s)) + set_obj(s, "red_zone", 1); + else + fprintf(stderr, "%s not empty cannot enable redzoning\n", s->name); + } + if (!redzone && s->red_zone) { + if (slab_empty(s)) + set_obj(s, "red_zone", 0); + else + fprintf(stderr, "%s not empty cannot disable redzoning\n", s->name); + } + if (poison && !s->poison) { + if (slab_empty(s)) + set_obj(s, "poison", 1); + else + fprintf(stderr, "%s not empty cannot enable poisoning\n", s->name); + } + if (!poison && s->poison) { + if (slab_empty(s)) + set_obj(s, "poison", 0); + else + fprintf(stderr, "%s not empty cannot disable poisoning\n", s->name); + } + if (tracking && !s->store_user) { + if (slab_empty(s)) + set_obj(s, "store_user", 1); + else + fprintf(stderr, "%s not empty cannot enable tracking\n", s->name); + } + if (!tracking && s->store_user) { + if (slab_empty(s)) + set_obj(s, "store_user", 0); + else + fprintf(stderr, "%s not empty cannot disable tracking\n", s->name); + } + if (tracing && !s->trace) { + if (slabs == 1) + set_obj(s, "trace", 1); + else + fprintf(stderr, "%s can only enable trace for one slab at a time\n", s->name); + } + if (!tracing && s->trace) + set_obj(s, "trace", 1); +} + +static void totals(void) +{ + struct slabinfo *s; + + int used_slabs = 0; + char b1[20], b2[20], b3[20], b4[20]; + unsigned long long max = 1ULL << 63; + + /* Object size */ + unsigned long long min_objsize = max, max_objsize = 0, avg_objsize; + + /* Number of partial slabs in a slabcache */ + unsigned long long min_partial = max, max_partial = 0, + avg_partial, total_partial = 0; + + /* Number of slabs in a slab cache */ + unsigned long long min_slabs = max, max_slabs = 0, + avg_slabs, total_slabs = 0; + + /* Size of the whole slab */ + unsigned long long min_size = max, max_size = 0, + avg_size, total_size = 0; + + /* Bytes used for object storage in a slab */ + unsigned long long min_used = max, max_used = 0, + avg_used, total_used = 0; + + /* Waste: Bytes used for alignment and padding */ + unsigned long long min_waste = max, max_waste = 0, + avg_waste, total_waste = 0; + /* Number of objects in a slab */ + unsigned long long min_objects = max, max_objects = 0, + avg_objects, total_objects = 0; + /* Waste per object */ + unsigned long long min_objwaste = max, + max_objwaste = 0, avg_objwaste, + total_objwaste = 0; + + /* Memory per object */ + unsigned long long min_memobj = max, + max_memobj = 0, avg_memobj, + total_objsize = 0; + + /* Percentage of partial slabs per slab */ + unsigned long min_ppart = 100, max_ppart = 0, + avg_ppart, total_ppart = 0; + + /* Number of objects in partial slabs */ + unsigned long min_partobj = max, max_partobj = 0, + avg_partobj, total_partobj = 0; + + /* Percentage of partial objects of all objects in a slab */ + unsigned long min_ppartobj = 100, max_ppartobj = 0, + avg_ppartobj, total_ppartobj = 0; + + + for (s = slabinfo; s < slabinfo + slabs; s++) { + unsigned long long size; + unsigned long used; + unsigned long long wasted; + unsigned long long objwaste; + unsigned long percentage_partial_slabs; + unsigned long percentage_partial_objs; + + if (!s->slabs || !s->objects) + continue; + + used_slabs++; + + size = slab_size(s); + used = s->objects * s->object_size; + wasted = size - used; + objwaste = s->slab_size - s->object_size; + + percentage_partial_slabs = s->partial * 100 / s->slabs; + if (percentage_partial_slabs > 100) + percentage_partial_slabs = 100; + + percentage_partial_objs = s->objects_partial * 100 + / s->objects; + + if (percentage_partial_objs > 100) + percentage_partial_objs = 100; + + if (s->object_size < min_objsize) + min_objsize = s->object_size; + if (s->partial < min_partial) + min_partial = s->partial; + if (s->slabs < min_slabs) + min_slabs = s->slabs; + if (size < min_size) + min_size = size; + if (wasted < min_waste) + min_waste = wasted; + if (objwaste < min_objwaste) + min_objwaste = objwaste; + if (s->objects < min_objects) + min_objects = s->objects; + if (used < min_used) + min_used = used; + if (s->objects_partial < min_partobj) + min_partobj = s->objects_partial; + if (percentage_partial_slabs < min_ppart) + min_ppart = percentage_partial_slabs; + if (percentage_partial_objs < min_ppartobj) + min_ppartobj = percentage_partial_objs; + if (s->slab_size < min_memobj) + min_memobj = s->slab_size; + + if (s->object_size > max_objsize) + max_objsize = s->object_size; + if (s->partial > max_partial) + max_partial = s->partial; + if (s->slabs > max_slabs) + max_slabs = s->slabs; + if (size > max_size) + max_size = size; + if (wasted > max_waste) + max_waste = wasted; + if (objwaste > max_objwaste) + max_objwaste = objwaste; + if (s->objects > max_objects) + max_objects = s->objects; + if (used > max_used) + max_used = used; + if (s->objects_partial > max_partobj) + max_partobj = s->objects_partial; + if (percentage_partial_slabs > max_ppart) + max_ppart = percentage_partial_slabs; + if (percentage_partial_objs > max_ppartobj) + max_ppartobj = percentage_partial_objs; + if (s->slab_size > max_memobj) + max_memobj = s->slab_size; + + total_partial += s->partial; + total_slabs += s->slabs; + total_size += size; + total_waste += wasted; + + total_objects += s->objects; + total_used += used; + total_partobj += s->objects_partial; + total_ppart += percentage_partial_slabs; + total_ppartobj += percentage_partial_objs; + + total_objwaste += s->objects * objwaste; + total_objsize += s->objects * s->slab_size; + } + + if (!total_objects) { + printf("No objects\n"); + return; + } + if (!used_slabs) { + printf("No slabs\n"); + return; + } + + /* Per slab averages */ + avg_partial = total_partial / used_slabs; + avg_slabs = total_slabs / used_slabs; + avg_size = total_size / used_slabs; + avg_waste = total_waste / used_slabs; + + avg_objects = total_objects / used_slabs; + avg_used = total_used / used_slabs; + avg_partobj = total_partobj / used_slabs; + avg_ppart = total_ppart / used_slabs; + avg_ppartobj = total_ppartobj / used_slabs; + + /* Per object object sizes */ + avg_objsize = total_used / total_objects; + avg_objwaste = total_objwaste / total_objects; + avg_partobj = total_partobj * 100 / total_objects; + avg_memobj = total_objsize / total_objects; + + printf("Slabcache Totals\n"); + printf("----------------\n"); + printf("Slabcaches : %15d Aliases : %11d->%-3d Active: %3d\n", + slabs, aliases, alias_targets, used_slabs); + + store_size(b1, total_size);store_size(b2, total_waste); + store_size(b3, total_waste * 100 / total_used); + printf("Memory used: %15s # Loss : %15s MRatio:%6s%%\n", b1, b2, b3); + + store_size(b1, total_objects);store_size(b2, total_partobj); + store_size(b3, total_partobj * 100 / total_objects); + printf("# Objects : %15s # PartObj: %15s ORatio:%6s%%\n", b1, b2, b3); + + printf("\n"); + printf("Per Cache Average " + "Min Max Total\n"); + printf("---------------------------------------" + "-------------------------------------\n"); + + store_size(b1, avg_objects);store_size(b2, min_objects); + store_size(b3, max_objects);store_size(b4, total_objects); + printf("#Objects %15s %15s %15s %15s\n", + b1, b2, b3, b4); + + store_size(b1, avg_slabs);store_size(b2, min_slabs); + store_size(b3, max_slabs);store_size(b4, total_slabs); + printf("#Slabs %15s %15s %15s %15s\n", + b1, b2, b3, b4); + + store_size(b1, avg_partial);store_size(b2, min_partial); + store_size(b3, max_partial);store_size(b4, total_partial); + printf("#PartSlab %15s %15s %15s %15s\n", + b1, b2, b3, b4); + store_size(b1, avg_ppart);store_size(b2, min_ppart); + store_size(b3, max_ppart); + store_size(b4, total_partial * 100 / total_slabs); + printf("%%PartSlab%15s%% %15s%% %15s%% %15s%%\n", + b1, b2, b3, b4); + + store_size(b1, avg_partobj);store_size(b2, min_partobj); + store_size(b3, max_partobj); + store_size(b4, total_partobj); + printf("PartObjs %15s %15s %15s %15s\n", + b1, b2, b3, b4); + + store_size(b1, avg_ppartobj);store_size(b2, min_ppartobj); + store_size(b3, max_ppartobj); + store_size(b4, total_partobj * 100 / total_objects); + printf("%% PartObj%15s%% %15s%% %15s%% %15s%%\n", + b1, b2, b3, b4); + + store_size(b1, avg_size);store_size(b2, min_size); + store_size(b3, max_size);store_size(b4, total_size); + printf("Memory %15s %15s %15s %15s\n", + b1, b2, b3, b4); + + store_size(b1, avg_used);store_size(b2, min_used); + store_size(b3, max_used);store_size(b4, total_used); + printf("Used %15s %15s %15s %15s\n", + b1, b2, b3, b4); + + store_size(b1, avg_waste);store_size(b2, min_waste); + store_size(b3, max_waste);store_size(b4, total_waste); + printf("Loss %15s %15s %15s %15s\n", + b1, b2, b3, b4); + + printf("\n"); + printf("Per Object Average " + "Min Max\n"); + printf("---------------------------------------" + "--------------------\n"); + + store_size(b1, avg_memobj);store_size(b2, min_memobj); + store_size(b3, max_memobj); + printf("Memory %15s %15s %15s\n", + b1, b2, b3); + store_size(b1, avg_objsize);store_size(b2, min_objsize); + store_size(b3, max_objsize); + printf("User %15s %15s %15s\n", + b1, b2, b3); + + store_size(b1, avg_objwaste);store_size(b2, min_objwaste); + store_size(b3, max_objwaste); + printf("Loss %15s %15s %15s\n", + b1, b2, b3); +} + +static void sort_slabs(void) +{ + struct slabinfo *s1,*s2; + + for (s1 = slabinfo; s1 < slabinfo + slabs; s1++) { + for (s2 = s1 + 1; s2 < slabinfo + slabs; s2++) { + int result; + + if (sort_size) { + if (slab_size(s1) == slab_size(s2)) + result = strcasecmp(s1->name, s2->name); + else + result = slab_size(s1) < slab_size(s2); + } else if (sort_active) { + if (slab_activity(s1) == slab_activity(s2)) + result = strcasecmp(s1->name, s2->name); + else + result = slab_activity(s1) < slab_activity(s2); + } else if (sort_loss) { + if (slab_waste(s1) == slab_waste(s2)) + result = strcasecmp(s1->name, s2->name); + else + result = slab_waste(s1) < slab_waste(s2); + } else if (sort_partial) { + if (s1->partial == s2->partial) + result = strcasecmp(s1->name, s2->name); + else + result = s1->partial < s2->partial; + } else + result = strcasecmp(s1->name, s2->name); + + if (show_inverted) + result = -result; + + if (result > 0) { + struct slabinfo t; + + memcpy(&t, s1, sizeof(struct slabinfo)); + memcpy(s1, s2, sizeof(struct slabinfo)); + memcpy(s2, &t, sizeof(struct slabinfo)); + } + } + } +} + +static void sort_aliases(void) +{ + struct aliasinfo *a1,*a2; + + for (a1 = aliasinfo; a1 < aliasinfo + aliases; a1++) { + for (a2 = a1 + 1; a2 < aliasinfo + aliases; a2++) { + char *n1, *n2; + + n1 = a1->name; + n2 = a2->name; + if (show_alias && !show_inverted) { + n1 = a1->ref; + n2 = a2->ref; + } + if (strcasecmp(n1, n2) > 0) { + struct aliasinfo t; + + memcpy(&t, a1, sizeof(struct aliasinfo)); + memcpy(a1, a2, sizeof(struct aliasinfo)); + memcpy(a2, &t, sizeof(struct aliasinfo)); + } + } + } +} + +static void link_slabs(void) +{ + struct aliasinfo *a; + struct slabinfo *s; + + for (a = aliasinfo; a < aliasinfo + aliases; a++) { + + for (s = slabinfo; s < slabinfo + slabs; s++) + if (strcmp(a->ref, s->name) == 0) { + a->slab = s; + s->refs++; + break; + } + if (s == slabinfo + slabs) + fatal("Unresolved alias %s\n", a->ref); + } +} + +static void alias(void) +{ + struct aliasinfo *a; + char *active = NULL; + + sort_aliases(); + link_slabs(); + + for(a = aliasinfo; a < aliasinfo + aliases; a++) { + + if (!show_single_ref && a->slab->refs == 1) + continue; + + if (!show_inverted) { + if (active) { + if (strcmp(a->slab->name, active) == 0) { + printf(" %s", a->name); + continue; + } + } + printf("\n%-12s <- %s", a->slab->name, a->name); + active = a->slab->name; + } + else + printf("%-15s -> %s\n", a->name, a->slab->name); + } + if (active) + printf("\n"); +} + + +static void rename_slabs(void) +{ + struct slabinfo *s; + struct aliasinfo *a; + + for (s = slabinfo; s < slabinfo + slabs; s++) { + if (*s->name != ':') + continue; + + if (s->refs > 1 && !show_first_alias) + continue; + + a = find_one_alias(s); + + if (a) + s->name = a->name; + else { + s->name = "*"; + actual_slabs--; + } + } +} + +static int slab_mismatch(char *slab) +{ + return regexec(&pattern, slab, 0, NULL, 0); +} + +static void read_slab_dir(void) +{ + DIR *dir; + struct dirent *de; + struct slabinfo *slab = slabinfo; + struct aliasinfo *alias = aliasinfo; + char *p; + char *t; + int count; + + if (chdir("/sys/kernel/slab") && chdir("/sys/slab")) + fatal("SYSFS support for SLUB not active\n"); + + dir = opendir("."); + while ((de = readdir(dir))) { + if (de->d_name[0] == '.' || + (de->d_name[0] != ':' && slab_mismatch(de->d_name))) + continue; + switch (de->d_type) { + case DT_LNK: + alias->name = strdup(de->d_name); + count = readlink(de->d_name, buffer, sizeof(buffer)-1); + + if (count < 0) + fatal("Cannot read symlink %s\n", de->d_name); + + buffer[count] = 0; + p = buffer + count; + while (p > buffer && p[-1] != '/') + p--; + alias->ref = strdup(p); + alias++; + break; + case DT_DIR: + if (chdir(de->d_name)) + fatal("Unable to access slab %s\n", slab->name); + slab->name = strdup(de->d_name); + slab->alias = 0; + slab->refs = 0; + slab->aliases = get_obj("aliases"); + slab->align = get_obj("align"); + slab->cache_dma = get_obj("cache_dma"); + slab->cpu_slabs = get_obj("cpu_slabs"); + slab->destroy_by_rcu = get_obj("destroy_by_rcu"); + slab->hwcache_align = get_obj("hwcache_align"); + slab->object_size = get_obj("object_size"); + slab->objects = get_obj("objects"); + slab->objects_partial = get_obj("objects_partial"); + slab->objects_total = get_obj("objects_total"); + slab->objs_per_slab = get_obj("objs_per_slab"); + slab->order = get_obj("order"); + slab->partial = get_obj("partial"); + slab->partial = get_obj_and_str("partial", &t); + decode_numa_list(slab->numa_partial, t); + free(t); + slab->poison = get_obj("poison"); + slab->reclaim_account = get_obj("reclaim_account"); + slab->red_zone = get_obj("red_zone"); + slab->sanity_checks = get_obj("sanity_checks"); + slab->slab_size = get_obj("slab_size"); + slab->slabs = get_obj_and_str("slabs", &t); + decode_numa_list(slab->numa, t); + free(t); + slab->store_user = get_obj("store_user"); + slab->trace = get_obj("trace"); + slab->alloc_fastpath = get_obj("alloc_fastpath"); + slab->alloc_slowpath = get_obj("alloc_slowpath"); + slab->free_fastpath = get_obj("free_fastpath"); + slab->free_slowpath = get_obj("free_slowpath"); + slab->free_frozen= get_obj("free_frozen"); + slab->free_add_partial = get_obj("free_add_partial"); + slab->free_remove_partial = get_obj("free_remove_partial"); + slab->alloc_from_partial = get_obj("alloc_from_partial"); + slab->alloc_slab = get_obj("alloc_slab"); + slab->alloc_refill = get_obj("alloc_refill"); + slab->free_slab = get_obj("free_slab"); + slab->cpuslab_flush = get_obj("cpuslab_flush"); + slab->deactivate_full = get_obj("deactivate_full"); + slab->deactivate_empty = get_obj("deactivate_empty"); + slab->deactivate_to_head = get_obj("deactivate_to_head"); + slab->deactivate_to_tail = get_obj("deactivate_to_tail"); + slab->deactivate_remote_frees = get_obj("deactivate_remote_frees"); + slab->order_fallback = get_obj("order_fallback"); + slab->cmpxchg_double_cpu_fail = get_obj("cmpxchg_double_cpu_fail"); + slab->cmpxchg_double_fail = get_obj("cmpxchg_double_fail"); + slab->cpu_partial_alloc = get_obj("cpu_partial_alloc"); + slab->cpu_partial_free = get_obj("cpu_partial_free"); + slab->alloc_node_mismatch = get_obj("alloc_node_mismatch"); + slab->deactivate_bypass = get_obj("deactivate_bypass"); + chdir(".."); + if (slab->name[0] == ':') + alias_targets++; + slab++; + break; + default : + fatal("Unknown file type %lx\n", de->d_type); + } + } + closedir(dir); + slabs = slab - slabinfo; + actual_slabs = slabs; + aliases = alias - aliasinfo; + if (slabs > MAX_SLABS) + fatal("Too many slabs\n"); + if (aliases > MAX_ALIASES) + fatal("Too many aliases\n"); +} + +static void output_slabs(void) +{ + struct slabinfo *slab; + int lines = output_lines; + + for (slab = slabinfo; (slab < slabinfo + slabs) && + lines != 0; slab++) { + + if (slab->alias) + continue; + + if (lines != -1) + lines--; + + if (show_numa) + slab_numa(slab, 0); + else if (show_track) + show_tracking(slab); + else if (validate) + slab_validate(slab); + else if (shrink) + slab_shrink(slab); + else if (set_debug) + slab_debug(slab); + else if (show_ops) + ops(slab); + else if (show_slab) + slabcache(slab); + else if (show_report) + report(slab); + } +} + +static void _xtotals(char *heading, char *underline, + int loss, int size, int partial) +{ + printf("%s%s", heading, underline); + line = 0; + sort_loss = loss; + sort_size = size; + sort_partial = partial; + sort_slabs(); + output_slabs(); +} + +static void xtotals(void) +{ + char *heading, *underline; + + totals(); + + link_slabs(); + rename_slabs(); + + heading = "\nSlabs sorted by size\n"; + underline = "--------------------\n"; + _xtotals(heading, underline, 0, 1, 0); + + heading = "\nSlabs sorted by loss\n"; + underline = "--------------------\n"; + _xtotals(heading, underline, 1, 0, 0); + + heading = "\nSlabs sorted by number of partial slabs\n"; + underline = "---------------------------------------\n"; + _xtotals(heading, underline, 0, 0, 1); + + printf("\n"); +} + +struct option opts[] = { + { "aliases", no_argument, NULL, 'a' }, + { "activity", no_argument, NULL, 'A' }, + { "Bytes", no_argument, NULL, 'B'}, + { "debug", optional_argument, NULL, 'd' }, + { "display-activity", no_argument, NULL, 'D' }, + { "empty", no_argument, NULL, 'e' }, + { "first-alias", no_argument, NULL, 'f' }, + { "help", no_argument, NULL, 'h' }, + { "inverted", no_argument, NULL, 'i'}, + { "slabs", no_argument, NULL, 'l' }, + { "Loss", no_argument, NULL, 'L'}, + { "numa", no_argument, NULL, 'n' }, + { "lines", required_argument, NULL, 'N'}, + { "ops", no_argument, NULL, 'o' }, + { "partial", no_argument, NULL, 'p'}, + { "report", no_argument, NULL, 'r' }, + { "shrink", no_argument, NULL, 's' }, + { "Size", no_argument, NULL, 'S'}, + { "tracking", no_argument, NULL, 't'}, + { "Totals", no_argument, NULL, 'T'}, + { "Unreclaim", no_argument, NULL, 'U'}, + { "validate", no_argument, NULL, 'v' }, + { "Xtotals", no_argument, NULL, 'X'}, + { "zero", no_argument, NULL, 'z' }, + { "1ref", no_argument, NULL, '1'}, + { NULL, 0, NULL, 0 } +}; + +int main(int argc, char *argv[]) +{ + int c; + int err; + char *pattern_source; + + page_size = getpagesize(); + + while ((c = getopt_long(argc, argv, "aABd::DefhilLnN:oPrsStTUvXz1", + opts, NULL)) != -1) + switch (c) { + case 'a': + show_alias = 1; + break; + case 'A': + sort_active = 1; + break; + case 'B': + show_bytes = 1; + break; + case 'd': + set_debug = 1; + if (!debug_opt_scan(optarg)) + fatal("Invalid debug option '%s'\n", optarg); + break; + case 'D': + show_activity = 1; + break; + case 'e': + show_empty = 1; + break; + case 'f': + show_first_alias = 1; + break; + case 'h': + usage(); + return 0; + case 'i': + show_inverted = 1; + break; + case 'l': + show_slab = 1; + break; + case 'L': + sort_loss = 1; + break; + case 'n': + show_numa = 1; + break; + case 'N': + if (optarg) { + output_lines = atoi(optarg); + if (output_lines < 1) + output_lines = 1; + } + break; + case 'o': + show_ops = 1; + break; + case 'r': + show_report = 1; + break; + case 'P': + sort_partial = 1; + break; + case 's': + shrink = 1; + break; + case 'S': + sort_size = 1; + break; + case 't': + show_track = 1; + break; + case 'T': + show_totals = 1; + break; + case 'U': + unreclaim_only = 1; + break; + case 'v': + validate = 1; + break; + case 'X': + if (output_lines == -1) + output_lines = 1; + extended_totals = 1; + show_bytes = 1; + break; + case 'z': + skip_zero = 0; + break; + case '1': + show_single_ref = 1; + break; + default: + fatal("%s: Invalid option '%c'\n", argv[0], optopt); + + } + + if (!show_slab && !show_alias && !show_track && !show_report + && !validate && !shrink && !set_debug && !show_ops) + show_slab = 1; + + if (argc > optind) + pattern_source = argv[optind]; + else + pattern_source = ".*"; + + err = regcomp(&pattern, pattern_source, REG_ICASE|REG_NOSUB); + if (err) + fatal("%s: Invalid pattern '%s' code %d\n", + argv[0], pattern_source, err); + read_slab_dir(); + if (show_alias) { + alias(); + } else if (extended_totals) { + xtotals(); + } else if (show_totals) { + totals(); + } else { + link_slabs(); + rename_slabs(); + sort_slabs(); + output_slabs(); + } + return 0; +} diff --git a/tools/vm/.gitignore b/tools/vm/.gitignore deleted file mode 100644 index 922879f93fc8..000000000000 --- a/tools/vm/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -slabinfo -page-types -page_owner_sort diff --git a/tools/vm/Makefile b/tools/vm/Makefile deleted file mode 100644 index 9860622cbb15..000000000000 --- a/tools/vm/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# Makefile for vm tools -# -include ../scripts/Makefile.include - -TARGETS=page-types slabinfo page_owner_sort - -LIB_DIR = ../lib/api -LIBS = $(LIB_DIR)/libapi.a - -CFLAGS = -Wall -Wextra -I../lib/ -LDFLAGS = $(LIBS) - -all: $(TARGETS) - -$(TARGETS): $(LIBS) - -$(LIBS): - make -C $(LIB_DIR) - -%: %.c - $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) - -clean: - $(RM) page-types slabinfo page_owner_sort - make -C $(LIB_DIR) clean - -sbindir ?= /usr/sbin - -install: all - install -d $(DESTDIR)$(sbindir) - install -m 755 -p $(TARGETS) $(DESTDIR)$(sbindir) diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c deleted file mode 100644 index 381dcc00cb62..000000000000 --- a/tools/vm/page-types.c +++ /dev/null @@ -1,1396 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * page-types: Tool for querying page flags - * - * Copyright (C) 2009 Intel corporation - * - * Authors: Wu Fengguang - */ - -#define _FILE_OFFSET_BITS 64 -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "../../include/uapi/linux/magic.h" -#include "../../include/uapi/linux/kernel-page-flags.h" -#include - -#ifndef MAX_PATH -# define MAX_PATH 256 -#endif - -#ifndef STR -# define _STR(x) #x -# define STR(x) _STR(x) -#endif - -/* - * pagemap kernel ABI bits - */ - -#define PM_ENTRY_BYTES 8 -#define PM_PFRAME_BITS 55 -#define PM_PFRAME_MASK ((1LL << PM_PFRAME_BITS) - 1) -#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) -#define MAX_SWAPFILES_SHIFT 5 -#define PM_SWAP_OFFSET(x) (((x) & PM_PFRAME_MASK) >> MAX_SWAPFILES_SHIFT) -#define PM_SOFT_DIRTY (1ULL << 55) -#define PM_MMAP_EXCLUSIVE (1ULL << 56) -#define PM_FILE (1ULL << 61) -#define PM_SWAP (1ULL << 62) -#define PM_PRESENT (1ULL << 63) - -/* - * kernel page flags - */ - -#define KPF_BYTES 8 -#define PROC_KPAGEFLAGS "/proc/kpageflags" -#define PROC_KPAGECOUNT "/proc/kpagecount" -#define PROC_KPAGECGROUP "/proc/kpagecgroup" - -#define SYS_KERNEL_MM_PAGE_IDLE "/sys/kernel/mm/page_idle/bitmap" - -/* [32-] kernel hacking assistances */ -#define KPF_RESERVED 32 -#define KPF_MLOCKED 33 -#define KPF_MAPPEDTODISK 34 -#define KPF_PRIVATE 35 -#define KPF_PRIVATE_2 36 -#define KPF_OWNER_PRIVATE 37 -#define KPF_ARCH 38 -#define KPF_UNCACHED 39 -#define KPF_SOFTDIRTY 40 -#define KPF_ARCH_2 41 - -/* [47-] take some arbitrary free slots for expanding overloaded flags - * not part of kernel API - */ -#define KPF_ANON_EXCLUSIVE 47 -#define KPF_READAHEAD 48 -#define KPF_SLOB_FREE 49 -#define KPF_SLUB_FROZEN 50 -#define KPF_SLUB_DEBUG 51 -#define KPF_FILE 61 -#define KPF_SWAP 62 -#define KPF_MMAP_EXCLUSIVE 63 - -#define KPF_ALL_BITS ((uint64_t)~0ULL) -#define KPF_HACKERS_BITS (0xffffULL << 32) -#define KPF_OVERLOADED_BITS (0xffffULL << 48) -#define BIT(name) (1ULL << KPF_##name) -#define BITS_COMPOUND (BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL)) - -static const char * const page_flag_names[] = { - [KPF_LOCKED] = "L:locked", - [KPF_ERROR] = "E:error", - [KPF_REFERENCED] = "R:referenced", - [KPF_UPTODATE] = "U:uptodate", - [KPF_DIRTY] = "D:dirty", - [KPF_LRU] = "l:lru", - [KPF_ACTIVE] = "A:active", - [KPF_SLAB] = "S:slab", - [KPF_WRITEBACK] = "W:writeback", - [KPF_RECLAIM] = "I:reclaim", - [KPF_BUDDY] = "B:buddy", - - [KPF_MMAP] = "M:mmap", - [KPF_ANON] = "a:anonymous", - [KPF_SWAPCACHE] = "s:swapcache", - [KPF_SWAPBACKED] = "b:swapbacked", - [KPF_COMPOUND_HEAD] = "H:compound_head", - [KPF_COMPOUND_TAIL] = "T:compound_tail", - [KPF_HUGE] = "G:huge", - [KPF_UNEVICTABLE] = "u:unevictable", - [KPF_HWPOISON] = "X:hwpoison", - [KPF_NOPAGE] = "n:nopage", - [KPF_KSM] = "x:ksm", - [KPF_THP] = "t:thp", - [KPF_OFFLINE] = "o:offline", - [KPF_PGTABLE] = "g:pgtable", - [KPF_ZERO_PAGE] = "z:zero_page", - [KPF_IDLE] = "i:idle_page", - - [KPF_RESERVED] = "r:reserved", - [KPF_MLOCKED] = "m:mlocked", - [KPF_MAPPEDTODISK] = "d:mappedtodisk", - [KPF_PRIVATE] = "P:private", - [KPF_PRIVATE_2] = "p:private_2", - [KPF_OWNER_PRIVATE] = "O:owner_private", - [KPF_ARCH] = "h:arch", - [KPF_UNCACHED] = "c:uncached", - [KPF_SOFTDIRTY] = "f:softdirty", - [KPF_ARCH_2] = "H:arch_2", - - [KPF_ANON_EXCLUSIVE] = "d:anon_exclusive", - [KPF_READAHEAD] = "I:readahead", - [KPF_SLOB_FREE] = "P:slob_free", - [KPF_SLUB_FROZEN] = "A:slub_frozen", - [KPF_SLUB_DEBUG] = "E:slub_debug", - - [KPF_FILE] = "F:file", - [KPF_SWAP] = "w:swap", - [KPF_MMAP_EXCLUSIVE] = "1:mmap_exclusive", -}; - - -/* - * data structures - */ - -static int opt_raw; /* for kernel developers */ -static int opt_list; /* list pages (in ranges) */ -static int opt_mark_idle; /* set accessed bit */ -static int opt_no_summary; /* don't show summary */ -static pid_t opt_pid; /* process to walk */ -const char *opt_file; /* file or directory path */ -static uint64_t opt_cgroup; /* cgroup inode */ -static int opt_list_cgroup;/* list page cgroup */ -static int opt_list_mapcnt;/* list page map count */ -static const char *opt_kpageflags;/* kpageflags file to parse */ - -#define MAX_ADDR_RANGES 1024 -static int nr_addr_ranges; -static unsigned long opt_offset[MAX_ADDR_RANGES]; -static unsigned long opt_size[MAX_ADDR_RANGES]; - -#define MAX_VMAS 10240 -static int nr_vmas; -static unsigned long pg_start[MAX_VMAS]; -static unsigned long pg_end[MAX_VMAS]; - -#define MAX_BIT_FILTERS 64 -static int nr_bit_filters; -static uint64_t opt_mask[MAX_BIT_FILTERS]; -static uint64_t opt_bits[MAX_BIT_FILTERS]; - -static int page_size; - -static int pagemap_fd; -static int kpageflags_fd; -static int kpagecount_fd = -1; -static int kpagecgroup_fd = -1; -static int page_idle_fd = -1; - -static int opt_hwpoison; -static int opt_unpoison; - -static const char *hwpoison_debug_fs; -static int hwpoison_inject_fd; -static int hwpoison_forget_fd; - -#define HASH_SHIFT 13 -#define HASH_SIZE (1 << HASH_SHIFT) -#define HASH_MASK (HASH_SIZE - 1) -#define HASH_KEY(flags) (flags & HASH_MASK) - -static unsigned long total_pages; -static unsigned long nr_pages[HASH_SIZE]; -static uint64_t page_flags[HASH_SIZE]; - - -/* - * helper functions - */ - -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) - -#define min_t(type, x, y) ({ \ - type __min1 = (x); \ - type __min2 = (y); \ - __min1 < __min2 ? __min1 : __min2; }) - -#define max_t(type, x, y) ({ \ - type __max1 = (x); \ - type __max2 = (y); \ - __max1 > __max2 ? __max1 : __max2; }) - -static unsigned long pages2mb(unsigned long pages) -{ - return (pages * page_size) >> 20; -} - -static void fatal(const char *x, ...) -{ - va_list ap; - - va_start(ap, x); - vfprintf(stderr, x, ap); - va_end(ap); - exit(EXIT_FAILURE); -} - -static int checked_open(const char *pathname, int flags) -{ - int fd = open(pathname, flags); - - if (fd < 0) { - perror(pathname); - exit(EXIT_FAILURE); - } - - return fd; -} - -/* - * pagemap/kpageflags routines - */ - -static unsigned long do_u64_read(int fd, const char *name, - uint64_t *buf, - unsigned long index, - unsigned long count) -{ - long bytes; - - if (index > ULONG_MAX / 8) - fatal("index overflow: %lu\n", index); - - bytes = pread(fd, buf, count * 8, (off_t)index * 8); - if (bytes < 0) { - perror(name); - exit(EXIT_FAILURE); - } - if (bytes % 8) - fatal("partial read: %lu bytes\n", bytes); - - return bytes / 8; -} - -static unsigned long kpageflags_read(uint64_t *buf, - unsigned long index, - unsigned long pages) -{ - return do_u64_read(kpageflags_fd, opt_kpageflags, buf, index, pages); -} - -static unsigned long kpagecgroup_read(uint64_t *buf, - unsigned long index, - unsigned long pages) -{ - if (kpagecgroup_fd < 0) - return pages; - - return do_u64_read(kpagecgroup_fd, opt_kpageflags, buf, index, pages); -} - -static unsigned long kpagecount_read(uint64_t *buf, - unsigned long index, - unsigned long pages) -{ - return kpagecount_fd < 0 ? pages : - do_u64_read(kpagecount_fd, PROC_KPAGECOUNT, - buf, index, pages); -} - -static unsigned long pagemap_read(uint64_t *buf, - unsigned long index, - unsigned long pages) -{ - return do_u64_read(pagemap_fd, "/proc/pid/pagemap", buf, index, pages); -} - -static unsigned long pagemap_pfn(uint64_t val) -{ - unsigned long pfn; - - if (val & PM_PRESENT) - pfn = PM_PFRAME(val); - else - pfn = 0; - - return pfn; -} - -static unsigned long pagemap_swap_offset(uint64_t val) -{ - return val & PM_SWAP ? PM_SWAP_OFFSET(val) : 0; -} - -/* - * page flag names - */ - -static char *page_flag_name(uint64_t flags) -{ - static char buf[65]; - int present; - size_t i, j; - - for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) { - present = (flags >> i) & 1; - if (!page_flag_names[i]) { - if (present) - fatal("unknown flag bit %d\n", i); - continue; - } - buf[j++] = present ? page_flag_names[i][0] : '_'; - } - - return buf; -} - -static char *page_flag_longname(uint64_t flags) -{ - static char buf[1024]; - size_t i, n; - - for (i = 0, n = 0; i < ARRAY_SIZE(page_flag_names); i++) { - if (!page_flag_names[i]) - continue; - if ((flags >> i) & 1) - n += snprintf(buf + n, sizeof(buf) - n, "%s,", - page_flag_names[i] + 2); - } - if (n) - n--; - buf[n] = '\0'; - - return buf; -} - - -/* - * page list and summary - */ - -static void show_page_range(unsigned long voffset, unsigned long offset, - unsigned long size, uint64_t flags, - uint64_t cgroup, uint64_t mapcnt) -{ - static uint64_t flags0; - static uint64_t cgroup0; - static uint64_t mapcnt0; - static unsigned long voff; - static unsigned long index; - static unsigned long count; - - if (flags == flags0 && cgroup == cgroup0 && mapcnt == mapcnt0 && - offset == index + count && size && voffset == voff + count) { - count += size; - return; - } - - if (count) { - if (opt_pid) - printf("%lx\t", voff); - if (opt_file) - printf("%lx\t", voff); - if (opt_list_cgroup) - printf("@%llu\t", (unsigned long long)cgroup0); - if (opt_list_mapcnt) - printf("%lu\t", mapcnt0); - printf("%lx\t%lx\t%s\n", - index, count, page_flag_name(flags0)); - } - - flags0 = flags; - cgroup0 = cgroup; - mapcnt0 = mapcnt; - index = offset; - voff = voffset; - count = size; -} - -static void flush_page_range(void) -{ - show_page_range(0, 0, 0, 0, 0, 0); -} - -static void show_page(unsigned long voffset, unsigned long offset, - uint64_t flags, uint64_t cgroup, uint64_t mapcnt) -{ - if (opt_pid) - printf("%lx\t", voffset); - if (opt_file) - printf("%lx\t", voffset); - if (opt_list_cgroup) - printf("@%llu\t", (unsigned long long)cgroup); - if (opt_list_mapcnt) - printf("%lu\t", mapcnt); - - printf("%lx\t%s\n", offset, page_flag_name(flags)); -} - -static void show_summary(void) -{ - size_t i; - - printf(" flags\tpage-count MB" - " symbolic-flags\t\t\tlong-symbolic-flags\n"); - - for (i = 0; i < ARRAY_SIZE(nr_pages); i++) { - if (nr_pages[i]) - printf("0x%016llx\t%10lu %8lu %s\t%s\n", - (unsigned long long)page_flags[i], - nr_pages[i], - pages2mb(nr_pages[i]), - page_flag_name(page_flags[i]), - page_flag_longname(page_flags[i])); - } - - printf(" total\t%10lu %8lu\n", - total_pages, pages2mb(total_pages)); -} - - -/* - * page flag filters - */ - -static int bit_mask_ok(uint64_t flags) -{ - int i; - - for (i = 0; i < nr_bit_filters; i++) { - if (opt_bits[i] == KPF_ALL_BITS) { - if ((flags & opt_mask[i]) == 0) - return 0; - } else { - if ((flags & opt_mask[i]) != opt_bits[i]) - return 0; - } - } - - return 1; -} - -static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme) -{ - /* Anonymous pages overload PG_mappedtodisk */ - if ((flags & BIT(ANON)) && (flags & BIT(MAPPEDTODISK))) - flags ^= BIT(MAPPEDTODISK) | BIT(ANON_EXCLUSIVE); - - /* SLOB/SLUB overload several page flags */ - if (flags & BIT(SLAB)) { - if (flags & BIT(PRIVATE)) - flags ^= BIT(PRIVATE) | BIT(SLOB_FREE); - if (flags & BIT(ACTIVE)) - flags ^= BIT(ACTIVE) | BIT(SLUB_FROZEN); - if (flags & BIT(ERROR)) - flags ^= BIT(ERROR) | BIT(SLUB_DEBUG); - } - - /* PG_reclaim is overloaded as PG_readahead in the read path */ - if ((flags & (BIT(RECLAIM) | BIT(WRITEBACK))) == BIT(RECLAIM)) - flags ^= BIT(RECLAIM) | BIT(READAHEAD); - - if (pme & PM_SOFT_DIRTY) - flags |= BIT(SOFTDIRTY); - if (pme & PM_FILE) - flags |= BIT(FILE); - if (pme & PM_SWAP) - flags |= BIT(SWAP); - if (pme & PM_MMAP_EXCLUSIVE) - flags |= BIT(MMAP_EXCLUSIVE); - - return flags; -} - -static uint64_t well_known_flags(uint64_t flags) -{ - /* hide flags intended only for kernel hacker */ - flags &= ~KPF_HACKERS_BITS; - - /* hide non-hugeTLB compound pages */ - if ((flags & BITS_COMPOUND) && !(flags & BIT(HUGE))) - flags &= ~BITS_COMPOUND; - - return flags; -} - -static uint64_t kpageflags_flags(uint64_t flags, uint64_t pme) -{ - if (opt_raw) - flags = expand_overloaded_flags(flags, pme); - else - flags = well_known_flags(flags); - - return flags; -} - -/* - * page actions - */ - -static void prepare_hwpoison_fd(void) -{ - char buf[MAX_PATH + 1]; - - hwpoison_debug_fs = debugfs__mount(); - if (!hwpoison_debug_fs) { - perror("mount debugfs"); - exit(EXIT_FAILURE); - } - - if (opt_hwpoison && !hwpoison_inject_fd) { - snprintf(buf, MAX_PATH, "%s/hwpoison/corrupt-pfn", - hwpoison_debug_fs); - hwpoison_inject_fd = checked_open(buf, O_WRONLY); - } - - if (opt_unpoison && !hwpoison_forget_fd) { - snprintf(buf, MAX_PATH, "%s/hwpoison/unpoison-pfn", - hwpoison_debug_fs); - hwpoison_forget_fd = checked_open(buf, O_WRONLY); - } -} - -static int hwpoison_page(unsigned long offset) -{ - char buf[100]; - int len; - - len = sprintf(buf, "0x%lx\n", offset); - len = write(hwpoison_inject_fd, buf, len); - if (len < 0) { - perror("hwpoison inject"); - return len; - } - return 0; -} - -static int unpoison_page(unsigned long offset) -{ - char buf[100]; - int len; - - len = sprintf(buf, "0x%lx\n", offset); - len = write(hwpoison_forget_fd, buf, len); - if (len < 0) { - perror("hwpoison forget"); - return len; - } - return 0; -} - -static int mark_page_idle(unsigned long offset) -{ - static unsigned long off; - static uint64_t buf; - int len; - - if ((offset / 64 == off / 64) || buf == 0) { - buf |= 1UL << (offset % 64); - off = offset; - return 0; - } - - len = pwrite(page_idle_fd, &buf, 8, 8 * (off / 64)); - if (len < 0) { - perror("mark page idle"); - return len; - } - - buf = 1UL << (offset % 64); - off = offset; - - return 0; -} - -/* - * page frame walker - */ - -static size_t hash_slot(uint64_t flags) -{ - size_t k = HASH_KEY(flags); - size_t i; - - /* Explicitly reserve slot 0 for flags 0: the following logic - * cannot distinguish an unoccupied slot from slot (flags==0). - */ - if (flags == 0) - return 0; - - /* search through the remaining (HASH_SIZE-1) slots */ - for (i = 1; i < ARRAY_SIZE(page_flags); i++, k++) { - if (!k || k >= ARRAY_SIZE(page_flags)) - k = 1; - if (page_flags[k] == 0) { - page_flags[k] = flags; - return k; - } - if (page_flags[k] == flags) - return k; - } - - fatal("hash table full: bump up HASH_SHIFT?\n"); - exit(EXIT_FAILURE); -} - -static void add_page(unsigned long voffset, unsigned long offset, - uint64_t flags, uint64_t cgroup, uint64_t mapcnt, - uint64_t pme) -{ - flags = kpageflags_flags(flags, pme); - - if (!bit_mask_ok(flags)) - return; - - if (opt_cgroup && cgroup != (uint64_t)opt_cgroup) - return; - - if (opt_hwpoison) - hwpoison_page(offset); - if (opt_unpoison) - unpoison_page(offset); - - if (opt_mark_idle) - mark_page_idle(offset); - - if (opt_list == 1) - show_page_range(voffset, offset, 1, flags, cgroup, mapcnt); - else if (opt_list == 2) - show_page(voffset, offset, flags, cgroup, mapcnt); - - nr_pages[hash_slot(flags)]++; - total_pages++; -} - -#define KPAGEFLAGS_BATCH (64 << 10) /* 64k pages */ -static void walk_pfn(unsigned long voffset, - unsigned long index, - unsigned long count, - uint64_t pme) -{ - uint64_t buf[KPAGEFLAGS_BATCH]; - uint64_t cgi[KPAGEFLAGS_BATCH]; - uint64_t cnt[KPAGEFLAGS_BATCH]; - unsigned long batch; - unsigned long pages; - unsigned long i; - - /* - * kpagecgroup_read() reads only if kpagecgroup were opened, but - * /proc/kpagecgroup might even not exist, so it's better to fill - * them with zeros here. - */ - if (count == 1) - cgi[0] = 0; - else - memset(cgi, 0, sizeof cgi); - - while (count) { - batch = min_t(unsigned long, count, KPAGEFLAGS_BATCH); - pages = kpageflags_read(buf, index, batch); - if (pages == 0) - break; - - if (kpagecgroup_read(cgi, index, pages) != pages) - fatal("kpagecgroup returned fewer pages than expected"); - - if (kpagecount_read(cnt, index, pages) != pages) - fatal("kpagecount returned fewer pages than expected"); - - for (i = 0; i < pages; i++) - add_page(voffset + i, index + i, - buf[i], cgi[i], cnt[i], pme); - - index += pages; - count -= pages; - } -} - -static void walk_swap(unsigned long voffset, uint64_t pme) -{ - uint64_t flags = kpageflags_flags(0, pme); - - if (!bit_mask_ok(flags)) - return; - - if (opt_cgroup) - return; - - if (opt_list == 1) - show_page_range(voffset, pagemap_swap_offset(pme), - 1, flags, 0, 0); - else if (opt_list == 2) - show_page(voffset, pagemap_swap_offset(pme), flags, 0, 0); - - nr_pages[hash_slot(flags)]++; - total_pages++; -} - -#define PAGEMAP_BATCH (64 << 10) -static void walk_vma(unsigned long index, unsigned long count) -{ - uint64_t buf[PAGEMAP_BATCH]; - unsigned long batch; - unsigned long pages; - unsigned long pfn; - unsigned long i; - - while (count) { - batch = min_t(unsigned long, count, PAGEMAP_BATCH); - pages = pagemap_read(buf, index, batch); - if (pages == 0) - break; - - for (i = 0; i < pages; i++) { - pfn = pagemap_pfn(buf[i]); - if (pfn) - walk_pfn(index + i, pfn, 1, buf[i]); - if (buf[i] & PM_SWAP) - walk_swap(index + i, buf[i]); - } - - index += pages; - count -= pages; - } -} - -static void walk_task(unsigned long index, unsigned long count) -{ - const unsigned long end = index + count; - unsigned long start; - int i = 0; - - while (index < end) { - - while (pg_end[i] <= index) - if (++i >= nr_vmas) - return; - if (pg_start[i] >= end) - return; - - start = max_t(unsigned long, pg_start[i], index); - index = min_t(unsigned long, pg_end[i], end); - - assert(start < index); - walk_vma(start, index - start); - } -} - -static void add_addr_range(unsigned long offset, unsigned long size) -{ - if (nr_addr_ranges >= MAX_ADDR_RANGES) - fatal("too many addr ranges\n"); - - opt_offset[nr_addr_ranges] = offset; - opt_size[nr_addr_ranges] = min_t(unsigned long, size, ULONG_MAX-offset); - nr_addr_ranges++; -} - -static void walk_addr_ranges(void) -{ - int i; - - kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY); - - if (!nr_addr_ranges) - add_addr_range(0, ULONG_MAX); - - for (i = 0; i < nr_addr_ranges; i++) - if (!opt_pid) - walk_pfn(opt_offset[i], opt_offset[i], opt_size[i], 0); - else - walk_task(opt_offset[i], opt_size[i]); - - if (opt_mark_idle) - mark_page_idle(0); - - close(kpageflags_fd); -} - - -/* - * user interface - */ - -static const char *page_flag_type(uint64_t flag) -{ - if (flag & KPF_HACKERS_BITS) - return "(r)"; - if (flag & KPF_OVERLOADED_BITS) - return "(o)"; - return " "; -} - -static void usage(void) -{ - size_t i, j; - - printf( -"page-types [options]\n" -" -r|--raw Raw mode, for kernel developers\n" -" -d|--describe flags Describe flags\n" -" -a|--addr addr-spec Walk a range of pages\n" -" -b|--bits bits-spec Walk pages with specified bits\n" -" -c|--cgroup path|@inode Walk pages within memory cgroup\n" -" -p|--pid pid Walk process address space\n" -" -f|--file filename Walk file address space\n" -" -i|--mark-idle Mark pages idle\n" -" -l|--list Show page details in ranges\n" -" -L|--list-each Show page details one by one\n" -" -C|--list-cgroup Show cgroup inode for pages\n" -" -M|--list-mapcnt Show page map count\n" -" -N|--no-summary Don't show summary info\n" -" -X|--hwpoison hwpoison pages\n" -" -x|--unpoison unpoison pages\n" -" -F|--kpageflags filename kpageflags file to parse\n" -" -h|--help Show this usage message\n" -"flags:\n" -" 0x10 bitfield format, e.g.\n" -" anon bit-name, e.g.\n" -" 0x10,anon comma-separated list, e.g.\n" -"addr-spec:\n" -" N one page at offset N (unit: pages)\n" -" N+M pages range from N to N+M-1\n" -" N,M pages range from N to M-1\n" -" N, pages range from N to end\n" -" ,M pages range from 0 to M-1\n" -"bits-spec:\n" -" bit1,bit2 (flags & (bit1|bit2)) != 0\n" -" bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n" -" bit1,~bit2 (flags & (bit1|bit2)) == bit1\n" -" =bit1,bit2 flags == (bit1|bit2)\n" -"bit-names:\n" - ); - - for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) { - if (!page_flag_names[i]) - continue; - printf("%16s%s", page_flag_names[i] + 2, - page_flag_type(1ULL << i)); - if (++j > 3) { - j = 0; - putchar('\n'); - } - } - printf("\n " - "(r) raw mode bits (o) overloaded bits\n"); -} - -static unsigned long long parse_number(const char *str) -{ - unsigned long long n; - - n = strtoll(str, NULL, 0); - - if (n == 0 && str[0] != '0') - fatal("invalid name or number: %s\n", str); - - return n; -} - -static void parse_pid(const char *str) -{ - FILE *file; - char buf[5000]; - - opt_pid = parse_number(str); - - sprintf(buf, "/proc/%d/pagemap", opt_pid); - pagemap_fd = checked_open(buf, O_RDONLY); - - sprintf(buf, "/proc/%d/maps", opt_pid); - file = fopen(buf, "r"); - if (!file) { - perror(buf); - exit(EXIT_FAILURE); - } - - while (fgets(buf, sizeof(buf), file) != NULL) { - unsigned long vm_start; - unsigned long vm_end; - unsigned long long pgoff; - int major, minor; - char r, w, x, s; - unsigned long ino; - int n; - - n = sscanf(buf, "%lx-%lx %c%c%c%c %llx %x:%x %lu", - &vm_start, - &vm_end, - &r, &w, &x, &s, - &pgoff, - &major, &minor, - &ino); - if (n < 10) { - fprintf(stderr, "unexpected line: %s\n", buf); - continue; - } - pg_start[nr_vmas] = vm_start / page_size; - pg_end[nr_vmas] = vm_end / page_size; - if (++nr_vmas >= MAX_VMAS) { - fprintf(stderr, "too many VMAs\n"); - break; - } - } - fclose(file); -} - -static void show_file(const char *name, const struct stat *st) -{ - unsigned long long size = st->st_size; - char atime[64], mtime[64]; - long now = time(NULL); - - printf("%s\tInode: %u\tSize: %llu (%llu pages)\n", - name, (unsigned)st->st_ino, - size, (size + page_size - 1) / page_size); - - strftime(atime, sizeof(atime), "%c", localtime(&st->st_atime)); - strftime(mtime, sizeof(mtime), "%c", localtime(&st->st_mtime)); - - printf("Modify: %s (%ld seconds ago)\nAccess: %s (%ld seconds ago)\n", - mtime, now - st->st_mtime, - atime, now - st->st_atime); -} - -static sigjmp_buf sigbus_jmp; - -static void * volatile sigbus_addr; - -static void sigbus_handler(int sig, siginfo_t *info, void *ucontex) -{ - (void)sig; - (void)ucontex; - sigbus_addr = info ? info->si_addr : NULL; - siglongjmp(sigbus_jmp, 1); -} - -static struct sigaction sigbus_action = { - .sa_sigaction = sigbus_handler, - .sa_flags = SA_SIGINFO, -}; - -static void walk_file_range(const char *name, int fd, - unsigned long off, unsigned long end) -{ - uint8_t vec[PAGEMAP_BATCH]; - uint64_t buf[PAGEMAP_BATCH], flags; - uint64_t cgroup = 0; - uint64_t mapcnt = 0; - unsigned long nr_pages, pfn, i; - ssize_t len; - void *ptr; - int first = 1; - - for (; off < end; off += len) { - nr_pages = (end - off + page_size - 1) / page_size; - if (nr_pages > PAGEMAP_BATCH) - nr_pages = PAGEMAP_BATCH; - len = nr_pages * page_size; - - ptr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, off); - if (ptr == MAP_FAILED) - fatal("mmap failed: %s", name); - - /* determine cached pages */ - if (mincore(ptr, len, vec)) - fatal("mincore failed: %s", name); - - /* turn off readahead */ - if (madvise(ptr, len, MADV_RANDOM)) - fatal("madvice failed: %s", name); - - if (sigsetjmp(sigbus_jmp, 1)) { - end = off + sigbus_addr ? sigbus_addr - ptr : 0; - fprintf(stderr, "got sigbus at offset %lld: %s\n", - (long long)end, name); - goto got_sigbus; - } - - /* populate ptes */ - for (i = 0; i < nr_pages ; i++) { - if (vec[i] & 1) - (void)*(volatile int *)(ptr + i * page_size); - } -got_sigbus: - - /* turn off harvesting reference bits */ - if (madvise(ptr, len, MADV_SEQUENTIAL)) - fatal("madvice failed: %s", name); - - if (pagemap_read(buf, (unsigned long)ptr / page_size, - nr_pages) != nr_pages) - fatal("cannot read pagemap"); - - munmap(ptr, len); - - for (i = 0; i < nr_pages; i++) { - pfn = pagemap_pfn(buf[i]); - if (!pfn) - continue; - if (!kpageflags_read(&flags, pfn, 1)) - continue; - if (!kpagecgroup_read(&cgroup, pfn, 1)) - fatal("kpagecgroup_read failed"); - if (!kpagecount_read(&mapcnt, pfn, 1)) - fatal("kpagecount_read failed"); - if (first && opt_list) { - first = 0; - flush_page_range(); - } - add_page(off / page_size + i, pfn, - flags, cgroup, mapcnt, buf[i]); - } - } -} - -static void walk_file(const char *name, const struct stat *st) -{ - int i; - int fd; - - fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW); - - if (!nr_addr_ranges) - add_addr_range(0, st->st_size / page_size); - - for (i = 0; i < nr_addr_ranges; i++) - walk_file_range(name, fd, opt_offset[i] * page_size, - (opt_offset[i] + opt_size[i]) * page_size); - - close(fd); -} - -int walk_tree(const char *name, const struct stat *st, int type, struct FTW *f) -{ - (void)f; - switch (type) { - case FTW_F: - if (S_ISREG(st->st_mode)) - walk_file(name, st); - break; - case FTW_DNR: - fprintf(stderr, "cannot read dir: %s\n", name); - break; - } - return 0; -} - -struct stat st; - -static void walk_page_cache(void) -{ - kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY); - pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY); - sigaction(SIGBUS, &sigbus_action, NULL); - - if (stat(opt_file, &st)) - fatal("stat failed: %s\n", opt_file); - - if (S_ISREG(st.st_mode)) { - walk_file(opt_file, &st); - } else if (S_ISDIR(st.st_mode)) { - /* do not follow symlinks and mountpoints */ - if (nftw(opt_file, walk_tree, 64, FTW_MOUNT | FTW_PHYS) < 0) - fatal("nftw failed: %s\n", opt_file); - } else - fatal("unhandled file type: %s\n", opt_file); - - close(kpageflags_fd); - close(pagemap_fd); - signal(SIGBUS, SIG_DFL); -} - -static void parse_file(const char *name) -{ - opt_file = name; -} - -static void parse_cgroup(const char *path) -{ - if (path[0] == '@') { - opt_cgroup = parse_number(path + 1); - return; - } - - struct stat st; - - if (stat(path, &st)) - fatal("stat failed: %s: %m\n", path); - - if (!S_ISDIR(st.st_mode)) - fatal("cgroup supposed to be a directory: %s\n", path); - - opt_cgroup = st.st_ino; -} - -static void parse_addr_range(const char *optarg) -{ - unsigned long offset; - unsigned long size; - char *p; - - p = strchr(optarg, ','); - if (!p) - p = strchr(optarg, '+'); - - if (p == optarg) { - offset = 0; - size = parse_number(p + 1); - } else if (p) { - offset = parse_number(optarg); - if (p[1] == '\0') - size = ULONG_MAX; - else { - size = parse_number(p + 1); - if (*p == ',') { - if (size < offset) - fatal("invalid range: %lu,%lu\n", - offset, size); - size -= offset; - } - } - } else { - offset = parse_number(optarg); - size = 1; - } - - add_addr_range(offset, size); -} - -static void add_bits_filter(uint64_t mask, uint64_t bits) -{ - if (nr_bit_filters >= MAX_BIT_FILTERS) - fatal("too much bit filters\n"); - - opt_mask[nr_bit_filters] = mask; - opt_bits[nr_bit_filters] = bits; - nr_bit_filters++; -} - -static uint64_t parse_flag_name(const char *str, int len) -{ - size_t i; - - if (!*str || !len) - return 0; - - if (len <= 8 && !strncmp(str, "compound", len)) - return BITS_COMPOUND; - - for (i = 0; i < ARRAY_SIZE(page_flag_names); i++) { - if (!page_flag_names[i]) - continue; - if (!strncmp(str, page_flag_names[i] + 2, len)) - return 1ULL << i; - } - - return parse_number(str); -} - -static uint64_t parse_flag_names(const char *str, int all) -{ - const char *p = str; - uint64_t flags = 0; - - while (1) { - if (*p == ',' || *p == '=' || *p == '\0') { - if ((*str != '~') || (*str == '~' && all && *++str)) - flags |= parse_flag_name(str, p - str); - if (*p != ',') - break; - str = p + 1; - } - p++; - } - - return flags; -} - -static void parse_bits_mask(const char *optarg) -{ - uint64_t mask; - uint64_t bits; - const char *p; - - p = strchr(optarg, '='); - if (p == optarg) { - mask = KPF_ALL_BITS; - bits = parse_flag_names(p + 1, 0); - } else if (p) { - mask = parse_flag_names(optarg, 0); - bits = parse_flag_names(p + 1, 0); - } else if (strchr(optarg, '~')) { - mask = parse_flag_names(optarg, 1); - bits = parse_flag_names(optarg, 0); - } else { - mask = parse_flag_names(optarg, 0); - bits = KPF_ALL_BITS; - } - - add_bits_filter(mask, bits); -} - -static void parse_kpageflags(const char *name) -{ - opt_kpageflags = name; -} - -static void describe_flags(const char *optarg) -{ - uint64_t flags = parse_flag_names(optarg, 0); - - printf("0x%016llx\t%s\t%s\n", - (unsigned long long)flags, - page_flag_name(flags), - page_flag_longname(flags)); -} - -static const struct option opts[] = { - { "raw" , 0, NULL, 'r' }, - { "pid" , 1, NULL, 'p' }, - { "file" , 1, NULL, 'f' }, - { "addr" , 1, NULL, 'a' }, - { "bits" , 1, NULL, 'b' }, - { "cgroup" , 1, NULL, 'c' }, - { "describe" , 1, NULL, 'd' }, - { "mark-idle" , 0, NULL, 'i' }, - { "list" , 0, NULL, 'l' }, - { "list-each" , 0, NULL, 'L' }, - { "list-cgroup", 0, NULL, 'C' }, - { "list-mapcnt", 0, NULL, 'M' }, - { "no-summary", 0, NULL, 'N' }, - { "hwpoison" , 0, NULL, 'X' }, - { "unpoison" , 0, NULL, 'x' }, - { "kpageflags", 0, NULL, 'F' }, - { "help" , 0, NULL, 'h' }, - { NULL , 0, NULL, 0 } -}; - -int main(int argc, char *argv[]) -{ - int c; - - page_size = getpagesize(); - - while ((c = getopt_long(argc, argv, - "rp:f:a:b:d:c:CilLMNXxF:h", - opts, NULL)) != -1) { - switch (c) { - case 'r': - opt_raw = 1; - break; - case 'p': - parse_pid(optarg); - break; - case 'f': - parse_file(optarg); - break; - case 'a': - parse_addr_range(optarg); - break; - case 'b': - parse_bits_mask(optarg); - break; - case 'c': - parse_cgroup(optarg); - break; - case 'C': - opt_list_cgroup = 1; - break; - case 'd': - describe_flags(optarg); - exit(0); - case 'i': - opt_mark_idle = 1; - break; - case 'l': - opt_list = 1; - break; - case 'L': - opt_list = 2; - break; - case 'M': - opt_list_mapcnt = 1; - break; - case 'N': - opt_no_summary = 1; - break; - case 'X': - opt_hwpoison = 1; - prepare_hwpoison_fd(); - break; - case 'x': - opt_unpoison = 1; - prepare_hwpoison_fd(); - break; - case 'F': - parse_kpageflags(optarg); - break; - case 'h': - usage(); - exit(0); - default: - usage(); - exit(1); - } - } - - if (!opt_kpageflags) - opt_kpageflags = PROC_KPAGEFLAGS; - - if (opt_cgroup || opt_list_cgroup) - kpagecgroup_fd = checked_open(PROC_KPAGECGROUP, O_RDONLY); - - if (opt_list && opt_list_mapcnt) - kpagecount_fd = checked_open(PROC_KPAGECOUNT, O_RDONLY); - - if (opt_mark_idle) - page_idle_fd = checked_open(SYS_KERNEL_MM_PAGE_IDLE, O_RDWR); - - if (opt_list && opt_pid) - printf("voffset\t"); - if (opt_list && opt_file) - printf("foffset\t"); - if (opt_list && opt_list_cgroup) - printf("cgroup\t"); - if (opt_list && opt_list_mapcnt) - printf("map-cnt\t"); - - if (opt_list == 1) - printf("offset\tlen\tflags\n"); - if (opt_list == 2) - printf("offset\tflags\n"); - - if (opt_file) - walk_page_cache(); - else - walk_addr_ranges(); - - if (opt_list == 1) - flush_page_range(); - - if (opt_no_summary) - return 0; - - if (opt_list) - printf("\n\n"); - - if (opt_file) { - show_file(opt_file, &st); - printf("\n"); - } - - show_summary(); - - if (opt_list_mapcnt) - close(kpagecount_fd); - - if (page_idle_fd >= 0) - close(page_idle_fd); - - return 0; -} diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c deleted file mode 100644 index 7c2ac124cdc8..000000000000 --- a/tools/vm/page_owner_sort.c +++ /dev/null @@ -1,897 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * User-space helper to sort the output of /sys/kernel/debug/page_owner - * - * Example use: - * cat /sys/kernel/debug/page_owner > page_owner_full.txt - * ./page_owner_sort page_owner_full.txt sorted_page_owner.txt - * Or sort by total memory: - * ./page_owner_sort -m page_owner_full.txt sorted_page_owner.txt - * - * See Documentation/mm/page_owner.rst -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define bool int -#define true 1 -#define false 0 -#define TASK_COMM_LEN 16 - -struct block_list { - char *txt; - char *comm; // task command name - char *stacktrace; - __u64 ts_nsec; - __u64 free_ts_nsec; - int len; - int num; - int page_num; - pid_t pid; - pid_t tgid; - int allocator; -}; -enum FILTER_BIT { - FILTER_UNRELEASE = 1<<1, - FILTER_PID = 1<<2, - FILTER_TGID = 1<<3, - FILTER_COMM = 1<<4 -}; -enum CULL_BIT { - CULL_UNRELEASE = 1<<1, - CULL_PID = 1<<2, - CULL_TGID = 1<<3, - CULL_COMM = 1<<4, - CULL_STACKTRACE = 1<<5, - CULL_ALLOCATOR = 1<<6 -}; -enum ALLOCATOR_BIT { - ALLOCATOR_CMA = 1<<1, - ALLOCATOR_SLAB = 1<<2, - ALLOCATOR_VMALLOC = 1<<3, - ALLOCATOR_OTHERS = 1<<4 -}; -enum ARG_TYPE { - ARG_TXT, ARG_COMM, ARG_STACKTRACE, ARG_ALLOC_TS, ARG_FREE_TS, - ARG_CULL_TIME, ARG_PAGE_NUM, ARG_PID, ARG_TGID, ARG_UNKNOWN, ARG_FREE, - ARG_ALLOCATOR -}; -enum SORT_ORDER { - SORT_ASC = 1, - SORT_DESC = -1, -}; -struct filter_condition { - pid_t *pids; - pid_t *tgids; - char **comms; - int pids_size; - int tgids_size; - int comms_size; -}; -struct sort_condition { - int (**cmps)(const void *, const void *); - int *signs; - int size; -}; -static struct filter_condition fc; -static struct sort_condition sc; -static regex_t order_pattern; -static regex_t pid_pattern; -static regex_t tgid_pattern; -static regex_t comm_pattern; -static regex_t ts_nsec_pattern; -static regex_t free_ts_nsec_pattern; -static struct block_list *list; -static int list_size; -static int max_size; -static int cull; -static int filter; -static bool debug_on; - -static void set_single_cmp(int (*cmp)(const void *, const void *), int sign); - -int read_block(char *buf, char *ext_buf, int buf_size, FILE *fin) -{ - char *curr = buf, *const buf_end = buf + buf_size; - - while (buf_end - curr > 1 && fgets(curr, buf_end - curr, fin)) { - if (*curr == '\n') { /* empty line */ - return curr - buf; - } - if (!strncmp(curr, "PFN", 3)) { - strcpy(ext_buf, curr); - continue; - } - curr += strlen(curr); - } - - return -1; /* EOF or no space left in buf. */ -} - -static int compare_txt(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - return strcmp(l1->txt, l2->txt); -} - -static int compare_stacktrace(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - return strcmp(l1->stacktrace, l2->stacktrace); -} - -static int compare_num(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - return l1->num - l2->num; -} - -static int compare_page_num(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - return l1->page_num - l2->page_num; -} - -static int compare_pid(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - return l1->pid - l2->pid; -} - -static int compare_tgid(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - return l1->tgid - l2->tgid; -} - -static int compare_allocator(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - return l1->allocator - l2->allocator; -} - -static int compare_comm(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - return strcmp(l1->comm, l2->comm); -} - -static int compare_ts(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - return l1->ts_nsec < l2->ts_nsec ? -1 : 1; -} - -static int compare_free_ts(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - return l1->free_ts_nsec < l2->free_ts_nsec ? -1 : 1; -} - -static int compare_release(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - if (!l1->free_ts_nsec && !l2->free_ts_nsec) - return 0; - if (l1->free_ts_nsec && l2->free_ts_nsec) - return 0; - return l1->free_ts_nsec ? 1 : -1; -} - -static int compare_cull_condition(const void *p1, const void *p2) -{ - if (cull == 0) - return compare_txt(p1, p2); - if ((cull & CULL_STACKTRACE) && compare_stacktrace(p1, p2)) - return compare_stacktrace(p1, p2); - if ((cull & CULL_PID) && compare_pid(p1, p2)) - return compare_pid(p1, p2); - if ((cull & CULL_TGID) && compare_tgid(p1, p2)) - return compare_tgid(p1, p2); - if ((cull & CULL_COMM) && compare_comm(p1, p2)) - return compare_comm(p1, p2); - if ((cull & CULL_UNRELEASE) && compare_release(p1, p2)) - return compare_release(p1, p2); - if ((cull & CULL_ALLOCATOR) && compare_allocator(p1, p2)) - return compare_allocator(p1, p2); - return 0; -} - -static int compare_sort_condition(const void *p1, const void *p2) -{ - int cmp = 0; - - for (int i = 0; i < sc.size; ++i) - if (cmp == 0) - cmp = sc.signs[i] * sc.cmps[i](p1, p2); - return cmp; -} - -static int search_pattern(regex_t *pattern, char *pattern_str, char *buf) -{ - int err, val_len; - regmatch_t pmatch[2]; - - err = regexec(pattern, buf, 2, pmatch, REG_NOTBOL); - if (err != 0 || pmatch[1].rm_so == -1) { - if (debug_on) - fprintf(stderr, "no matching pattern in %s\n", buf); - return -1; - } - val_len = pmatch[1].rm_eo - pmatch[1].rm_so; - - memcpy(pattern_str, buf + pmatch[1].rm_so, val_len); - - return 0; -} - -static bool check_regcomp(regex_t *pattern, const char *regex) -{ - int err; - - err = regcomp(pattern, regex, REG_EXTENDED | REG_NEWLINE); - if (err != 0 || pattern->re_nsub != 1) { - fprintf(stderr, "Invalid pattern %s code %d\n", regex, err); - return false; - } - return true; -} - -static char **explode(char sep, const char *str, int *size) -{ - int count = 0, len = strlen(str); - int lastindex = -1, j = 0; - - for (int i = 0; i < len; i++) - if (str[i] == sep) - count++; - char **ret = calloc(++count, sizeof(char *)); - - for (int i = 0; i < len; i++) { - if (str[i] == sep) { - ret[j] = calloc(i - lastindex, sizeof(char)); - memcpy(ret[j++], str + lastindex + 1, i - lastindex - 1); - lastindex = i; - } - } - if (lastindex <= len - 1) { - ret[j] = calloc(len - lastindex, sizeof(char)); - memcpy(ret[j++], str + lastindex + 1, strlen(str) - 1 - lastindex); - } - *size = j; - return ret; -} - -static void free_explode(char **arr, int size) -{ - for (int i = 0; i < size; i++) - free(arr[i]); - free(arr); -} - -# define FIELD_BUFF 25 - -static int get_page_num(char *buf) -{ - int order_val; - char order_str[FIELD_BUFF] = {0}; - char *endptr; - - search_pattern(&order_pattern, order_str, buf); - errno = 0; - order_val = strtol(order_str, &endptr, 10); - if (order_val > 64 || errno != 0 || endptr == order_str || *endptr != '\0') { - if (debug_on) - fprintf(stderr, "wrong order in follow buf:\n%s\n", buf); - return 0; - } - - return 1 << order_val; -} - -static pid_t get_pid(char *buf) -{ - pid_t pid; - char pid_str[FIELD_BUFF] = {0}; - char *endptr; - - search_pattern(&pid_pattern, pid_str, buf); - errno = 0; - pid = strtol(pid_str, &endptr, 10); - if (errno != 0 || endptr == pid_str || *endptr != '\0') { - if (debug_on) - fprintf(stderr, "wrong/invalid pid in follow buf:\n%s\n", buf); - return -1; - } - - return pid; - -} - -static pid_t get_tgid(char *buf) -{ - pid_t tgid; - char tgid_str[FIELD_BUFF] = {0}; - char *endptr; - - search_pattern(&tgid_pattern, tgid_str, buf); - errno = 0; - tgid = strtol(tgid_str, &endptr, 10); - if (errno != 0 || endptr == tgid_str || *endptr != '\0') { - if (debug_on) - fprintf(stderr, "wrong/invalid tgid in follow buf:\n%s\n", buf); - return -1; - } - - return tgid; - -} - -static __u64 get_ts_nsec(char *buf) -{ - __u64 ts_nsec; - char ts_nsec_str[FIELD_BUFF] = {0}; - char *endptr; - - search_pattern(&ts_nsec_pattern, ts_nsec_str, buf); - errno = 0; - ts_nsec = strtoull(ts_nsec_str, &endptr, 10); - if (errno != 0 || endptr == ts_nsec_str || *endptr != '\0') { - if (debug_on) - fprintf(stderr, "wrong ts_nsec in follow buf:\n%s\n", buf); - return -1; - } - - return ts_nsec; -} - -static __u64 get_free_ts_nsec(char *buf) -{ - __u64 free_ts_nsec; - char free_ts_nsec_str[FIELD_BUFF] = {0}; - char *endptr; - - search_pattern(&free_ts_nsec_pattern, free_ts_nsec_str, buf); - errno = 0; - free_ts_nsec = strtoull(free_ts_nsec_str, &endptr, 10); - if (errno != 0 || endptr == free_ts_nsec_str || *endptr != '\0') { - if (debug_on) - fprintf(stderr, "wrong free_ts_nsec in follow buf:\n%s\n", buf); - return -1; - } - - return free_ts_nsec; -} - -static char *get_comm(char *buf) -{ - char *comm_str = malloc(TASK_COMM_LEN); - - memset(comm_str, 0, TASK_COMM_LEN); - - search_pattern(&comm_pattern, comm_str, buf); - errno = 0; - if (errno != 0) { - if (debug_on) - fprintf(stderr, "wrong comm in follow buf:\n%s\n", buf); - return NULL; - } - - return comm_str; -} - -static int get_arg_type(const char *arg) -{ - if (!strcmp(arg, "pid") || !strcmp(arg, "p")) - return ARG_PID; - else if (!strcmp(arg, "tgid") || !strcmp(arg, "tg")) - return ARG_TGID; - else if (!strcmp(arg, "name") || !strcmp(arg, "n")) - return ARG_COMM; - else if (!strcmp(arg, "stacktrace") || !strcmp(arg, "st")) - return ARG_STACKTRACE; - else if (!strcmp(arg, "free") || !strcmp(arg, "f")) - return ARG_FREE; - else if (!strcmp(arg, "txt") || !strcmp(arg, "T")) - return ARG_TXT; - else if (!strcmp(arg, "free_ts") || !strcmp(arg, "ft")) - return ARG_FREE_TS; - else if (!strcmp(arg, "alloc_ts") || !strcmp(arg, "at")) - return ARG_ALLOC_TS; - else if (!strcmp(arg, "allocator") || !strcmp(arg, "ator")) - return ARG_ALLOCATOR; - else { - return ARG_UNKNOWN; - } -} - -static int get_allocator(const char *buf, const char *migrate_info) -{ - char *tmp, *first_line, *second_line; - int allocator = 0; - - if (strstr(migrate_info, "CMA")) - allocator |= ALLOCATOR_CMA; - if (strstr(migrate_info, "slab")) - allocator |= ALLOCATOR_SLAB; - tmp = strstr(buf, "__vmalloc_node_range"); - if (tmp) { - second_line = tmp; - while (*tmp != '\n') - tmp--; - tmp--; - while (*tmp != '\n') - tmp--; - first_line = ++tmp; - tmp = strstr(tmp, "alloc_pages"); - if (tmp && first_line <= tmp && tmp < second_line) - allocator |= ALLOCATOR_VMALLOC; - } - if (allocator == 0) - allocator = ALLOCATOR_OTHERS; - return allocator; -} - -static bool match_num_list(int num, int *list, int list_size) -{ - for (int i = 0; i < list_size; ++i) - if (list[i] == num) - return true; - return false; -} - -static bool match_str_list(const char *str, char **list, int list_size) -{ - for (int i = 0; i < list_size; ++i) - if (!strcmp(list[i], str)) - return true; - return false; -} - -static bool is_need(char *buf) -{ - __u64 ts_nsec, free_ts_nsec; - - ts_nsec = get_ts_nsec(buf); - free_ts_nsec = get_free_ts_nsec(buf); - - if ((filter & FILTER_UNRELEASE) && free_ts_nsec != 0 && ts_nsec < free_ts_nsec) - return false; - if ((filter & FILTER_PID) && !match_num_list(get_pid(buf), fc.pids, fc.pids_size)) - return false; - if ((filter & FILTER_TGID) && - !match_num_list(get_tgid(buf), fc.tgids, fc.tgids_size)) - return false; - - char *comm = get_comm(buf); - - if ((filter & FILTER_COMM) && - !match_str_list(comm, fc.comms, fc.comms_size)) { - free(comm); - return false; - } - free(comm); - return true; -} - -static bool add_list(char *buf, int len, char *ext_buf) -{ - if (list_size != 0 && - len == list[list_size-1].len && - memcmp(buf, list[list_size-1].txt, len) == 0) { - list[list_size-1].num++; - list[list_size-1].page_num += get_page_num(buf); - return true; - } - if (list_size == max_size) { - fprintf(stderr, "max_size too small??\n"); - return false; - } - if (!is_need(buf)) - return true; - list[list_size].pid = get_pid(buf); - list[list_size].tgid = get_tgid(buf); - list[list_size].comm = get_comm(buf); - list[list_size].txt = malloc(len+1); - if (!list[list_size].txt) { - fprintf(stderr, "Out of memory\n"); - return false; - } - memcpy(list[list_size].txt, buf, len); - list[list_size].txt[len] = 0; - list[list_size].len = len; - list[list_size].num = 1; - list[list_size].page_num = get_page_num(buf); - - list[list_size].stacktrace = strchr(list[list_size].txt, '\n') ?: ""; - if (*list[list_size].stacktrace == '\n') - list[list_size].stacktrace++; - list[list_size].ts_nsec = get_ts_nsec(buf); - list[list_size].free_ts_nsec = get_free_ts_nsec(buf); - list[list_size].allocator = get_allocator(buf, ext_buf); - list_size++; - if (list_size % 1000 == 0) { - printf("loaded %d\r", list_size); - fflush(stdout); - } - return true; -} - -static bool parse_cull_args(const char *arg_str) -{ - int size = 0; - char **args = explode(',', arg_str, &size); - - for (int i = 0; i < size; ++i) { - int arg_type = get_arg_type(args[i]); - - if (arg_type == ARG_PID) - cull |= CULL_PID; - else if (arg_type == ARG_TGID) - cull |= CULL_TGID; - else if (arg_type == ARG_COMM) - cull |= CULL_COMM; - else if (arg_type == ARG_STACKTRACE) - cull |= CULL_STACKTRACE; - else if (arg_type == ARG_FREE) - cull |= CULL_UNRELEASE; - else if (arg_type == ARG_ALLOCATOR) - cull |= CULL_ALLOCATOR; - else { - free_explode(args, size); - return false; - } - } - free_explode(args, size); - if (sc.size == 0) - set_single_cmp(compare_num, SORT_DESC); - return true; -} - -static void set_single_cmp(int (*cmp)(const void *, const void *), int sign) -{ - if (sc.signs == NULL || sc.size < 1) - sc.signs = calloc(1, sizeof(int)); - sc.signs[0] = sign; - if (sc.cmps == NULL || sc.size < 1) - sc.cmps = calloc(1, sizeof(int *)); - sc.cmps[0] = cmp; - sc.size = 1; -} - -static bool parse_sort_args(const char *arg_str) -{ - int size = 0; - - if (sc.size != 0) { /* reset sort_condition */ - free(sc.signs); - free(sc.cmps); - size = 0; - } - - char **args = explode(',', arg_str, &size); - - sc.signs = calloc(size, sizeof(int)); - sc.cmps = calloc(size, sizeof(int *)); - for (int i = 0; i < size; ++i) { - int offset = 0; - - sc.signs[i] = SORT_ASC; - if (args[i][0] == '-' || args[i][0] == '+') { - if (args[i][0] == '-') - sc.signs[i] = SORT_DESC; - offset = 1; - } - - int arg_type = get_arg_type(args[i]+offset); - - if (arg_type == ARG_PID) - sc.cmps[i] = compare_pid; - else if (arg_type == ARG_TGID) - sc.cmps[i] = compare_tgid; - else if (arg_type == ARG_COMM) - sc.cmps[i] = compare_comm; - else if (arg_type == ARG_STACKTRACE) - sc.cmps[i] = compare_stacktrace; - else if (arg_type == ARG_ALLOC_TS) - sc.cmps[i] = compare_ts; - else if (arg_type == ARG_FREE_TS) - sc.cmps[i] = compare_free_ts; - else if (arg_type == ARG_TXT) - sc.cmps[i] = compare_txt; - else if (arg_type == ARG_ALLOCATOR) - sc.cmps[i] = compare_allocator; - else { - free_explode(args, size); - sc.size = 0; - return false; - } - } - sc.size = size; - free_explode(args, size); - return true; -} - -static int *parse_nums_list(char *arg_str, int *list_size) -{ - int size = 0; - char **args = explode(',', arg_str, &size); - int *list = calloc(size, sizeof(int)); - - errno = 0; - for (int i = 0; i < size; ++i) { - char *endptr = NULL; - - list[i] = strtol(args[i], &endptr, 10); - if (errno != 0 || endptr == args[i] || *endptr != '\0') { - free(list); - return NULL; - } - } - *list_size = size; - free_explode(args, size); - return list; -} - -static void print_allocator(FILE *out, int allocator) -{ - fprintf(out, "allocated by "); - if (allocator & ALLOCATOR_CMA) - fprintf(out, "CMA "); - if (allocator & ALLOCATOR_SLAB) - fprintf(out, "SLAB "); - if (allocator & ALLOCATOR_VMALLOC) - fprintf(out, "VMALLOC "); - if (allocator & ALLOCATOR_OTHERS) - fprintf(out, "OTHERS "); -} - -#define BUF_SIZE (128 * 1024) - -static void usage(void) -{ - printf("Usage: ./page_owner_sort [OPTIONS] \n" - "-m\t\tSort by total memory.\n" - "-s\t\tSort by the stack trace.\n" - "-t\t\tSort by times (default).\n" - "-p\t\tSort by pid.\n" - "-P\t\tSort by tgid.\n" - "-n\t\tSort by task command name.\n" - "-a\t\tSort by memory allocate time.\n" - "-r\t\tSort by memory release time.\n" - "-f\t\tFilter out the information of blocks whose memory has been released.\n" - "-d\t\tPrint debug information.\n" - "--pid \tSelect by pid. This selects the information of blocks whose process ID numbers appear in .\n" - "--tgid \tSelect by tgid. This selects the information of blocks whose Thread Group ID numbers appear in .\n" - "--name \n\t\tSelect by command name. This selects the information of blocks whose command name appears in .\n" - "--cull \tCull by user-defined rules. is a single argument in the form of a comma-separated list with some common fields predefined\n" - "--sort \tSpecify sort order as: [+|-]key[,[+|-]key[,...]]\n" - ); -} - -int main(int argc, char **argv) -{ - FILE *fin, *fout; - char *buf, *ext_buf; - int i, count; - struct stat st; - int opt; - struct option longopts[] = { - { "pid", required_argument, NULL, 1 }, - { "tgid", required_argument, NULL, 2 }, - { "name", required_argument, NULL, 3 }, - { "cull", required_argument, NULL, 4 }, - { "sort", required_argument, NULL, 5 }, - { 0, 0, 0, 0}, - }; - - while ((opt = getopt_long(argc, argv, "adfmnprstP", longopts, NULL)) != -1) - switch (opt) { - case 'a': - set_single_cmp(compare_ts, SORT_ASC); - break; - case 'd': - debug_on = true; - break; - case 'f': - filter = filter | FILTER_UNRELEASE; - break; - case 'm': - set_single_cmp(compare_page_num, SORT_DESC); - break; - case 'p': - set_single_cmp(compare_pid, SORT_ASC); - break; - case 'r': - set_single_cmp(compare_free_ts, SORT_ASC); - break; - case 's': - set_single_cmp(compare_stacktrace, SORT_ASC); - break; - case 't': - set_single_cmp(compare_num, SORT_DESC); - break; - case 'P': - set_single_cmp(compare_tgid, SORT_ASC); - break; - case 'n': - set_single_cmp(compare_comm, SORT_ASC); - break; - case 1: - filter = filter | FILTER_PID; - fc.pids = parse_nums_list(optarg, &fc.pids_size); - if (fc.pids == NULL) { - fprintf(stderr, "wrong/invalid pid in from the command line:%s\n", - optarg); - exit(1); - } - break; - case 2: - filter = filter | FILTER_TGID; - fc.tgids = parse_nums_list(optarg, &fc.tgids_size); - if (fc.tgids == NULL) { - fprintf(stderr, "wrong/invalid tgid in from the command line:%s\n", - optarg); - exit(1); - } - break; - case 3: - filter = filter | FILTER_COMM; - fc.comms = explode(',', optarg, &fc.comms_size); - break; - case 4: - if (!parse_cull_args(optarg)) { - fprintf(stderr, "wrong argument after --cull option:%s\n", - optarg); - exit(1); - } - break; - case 5: - if (!parse_sort_args(optarg)) { - fprintf(stderr, "wrong argument after --sort option:%s\n", - optarg); - exit(1); - } - break; - default: - usage(); - exit(1); - } - - if (optind >= (argc - 1)) { - usage(); - exit(1); - } - - fin = fopen(argv[optind], "r"); - fout = fopen(argv[optind + 1], "w"); - if (!fin || !fout) { - usage(); - perror("open: "); - exit(1); - } - - if (!check_regcomp(&order_pattern, "order\\s*([0-9]*),")) - goto out_order; - if (!check_regcomp(&pid_pattern, "pid\\s*([0-9]*),")) - goto out_pid; - if (!check_regcomp(&tgid_pattern, "tgid\\s*([0-9]*) ")) - goto out_tgid; - if (!check_regcomp(&comm_pattern, "tgid\\s*[0-9]*\\s*\\((.*)\\),\\s*ts")) - goto out_comm; - if (!check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,")) - goto out_ts; - if (!check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns")) - goto out_free_ts; - - fstat(fileno(fin), &st); - max_size = st.st_size / 100; /* hack ... */ - - list = malloc(max_size * sizeof(*list)); - buf = malloc(BUF_SIZE); - ext_buf = malloc(BUF_SIZE); - if (!list || !buf || !ext_buf) { - fprintf(stderr, "Out of memory\n"); - goto out_free; - } - - for ( ; ; ) { - int buf_len = read_block(buf, ext_buf, BUF_SIZE, fin); - - if (buf_len < 0) - break; - if (!add_list(buf, buf_len, ext_buf)) - goto out_free; - } - - printf("loaded %d\n", list_size); - - printf("sorting ....\n"); - - qsort(list, list_size, sizeof(list[0]), compare_cull_condition); - - printf("culling\n"); - - for (i = count = 0; i < list_size; i++) { - if (count == 0 || - compare_cull_condition((void *)(&list[count-1]), (void *)(&list[i])) != 0) { - list[count++] = list[i]; - } else { - list[count-1].num += list[i].num; - list[count-1].page_num += list[i].page_num; - } - } - - qsort(list, count, sizeof(list[0]), compare_sort_condition); - - for (i = 0; i < count; i++) { - if (cull == 0) { - fprintf(fout, "%d times, %d pages, ", list[i].num, list[i].page_num); - print_allocator(fout, list[i].allocator); - fprintf(fout, ":\n%s\n", list[i].txt); - } - else { - fprintf(fout, "%d times, %d pages", - list[i].num, list[i].page_num); - if (cull & CULL_PID || filter & FILTER_PID) - fprintf(fout, ", PID %d", list[i].pid); - if (cull & CULL_TGID || filter & FILTER_TGID) - fprintf(fout, ", TGID %d", list[i].pid); - if (cull & CULL_COMM || filter & FILTER_COMM) - fprintf(fout, ", task_comm_name: %s", list[i].comm); - if (cull & CULL_ALLOCATOR) { - fprintf(fout, ", "); - print_allocator(fout, list[i].allocator); - } - if (cull & CULL_UNRELEASE) - fprintf(fout, " (%s)", - list[i].free_ts_nsec ? "UNRELEASED" : "RELEASED"); - if (cull & CULL_STACKTRACE) - fprintf(fout, ":\n%s", list[i].stacktrace); - fprintf(fout, "\n"); - } - } - -out_free: - if (ext_buf) - free(ext_buf); - if (buf) - free(buf); - if (list) - free(list); -out_free_ts: - regfree(&free_ts_nsec_pattern); -out_ts: - regfree(&ts_nsec_pattern); -out_comm: - regfree(&comm_pattern); -out_tgid: - regfree(&tgid_pattern); -out_pid: - regfree(&pid_pattern); -out_order: - regfree(&order_pattern); - - return 0; -} diff --git a/tools/vm/slabinfo-gnuplot.sh b/tools/vm/slabinfo-gnuplot.sh deleted file mode 100644 index 873a892147e5..000000000000 --- a/tools/vm/slabinfo-gnuplot.sh +++ /dev/null @@ -1,268 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0-only - -# Sergey Senozhatsky, 2015 -# sergey.senozhatsky.work@gmail.com -# - - -# This program is intended to plot a `slabinfo -X' stats, collected, -# for example, using the following command: -# while [ 1 ]; do slabinfo -X >> stats; sleep 1; done -# -# Use `slabinfo-gnuplot.sh stats' to pre-process collected records -# and generate graphs (totals, slabs sorted by size, slabs sorted -# by size). -# -# Graphs can be [individually] regenerate with different ranges and -# size (-r %d,%d and -s %d,%d options). -# -# To visually compare N `totals' graphs, do -# slabinfo-gnuplot.sh -t FILE1-totals FILE2-totals ... FILEN-totals -# - -min_slab_name_size=11 -xmin=0 -xmax=0 -width=1500 -height=700 -mode=preprocess - -usage() -{ - echo "Usage: [-s W,H] [-r MIN,MAX] [-t|-l] FILE1 [FILE2 ..]" - echo "FILEs must contain 'slabinfo -X' samples" - echo "-t - plot totals for FILE(s)" - echo "-l - plot slabs stats for FILE(s)" - echo "-s %d,%d - set image width and height" - echo "-r %d,%d - use data samples from a given range" -} - -check_file_exist() -{ - if [ ! -f "$1" ]; then - echo "File '$1' does not exist" - exit 1 - fi -} - -do_slabs_plotting() -{ - local file=$1 - local out_file - local range="every ::$xmin" - local xtic="" - local xtic_rotate="norotate" - local lines=2000000 - local wc_lines - - check_file_exist "$file" - - out_file=`basename "$file"` - if [ $xmax -ne 0 ]; then - range="$range::$xmax" - lines=$((xmax-xmin)) - fi - - wc_lines=`cat "$file" | wc -l` - if [ $? -ne 0 ] || [ "$wc_lines" -eq 0 ] ; then - wc_lines=$lines - fi - - if [ "$wc_lines" -lt "$lines" ]; then - lines=$wc_lines - fi - - if [ $((width / lines)) -gt $min_slab_name_size ]; then - xtic=":xtic(1)" - xtic_rotate=90 - fi - -gnuplot -p << EOF -#!/usr/bin/env gnuplot - -set terminal png enhanced size $width,$height large -set output '$out_file.png' -set autoscale xy -set xlabel 'samples' -set ylabel 'bytes' -set style histogram columnstacked title textcolor lt -1 -set style fill solid 0.15 -set xtics rotate $xtic_rotate -set key left above Left title reverse - -plot "$file" $range u 2$xtic title 'SIZE' with boxes,\ - '' $range u 3 title 'LOSS' with boxes -EOF - - if [ $? -eq 0 ]; then - echo "$out_file.png" - fi -} - -do_totals_plotting() -{ - local gnuplot_cmd="" - local range="every ::$xmin" - local file="" - - if [ $xmax -ne 0 ]; then - range="$range::$xmax" - fi - - for i in "${t_files[@]}"; do - check_file_exist "$i" - - file="$file"`basename "$i"` - gnuplot_cmd="$gnuplot_cmd '$i' $range using 1 title\ - '$i Memory usage' with lines," - gnuplot_cmd="$gnuplot_cmd '' $range using 2 title \ - '$i Loss' with lines," - done - -gnuplot -p << EOF -#!/usr/bin/env gnuplot - -set terminal png enhanced size $width,$height large -set autoscale xy -set output '$file.png' -set xlabel 'samples' -set ylabel 'bytes' -set key left above Left title reverse - -plot $gnuplot_cmd -EOF - - if [ $? -eq 0 ]; then - echo "$file.png" - fi -} - -do_preprocess() -{ - local out - local lines - local in=$1 - - check_file_exist "$in" - - # use only 'TOP' slab (biggest memory usage or loss) - let lines=3 - out=`basename "$in"`"-slabs-by-loss" - `cat "$in" | grep -A "$lines" 'Slabs sorted by loss' |\ - grep -E -iv '\-\-|Name|Slabs'\ - | awk '{print $1" "$4+$2*$3" "$4}' > "$out"` - if [ $? -eq 0 ]; then - do_slabs_plotting "$out" - fi - - let lines=3 - out=`basename "$in"`"-slabs-by-size" - `cat "$in" | grep -A "$lines" 'Slabs sorted by size' |\ - grep -E -iv '\-\-|Name|Slabs'\ - | awk '{print $1" "$4" "$4-$2*$3}' > "$out"` - if [ $? -eq 0 ]; then - do_slabs_plotting "$out" - fi - - out=`basename "$in"`"-totals" - `cat "$in" | grep "Memory used" |\ - awk '{print $3" "$7}' > "$out"` - if [ $? -eq 0 ]; then - t_files[0]=$out - do_totals_plotting - fi -} - -parse_opts() -{ - local opt - - while getopts "tlr::s::h" opt; do - case $opt in - t) - mode=totals - ;; - l) - mode=slabs - ;; - s) - array=(${OPTARG//,/ }) - width=${array[0]} - height=${array[1]} - ;; - r) - array=(${OPTARG//,/ }) - xmin=${array[0]} - xmax=${array[1]} - ;; - h) - usage - exit 0 - ;; - \?) - echo "Invalid option: -$OPTARG" >&2 - exit 1 - ;; - :) - echo "-$OPTARG requires an argument." >&2 - exit 1 - ;; - esac - done - - return $OPTIND -} - -parse_args() -{ - local idx=0 - local p - - for p in "$@"; do - case $mode in - preprocess) - files[$idx]=$p - idx=$idx+1 - ;; - totals) - t_files[$idx]=$p - idx=$idx+1 - ;; - slabs) - files[$idx]=$p - idx=$idx+1 - ;; - esac - done -} - -parse_opts "$@" -argstart=$? -parse_args "${@:$argstart}" - -if [ ${#files[@]} -eq 0 ] && [ ${#t_files[@]} -eq 0 ]; then - usage - exit 1 -fi - -case $mode in - preprocess) - for i in "${files[@]}"; do - do_preprocess "$i" - done - ;; - totals) - do_totals_plotting - ;; - slabs) - for i in "${files[@]}"; do - do_slabs_plotting "$i" - done - ;; - *) - echo "Unknown mode $mode" >&2 - usage - exit 1 - ;; -esac diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c deleted file mode 100644 index cfaeaea71042..000000000000 --- a/tools/vm/slabinfo.c +++ /dev/null @@ -1,1544 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Slabinfo: Tool to get reports about slabs - * - * (C) 2007 sgi, Christoph Lameter - * (C) 2011 Linux Foundation, Christoph Lameter - * - * Compile with: - * - * gcc -o slabinfo slabinfo.c - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define MAX_SLABS 500 -#define MAX_ALIASES 500 -#define MAX_NODES 1024 - -struct slabinfo { - char *name; - int alias; - int refs; - int aliases, align, cache_dma, cpu_slabs, destroy_by_rcu; - unsigned int hwcache_align, object_size, objs_per_slab; - unsigned int sanity_checks, slab_size, store_user, trace; - int order, poison, reclaim_account, red_zone; - unsigned long partial, objects, slabs, objects_partial, objects_total; - unsigned long alloc_fastpath, alloc_slowpath; - unsigned long free_fastpath, free_slowpath; - unsigned long free_frozen, free_add_partial, free_remove_partial; - unsigned long alloc_from_partial, alloc_slab, free_slab, alloc_refill; - unsigned long cpuslab_flush, deactivate_full, deactivate_empty; - unsigned long deactivate_to_head, deactivate_to_tail; - unsigned long deactivate_remote_frees, order_fallback; - unsigned long cmpxchg_double_cpu_fail, cmpxchg_double_fail; - unsigned long alloc_node_mismatch, deactivate_bypass; - unsigned long cpu_partial_alloc, cpu_partial_free; - int numa[MAX_NODES]; - int numa_partial[MAX_NODES]; -} slabinfo[MAX_SLABS]; - -struct aliasinfo { - char *name; - char *ref; - struct slabinfo *slab; -} aliasinfo[MAX_ALIASES]; - -int slabs; -int actual_slabs; -int aliases; -int alias_targets; -int highest_node; - -char buffer[4096]; - -int show_empty; -int show_report; -int show_alias; -int show_slab; -int skip_zero = 1; -int show_numa; -int show_track; -int show_first_alias; -int validate; -int shrink; -int show_inverted; -int show_single_ref; -int show_totals; -int sort_size; -int sort_active; -int set_debug; -int show_ops; -int sort_partial; -int show_activity; -int output_lines = -1; -int sort_loss; -int extended_totals; -int show_bytes; -int unreclaim_only; - -/* Debug options */ -int sanity; -int redzone; -int poison; -int tracking; -int tracing; - -int page_size; - -regex_t pattern; - -static void fatal(const char *x, ...) -{ - va_list ap; - - va_start(ap, x); - vfprintf(stderr, x, ap); - va_end(ap); - exit(EXIT_FAILURE); -} - -static void usage(void) -{ - printf("slabinfo 4/15/2011. (c) 2007 sgi/(c) 2011 Linux Foundation.\n\n" - "slabinfo [-aABDefhilLnoPrsStTUvXz1] [N=K] [-dafzput] [slab-regexp]\n" - "-a|--aliases Show aliases\n" - "-A|--activity Most active slabs first\n" - "-B|--Bytes Show size in bytes\n" - "-D|--display-active Switch line format to activity\n" - "-e|--empty Show empty slabs\n" - "-f|--first-alias Show first alias\n" - "-h|--help Show usage information\n" - "-i|--inverted Inverted list\n" - "-l|--slabs Show slabs\n" - "-L|--Loss Sort by loss\n" - "-n|--numa Show NUMA information\n" - "-N|--lines=K Show the first K slabs\n" - "-o|--ops Show kmem_cache_ops\n" - "-P|--partial Sort by number of partial slabs\n" - "-r|--report Detailed report on single slabs\n" - "-s|--shrink Shrink slabs\n" - "-S|--Size Sort by size\n" - "-t|--tracking Show alloc/free information\n" - "-T|--Totals Show summary information\n" - "-U|--Unreclaim Show unreclaimable slabs only\n" - "-v|--validate Validate slabs\n" - "-X|--Xtotals Show extended summary information\n" - "-z|--zero Include empty slabs\n" - "-1|--1ref Single reference\n" - - "\n" - "-d | --debug Switch off all debug options\n" - "-da | --debug=a Switch on all debug options (--debug=FZPU)\n" - - "\n" - "-d[afzput] | --debug=[afzput]\n" - " f | F Sanity Checks (SLAB_CONSISTENCY_CHECKS)\n" - " z | Z Redzoning\n" - " p | P Poisoning\n" - " u | U Tracking\n" - " t | T Tracing\n" - - "\nSorting options (--Loss, --Size, --Partial) are mutually exclusive\n" - ); -} - -static unsigned long read_obj(const char *name) -{ - FILE *f = fopen(name, "r"); - - if (!f) { - buffer[0] = 0; - if (errno == EACCES) - fatal("%s, Try using superuser\n", strerror(errno)); - } else { - if (!fgets(buffer, sizeof(buffer), f)) - buffer[0] = 0; - fclose(f); - if (buffer[strlen(buffer)] == '\n') - buffer[strlen(buffer)] = 0; - } - return strlen(buffer); -} - - -/* - * Get the contents of an attribute - */ -static unsigned long get_obj(const char *name) -{ - if (!read_obj(name)) - return 0; - - return atol(buffer); -} - -static unsigned long get_obj_and_str(const char *name, char **x) -{ - unsigned long result = 0; - char *p; - - *x = NULL; - - if (!read_obj(name)) { - x = NULL; - return 0; - } - result = strtoul(buffer, &p, 10); - while (*p == ' ') - p++; - if (*p) - *x = strdup(p); - return result; -} - -static void set_obj(struct slabinfo *s, const char *name, int n) -{ - char x[100]; - FILE *f; - - snprintf(x, 100, "%s/%s", s->name, name); - f = fopen(x, "w"); - if (!f) - fatal("Cannot write to %s\n", x); - - fprintf(f, "%d\n", n); - fclose(f); -} - -static unsigned long read_slab_obj(struct slabinfo *s, const char *name) -{ - char x[100]; - FILE *f; - size_t l; - - snprintf(x, 100, "%s/%s", s->name, name); - f = fopen(x, "r"); - if (!f) { - buffer[0] = 0; - l = 0; - } else { - l = fread(buffer, 1, sizeof(buffer), f); - buffer[l] = 0; - fclose(f); - } - return l; -} - -static unsigned long read_debug_slab_obj(struct slabinfo *s, const char *name) -{ - char x[128]; - FILE *f; - size_t l; - - snprintf(x, 128, "/sys/kernel/debug/slab/%s/%s", s->name, name); - f = fopen(x, "r"); - if (!f) { - buffer[0] = 0; - l = 0; - } else { - l = fread(buffer, 1, sizeof(buffer), f); - buffer[l] = 0; - fclose(f); - } - return l; -} - -/* - * Put a size string together - */ -static int store_size(char *buffer, unsigned long value) -{ - unsigned long divisor = 1; - char trailer = 0; - int n; - - if (!show_bytes) { - if (value > 1000000000UL) { - divisor = 100000000UL; - trailer = 'G'; - } else if (value > 1000000UL) { - divisor = 100000UL; - trailer = 'M'; - } else if (value > 1000UL) { - divisor = 100; - trailer = 'K'; - } - } - - value /= divisor; - n = sprintf(buffer, "%ld",value); - if (trailer) { - buffer[n] = trailer; - n++; - buffer[n] = 0; - } - if (divisor != 1) { - memmove(buffer + n - 2, buffer + n - 3, 4); - buffer[n-2] = '.'; - n++; - } - return n; -} - -static void decode_numa_list(int *numa, char *t) -{ - int node; - int nr; - - memset(numa, 0, MAX_NODES * sizeof(int)); - - if (!t) - return; - - while (*t == 'N') { - t++; - node = strtoul(t, &t, 10); - if (*t == '=') { - t++; - nr = strtoul(t, &t, 10); - numa[node] = nr; - if (node > highest_node) - highest_node = node; - } - while (*t == ' ') - t++; - } -} - -static void slab_validate(struct slabinfo *s) -{ - if (strcmp(s->name, "*") == 0) - return; - - set_obj(s, "validate", 1); -} - -static void slab_shrink(struct slabinfo *s) -{ - if (strcmp(s->name, "*") == 0) - return; - - set_obj(s, "shrink", 1); -} - -int line = 0; - -static void first_line(void) -{ - if (show_activity) - printf("Name Objects Alloc Free" - " %%Fast Fallb O CmpX UL\n"); - else - printf("Name Objects Objsize %s " - "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n", - sort_loss ? " Loss" : "Space"); -} - -/* - * Find the shortest alias of a slab - */ -static struct aliasinfo *find_one_alias(struct slabinfo *find) -{ - struct aliasinfo *a; - struct aliasinfo *best = NULL; - - for(a = aliasinfo;a < aliasinfo + aliases; a++) { - if (a->slab == find && - (!best || strlen(best->name) < strlen(a->name))) { - best = a; - if (strncmp(a->name,"kmall", 5) == 0) - return best; - } - } - return best; -} - -static unsigned long slab_size(struct slabinfo *s) -{ - return s->slabs * (page_size << s->order); -} - -static unsigned long slab_activity(struct slabinfo *s) -{ - return s->alloc_fastpath + s->free_fastpath + - s->alloc_slowpath + s->free_slowpath; -} - -static unsigned long slab_waste(struct slabinfo *s) -{ - return slab_size(s) - s->objects * s->object_size; -} - -static void slab_numa(struct slabinfo *s, int mode) -{ - int node; - - if (strcmp(s->name, "*") == 0) - return; - - if (!highest_node) { - printf("\n%s: No NUMA information available.\n", s->name); - return; - } - - if (skip_zero && !s->slabs) - return; - - if (!line) { - printf("\n%-21s:", mode ? "NUMA nodes" : "Slab"); - for(node = 0; node <= highest_node; node++) - printf(" %4d", node); - printf("\n----------------------"); - for(node = 0; node <= highest_node; node++) - printf("-----"); - printf("\n"); - } - printf("%-21s ", mode ? "All slabs" : s->name); - for(node = 0; node <= highest_node; node++) { - char b[20]; - - store_size(b, s->numa[node]); - printf(" %4s", b); - } - printf("\n"); - if (mode) { - printf("%-21s ", "Partial slabs"); - for(node = 0; node <= highest_node; node++) { - char b[20]; - - store_size(b, s->numa_partial[node]); - printf(" %4s", b); - } - printf("\n"); - } - line++; -} - -static void show_tracking(struct slabinfo *s) -{ - printf("\n%s: Kernel object allocation\n", s->name); - printf("-----------------------------------------------------------------------\n"); - if (read_debug_slab_obj(s, "alloc_traces")) - printf("%s", buffer); - else if (read_slab_obj(s, "alloc_calls")) - printf("%s", buffer); - else - printf("No Data\n"); - - printf("\n%s: Kernel object freeing\n", s->name); - printf("------------------------------------------------------------------------\n"); - if (read_debug_slab_obj(s, "free_traces")) - printf("%s", buffer); - else if (read_slab_obj(s, "free_calls")) - printf("%s", buffer); - else - printf("No Data\n"); - -} - -static void ops(struct slabinfo *s) -{ - if (strcmp(s->name, "*") == 0) - return; - - if (read_slab_obj(s, "ops")) { - printf("\n%s: kmem_cache operations\n", s->name); - printf("--------------------------------------------\n"); - printf("%s", buffer); - } else - printf("\n%s has no kmem_cache operations\n", s->name); -} - -static const char *onoff(int x) -{ - if (x) - return "On "; - return "Off"; -} - -static void slab_stats(struct slabinfo *s) -{ - unsigned long total_alloc; - unsigned long total_free; - unsigned long total; - - if (!s->alloc_slab) - return; - - total_alloc = s->alloc_fastpath + s->alloc_slowpath; - total_free = s->free_fastpath + s->free_slowpath; - - if (!total_alloc) - return; - - printf("\n"); - printf("Slab Perf Counter Alloc Free %%Al %%Fr\n"); - printf("--------------------------------------------------\n"); - printf("Fastpath %8lu %8lu %3lu %3lu\n", - s->alloc_fastpath, s->free_fastpath, - s->alloc_fastpath * 100 / total_alloc, - total_free ? s->free_fastpath * 100 / total_free : 0); - printf("Slowpath %8lu %8lu %3lu %3lu\n", - total_alloc - s->alloc_fastpath, s->free_slowpath, - (total_alloc - s->alloc_fastpath) * 100 / total_alloc, - total_free ? s->free_slowpath * 100 / total_free : 0); - printf("Page Alloc %8lu %8lu %3lu %3lu\n", - s->alloc_slab, s->free_slab, - s->alloc_slab * 100 / total_alloc, - total_free ? s->free_slab * 100 / total_free : 0); - printf("Add partial %8lu %8lu %3lu %3lu\n", - s->deactivate_to_head + s->deactivate_to_tail, - s->free_add_partial, - (s->deactivate_to_head + s->deactivate_to_tail) * 100 / total_alloc, - total_free ? s->free_add_partial * 100 / total_free : 0); - printf("Remove partial %8lu %8lu %3lu %3lu\n", - s->alloc_from_partial, s->free_remove_partial, - s->alloc_from_partial * 100 / total_alloc, - total_free ? s->free_remove_partial * 100 / total_free : 0); - - printf("Cpu partial list %8lu %8lu %3lu %3lu\n", - s->cpu_partial_alloc, s->cpu_partial_free, - s->cpu_partial_alloc * 100 / total_alloc, - total_free ? s->cpu_partial_free * 100 / total_free : 0); - - printf("RemoteObj/SlabFrozen %8lu %8lu %3lu %3lu\n", - s->deactivate_remote_frees, s->free_frozen, - s->deactivate_remote_frees * 100 / total_alloc, - total_free ? s->free_frozen * 100 / total_free : 0); - - printf("Total %8lu %8lu\n\n", total_alloc, total_free); - - if (s->cpuslab_flush) - printf("Flushes %8lu\n", s->cpuslab_flush); - - total = s->deactivate_full + s->deactivate_empty + - s->deactivate_to_head + s->deactivate_to_tail + s->deactivate_bypass; - - if (total) { - printf("\nSlab Deactivation Occurrences %%\n"); - printf("-------------------------------------------------\n"); - printf("Slab full %7lu %3lu%%\n", - s->deactivate_full, (s->deactivate_full * 100) / total); - printf("Slab empty %7lu %3lu%%\n", - s->deactivate_empty, (s->deactivate_empty * 100) / total); - printf("Moved to head of partial list %7lu %3lu%%\n", - s->deactivate_to_head, (s->deactivate_to_head * 100) / total); - printf("Moved to tail of partial list %7lu %3lu%%\n", - s->deactivate_to_tail, (s->deactivate_to_tail * 100) / total); - printf("Deactivation bypass %7lu %3lu%%\n", - s->deactivate_bypass, (s->deactivate_bypass * 100) / total); - printf("Refilled from foreign frees %7lu %3lu%%\n", - s->alloc_refill, (s->alloc_refill * 100) / total); - printf("Node mismatch %7lu %3lu%%\n", - s->alloc_node_mismatch, (s->alloc_node_mismatch * 100) / total); - } - - if (s->cmpxchg_double_fail || s->cmpxchg_double_cpu_fail) { - printf("\nCmpxchg_double Looping\n------------------------\n"); - printf("Locked Cmpxchg Double redos %lu\nUnlocked Cmpxchg Double redos %lu\n", - s->cmpxchg_double_fail, s->cmpxchg_double_cpu_fail); - } -} - -static void report(struct slabinfo *s) -{ - if (strcmp(s->name, "*") == 0) - return; - - printf("\nSlabcache: %-15s Aliases: %2d Order : %2d Objects: %lu\n", - s->name, s->aliases, s->order, s->objects); - if (s->hwcache_align) - printf("** Hardware cacheline aligned\n"); - if (s->cache_dma) - printf("** Memory is allocated in a special DMA zone\n"); - if (s->destroy_by_rcu) - printf("** Slabs are destroyed via RCU\n"); - if (s->reclaim_account) - printf("** Reclaim accounting active\n"); - - printf("\nSizes (bytes) Slabs Debug Memory\n"); - printf("------------------------------------------------------------------------\n"); - printf("Object : %7d Total : %7ld Sanity Checks : %s Total: %7ld\n", - s->object_size, s->slabs, onoff(s->sanity_checks), - s->slabs * (page_size << s->order)); - printf("SlabObj: %7d Full : %7ld Redzoning : %s Used : %7ld\n", - s->slab_size, s->slabs - s->partial - s->cpu_slabs, - onoff(s->red_zone), s->objects * s->object_size); - printf("SlabSiz: %7d Partial: %7ld Poisoning : %s Loss : %7ld\n", - page_size << s->order, s->partial, onoff(s->poison), - s->slabs * (page_size << s->order) - s->objects * s->object_size); - printf("Loss : %7d CpuSlab: %7d Tracking : %s Lalig: %7ld\n", - s->slab_size - s->object_size, s->cpu_slabs, onoff(s->store_user), - (s->slab_size - s->object_size) * s->objects); - printf("Align : %7d Objects: %7d Tracing : %s Lpadd: %7ld\n", - s->align, s->objs_per_slab, onoff(s->trace), - ((page_size << s->order) - s->objs_per_slab * s->slab_size) * - s->slabs); - - ops(s); - show_tracking(s); - slab_numa(s, 1); - slab_stats(s); -} - -static void slabcache(struct slabinfo *s) -{ - char size_str[20]; - char dist_str[40]; - char flags[20]; - char *p = flags; - - if (strcmp(s->name, "*") == 0) - return; - - if (unreclaim_only && s->reclaim_account) - return; - - if (actual_slabs == 1) { - report(s); - return; - } - - if (skip_zero && !show_empty && !s->slabs) - return; - - if (show_empty && s->slabs) - return; - - if (sort_loss == 0) - store_size(size_str, slab_size(s)); - else - store_size(size_str, slab_waste(s)); - snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs, - s->partial, s->cpu_slabs); - - if (!line++) - first_line(); - - if (s->aliases) - *p++ = '*'; - if (s->cache_dma) - *p++ = 'd'; - if (s->hwcache_align) - *p++ = 'A'; - if (s->poison) - *p++ = 'P'; - if (s->reclaim_account) - *p++ = 'a'; - if (s->red_zone) - *p++ = 'Z'; - if (s->sanity_checks) - *p++ = 'F'; - if (s->store_user) - *p++ = 'U'; - if (s->trace) - *p++ = 'T'; - - *p = 0; - if (show_activity) { - unsigned long total_alloc; - unsigned long total_free; - - total_alloc = s->alloc_fastpath + s->alloc_slowpath; - total_free = s->free_fastpath + s->free_slowpath; - - printf("%-21s %8ld %10ld %10ld %3ld %3ld %5ld %1d %4ld %4ld\n", - s->name, s->objects, - total_alloc, total_free, - total_alloc ? (s->alloc_fastpath * 100 / total_alloc) : 0, - total_free ? (s->free_fastpath * 100 / total_free) : 0, - s->order_fallback, s->order, s->cmpxchg_double_fail, - s->cmpxchg_double_cpu_fail); - } else { - printf("%-21s %8ld %7d %15s %14s %4d %1d %3ld %3ld %s\n", - s->name, s->objects, s->object_size, size_str, dist_str, - s->objs_per_slab, s->order, - s->slabs ? (s->partial * 100) / s->slabs : 100, - s->slabs ? (s->objects * s->object_size * 100) / - (s->slabs * (page_size << s->order)) : 100, - flags); - } -} - -/* - * Analyze debug options. Return false if something is amiss. - */ -static int debug_opt_scan(char *opt) -{ - if (!opt || !opt[0] || strcmp(opt, "-") == 0) - return 1; - - if (strcasecmp(opt, "a") == 0) { - sanity = 1; - poison = 1; - redzone = 1; - tracking = 1; - return 1; - } - - for ( ; *opt; opt++) - switch (*opt) { - case 'F' : case 'f': - if (sanity) - return 0; - sanity = 1; - break; - case 'P' : case 'p': - if (poison) - return 0; - poison = 1; - break; - - case 'Z' : case 'z': - if (redzone) - return 0; - redzone = 1; - break; - - case 'U' : case 'u': - if (tracking) - return 0; - tracking = 1; - break; - - case 'T' : case 't': - if (tracing) - return 0; - tracing = 1; - break; - default: - return 0; - } - return 1; -} - -static int slab_empty(struct slabinfo *s) -{ - if (s->objects > 0) - return 0; - - /* - * We may still have slabs even if there are no objects. Shrinking will - * remove them. - */ - if (s->slabs != 0) - set_obj(s, "shrink", 1); - - return 1; -} - -static void slab_debug(struct slabinfo *s) -{ - if (strcmp(s->name, "*") == 0) - return; - - if (sanity && !s->sanity_checks) { - set_obj(s, "sanity_checks", 1); - } - if (!sanity && s->sanity_checks) { - if (slab_empty(s)) - set_obj(s, "sanity_checks", 0); - else - fprintf(stderr, "%s not empty cannot disable sanity checks\n", s->name); - } - if (redzone && !s->red_zone) { - if (slab_empty(s)) - set_obj(s, "red_zone", 1); - else - fprintf(stderr, "%s not empty cannot enable redzoning\n", s->name); - } - if (!redzone && s->red_zone) { - if (slab_empty(s)) - set_obj(s, "red_zone", 0); - else - fprintf(stderr, "%s not empty cannot disable redzoning\n", s->name); - } - if (poison && !s->poison) { - if (slab_empty(s)) - set_obj(s, "poison", 1); - else - fprintf(stderr, "%s not empty cannot enable poisoning\n", s->name); - } - if (!poison && s->poison) { - if (slab_empty(s)) - set_obj(s, "poison", 0); - else - fprintf(stderr, "%s not empty cannot disable poisoning\n", s->name); - } - if (tracking && !s->store_user) { - if (slab_empty(s)) - set_obj(s, "store_user", 1); - else - fprintf(stderr, "%s not empty cannot enable tracking\n", s->name); - } - if (!tracking && s->store_user) { - if (slab_empty(s)) - set_obj(s, "store_user", 0); - else - fprintf(stderr, "%s not empty cannot disable tracking\n", s->name); - } - if (tracing && !s->trace) { - if (slabs == 1) - set_obj(s, "trace", 1); - else - fprintf(stderr, "%s can only enable trace for one slab at a time\n", s->name); - } - if (!tracing && s->trace) - set_obj(s, "trace", 1); -} - -static void totals(void) -{ - struct slabinfo *s; - - int used_slabs = 0; - char b1[20], b2[20], b3[20], b4[20]; - unsigned long long max = 1ULL << 63; - - /* Object size */ - unsigned long long min_objsize = max, max_objsize = 0, avg_objsize; - - /* Number of partial slabs in a slabcache */ - unsigned long long min_partial = max, max_partial = 0, - avg_partial, total_partial = 0; - - /* Number of slabs in a slab cache */ - unsigned long long min_slabs = max, max_slabs = 0, - avg_slabs, total_slabs = 0; - - /* Size of the whole slab */ - unsigned long long min_size = max, max_size = 0, - avg_size, total_size = 0; - - /* Bytes used for object storage in a slab */ - unsigned long long min_used = max, max_used = 0, - avg_used, total_used = 0; - - /* Waste: Bytes used for alignment and padding */ - unsigned long long min_waste = max, max_waste = 0, - avg_waste, total_waste = 0; - /* Number of objects in a slab */ - unsigned long long min_objects = max, max_objects = 0, - avg_objects, total_objects = 0; - /* Waste per object */ - unsigned long long min_objwaste = max, - max_objwaste = 0, avg_objwaste, - total_objwaste = 0; - - /* Memory per object */ - unsigned long long min_memobj = max, - max_memobj = 0, avg_memobj, - total_objsize = 0; - - /* Percentage of partial slabs per slab */ - unsigned long min_ppart = 100, max_ppart = 0, - avg_ppart, total_ppart = 0; - - /* Number of objects in partial slabs */ - unsigned long min_partobj = max, max_partobj = 0, - avg_partobj, total_partobj = 0; - - /* Percentage of partial objects of all objects in a slab */ - unsigned long min_ppartobj = 100, max_ppartobj = 0, - avg_ppartobj, total_ppartobj = 0; - - - for (s = slabinfo; s < slabinfo + slabs; s++) { - unsigned long long size; - unsigned long used; - unsigned long long wasted; - unsigned long long objwaste; - unsigned long percentage_partial_slabs; - unsigned long percentage_partial_objs; - - if (!s->slabs || !s->objects) - continue; - - used_slabs++; - - size = slab_size(s); - used = s->objects * s->object_size; - wasted = size - used; - objwaste = s->slab_size - s->object_size; - - percentage_partial_slabs = s->partial * 100 / s->slabs; - if (percentage_partial_slabs > 100) - percentage_partial_slabs = 100; - - percentage_partial_objs = s->objects_partial * 100 - / s->objects; - - if (percentage_partial_objs > 100) - percentage_partial_objs = 100; - - if (s->object_size < min_objsize) - min_objsize = s->object_size; - if (s->partial < min_partial) - min_partial = s->partial; - if (s->slabs < min_slabs) - min_slabs = s->slabs; - if (size < min_size) - min_size = size; - if (wasted < min_waste) - min_waste = wasted; - if (objwaste < min_objwaste) - min_objwaste = objwaste; - if (s->objects < min_objects) - min_objects = s->objects; - if (used < min_used) - min_used = used; - if (s->objects_partial < min_partobj) - min_partobj = s->objects_partial; - if (percentage_partial_slabs < min_ppart) - min_ppart = percentage_partial_slabs; - if (percentage_partial_objs < min_ppartobj) - min_ppartobj = percentage_partial_objs; - if (s->slab_size < min_memobj) - min_memobj = s->slab_size; - - if (s->object_size > max_objsize) - max_objsize = s->object_size; - if (s->partial > max_partial) - max_partial = s->partial; - if (s->slabs > max_slabs) - max_slabs = s->slabs; - if (size > max_size) - max_size = size; - if (wasted > max_waste) - max_waste = wasted; - if (objwaste > max_objwaste) - max_objwaste = objwaste; - if (s->objects > max_objects) - max_objects = s->objects; - if (used > max_used) - max_used = used; - if (s->objects_partial > max_partobj) - max_partobj = s->objects_partial; - if (percentage_partial_slabs > max_ppart) - max_ppart = percentage_partial_slabs; - if (percentage_partial_objs > max_ppartobj) - max_ppartobj = percentage_partial_objs; - if (s->slab_size > max_memobj) - max_memobj = s->slab_size; - - total_partial += s->partial; - total_slabs += s->slabs; - total_size += size; - total_waste += wasted; - - total_objects += s->objects; - total_used += used; - total_partobj += s->objects_partial; - total_ppart += percentage_partial_slabs; - total_ppartobj += percentage_partial_objs; - - total_objwaste += s->objects * objwaste; - total_objsize += s->objects * s->slab_size; - } - - if (!total_objects) { - printf("No objects\n"); - return; - } - if (!used_slabs) { - printf("No slabs\n"); - return; - } - - /* Per slab averages */ - avg_partial = total_partial / used_slabs; - avg_slabs = total_slabs / used_slabs; - avg_size = total_size / used_slabs; - avg_waste = total_waste / used_slabs; - - avg_objects = total_objects / used_slabs; - avg_used = total_used / used_slabs; - avg_partobj = total_partobj / used_slabs; - avg_ppart = total_ppart / used_slabs; - avg_ppartobj = total_ppartobj / used_slabs; - - /* Per object object sizes */ - avg_objsize = total_used / total_objects; - avg_objwaste = total_objwaste / total_objects; - avg_partobj = total_partobj * 100 / total_objects; - avg_memobj = total_objsize / total_objects; - - printf("Slabcache Totals\n"); - printf("----------------\n"); - printf("Slabcaches : %15d Aliases : %11d->%-3d Active: %3d\n", - slabs, aliases, alias_targets, used_slabs); - - store_size(b1, total_size);store_size(b2, total_waste); - store_size(b3, total_waste * 100 / total_used); - printf("Memory used: %15s # Loss : %15s MRatio:%6s%%\n", b1, b2, b3); - - store_size(b1, total_objects);store_size(b2, total_partobj); - store_size(b3, total_partobj * 100 / total_objects); - printf("# Objects : %15s # PartObj: %15s ORatio:%6s%%\n", b1, b2, b3); - - printf("\n"); - printf("Per Cache Average " - "Min Max Total\n"); - printf("---------------------------------------" - "-------------------------------------\n"); - - store_size(b1, avg_objects);store_size(b2, min_objects); - store_size(b3, max_objects);store_size(b4, total_objects); - printf("#Objects %15s %15s %15s %15s\n", - b1, b2, b3, b4); - - store_size(b1, avg_slabs);store_size(b2, min_slabs); - store_size(b3, max_slabs);store_size(b4, total_slabs); - printf("#Slabs %15s %15s %15s %15s\n", - b1, b2, b3, b4); - - store_size(b1, avg_partial);store_size(b2, min_partial); - store_size(b3, max_partial);store_size(b4, total_partial); - printf("#PartSlab %15s %15s %15s %15s\n", - b1, b2, b3, b4); - store_size(b1, avg_ppart);store_size(b2, min_ppart); - store_size(b3, max_ppart); - store_size(b4, total_partial * 100 / total_slabs); - printf("%%PartSlab%15s%% %15s%% %15s%% %15s%%\n", - b1, b2, b3, b4); - - store_size(b1, avg_partobj);store_size(b2, min_partobj); - store_size(b3, max_partobj); - store_size(b4, total_partobj); - printf("PartObjs %15s %15s %15s %15s\n", - b1, b2, b3, b4); - - store_size(b1, avg_ppartobj);store_size(b2, min_ppartobj); - store_size(b3, max_ppartobj); - store_size(b4, total_partobj * 100 / total_objects); - printf("%% PartObj%15s%% %15s%% %15s%% %15s%%\n", - b1, b2, b3, b4); - - store_size(b1, avg_size);store_size(b2, min_size); - store_size(b3, max_size);store_size(b4, total_size); - printf("Memory %15s %15s %15s %15s\n", - b1, b2, b3, b4); - - store_size(b1, avg_used);store_size(b2, min_used); - store_size(b3, max_used);store_size(b4, total_used); - printf("Used %15s %15s %15s %15s\n", - b1, b2, b3, b4); - - store_size(b1, avg_waste);store_size(b2, min_waste); - store_size(b3, max_waste);store_size(b4, total_waste); - printf("Loss %15s %15s %15s %15s\n", - b1, b2, b3, b4); - - printf("\n"); - printf("Per Object Average " - "Min Max\n"); - printf("---------------------------------------" - "--------------------\n"); - - store_size(b1, avg_memobj);store_size(b2, min_memobj); - store_size(b3, max_memobj); - printf("Memory %15s %15s %15s\n", - b1, b2, b3); - store_size(b1, avg_objsize);store_size(b2, min_objsize); - store_size(b3, max_objsize); - printf("User %15s %15s %15s\n", - b1, b2, b3); - - store_size(b1, avg_objwaste);store_size(b2, min_objwaste); - store_size(b3, max_objwaste); - printf("Loss %15s %15s %15s\n", - b1, b2, b3); -} - -static void sort_slabs(void) -{ - struct slabinfo *s1,*s2; - - for (s1 = slabinfo; s1 < slabinfo + slabs; s1++) { - for (s2 = s1 + 1; s2 < slabinfo + slabs; s2++) { - int result; - - if (sort_size) { - if (slab_size(s1) == slab_size(s2)) - result = strcasecmp(s1->name, s2->name); - else - result = slab_size(s1) < slab_size(s2); - } else if (sort_active) { - if (slab_activity(s1) == slab_activity(s2)) - result = strcasecmp(s1->name, s2->name); - else - result = slab_activity(s1) < slab_activity(s2); - } else if (sort_loss) { - if (slab_waste(s1) == slab_waste(s2)) - result = strcasecmp(s1->name, s2->name); - else - result = slab_waste(s1) < slab_waste(s2); - } else if (sort_partial) { - if (s1->partial == s2->partial) - result = strcasecmp(s1->name, s2->name); - else - result = s1->partial < s2->partial; - } else - result = strcasecmp(s1->name, s2->name); - - if (show_inverted) - result = -result; - - if (result > 0) { - struct slabinfo t; - - memcpy(&t, s1, sizeof(struct slabinfo)); - memcpy(s1, s2, sizeof(struct slabinfo)); - memcpy(s2, &t, sizeof(struct slabinfo)); - } - } - } -} - -static void sort_aliases(void) -{ - struct aliasinfo *a1,*a2; - - for (a1 = aliasinfo; a1 < aliasinfo + aliases; a1++) { - for (a2 = a1 + 1; a2 < aliasinfo + aliases; a2++) { - char *n1, *n2; - - n1 = a1->name; - n2 = a2->name; - if (show_alias && !show_inverted) { - n1 = a1->ref; - n2 = a2->ref; - } - if (strcasecmp(n1, n2) > 0) { - struct aliasinfo t; - - memcpy(&t, a1, sizeof(struct aliasinfo)); - memcpy(a1, a2, sizeof(struct aliasinfo)); - memcpy(a2, &t, sizeof(struct aliasinfo)); - } - } - } -} - -static void link_slabs(void) -{ - struct aliasinfo *a; - struct slabinfo *s; - - for (a = aliasinfo; a < aliasinfo + aliases; a++) { - - for (s = slabinfo; s < slabinfo + slabs; s++) - if (strcmp(a->ref, s->name) == 0) { - a->slab = s; - s->refs++; - break; - } - if (s == slabinfo + slabs) - fatal("Unresolved alias %s\n", a->ref); - } -} - -static void alias(void) -{ - struct aliasinfo *a; - char *active = NULL; - - sort_aliases(); - link_slabs(); - - for(a = aliasinfo; a < aliasinfo + aliases; a++) { - - if (!show_single_ref && a->slab->refs == 1) - continue; - - if (!show_inverted) { - if (active) { - if (strcmp(a->slab->name, active) == 0) { - printf(" %s", a->name); - continue; - } - } - printf("\n%-12s <- %s", a->slab->name, a->name); - active = a->slab->name; - } - else - printf("%-15s -> %s\n", a->name, a->slab->name); - } - if (active) - printf("\n"); -} - - -static void rename_slabs(void) -{ - struct slabinfo *s; - struct aliasinfo *a; - - for (s = slabinfo; s < slabinfo + slabs; s++) { - if (*s->name != ':') - continue; - - if (s->refs > 1 && !show_first_alias) - continue; - - a = find_one_alias(s); - - if (a) - s->name = a->name; - else { - s->name = "*"; - actual_slabs--; - } - } -} - -static int slab_mismatch(char *slab) -{ - return regexec(&pattern, slab, 0, NULL, 0); -} - -static void read_slab_dir(void) -{ - DIR *dir; - struct dirent *de; - struct slabinfo *slab = slabinfo; - struct aliasinfo *alias = aliasinfo; - char *p; - char *t; - int count; - - if (chdir("/sys/kernel/slab") && chdir("/sys/slab")) - fatal("SYSFS support for SLUB not active\n"); - - dir = opendir("."); - while ((de = readdir(dir))) { - if (de->d_name[0] == '.' || - (de->d_name[0] != ':' && slab_mismatch(de->d_name))) - continue; - switch (de->d_type) { - case DT_LNK: - alias->name = strdup(de->d_name); - count = readlink(de->d_name, buffer, sizeof(buffer)-1); - - if (count < 0) - fatal("Cannot read symlink %s\n", de->d_name); - - buffer[count] = 0; - p = buffer + count; - while (p > buffer && p[-1] != '/') - p--; - alias->ref = strdup(p); - alias++; - break; - case DT_DIR: - if (chdir(de->d_name)) - fatal("Unable to access slab %s\n", slab->name); - slab->name = strdup(de->d_name); - slab->alias = 0; - slab->refs = 0; - slab->aliases = get_obj("aliases"); - slab->align = get_obj("align"); - slab->cache_dma = get_obj("cache_dma"); - slab->cpu_slabs = get_obj("cpu_slabs"); - slab->destroy_by_rcu = get_obj("destroy_by_rcu"); - slab->hwcache_align = get_obj("hwcache_align"); - slab->object_size = get_obj("object_size"); - slab->objects = get_obj("objects"); - slab->objects_partial = get_obj("objects_partial"); - slab->objects_total = get_obj("objects_total"); - slab->objs_per_slab = get_obj("objs_per_slab"); - slab->order = get_obj("order"); - slab->partial = get_obj("partial"); - slab->partial = get_obj_and_str("partial", &t); - decode_numa_list(slab->numa_partial, t); - free(t); - slab->poison = get_obj("poison"); - slab->reclaim_account = get_obj("reclaim_account"); - slab->red_zone = get_obj("red_zone"); - slab->sanity_checks = get_obj("sanity_checks"); - slab->slab_size = get_obj("slab_size"); - slab->slabs = get_obj_and_str("slabs", &t); - decode_numa_list(slab->numa, t); - free(t); - slab->store_user = get_obj("store_user"); - slab->trace = get_obj("trace"); - slab->alloc_fastpath = get_obj("alloc_fastpath"); - slab->alloc_slowpath = get_obj("alloc_slowpath"); - slab->free_fastpath = get_obj("free_fastpath"); - slab->free_slowpath = get_obj("free_slowpath"); - slab->free_frozen= get_obj("free_frozen"); - slab->free_add_partial = get_obj("free_add_partial"); - slab->free_remove_partial = get_obj("free_remove_partial"); - slab->alloc_from_partial = get_obj("alloc_from_partial"); - slab->alloc_slab = get_obj("alloc_slab"); - slab->alloc_refill = get_obj("alloc_refill"); - slab->free_slab = get_obj("free_slab"); - slab->cpuslab_flush = get_obj("cpuslab_flush"); - slab->deactivate_full = get_obj("deactivate_full"); - slab->deactivate_empty = get_obj("deactivate_empty"); - slab->deactivate_to_head = get_obj("deactivate_to_head"); - slab->deactivate_to_tail = get_obj("deactivate_to_tail"); - slab->deactivate_remote_frees = get_obj("deactivate_remote_frees"); - slab->order_fallback = get_obj("order_fallback"); - slab->cmpxchg_double_cpu_fail = get_obj("cmpxchg_double_cpu_fail"); - slab->cmpxchg_double_fail = get_obj("cmpxchg_double_fail"); - slab->cpu_partial_alloc = get_obj("cpu_partial_alloc"); - slab->cpu_partial_free = get_obj("cpu_partial_free"); - slab->alloc_node_mismatch = get_obj("alloc_node_mismatch"); - slab->deactivate_bypass = get_obj("deactivate_bypass"); - chdir(".."); - if (slab->name[0] == ':') - alias_targets++; - slab++; - break; - default : - fatal("Unknown file type %lx\n", de->d_type); - } - } - closedir(dir); - slabs = slab - slabinfo; - actual_slabs = slabs; - aliases = alias - aliasinfo; - if (slabs > MAX_SLABS) - fatal("Too many slabs\n"); - if (aliases > MAX_ALIASES) - fatal("Too many aliases\n"); -} - -static void output_slabs(void) -{ - struct slabinfo *slab; - int lines = output_lines; - - for (slab = slabinfo; (slab < slabinfo + slabs) && - lines != 0; slab++) { - - if (slab->alias) - continue; - - if (lines != -1) - lines--; - - if (show_numa) - slab_numa(slab, 0); - else if (show_track) - show_tracking(slab); - else if (validate) - slab_validate(slab); - else if (shrink) - slab_shrink(slab); - else if (set_debug) - slab_debug(slab); - else if (show_ops) - ops(slab); - else if (show_slab) - slabcache(slab); - else if (show_report) - report(slab); - } -} - -static void _xtotals(char *heading, char *underline, - int loss, int size, int partial) -{ - printf("%s%s", heading, underline); - line = 0; - sort_loss = loss; - sort_size = size; - sort_partial = partial; - sort_slabs(); - output_slabs(); -} - -static void xtotals(void) -{ - char *heading, *underline; - - totals(); - - link_slabs(); - rename_slabs(); - - heading = "\nSlabs sorted by size\n"; - underline = "--------------------\n"; - _xtotals(heading, underline, 0, 1, 0); - - heading = "\nSlabs sorted by loss\n"; - underline = "--------------------\n"; - _xtotals(heading, underline, 1, 0, 0); - - heading = "\nSlabs sorted by number of partial slabs\n"; - underline = "---------------------------------------\n"; - _xtotals(heading, underline, 0, 0, 1); - - printf("\n"); -} - -struct option opts[] = { - { "aliases", no_argument, NULL, 'a' }, - { "activity", no_argument, NULL, 'A' }, - { "Bytes", no_argument, NULL, 'B'}, - { "debug", optional_argument, NULL, 'd' }, - { "display-activity", no_argument, NULL, 'D' }, - { "empty", no_argument, NULL, 'e' }, - { "first-alias", no_argument, NULL, 'f' }, - { "help", no_argument, NULL, 'h' }, - { "inverted", no_argument, NULL, 'i'}, - { "slabs", no_argument, NULL, 'l' }, - { "Loss", no_argument, NULL, 'L'}, - { "numa", no_argument, NULL, 'n' }, - { "lines", required_argument, NULL, 'N'}, - { "ops", no_argument, NULL, 'o' }, - { "partial", no_argument, NULL, 'p'}, - { "report", no_argument, NULL, 'r' }, - { "shrink", no_argument, NULL, 's' }, - { "Size", no_argument, NULL, 'S'}, - { "tracking", no_argument, NULL, 't'}, - { "Totals", no_argument, NULL, 'T'}, - { "Unreclaim", no_argument, NULL, 'U'}, - { "validate", no_argument, NULL, 'v' }, - { "Xtotals", no_argument, NULL, 'X'}, - { "zero", no_argument, NULL, 'z' }, - { "1ref", no_argument, NULL, '1'}, - { NULL, 0, NULL, 0 } -}; - -int main(int argc, char *argv[]) -{ - int c; - int err; - char *pattern_source; - - page_size = getpagesize(); - - while ((c = getopt_long(argc, argv, "aABd::DefhilLnN:oPrsStTUvXz1", - opts, NULL)) != -1) - switch (c) { - case 'a': - show_alias = 1; - break; - case 'A': - sort_active = 1; - break; - case 'B': - show_bytes = 1; - break; - case 'd': - set_debug = 1; - if (!debug_opt_scan(optarg)) - fatal("Invalid debug option '%s'\n", optarg); - break; - case 'D': - show_activity = 1; - break; - case 'e': - show_empty = 1; - break; - case 'f': - show_first_alias = 1; - break; - case 'h': - usage(); - return 0; - case 'i': - show_inverted = 1; - break; - case 'l': - show_slab = 1; - break; - case 'L': - sort_loss = 1; - break; - case 'n': - show_numa = 1; - break; - case 'N': - if (optarg) { - output_lines = atoi(optarg); - if (output_lines < 1) - output_lines = 1; - } - break; - case 'o': - show_ops = 1; - break; - case 'r': - show_report = 1; - break; - case 'P': - sort_partial = 1; - break; - case 's': - shrink = 1; - break; - case 'S': - sort_size = 1; - break; - case 't': - show_track = 1; - break; - case 'T': - show_totals = 1; - break; - case 'U': - unreclaim_only = 1; - break; - case 'v': - validate = 1; - break; - case 'X': - if (output_lines == -1) - output_lines = 1; - extended_totals = 1; - show_bytes = 1; - break; - case 'z': - skip_zero = 0; - break; - case '1': - show_single_ref = 1; - break; - default: - fatal("%s: Invalid option '%c'\n", argv[0], optopt); - - } - - if (!show_slab && !show_alias && !show_track && !show_report - && !validate && !shrink && !set_debug && !show_ops) - show_slab = 1; - - if (argc > optind) - pattern_source = argv[optind]; - else - pattern_source = ".*"; - - err = regcomp(&pattern, pattern_source, REG_ICASE|REG_NOSUB); - if (err) - fatal("%s: Invalid pattern '%s' code %d\n", - argv[0], pattern_source, err); - read_slab_dir(); - if (show_alias) { - alias(); - } else if (extended_totals) { - xtotals(); - } else if (show_totals) { - totals(); - } else { - link_slabs(); - rename_slabs(); - sort_slabs(); - output_slabs(); - } - return 0; -} -- cgit v1.2.3 From baa489fabd01596d5426d6e112b34ba5fb59ab82 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 3 Jan 2023 18:07:53 +0000 Subject: selftests/vm: rename selftests/vm to selftests/mm Rename selftets/vm to selftests/mm for being more consistent with the code, documentation, and tools directories, and won't be confused with virtual machines. [sj@kernel.org: convert missing vm->mm changes] Link: https://lkml.kernel.org/r/20230107230643.252273-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230103180754.129637-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/hugetlbpage.rst | 6 +- Documentation/core-api/pin_user_pages.rst | 2 +- MAINTAINERS | 4 +- mm/Kconfig | 2 +- tools/testing/selftests/Makefile | 2 +- tools/testing/selftests/kselftest_deps.sh | 6 +- tools/testing/selftests/mm/.gitignore | 38 + tools/testing/selftests/mm/Makefile | 180 ++ .../selftests/mm/charge_reserved_hugetlb.sh | 584 ++++++ tools/testing/selftests/mm/check_config.sh | 31 + tools/testing/selftests/mm/compaction_test.c | 231 +++ tools/testing/selftests/mm/config | 8 + tools/testing/selftests/mm/cow.c | 1764 +++++++++++++++++ tools/testing/selftests/mm/gup_test.c | 271 +++ tools/testing/selftests/mm/hmm-tests.c | 2054 ++++++++++++++++++++ tools/testing/selftests/mm/hugepage-mmap.c | 91 + tools/testing/selftests/mm/hugepage-mremap.c | 188 ++ tools/testing/selftests/mm/hugepage-shm.c | 101 + tools/testing/selftests/mm/hugepage-vmemmap.c | 144 ++ tools/testing/selftests/mm/hugetlb-madvise.c | 406 ++++ .../selftests/mm/hugetlb_reparenting_test.sh | 252 +++ tools/testing/selftests/mm/khugepaged.c | 1558 +++++++++++++++ tools/testing/selftests/mm/ksm_functional_tests.c | 279 +++ tools/testing/selftests/mm/ksm_tests.c | 849 ++++++++ tools/testing/selftests/mm/madv_populate.c | 296 +++ tools/testing/selftests/mm/map_fixed_noreplace.c | 231 +++ tools/testing/selftests/mm/map_hugetlb.c | 109 ++ tools/testing/selftests/mm/map_populate.c | 113 ++ tools/testing/selftests/mm/memfd_secret.c | 296 +++ tools/testing/selftests/mm/migration.c | 193 ++ tools/testing/selftests/mm/mlock-random-test.c | 294 +++ tools/testing/selftests/mm/mlock2-tests.c | 520 +++++ tools/testing/selftests/mm/mlock2.h | 63 + tools/testing/selftests/mm/mrelease_test.c | 206 ++ tools/testing/selftests/mm/mremap_dontunmap.c | 364 ++++ tools/testing/selftests/mm/mremap_test.c | 475 +++++ tools/testing/selftests/mm/on-fault-limit.c | 48 + tools/testing/selftests/mm/pkey-helpers.h | 226 +++ tools/testing/selftests/mm/pkey-powerpc.h | 133 ++ tools/testing/selftests/mm/pkey-x86.h | 177 ++ tools/testing/selftests/mm/protection_keys.c | 1788 +++++++++++++++++ tools/testing/selftests/mm/run_vmtests.sh | 274 +++ tools/testing/selftests/mm/settings | 1 + tools/testing/selftests/mm/soft-dirty.c | 210 ++ tools/testing/selftests/mm/split_huge_page_test.c | 309 +++ tools/testing/selftests/mm/test_hmm.sh | 105 + tools/testing/selftests/mm/test_vmalloc.sh | 177 ++ tools/testing/selftests/mm/thuge-gen.c | 257 +++ tools/testing/selftests/mm/transhuge-stress.c | 122 ++ tools/testing/selftests/mm/userfaultfd.c | 1858 ++++++++++++++++++ tools/testing/selftests/mm/util.h | 69 + tools/testing/selftests/mm/va_128TBswitch.c | 289 +++ tools/testing/selftests/mm/va_128TBswitch.sh | 54 + tools/testing/selftests/mm/virtual_address_range.c | 139 ++ tools/testing/selftests/mm/vm_util.c | 151 ++ tools/testing/selftests/mm/vm_util.h | 15 + tools/testing/selftests/mm/write_hugetlb_memory.sh | 23 + tools/testing/selftests/mm/write_to_hugetlbfs.c | 240 +++ tools/testing/selftests/vm/.gitignore | 38 - tools/testing/selftests/vm/Makefile | 180 -- .../selftests/vm/charge_reserved_hugetlb.sh | 584 ------ tools/testing/selftests/vm/check_config.sh | 31 - tools/testing/selftests/vm/compaction_test.c | 231 --- tools/testing/selftests/vm/config | 8 - tools/testing/selftests/vm/cow.c | 1764 ----------------- tools/testing/selftests/vm/gup_test.c | 271 --- tools/testing/selftests/vm/hmm-tests.c | 2054 -------------------- tools/testing/selftests/vm/hugepage-mmap.c | 91 - tools/testing/selftests/vm/hugepage-mremap.c | 188 -- tools/testing/selftests/vm/hugepage-shm.c | 101 - tools/testing/selftests/vm/hugepage-vmemmap.c | 144 -- tools/testing/selftests/vm/hugetlb-madvise.c | 406 ---- .../selftests/vm/hugetlb_reparenting_test.sh | 252 --- tools/testing/selftests/vm/khugepaged.c | 1558 --------------- tools/testing/selftests/vm/ksm_functional_tests.c | 279 --- tools/testing/selftests/vm/ksm_tests.c | 849 -------- tools/testing/selftests/vm/madv_populate.c | 296 --- tools/testing/selftests/vm/map_fixed_noreplace.c | 231 --- tools/testing/selftests/vm/map_hugetlb.c | 109 -- tools/testing/selftests/vm/map_populate.c | 113 -- tools/testing/selftests/vm/memfd_secret.c | 296 --- tools/testing/selftests/vm/migration.c | 193 -- tools/testing/selftests/vm/mlock-random-test.c | 294 --- tools/testing/selftests/vm/mlock2-tests.c | 520 ----- tools/testing/selftests/vm/mlock2.h | 63 - tools/testing/selftests/vm/mrelease_test.c | 206 -- tools/testing/selftests/vm/mremap_dontunmap.c | 364 ---- tools/testing/selftests/vm/mremap_test.c | 475 ----- tools/testing/selftests/vm/on-fault-limit.c | 48 - tools/testing/selftests/vm/pkey-helpers.h | 226 --- tools/testing/selftests/vm/pkey-powerpc.h | 133 -- tools/testing/selftests/vm/pkey-x86.h | 177 -- tools/testing/selftests/vm/protection_keys.c | 1788 ----------------- tools/testing/selftests/vm/run_vmtests.sh | 274 --- tools/testing/selftests/vm/settings | 1 - tools/testing/selftests/vm/soft-dirty.c | 210 -- tools/testing/selftests/vm/split_huge_page_test.c | 309 --- tools/testing/selftests/vm/test_hmm.sh | 105 - tools/testing/selftests/vm/test_vmalloc.sh | 177 -- tools/testing/selftests/vm/thuge-gen.c | 257 --- tools/testing/selftests/vm/transhuge-stress.c | 122 -- tools/testing/selftests/vm/userfaultfd.c | 1858 ------------------ tools/testing/selftests/vm/util.h | 69 - tools/testing/selftests/vm/va_128TBswitch.c | 289 --- tools/testing/selftests/vm/va_128TBswitch.sh | 54 - tools/testing/selftests/vm/virtual_address_range.c | 139 -- tools/testing/selftests/vm/vm_util.c | 151 -- tools/testing/selftests/vm/vm_util.h | 15 - tools/testing/selftests/vm/write_hugetlb_memory.sh | 23 - tools/testing/selftests/vm/write_to_hugetlbfs.c | 240 --- 110 files changed, 18865 insertions(+), 18865 deletions(-) create mode 100644 tools/testing/selftests/mm/.gitignore create mode 100644 tools/testing/selftests/mm/Makefile create mode 100644 tools/testing/selftests/mm/charge_reserved_hugetlb.sh create mode 100644 tools/testing/selftests/mm/check_config.sh create mode 100644 tools/testing/selftests/mm/compaction_test.c create mode 100644 tools/testing/selftests/mm/config create mode 100644 tools/testing/selftests/mm/cow.c create mode 100644 tools/testing/selftests/mm/gup_test.c create mode 100644 tools/testing/selftests/mm/hmm-tests.c create mode 100644 tools/testing/selftests/mm/hugepage-mmap.c create mode 100644 tools/testing/selftests/mm/hugepage-mremap.c create mode 100644 tools/testing/selftests/mm/hugepage-shm.c create mode 100644 tools/testing/selftests/mm/hugepage-vmemmap.c create mode 100644 tools/testing/selftests/mm/hugetlb-madvise.c create mode 100644 tools/testing/selftests/mm/hugetlb_reparenting_test.sh create mode 100644 tools/testing/selftests/mm/khugepaged.c create mode 100644 tools/testing/selftests/mm/ksm_functional_tests.c create mode 100644 tools/testing/selftests/mm/ksm_tests.c create mode 100644 tools/testing/selftests/mm/madv_populate.c create mode 100644 tools/testing/selftests/mm/map_fixed_noreplace.c create mode 100644 tools/testing/selftests/mm/map_hugetlb.c create mode 100644 tools/testing/selftests/mm/map_populate.c create mode 100644 tools/testing/selftests/mm/memfd_secret.c create mode 100644 tools/testing/selftests/mm/migration.c create mode 100644 tools/testing/selftests/mm/mlock-random-test.c create mode 100644 tools/testing/selftests/mm/mlock2-tests.c create mode 100644 tools/testing/selftests/mm/mlock2.h create mode 100644 tools/testing/selftests/mm/mrelease_test.c create mode 100644 tools/testing/selftests/mm/mremap_dontunmap.c create mode 100644 tools/testing/selftests/mm/mremap_test.c create mode 100644 tools/testing/selftests/mm/on-fault-limit.c create mode 100644 tools/testing/selftests/mm/pkey-helpers.h create mode 100644 tools/testing/selftests/mm/pkey-powerpc.h create mode 100644 tools/testing/selftests/mm/pkey-x86.h create mode 100644 tools/testing/selftests/mm/protection_keys.c create mode 100644 tools/testing/selftests/mm/run_vmtests.sh create mode 100644 tools/testing/selftests/mm/settings create mode 100644 tools/testing/selftests/mm/soft-dirty.c create mode 100644 tools/testing/selftests/mm/split_huge_page_test.c create mode 100644 tools/testing/selftests/mm/test_hmm.sh create mode 100644 tools/testing/selftests/mm/test_vmalloc.sh create mode 100644 tools/testing/selftests/mm/thuge-gen.c create mode 100644 tools/testing/selftests/mm/transhuge-stress.c create mode 100644 tools/testing/selftests/mm/userfaultfd.c create mode 100644 tools/testing/selftests/mm/util.h create mode 100644 tools/testing/selftests/mm/va_128TBswitch.c create mode 100644 tools/testing/selftests/mm/va_128TBswitch.sh create mode 100644 tools/testing/selftests/mm/virtual_address_range.c create mode 100644 tools/testing/selftests/mm/vm_util.c create mode 100644 tools/testing/selftests/mm/vm_util.h create mode 100644 tools/testing/selftests/mm/write_hugetlb_memory.sh create mode 100644 tools/testing/selftests/mm/write_to_hugetlbfs.c delete mode 100644 tools/testing/selftests/vm/.gitignore delete mode 100644 tools/testing/selftests/vm/Makefile delete mode 100644 tools/testing/selftests/vm/charge_reserved_hugetlb.sh delete mode 100644 tools/testing/selftests/vm/check_config.sh delete mode 100644 tools/testing/selftests/vm/compaction_test.c delete mode 100644 tools/testing/selftests/vm/config delete mode 100644 tools/testing/selftests/vm/cow.c delete mode 100644 tools/testing/selftests/vm/gup_test.c delete mode 100644 tools/testing/selftests/vm/hmm-tests.c delete mode 100644 tools/testing/selftests/vm/hugepage-mmap.c delete mode 100644 tools/testing/selftests/vm/hugepage-mremap.c delete mode 100644 tools/testing/selftests/vm/hugepage-shm.c delete mode 100644 tools/testing/selftests/vm/hugepage-vmemmap.c delete mode 100644 tools/testing/selftests/vm/hugetlb-madvise.c delete mode 100644 tools/testing/selftests/vm/hugetlb_reparenting_test.sh delete mode 100644 tools/testing/selftests/vm/khugepaged.c delete mode 100644 tools/testing/selftests/vm/ksm_functional_tests.c delete mode 100644 tools/testing/selftests/vm/ksm_tests.c delete mode 100644 tools/testing/selftests/vm/madv_populate.c delete mode 100644 tools/testing/selftests/vm/map_fixed_noreplace.c delete mode 100644 tools/testing/selftests/vm/map_hugetlb.c delete mode 100644 tools/testing/selftests/vm/map_populate.c delete mode 100644 tools/testing/selftests/vm/memfd_secret.c delete mode 100644 tools/testing/selftests/vm/migration.c delete mode 100644 tools/testing/selftests/vm/mlock-random-test.c delete mode 100644 tools/testing/selftests/vm/mlock2-tests.c delete mode 100644 tools/testing/selftests/vm/mlock2.h delete mode 100644 tools/testing/selftests/vm/mrelease_test.c delete mode 100644 tools/testing/selftests/vm/mremap_dontunmap.c delete mode 100644 tools/testing/selftests/vm/mremap_test.c delete mode 100644 tools/testing/selftests/vm/on-fault-limit.c delete mode 100644 tools/testing/selftests/vm/pkey-helpers.h delete mode 100644 tools/testing/selftests/vm/pkey-powerpc.h delete mode 100644 tools/testing/selftests/vm/pkey-x86.h delete mode 100644 tools/testing/selftests/vm/protection_keys.c delete mode 100755 tools/testing/selftests/vm/run_vmtests.sh delete mode 100644 tools/testing/selftests/vm/settings delete mode 100644 tools/testing/selftests/vm/soft-dirty.c delete mode 100644 tools/testing/selftests/vm/split_huge_page_test.c delete mode 100755 tools/testing/selftests/vm/test_hmm.sh delete mode 100755 tools/testing/selftests/vm/test_vmalloc.sh delete mode 100644 tools/testing/selftests/vm/thuge-gen.c delete mode 100644 tools/testing/selftests/vm/transhuge-stress.c delete mode 100644 tools/testing/selftests/vm/userfaultfd.c delete mode 100644 tools/testing/selftests/vm/util.h delete mode 100644 tools/testing/selftests/vm/va_128TBswitch.c delete mode 100755 tools/testing/selftests/vm/va_128TBswitch.sh delete mode 100644 tools/testing/selftests/vm/virtual_address_range.c delete mode 100644 tools/testing/selftests/vm/vm_util.c delete mode 100644 tools/testing/selftests/vm/vm_util.h delete mode 100644 tools/testing/selftests/vm/write_hugetlb_memory.sh delete mode 100644 tools/testing/selftests/vm/write_to_hugetlbfs.c diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst index 19f27c0d92e0..a969a2c742b2 100644 --- a/Documentation/admin-guide/mm/hugetlbpage.rst +++ b/Documentation/admin-guide/mm/hugetlbpage.rst @@ -461,13 +461,13 @@ Examples .. _map_hugetlb: ``map_hugetlb`` - see tools/testing/selftests/vm/map_hugetlb.c + see tools/testing/selftests/mm/map_hugetlb.c ``hugepage-shm`` - see tools/testing/selftests/vm/hugepage-shm.c + see tools/testing/selftests/mm/hugepage-shm.c ``hugepage-mmap`` - see tools/testing/selftests/vm/hugepage-mmap.c + see tools/testing/selftests/mm/hugepage-mmap.c The `libhugetlbfs`_ library provides a wide range of userspace tools to help with huge page usability, environment setup, and control. diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index b18416f4500f..facafbdecb95 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -221,7 +221,7 @@ Unit testing ============ This file:: - tools/testing/selftests/vm/gup_test.c + tools/testing/selftests/mm/gup_test.c has the following new calls to exercise the new pin*() wrapper functions: diff --git a/MAINTAINERS b/MAINTAINERS index c726adfd1f0d..8ac1472bea34 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9466,7 +9466,7 @@ F: Documentation/mm/hmm.rst F: include/linux/hmm* F: lib/test_hmm* F: mm/hmm* -F: tools/testing/selftests/vm/*hmm* +F: tools/testing/selftests/mm/*hmm* HOST AP DRIVER M: Jouni Malinen @@ -13484,7 +13484,7 @@ F: include/linux/mmzone.h F: include/linux/pagewalk.h F: mm/ F: tools/mm/ -F: tools/testing/selftests/vm/ +F: tools/testing/selftests/mm/ VMALLOC M: Andrew Morton diff --git a/mm/Kconfig b/mm/Kconfig index ff7b209dec05..39df30dcabe3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1073,7 +1073,7 @@ config GUP_TEST pin_user_pages*(), or pinned via get_user_pages*(), as specified by other command line arguments. - See tools/testing/selftests/vm/gup_test.c + See tools/testing/selftests/mm/gup_test.c comment "GUP_TEST needs to have DEBUG_FS enabled" depends on !GUP_TEST && !DEBUG_FS diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 41b649452560..56a29f2de8e6 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -85,7 +85,7 @@ TARGETS += tmpfs TARGETS += tpm2 TARGETS += user TARGETS += vDSO -TARGETS += vm +TARGETS += mm TARGETS += x86 TARGETS += zram #Please keep the TARGETS list alphabetically sorted diff --git a/tools/testing/selftests/kselftest_deps.sh b/tools/testing/selftests/kselftest_deps.sh index 7424a1f5babc..4bc14d9e8ff1 100755 --- a/tools/testing/selftests/kselftest_deps.sh +++ b/tools/testing/selftests/kselftest_deps.sh @@ -12,9 +12,9 @@ usage() echo -e "Usage: $0 -[p] [test_name]\n" echo -e "\tkselftest_deps.sh [-p] gcc" -echo -e "\tkselftest_deps.sh [-p] gcc vm" +echo -e "\tkselftest_deps.sh [-p] gcc mm" echo -e "\tkselftest_deps.sh [-p] aarch64-linux-gnu-gcc" -echo -e "\tkselftest_deps.sh [-p] aarch64-linux-gnu-gcc vm\n" +echo -e "\tkselftest_deps.sh [-p] aarch64-linux-gnu-gcc mm\n" echo "- Should be run in selftests directory in the kernel repo." echo "- Checks if Kselftests can be built/cross-built on a system." echo "- Parses all test/sub-test Makefile to find library dependencies." @@ -120,7 +120,7 @@ l1_tests=$(grep -r --include=Makefile "^LDLIBS" | \ # Level 2 # Some tests have multiple valid LDLIBS lines for individual sub-tests # that need dependency checks. Find them and append them to the tests -# e.g: vm/Makefile:$(OUTPUT)/userfaultfd: LDLIBS += -lpthread +# e.g: mm/Makefile:$(OUTPUT)/userfaultfd: LDLIBS += -lpthread # Filter out VAR_LDLIBS to discard the following: # memfd/Makefile:$(OUTPUT)/fuse_mnt: LDLIBS += $(VAR_LDLIBS) # Append space at the end of the list to append more tests. diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore new file mode 100644 index 000000000000..1f8c36a9fa10 --- /dev/null +++ b/tools/testing/selftests/mm/.gitignore @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: GPL-2.0-only +cow +hugepage-mmap +hugepage-mremap +hugepage-shm +hugepage-vmemmap +hugetlb-madvise +khugepaged +map_hugetlb +map_populate +thuge-gen +compaction_test +migration +mlock2-tests +mrelease_test +mremap_dontunmap +mremap_test +on-fault-limit +transhuge-stress +protection_keys +protection_keys_32 +protection_keys_64 +madv_populate +userfaultfd +mlock-intersect-test +mlock-random-test +virtual_address_range +gup_test +va_128TBswitch +map_fixed_noreplace +write_to_hugetlbfs +hmm-tests +memfd_secret +soft-dirty +split_huge_page_test +ksm_tests +local_config.h +local_config.mk diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile new file mode 100644 index 000000000000..6a4b639b2b2b --- /dev/null +++ b/tools/testing/selftests/mm/Makefile @@ -0,0 +1,180 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for mm selftests + +LOCAL_HDRS += $(selfdir)/mm/local_config.h $(top_srcdir)/mm/gup_test.h + +include local_config.mk + +uname_M := $(shell uname -m 2>/dev/null || echo not) +MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') + +# Without this, failed build products remain, with up-to-date timestamps, +# thus tricking Make (and you!) into believing that All Is Well, in subsequent +# make invocations: +.DELETE_ON_ERROR: + +# Avoid accidental wrong builds, due to built-in rules working just a little +# bit too well--but not quite as well as required for our situation here. +# +# In other words, "make userfaultfd" is supposed to fail to build at all, +# because this Makefile only supports either "make" (all), or "make /full/path". +# However, the built-in rules, if not suppressed, will pick up CFLAGS and the +# initial LDLIBS (but not the target-specific LDLIBS, because those are only +# set for the full path target!). This causes it to get pretty far into building +# things despite using incorrect values such as an *occasionally* incomplete +# LDLIBS. +MAKEFLAGS += --no-builtin-rules + +CFLAGS = -Wall -I $(top_srcdir) -I $(top_srcdir)/usr/include $(EXTRA_CFLAGS) $(KHDR_INCLUDES) +LDLIBS = -lrt -lpthread +TEST_GEN_FILES = cow +TEST_GEN_FILES += compaction_test +TEST_GEN_FILES += gup_test +TEST_GEN_FILES += hmm-tests +TEST_GEN_FILES += hugetlb-madvise +TEST_GEN_FILES += hugepage-mmap +TEST_GEN_FILES += hugepage-mremap +TEST_GEN_FILES += hugepage-shm +TEST_GEN_FILES += hugepage-vmemmap +TEST_GEN_FILES += khugepaged +TEST_GEN_PROGS = madv_populate +TEST_GEN_FILES += map_fixed_noreplace +TEST_GEN_FILES += map_hugetlb +TEST_GEN_FILES += map_populate +TEST_GEN_FILES += memfd_secret +TEST_GEN_FILES += migration +TEST_GEN_FILES += mlock-random-test +TEST_GEN_FILES += mlock2-tests +TEST_GEN_FILES += mrelease_test +TEST_GEN_FILES += mremap_dontunmap +TEST_GEN_FILES += mremap_test +TEST_GEN_FILES += on-fault-limit +TEST_GEN_FILES += thuge-gen +TEST_GEN_FILES += transhuge-stress +TEST_GEN_FILES += userfaultfd +TEST_GEN_PROGS += soft-dirty +TEST_GEN_PROGS += split_huge_page_test +TEST_GEN_FILES += ksm_tests +TEST_GEN_PROGS += ksm_functional_tests + +ifeq ($(MACHINE),x86_64) +CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32) +CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_program.c) +CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie) + +VMTARGETS := protection_keys +BINARIES_32 := $(VMTARGETS:%=%_32) +BINARIES_64 := $(VMTARGETS:%=%_64) + +ifeq ($(CAN_BUILD_WITH_NOPIE),1) +CFLAGS += -no-pie +endif + +ifeq ($(CAN_BUILD_I386),1) +TEST_GEN_FILES += $(BINARIES_32) +endif + +ifeq ($(CAN_BUILD_X86_64),1) +TEST_GEN_FILES += $(BINARIES_64) +endif +else + +ifneq (,$(findstring $(MACHINE),ppc64)) +TEST_GEN_FILES += protection_keys +endif + +endif + +ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sh64 sparc64 x86_64)) +TEST_GEN_FILES += va_128TBswitch +TEST_GEN_FILES += virtual_address_range +TEST_GEN_FILES += write_to_hugetlbfs +endif + +TEST_PROGS := run_vmtests.sh + +TEST_FILES := test_vmalloc.sh +TEST_FILES += test_hmm.sh +TEST_FILES += va_128TBswitch.sh + +include ../lib.mk + +$(OUTPUT)/cow: vm_util.c +$(OUTPUT)/khugepaged: vm_util.c +$(OUTPUT)/ksm_functional_tests: vm_util.c +$(OUTPUT)/madv_populate: vm_util.c +$(OUTPUT)/soft-dirty: vm_util.c +$(OUTPUT)/split_huge_page_test: vm_util.c +$(OUTPUT)/userfaultfd: vm_util.c + +ifeq ($(MACHINE),x86_64) +BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) +BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) + +define gen-target-rule-32 +$(1) $(1)_32: $(OUTPUT)/$(1)_32 +.PHONY: $(1) $(1)_32 +endef + +define gen-target-rule-64 +$(1) $(1)_64: $(OUTPUT)/$(1)_64 +.PHONY: $(1) $(1)_64 +endef + +ifeq ($(CAN_BUILD_I386),1) +$(BINARIES_32): CFLAGS += -m32 -mxsave +$(BINARIES_32): LDLIBS += -lrt -ldl -lm +$(BINARIES_32): $(OUTPUT)/%_32: %.c + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ +$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-32,$(t)))) +endif + +ifeq ($(CAN_BUILD_X86_64),1) +$(BINARIES_64): CFLAGS += -m64 -mxsave +$(BINARIES_64): LDLIBS += -lrt -ldl +$(BINARIES_64): $(OUTPUT)/%_64: %.c + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ +$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-64,$(t)))) +endif + +# x86_64 users should be encouraged to install 32-bit libraries +ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01) +all: warn_32bit_failure + +warn_32bit_failure: + @echo "Warning: you seem to have a broken 32-bit build" 2>&1; \ + echo "environment. This will reduce test coverage of 64-bit" 2>&1; \ + echo "kernels. If you are using a Debian-like distribution," 2>&1; \ + echo "try:"; 2>&1; \ + echo ""; \ + echo " apt-get install gcc-multilib libc6-i386 libc6-dev-i386"; \ + echo ""; \ + echo "If you are using a Fedora-like distribution, try:"; \ + echo ""; \ + echo " yum install glibc-devel.*i686"; \ + exit 0; +endif +endif + +# cow_EXTRA_LIBS may get set in local_config.mk, or it may be left empty. +$(OUTPUT)/cow: LDLIBS += $(COW_EXTRA_LIBS) + +$(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap + +$(OUTPUT)/ksm_tests: LDLIBS += -lnuma + +$(OUTPUT)/migration: LDLIBS += -lnuma + +local_config.mk local_config.h: check_config.sh + /bin/sh ./check_config.sh $(CC) + +EXTRA_CLEAN += local_config.mk local_config.h + +ifeq ($(COW_EXTRA_LIBS),) +all: warn_missing_liburing + +warn_missing_liburing: + @echo ; \ + echo "Warning: missing liburing support. Some COW tests will be skipped." ; \ + echo +endif diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh new file mode 100644 index 000000000000..a5cb4b09a46c --- /dev/null +++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh @@ -0,0 +1,584 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +set -e + +if [[ $(id -u) -ne 0 ]]; then + echo "This test must be run as root. Skipping..." + exit $ksft_skip +fi + +fault_limit_file=limit_in_bytes +reservation_limit_file=rsvd.limit_in_bytes +fault_usage_file=usage_in_bytes +reservation_usage_file=rsvd.usage_in_bytes + +if [[ "$1" == "-cgroup-v2" ]]; then + cgroup2=1 + fault_limit_file=max + reservation_limit_file=rsvd.max + fault_usage_file=current + reservation_usage_file=rsvd.current +fi + +if [[ $cgroup2 ]]; then + cgroup_path=$(mount -t cgroup2 | head -1 | awk -e '{print $3}') + if [[ -z "$cgroup_path" ]]; then + cgroup_path=/dev/cgroup/memory + mount -t cgroup2 none $cgroup_path + do_umount=1 + fi + echo "+hugetlb" >$cgroup_path/cgroup.subtree_control +else + cgroup_path=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}') + if [[ -z "$cgroup_path" ]]; then + cgroup_path=/dev/cgroup/memory + mount -t cgroup memory,hugetlb $cgroup_path + do_umount=1 + fi +fi +export cgroup_path + +function cleanup() { + if [[ $cgroup2 ]]; then + echo $$ >$cgroup_path/cgroup.procs + else + echo $$ >$cgroup_path/tasks + fi + + if [[ -e /mnt/huge ]]; then + rm -rf /mnt/huge/* + umount /mnt/huge || echo error + rmdir /mnt/huge + fi + if [[ -e $cgroup_path/hugetlb_cgroup_test ]]; then + rmdir $cgroup_path/hugetlb_cgroup_test + fi + if [[ -e $cgroup_path/hugetlb_cgroup_test1 ]]; then + rmdir $cgroup_path/hugetlb_cgroup_test1 + fi + if [[ -e $cgroup_path/hugetlb_cgroup_test2 ]]; then + rmdir $cgroup_path/hugetlb_cgroup_test2 + fi + echo 0 >/proc/sys/vm/nr_hugepages + echo CLEANUP DONE +} + +function expect_equal() { + local expected="$1" + local actual="$2" + local error="$3" + + if [[ "$expected" != "$actual" ]]; then + echo "expected ($expected) != actual ($actual): $3" + cleanup + exit 1 + fi +} + +function get_machine_hugepage_size() { + hpz=$(grep -i hugepagesize /proc/meminfo) + kb=${hpz:14:-3} + mb=$(($kb / 1024)) + echo $mb +} + +MB=$(get_machine_hugepage_size) + +function setup_cgroup() { + local name="$1" + local cgroup_limit="$2" + local reservation_limit="$3" + + mkdir $cgroup_path/$name + + echo writing cgroup limit: "$cgroup_limit" + echo "$cgroup_limit" >$cgroup_path/$name/hugetlb.${MB}MB.$fault_limit_file + + echo writing reseravation limit: "$reservation_limit" + echo "$reservation_limit" > \ + $cgroup_path/$name/hugetlb.${MB}MB.$reservation_limit_file + + if [ -e "$cgroup_path/$name/cpuset.cpus" ]; then + echo 0 >$cgroup_path/$name/cpuset.cpus + fi + if [ -e "$cgroup_path/$name/cpuset.mems" ]; then + echo 0 >$cgroup_path/$name/cpuset.mems + fi +} + +function wait_for_hugetlb_memory_to_get_depleted() { + local cgroup="$1" + local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" + # Wait for hugetlbfs memory to get depleted. + while [ $(cat $path) != 0 ]; do + echo Waiting for hugetlb memory to get depleted. + cat $path + sleep 0.5 + done +} + +function wait_for_hugetlb_memory_to_get_reserved() { + local cgroup="$1" + local size="$2" + + local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" + # Wait for hugetlbfs memory to get written. + while [ $(cat $path) != $size ]; do + echo Waiting for hugetlb memory reservation to reach size $size. + cat $path + sleep 0.5 + done +} + +function wait_for_hugetlb_memory_to_get_written() { + local cgroup="$1" + local size="$2" + + local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file" + # Wait for hugetlbfs memory to get written. + while [ $(cat $path) != $size ]; do + echo Waiting for hugetlb memory to reach size $size. + cat $path + sleep 0.5 + done +} + +function write_hugetlbfs_and_get_usage() { + local cgroup="$1" + local size="$2" + local populate="$3" + local write="$4" + local path="$5" + local method="$6" + local private="$7" + local expect_failure="$8" + local reserve="$9" + + # Function return values. + reservation_failed=0 + oom_killed=0 + hugetlb_difference=0 + reserved_difference=0 + + local hugetlb_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file + local reserved_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file + + local hugetlb_before=$(cat $hugetlb_usage) + local reserved_before=$(cat $reserved_usage) + + echo + echo Starting: + echo hugetlb_usage="$hugetlb_before" + echo reserved_usage="$reserved_before" + echo expect_failure is "$expect_failure" + + output=$(mktemp) + set +e + if [[ "$method" == "1" ]] || [[ "$method" == 2 ]] || + [[ "$private" == "-r" ]] && [[ "$expect_failure" != 1 ]]; then + + bash write_hugetlb_memory.sh "$size" "$populate" "$write" \ + "$cgroup" "$path" "$method" "$private" "-l" "$reserve" 2>&1 | tee $output & + + local write_result=$? + local write_pid=$! + + until grep -q -i "DONE" $output; do + echo waiting for DONE signal. + if ! ps $write_pid > /dev/null + then + echo "FAIL: The write died" + cleanup + exit 1 + fi + sleep 0.5 + done + + echo ================= write_hugetlb_memory.sh output is: + cat $output + echo ================= end output. + + if [[ "$populate" == "-o" ]] || [[ "$write" == "-w" ]]; then + wait_for_hugetlb_memory_to_get_written "$cgroup" "$size" + elif [[ "$reserve" != "-n" ]]; then + wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size" + else + # This case doesn't produce visible effects, but we still have + # to wait for the async process to start and execute... + sleep 0.5 + fi + + echo write_result is $write_result + else + bash write_hugetlb_memory.sh "$size" "$populate" "$write" \ + "$cgroup" "$path" "$method" "$private" "$reserve" + local write_result=$? + + if [[ "$reserve" != "-n" ]]; then + wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size" + fi + fi + set -e + + if [[ "$write_result" == 1 ]]; then + reservation_failed=1 + fi + + # On linus/master, the above process gets SIGBUS'd on oomkill, with + # return code 135. On earlier kernels, it gets actual oomkill, with return + # code 137, so just check for both conditions in case we're testing + # against an earlier kernel. + if [[ "$write_result" == 135 ]] || [[ "$write_result" == 137 ]]; then + oom_killed=1 + fi + + local hugetlb_after=$(cat $hugetlb_usage) + local reserved_after=$(cat $reserved_usage) + + echo After write: + echo hugetlb_usage="$hugetlb_after" + echo reserved_usage="$reserved_after" + + hugetlb_difference=$(($hugetlb_after - $hugetlb_before)) + reserved_difference=$(($reserved_after - $reserved_before)) +} + +function cleanup_hugetlb_memory() { + set +e + local cgroup="$1" + if [[ "$(pgrep -f write_to_hugetlbfs)" != "" ]]; then + echo killing write_to_hugetlbfs + killall -2 write_to_hugetlbfs + wait_for_hugetlb_memory_to_get_depleted $cgroup + fi + set -e + + if [[ -e /mnt/huge ]]; then + rm -rf /mnt/huge/* + umount /mnt/huge + rmdir /mnt/huge + fi +} + +function run_test() { + local size=$(($1 * ${MB} * 1024 * 1024)) + local populate="$2" + local write="$3" + local cgroup_limit=$(($4 * ${MB} * 1024 * 1024)) + local reservation_limit=$(($5 * ${MB} * 1024 * 1024)) + local nr_hugepages="$6" + local method="$7" + local private="$8" + local expect_failure="$9" + local reserve="${10}" + + # Function return values. + hugetlb_difference=0 + reserved_difference=0 + reservation_failed=0 + oom_killed=0 + + echo nr hugepages = "$nr_hugepages" + echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages + + setup_cgroup "hugetlb_cgroup_test" "$cgroup_limit" "$reservation_limit" + + mkdir -p /mnt/huge + mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge + + write_hugetlbfs_and_get_usage "hugetlb_cgroup_test" "$size" "$populate" \ + "$write" "/mnt/huge/test" "$method" "$private" "$expect_failure" \ + "$reserve" + + cleanup_hugetlb_memory "hugetlb_cgroup_test" + + local final_hugetlb=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$fault_usage_file) + local final_reservation=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$reservation_usage_file) + + echo $hugetlb_difference + echo $reserved_difference + expect_equal "0" "$final_hugetlb" "final hugetlb is not zero" + expect_equal "0" "$final_reservation" "final reservation is not zero" +} + +function run_multiple_cgroup_test() { + local size1="$1" + local populate1="$2" + local write1="$3" + local cgroup_limit1="$4" + local reservation_limit1="$5" + + local size2="$6" + local populate2="$7" + local write2="$8" + local cgroup_limit2="$9" + local reservation_limit2="${10}" + + local nr_hugepages="${11}" + local method="${12}" + local private="${13}" + local expect_failure="${14}" + local reserve="${15}" + + # Function return values. + hugetlb_difference1=0 + reserved_difference1=0 + reservation_failed1=0 + oom_killed1=0 + + hugetlb_difference2=0 + reserved_difference2=0 + reservation_failed2=0 + oom_killed2=0 + + echo nr hugepages = "$nr_hugepages" + echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages + + setup_cgroup "hugetlb_cgroup_test1" "$cgroup_limit1" "$reservation_limit1" + setup_cgroup "hugetlb_cgroup_test2" "$cgroup_limit2" "$reservation_limit2" + + mkdir -p /mnt/huge + mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge + + write_hugetlbfs_and_get_usage "hugetlb_cgroup_test1" "$size1" \ + "$populate1" "$write1" "/mnt/huge/test1" "$method" "$private" \ + "$expect_failure" "$reserve" + + hugetlb_difference1=$hugetlb_difference + reserved_difference1=$reserved_difference + reservation_failed1=$reservation_failed + oom_killed1=$oom_killed + + local cgroup1_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$fault_usage_file + local cgroup1_reservation_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$reservation_usage_file + local cgroup2_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$fault_usage_file + local cgroup2_reservation_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$reservation_usage_file + + local usage_before_second_write=$(cat $cgroup1_hugetlb_usage) + local reservation_usage_before_second_write=$(cat $cgroup1_reservation_usage) + + write_hugetlbfs_and_get_usage "hugetlb_cgroup_test2" "$size2" \ + "$populate2" "$write2" "/mnt/huge/test2" "$method" "$private" \ + "$expect_failure" "$reserve" + + hugetlb_difference2=$hugetlb_difference + reserved_difference2=$reserved_difference + reservation_failed2=$reservation_failed + oom_killed2=$oom_killed + + expect_equal "$usage_before_second_write" \ + "$(cat $cgroup1_hugetlb_usage)" "Usage changed." + expect_equal "$reservation_usage_before_second_write" \ + "$(cat $cgroup1_reservation_usage)" "Reservation usage changed." + + cleanup_hugetlb_memory + + local final_hugetlb=$(cat $cgroup1_hugetlb_usage) + local final_reservation=$(cat $cgroup1_reservation_usage) + + expect_equal "0" "$final_hugetlb" \ + "hugetlbt_cgroup_test1 final hugetlb is not zero" + expect_equal "0" "$final_reservation" \ + "hugetlbt_cgroup_test1 final reservation is not zero" + + local final_hugetlb=$(cat $cgroup2_hugetlb_usage) + local final_reservation=$(cat $cgroup2_reservation_usage) + + expect_equal "0" "$final_hugetlb" \ + "hugetlb_cgroup_test2 final hugetlb is not zero" + expect_equal "0" "$final_reservation" \ + "hugetlb_cgroup_test2 final reservation is not zero" +} + +cleanup + +for populate in "" "-o"; do + for method in 0 1 2; do + for private in "" "-r"; do + for reserve in "" "-n"; do + + # Skip mmap(MAP_HUGETLB | MAP_SHARED). Doesn't seem to be supported. + if [[ "$method" == 1 ]] && [[ "$private" == "" ]]; then + continue + fi + + # Skip populated shmem tests. Doesn't seem to be supported. + if [[ "$method" == 2"" ]] && [[ "$populate" == "-o" ]]; then + continue + fi + + if [[ "$method" == 2"" ]] && [[ "$reserve" == "-n" ]]; then + continue + fi + + cleanup + echo + echo + echo + echo Test normal case. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + run_test 5 "$populate" "" 10 10 10 "$method" "$private" "0" "$reserve" + + echo Memory charged to hugtlb=$hugetlb_difference + echo Memory charged to reservation=$reserved_difference + + if [[ "$populate" == "-o" ]]; then + expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \ + "Reserved memory charged to hugetlb cgroup." + else + expect_equal "0" "$hugetlb_difference" \ + "Reserved memory charged to hugetlb cgroup." + fi + + if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then + expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \ + "Reserved memory not charged to reservation usage." + else + expect_equal "0" "$reserved_difference" \ + "Reserved memory not charged to reservation usage." + fi + + echo 'PASS' + + cleanup + echo + echo + echo + echo Test normal case with write. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + run_test 5 "$populate" '-w' 5 5 10 "$method" "$private" "0" "$reserve" + + echo Memory charged to hugtlb=$hugetlb_difference + echo Memory charged to reservation=$reserved_difference + + expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \ + "Reserved memory charged to hugetlb cgroup." + + expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \ + "Reserved memory not charged to reservation usage." + + echo 'PASS' + + cleanup + continue + echo + echo + echo + echo Test more than reservation case. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + + if [ "$reserve" != "-n" ]; then + run_test "5" "$populate" '' "10" "2" "10" "$method" "$private" "1" \ + "$reserve" + + expect_equal "1" "$reservation_failed" "Reservation succeeded." + fi + + echo 'PASS' + + cleanup + + echo + echo + echo + echo Test more than cgroup limit case. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + + # Not sure if shm memory can be cleaned up when the process gets sigbus'd. + if [[ "$method" != 2 ]]; then + run_test 5 "$populate" "-w" 2 10 10 "$method" "$private" "1" "$reserve" + + expect_equal "1" "$oom_killed" "Not oom killed." + fi + echo 'PASS' + + cleanup + + echo + echo + echo + echo Test normal case, multiple cgroups. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + run_multiple_cgroup_test "3" "$populate" "" "10" "10" "5" \ + "$populate" "" "10" "10" "10" \ + "$method" "$private" "0" "$reserve" + + echo Memory charged to hugtlb1=$hugetlb_difference1 + echo Memory charged to reservation1=$reserved_difference1 + echo Memory charged to hugtlb2=$hugetlb_difference2 + echo Memory charged to reservation2=$reserved_difference2 + + if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then + expect_equal "3" "$reserved_difference1" \ + "Incorrect reservations charged to cgroup 1." + + expect_equal "5" "$reserved_difference2" \ + "Incorrect reservation charged to cgroup 2." + + else + expect_equal "0" "$reserved_difference1" \ + "Incorrect reservations charged to cgroup 1." + + expect_equal "0" "$reserved_difference2" \ + "Incorrect reservation charged to cgroup 2." + fi + + if [[ "$populate" == "-o" ]]; then + expect_equal "3" "$hugetlb_difference1" \ + "Incorrect hugetlb charged to cgroup 1." + + expect_equal "5" "$hugetlb_difference2" \ + "Incorrect hugetlb charged to cgroup 2." + + else + expect_equal "0" "$hugetlb_difference1" \ + "Incorrect hugetlb charged to cgroup 1." + + expect_equal "0" "$hugetlb_difference2" \ + "Incorrect hugetlb charged to cgroup 2." + fi + echo 'PASS' + + cleanup + echo + echo + echo + echo Test normal case with write, multiple cgroups. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + run_multiple_cgroup_test "3" "$populate" "-w" "10" "10" "5" \ + "$populate" "-w" "10" "10" "10" \ + "$method" "$private" "0" "$reserve" + + echo Memory charged to hugtlb1=$hugetlb_difference1 + echo Memory charged to reservation1=$reserved_difference1 + echo Memory charged to hugtlb2=$hugetlb_difference2 + echo Memory charged to reservation2=$reserved_difference2 + + expect_equal "3" "$hugetlb_difference1" \ + "Incorrect hugetlb charged to cgroup 1." + + expect_equal "3" "$reserved_difference1" \ + "Incorrect reservation charged to cgroup 1." + + expect_equal "5" "$hugetlb_difference2" \ + "Incorrect hugetlb charged to cgroup 2." + + expect_equal "5" "$reserved_difference2" \ + "Incorrected reservation charged to cgroup 2." + echo 'PASS' + + cleanup + + done # reserve + done # private + done # populate +done # method + +if [[ $do_umount ]]; then + umount $cgroup_path + rmdir $cgroup_path +fi diff --git a/tools/testing/selftests/mm/check_config.sh b/tools/testing/selftests/mm/check_config.sh new file mode 100644 index 000000000000..bcba3af0acea --- /dev/null +++ b/tools/testing/selftests/mm/check_config.sh @@ -0,0 +1,31 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Probe for libraries and create header files to record the results. Both C +# header files and Makefile include fragments are created. + +OUTPUT_H_FILE=local_config.h +OUTPUT_MKFILE=local_config.mk + +tmpname=$(mktemp) +tmpfile_c=${tmpname}.c +tmpfile_o=${tmpname}.o + +# liburing +echo "#include " > $tmpfile_c +echo "#include " >> $tmpfile_c +echo "int func(void) { return 0; }" >> $tmpfile_c + +CC=${1:?"Usage: $0 # example compiler: gcc"} +$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1 + +if [ -f $tmpfile_o ]; then + echo "#define LOCAL_CONFIG_HAVE_LIBURING 1" > $OUTPUT_H_FILE + echo "COW_EXTRA_LIBS = -luring" > $OUTPUT_MKFILE +else + echo "// No liburing support found" > $OUTPUT_H_FILE + echo "# No liburing support found, so:" > $OUTPUT_MKFILE + echo "COW_EXTRA_LIBS = " >> $OUTPUT_MKFILE +fi + +rm ${tmpname}.* diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c new file mode 100644 index 000000000000..9b420140ba2b --- /dev/null +++ b/tools/testing/selftests/mm/compaction_test.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * A test for the patch "Allow compaction of unevictable pages". + * With this patch we should be able to allocate at least 1/4 + * of RAM in huge pages. Without the patch much less is + * allocated. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +#define MAP_SIZE_MB 100 +#define MAP_SIZE (MAP_SIZE_MB * 1024 * 1024) + +struct map_list { + void *map; + struct map_list *next; +}; + +int read_memory_info(unsigned long *memfree, unsigned long *hugepagesize) +{ + char buffer[256] = {0}; + char *cmd = "cat /proc/meminfo | grep -i memfree | grep -o '[0-9]*'"; + FILE *cmdfile = popen(cmd, "r"); + + if (!(fgets(buffer, sizeof(buffer), cmdfile))) { + perror("Failed to read meminfo\n"); + return -1; + } + + pclose(cmdfile); + + *memfree = atoll(buffer); + cmd = "cat /proc/meminfo | grep -i hugepagesize | grep -o '[0-9]*'"; + cmdfile = popen(cmd, "r"); + + if (!(fgets(buffer, sizeof(buffer), cmdfile))) { + perror("Failed to read meminfo\n"); + return -1; + } + + pclose(cmdfile); + *hugepagesize = atoll(buffer); + + return 0; +} + +int prereq(void) +{ + char allowed; + int fd; + + fd = open("/proc/sys/vm/compact_unevictable_allowed", + O_RDONLY | O_NONBLOCK); + if (fd < 0) { + perror("Failed to open\n" + "/proc/sys/vm/compact_unevictable_allowed\n"); + return -1; + } + + if (read(fd, &allowed, sizeof(char)) != sizeof(char)) { + perror("Failed to read from\n" + "/proc/sys/vm/compact_unevictable_allowed\n"); + close(fd); + return -1; + } + + close(fd); + if (allowed == '1') + return 0; + + return -1; +} + +int check_compaction(unsigned long mem_free, unsigned int hugepage_size) +{ + int fd; + int compaction_index = 0; + char initial_nr_hugepages[10] = {0}; + char nr_hugepages[10] = {0}; + + /* We want to test with 80% of available memory. Else, OOM killer comes + in to play */ + mem_free = mem_free * 0.8; + + fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK); + if (fd < 0) { + perror("Failed to open /proc/sys/vm/nr_hugepages"); + return -1; + } + + if (read(fd, initial_nr_hugepages, sizeof(initial_nr_hugepages)) <= 0) { + perror("Failed to read from /proc/sys/vm/nr_hugepages"); + goto close_fd; + } + + /* Start with the initial condition of 0 huge pages*/ + if (write(fd, "0", sizeof(char)) != sizeof(char)) { + perror("Failed to write 0 to /proc/sys/vm/nr_hugepages\n"); + goto close_fd; + } + + lseek(fd, 0, SEEK_SET); + + /* Request a large number of huge pages. The Kernel will allocate + as much as it can */ + if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) { + perror("Failed to write 100000 to /proc/sys/vm/nr_hugepages\n"); + goto close_fd; + } + + lseek(fd, 0, SEEK_SET); + + if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) { + perror("Failed to re-read from /proc/sys/vm/nr_hugepages\n"); + goto close_fd; + } + + /* We should have been able to request at least 1/3 rd of the memory in + huge pages */ + compaction_index = mem_free/(atoi(nr_hugepages) * hugepage_size); + + if (compaction_index > 3) { + printf("No of huge pages allocated = %d\n", + (atoi(nr_hugepages))); + fprintf(stderr, "ERROR: Less that 1/%d of memory is available\n" + "as huge pages\n", compaction_index); + goto close_fd; + } + + printf("No of huge pages allocated = %d\n", + (atoi(nr_hugepages))); + + lseek(fd, 0, SEEK_SET); + + if (write(fd, initial_nr_hugepages, strlen(initial_nr_hugepages)) + != strlen(initial_nr_hugepages)) { + perror("Failed to write value to /proc/sys/vm/nr_hugepages\n"); + goto close_fd; + } + + close(fd); + return 0; + + close_fd: + close(fd); + printf("Not OK. Compaction test failed."); + return -1; +} + + +int main(int argc, char **argv) +{ + struct rlimit lim; + struct map_list *list, *entry; + size_t page_size, i; + void *map = NULL; + unsigned long mem_free = 0; + unsigned long hugepage_size = 0; + long mem_fragmentable_MB = 0; + + if (prereq() != 0) { + printf("Either the sysctl compact_unevictable_allowed is not\n" + "set to 1 or couldn't read the proc file.\n" + "Skipping the test\n"); + return KSFT_SKIP; + } + + lim.rlim_cur = RLIM_INFINITY; + lim.rlim_max = RLIM_INFINITY; + if (setrlimit(RLIMIT_MEMLOCK, &lim)) { + perror("Failed to set rlimit:\n"); + return -1; + } + + page_size = getpagesize(); + + list = NULL; + + if (read_memory_info(&mem_free, &hugepage_size) != 0) { + printf("ERROR: Cannot read meminfo\n"); + return -1; + } + + mem_fragmentable_MB = mem_free * 0.8 / 1024; + + while (mem_fragmentable_MB > 0) { + map = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED, -1, 0); + if (map == MAP_FAILED) + break; + + entry = malloc(sizeof(struct map_list)); + if (!entry) { + munmap(map, MAP_SIZE); + break; + } + entry->map = map; + entry->next = list; + list = entry; + + /* Write something (in this case the address of the map) to + * ensure that KSM can't merge the mapped pages + */ + for (i = 0; i < MAP_SIZE; i += page_size) + *(unsigned long *)(map + i) = (unsigned long)map + i; + + mem_fragmentable_MB -= MAP_SIZE_MB; + } + + for (entry = list; entry != NULL; entry = entry->next) { + munmap(entry->map, MAP_SIZE); + if (!entry->next) + break; + entry = entry->next; + } + + if (check_compaction(mem_free, hugepage_size) == 0) + return 0; + + return -1; +} diff --git a/tools/testing/selftests/mm/config b/tools/testing/selftests/mm/config new file mode 100644 index 000000000000..be087c4bc396 --- /dev/null +++ b/tools/testing/selftests/mm/config @@ -0,0 +1,8 @@ +CONFIG_SYSVIPC=y +CONFIG_USERFAULTFD=y +CONFIG_TEST_VMALLOC=m +CONFIG_DEVICE_PRIVATE=y +CONFIG_TEST_HMM=m +CONFIG_GUP_TEST=y +CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_MEM_SOFT_DIRTY=y diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c new file mode 100644 index 000000000000..16216d893d96 --- /dev/null +++ b/tools/testing/selftests/mm/cow.c @@ -0,0 +1,1764 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * COW (Copy On Write) tests. + * + * Copyright 2022, Red Hat, Inc. + * + * Author(s): David Hildenbrand + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "local_config.h" +#ifdef LOCAL_CONFIG_HAVE_LIBURING +#include +#endif /* LOCAL_CONFIG_HAVE_LIBURING */ + +#include "../../../../mm/gup_test.h" +#include "../kselftest.h" +#include "vm_util.h" + +#ifndef MADV_COLLAPSE +#define MADV_COLLAPSE 25 +#endif + +static size_t pagesize; +static int pagemap_fd; +static size_t thpsize; +static int nr_hugetlbsizes; +static size_t hugetlbsizes[10]; +static int gup_fd; +static bool has_huge_zeropage; + +static void detect_thpsize(void) +{ + int fd = open("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", + O_RDONLY); + size_t size = 0; + char buf[15]; + int ret; + + if (fd < 0) + return; + + ret = pread(fd, buf, sizeof(buf), 0); + if (ret > 0 && ret < sizeof(buf)) { + buf[ret] = 0; + + size = strtoul(buf, NULL, 10); + if (size < pagesize) + size = 0; + if (size > 0) { + thpsize = size; + ksft_print_msg("[INFO] detected THP size: %zu KiB\n", + thpsize / 1024); + } + } + + close(fd); +} + +static void detect_huge_zeropage(void) +{ + int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page", + O_RDONLY); + size_t enabled = 0; + char buf[15]; + int ret; + + if (fd < 0) + return; + + ret = pread(fd, buf, sizeof(buf), 0); + if (ret > 0 && ret < sizeof(buf)) { + buf[ret] = 0; + + enabled = strtoul(buf, NULL, 10); + if (enabled == 1) { + has_huge_zeropage = true; + ksft_print_msg("[INFO] huge zeropage is enabled\n"); + } + } + + close(fd); +} + +static void detect_hugetlbsizes(void) +{ + DIR *dir = opendir("/sys/kernel/mm/hugepages/"); + + if (!dir) + return; + + while (nr_hugetlbsizes < ARRAY_SIZE(hugetlbsizes)) { + struct dirent *entry = readdir(dir); + size_t kb; + + if (!entry) + break; + if (entry->d_type != DT_DIR) + continue; + if (sscanf(entry->d_name, "hugepages-%zukB", &kb) != 1) + continue; + hugetlbsizes[nr_hugetlbsizes] = kb * 1024; + nr_hugetlbsizes++; + ksft_print_msg("[INFO] detected hugetlb size: %zu KiB\n", + kb); + } + closedir(dir); +} + +static bool range_is_swapped(void *addr, size_t size) +{ + for (; size; addr += pagesize, size -= pagesize) + if (!pagemap_is_swapped(pagemap_fd, addr)) + return false; + return true; +} + +struct comm_pipes { + int child_ready[2]; + int parent_ready[2]; +}; + +static int setup_comm_pipes(struct comm_pipes *comm_pipes) +{ + if (pipe(comm_pipes->child_ready) < 0) + return -errno; + if (pipe(comm_pipes->parent_ready) < 0) { + close(comm_pipes->child_ready[0]); + close(comm_pipes->child_ready[1]); + return -errno; + } + + return 0; +} + +static void close_comm_pipes(struct comm_pipes *comm_pipes) +{ + close(comm_pipes->child_ready[0]); + close(comm_pipes->child_ready[1]); + close(comm_pipes->parent_ready[0]); + close(comm_pipes->parent_ready[1]); +} + +static int child_memcmp_fn(char *mem, size_t size, + struct comm_pipes *comm_pipes) +{ + char *old = malloc(size); + char buf; + + /* Backup the original content. */ + memcpy(old, mem, size); + + /* Wait until the parent modified the page. */ + write(comm_pipes->child_ready[1], "0", 1); + while (read(comm_pipes->parent_ready[0], &buf, 1) != 1) + ; + + /* See if we still read the old values. */ + return memcmp(old, mem, size); +} + +static int child_vmsplice_memcmp_fn(char *mem, size_t size, + struct comm_pipes *comm_pipes) +{ + struct iovec iov = { + .iov_base = mem, + .iov_len = size, + }; + ssize_t cur, total, transferred; + char *old, *new; + int fds[2]; + char buf; + + old = malloc(size); + new = malloc(size); + + /* Backup the original content. */ + memcpy(old, mem, size); + + if (pipe(fds) < 0) + return -errno; + + /* Trigger a read-only pin. */ + transferred = vmsplice(fds[1], &iov, 1, 0); + if (transferred < 0) + return -errno; + if (transferred == 0) + return -EINVAL; + + /* Unmap it from our page tables. */ + if (munmap(mem, size) < 0) + return -errno; + + /* Wait until the parent modified it. */ + write(comm_pipes->child_ready[1], "0", 1); + while (read(comm_pipes->parent_ready[0], &buf, 1) != 1) + ; + + /* See if we still read the old values via the pipe. */ + for (total = 0; total < transferred; total += cur) { + cur = read(fds[0], new + total, transferred - total); + if (cur < 0) + return -errno; + } + + return memcmp(old, new, transferred); +} + +typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes); + +static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect, + child_fn fn) +{ + struct comm_pipes comm_pipes; + char buf; + int ret; + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + return; + } + + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto close_comm_pipes; + } else if (!ret) { + exit(fn(mem, size, &comm_pipes)); + } + + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + + if (do_mprotect) { + /* + * mprotect() optimizations might try avoiding + * write-faults by directly mapping pages writable. + */ + ret = mprotect(mem, size, PROT_READ); + ret |= mprotect(mem, size, PROT_READ|PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + goto close_comm_pipes; + } + } + + /* Modify the page. */ + memset(mem, 0xff, size); + write(comm_pipes.parent_ready[1], "0", 1); + + wait(&ret); + if (WIFEXITED(ret)) + ret = WEXITSTATUS(ret); + else + ret = -EINVAL; + + ksft_test_result(!ret, "No leak from parent into child\n"); +close_comm_pipes: + close_comm_pipes(&comm_pipes); +} + +static void test_cow_in_parent(char *mem, size_t size) +{ + do_test_cow_in_parent(mem, size, false, child_memcmp_fn); +} + +static void test_cow_in_parent_mprotect(char *mem, size_t size) +{ + do_test_cow_in_parent(mem, size, true, child_memcmp_fn); +} + +static void test_vmsplice_in_child(char *mem, size_t size) +{ + do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn); +} + +static void test_vmsplice_in_child_mprotect(char *mem, size_t size) +{ + do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn); +} + +static void do_test_vmsplice_in_parent(char *mem, size_t size, + bool before_fork) +{ + struct iovec iov = { + .iov_base = mem, + .iov_len = size, + }; + ssize_t cur, total, transferred; + struct comm_pipes comm_pipes; + char *old, *new; + int ret, fds[2]; + char buf; + + old = malloc(size); + new = malloc(size); + + memcpy(old, mem, size); + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + goto free; + } + + if (pipe(fds) < 0) { + ksft_test_result_fail("pipe() failed\n"); + goto close_comm_pipes; + } + + if (before_fork) { + transferred = vmsplice(fds[1], &iov, 1, 0); + if (transferred <= 0) { + ksft_test_result_fail("vmsplice() failed\n"); + goto close_pipe; + } + } + + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto close_pipe; + } else if (!ret) { + write(comm_pipes.child_ready[1], "0", 1); + while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) + ; + /* Modify page content in the child. */ + memset(mem, 0xff, size); + exit(0); + } + + if (!before_fork) { + transferred = vmsplice(fds[1], &iov, 1, 0); + if (transferred <= 0) { + ksft_test_result_fail("vmsplice() failed\n"); + wait(&ret); + goto close_pipe; + } + } + + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + if (munmap(mem, size) < 0) { + ksft_test_result_fail("munmap() failed\n"); + goto close_pipe; + } + write(comm_pipes.parent_ready[1], "0", 1); + + /* Wait until the child is done writing. */ + wait(&ret); + if (!WIFEXITED(ret)) { + ksft_test_result_fail("wait() failed\n"); + goto close_pipe; + } + + /* See if we still read the old values. */ + for (total = 0; total < transferred; total += cur) { + cur = read(fds[0], new + total, transferred - total); + if (cur < 0) { + ksft_test_result_fail("read() failed\n"); + goto close_pipe; + } + } + + ksft_test_result(!memcmp(old, new, transferred), + "No leak from child into parent\n"); +close_pipe: + close(fds[0]); + close(fds[1]); +close_comm_pipes: + close_comm_pipes(&comm_pipes); +free: + free(old); + free(new); +} + +static void test_vmsplice_before_fork(char *mem, size_t size) +{ + do_test_vmsplice_in_parent(mem, size, true); +} + +static void test_vmsplice_after_fork(char *mem, size_t size) +{ + do_test_vmsplice_in_parent(mem, size, false); +} + +#ifdef LOCAL_CONFIG_HAVE_LIBURING +static void do_test_iouring(char *mem, size_t size, bool use_fork) +{ + struct comm_pipes comm_pipes; + struct io_uring_cqe *cqe; + struct io_uring_sqe *sqe; + struct io_uring ring; + ssize_t cur, total; + struct iovec iov; + char *buf, *tmp; + int ret, fd; + FILE *file; + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + return; + } + + file = tmpfile(); + if (!file) { + ksft_test_result_fail("tmpfile() failed\n"); + goto close_comm_pipes; + } + fd = fileno(file); + assert(fd); + + tmp = malloc(size); + if (!tmp) { + ksft_test_result_fail("malloc() failed\n"); + goto close_file; + } + + /* Skip on errors, as we might just lack kernel support. */ + ret = io_uring_queue_init(1, &ring, 0); + if (ret < 0) { + ksft_test_result_skip("io_uring_queue_init() failed\n"); + goto free_tmp; + } + + /* + * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN + * | FOLL_LONGTERM the range. + * + * Skip on errors, as we might just lack kernel support or might not + * have sufficient MEMLOCK permissions. + */ + iov.iov_base = mem; + iov.iov_len = size; + ret = io_uring_register_buffers(&ring, &iov, 1); + if (ret) { + ksft_test_result_skip("io_uring_register_buffers() failed\n"); + goto queue_exit; + } + + if (use_fork) { + /* + * fork() and keep the child alive until we're done. Note that + * we expect the pinned page to not get shared with the child. + */ + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto unregister_buffers; + } else if (!ret) { + write(comm_pipes.child_ready[1], "0", 1); + while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) + ; + exit(0); + } + + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + } else { + /* + * Map the page R/O into the page table. Enable softdirty + * tracking to stop the page from getting mapped R/W immediately + * again by mprotect() optimizations. Note that we don't have an + * easy way to test if that worked (the pagemap does not export + * if the page is mapped R/O vs. R/W). + */ + ret = mprotect(mem, size, PROT_READ); + clear_softdirty(); + ret |= mprotect(mem, size, PROT_READ | PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto unregister_buffers; + } + } + + /* + * Modify the page and write page content as observed by the fixed + * buffer pin to the file so we can verify it. + */ + memset(mem, 0xff, size); + sqe = io_uring_get_sqe(&ring); + if (!sqe) { + ksft_test_result_fail("io_uring_get_sqe() failed\n"); + goto quit_child; + } + io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0); + + ret = io_uring_submit(&ring); + if (ret < 0) { + ksft_test_result_fail("io_uring_submit() failed\n"); + goto quit_child; + } + + ret = io_uring_wait_cqe(&ring, &cqe); + if (ret < 0) { + ksft_test_result_fail("io_uring_wait_cqe() failed\n"); + goto quit_child; + } + + if (cqe->res != size) { + ksft_test_result_fail("write_fixed failed\n"); + goto quit_child; + } + io_uring_cqe_seen(&ring, cqe); + + /* Read back the file content to the temporary buffer. */ + total = 0; + while (total < size) { + cur = pread(fd, tmp + total, size - total, total); + if (cur < 0) { + ksft_test_result_fail("pread() failed\n"); + goto quit_child; + } + total += cur; + } + + /* Finally, check if we read what we expected. */ + ksft_test_result(!memcmp(mem, tmp, size), + "Longterm R/W pin is reliable\n"); + +quit_child: + if (use_fork) { + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + } +unregister_buffers: + io_uring_unregister_buffers(&ring); +queue_exit: + io_uring_queue_exit(&ring); +free_tmp: + free(tmp); +close_file: + fclose(file); +close_comm_pipes: + close_comm_pipes(&comm_pipes); +} + +static void test_iouring_ro(char *mem, size_t size) +{ + do_test_iouring(mem, size, false); +} + +static void test_iouring_fork(char *mem, size_t size) +{ + do_test_iouring(mem, size, true); +} + +#endif /* LOCAL_CONFIG_HAVE_LIBURING */ + +enum ro_pin_test { + RO_PIN_TEST, + RO_PIN_TEST_SHARED, + RO_PIN_TEST_PREVIOUSLY_SHARED, + RO_PIN_TEST_RO_EXCLUSIVE, +}; + +static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test, + bool fast) +{ + struct pin_longterm_test args; + struct comm_pipes comm_pipes; + char *tmp, buf; + __u64 tmp_val; + int ret; + + if (gup_fd < 0) { + ksft_test_result_skip("gup_test not available\n"); + return; + } + + tmp = malloc(size); + if (!tmp) { + ksft_test_result_fail("malloc() failed\n"); + return; + } + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + goto free_tmp; + } + + switch (test) { + case RO_PIN_TEST: + break; + case RO_PIN_TEST_SHARED: + case RO_PIN_TEST_PREVIOUSLY_SHARED: + /* + * Share the pages with our child. As the pages are not pinned, + * this should just work. + */ + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto close_comm_pipes; + } else if (!ret) { + write(comm_pipes.child_ready[1], "0", 1); + while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) + ; + exit(0); + } + + /* Wait until our child is ready. */ + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + + if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) { + /* + * Tell the child to quit now and wait until it quit. + * The pages should now be mapped R/O into our page + * tables, but they are no longer shared. + */ + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + if (!WIFEXITED(ret)) + ksft_print_msg("[INFO] wait() failed\n"); + } + break; + case RO_PIN_TEST_RO_EXCLUSIVE: + /* + * Map the page R/O into the page table. Enable softdirty + * tracking to stop the page from getting mapped R/W immediately + * again by mprotect() optimizations. Note that we don't have an + * easy way to test if that worked (the pagemap does not export + * if the page is mapped R/O vs. R/W). + */ + ret = mprotect(mem, size, PROT_READ); + clear_softdirty(); + ret |= mprotect(mem, size, PROT_READ | PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto close_comm_pipes; + } + break; + default: + assert(false); + } + + /* Take a R/O pin. This should trigger unsharing. */ + args.addr = (__u64)(uintptr_t)mem; + args.size = size; + args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0; + ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args); + if (ret) { + if (errno == EINVAL) + ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n"); + else + ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n"); + goto wait; + } + + /* Modify the page. */ + memset(mem, 0xff, size); + + /* + * Read back the content via the pin to the temporary buffer and + * test if we observed the modification. + */ + tmp_val = (__u64)(uintptr_t)tmp; + ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val); + if (ret) + ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n"); + else + ksft_test_result(!memcmp(mem, tmp, size), + "Longterm R/O pin is reliable\n"); + + ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP); + if (ret) + ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n"); +wait: + switch (test) { + case RO_PIN_TEST_SHARED: + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + if (!WIFEXITED(ret)) + ksft_print_msg("[INFO] wait() failed\n"); + break; + default: + break; + } +close_comm_pipes: + close_comm_pipes(&comm_pipes); +free_tmp: + free(tmp); +} + +static void test_ro_pin_on_shared(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false); +} + +static void test_ro_fast_pin_on_shared(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true); +} + +static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false); +} + +static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true); +} + +static void test_ro_pin_on_ro_exclusive(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false); +} + +static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true); +} + +typedef void (*test_fn)(char *mem, size_t size); + +static void do_run_with_base_page(test_fn fn, bool swapout) +{ + char *mem; + int ret; + + mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + + ret = madvise(mem, pagesize, MADV_NOHUGEPAGE); + /* Ignore if not around on a kernel. */ + if (ret && errno != EINVAL) { + ksft_test_result_fail("MADV_NOHUGEPAGE failed\n"); + goto munmap; + } + + /* Populate a base page. */ + memset(mem, 0, pagesize); + + if (swapout) { + madvise(mem, pagesize, MADV_PAGEOUT); + if (!pagemap_is_swapped(pagemap_fd, mem)) { + ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); + goto munmap; + } + } + + fn(mem, pagesize); +munmap: + munmap(mem, pagesize); +} + +static void run_with_base_page(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with base page\n", desc); + do_run_with_base_page(fn, false); +} + +static void run_with_base_page_swap(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc); + do_run_with_base_page(fn, true); +} + +enum thp_run { + THP_RUN_PMD, + THP_RUN_PMD_SWAPOUT, + THP_RUN_PTE, + THP_RUN_PTE_SWAPOUT, + THP_RUN_SINGLE_PTE, + THP_RUN_SINGLE_PTE_SWAPOUT, + THP_RUN_PARTIAL_MREMAP, + THP_RUN_PARTIAL_SHARED, +}; + +static void do_run_with_thp(test_fn fn, enum thp_run thp_run) +{ + char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED; + size_t size, mmap_size, mremap_size; + int ret; + + /* For alignment purposes, we need twice the thp size. */ + mmap_size = 2 * thpsize; + mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mmap_mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + + /* We need a THP-aligned memory area. */ + mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1)); + + ret = madvise(mem, thpsize, MADV_HUGEPAGE); + if (ret) { + ksft_test_result_fail("MADV_HUGEPAGE failed\n"); + goto munmap; + } + + /* + * Try to populate a THP. Touch the first sub-page and test if we get + * another sub-page populated automatically. + */ + mem[0] = 0; + if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) { + ksft_test_result_skip("Did not get a THP populated\n"); + goto munmap; + } + memset(mem, 0, thpsize); + + size = thpsize; + switch (thp_run) { + case THP_RUN_PMD: + case THP_RUN_PMD_SWAPOUT: + break; + case THP_RUN_PTE: + case THP_RUN_PTE_SWAPOUT: + /* + * Trigger PTE-mapping the THP by temporarily mapping a single + * subpage R/O. + */ + ret = mprotect(mem + pagesize, pagesize, PROT_READ); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + break; + case THP_RUN_SINGLE_PTE: + case THP_RUN_SINGLE_PTE_SWAPOUT: + /* + * Discard all but a single subpage of that PTE-mapped THP. What + * remains is a single PTE mapping a single subpage. + */ + ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED); + if (ret) { + ksft_test_result_fail("MADV_DONTNEED failed\n"); + goto munmap; + } + size = pagesize; + break; + case THP_RUN_PARTIAL_MREMAP: + /* + * Remap half of the THP. We need some new memory location + * for that. + */ + mremap_size = thpsize / 2; + mremap_mem = mmap(NULL, mremap_size, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + tmp = mremap(mem + mremap_size, mremap_size, mremap_size, + MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem); + if (tmp != mremap_mem) { + ksft_test_result_fail("mremap() failed\n"); + goto munmap; + } + size = mremap_size; + break; + case THP_RUN_PARTIAL_SHARED: + /* + * Share the first page of the THP with a child and quit the + * child. This will result in some parts of the THP never + * have been shared. + */ + ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK); + if (ret) { + ksft_test_result_fail("MADV_DONTFORK failed\n"); + goto munmap; + } + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto munmap; + } else if (!ret) { + exit(0); + } + wait(&ret); + /* Allow for sharing all pages again. */ + ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK); + if (ret) { + ksft_test_result_fail("MADV_DOFORK failed\n"); + goto munmap; + } + break; + default: + assert(false); + } + + switch (thp_run) { + case THP_RUN_PMD_SWAPOUT: + case THP_RUN_PTE_SWAPOUT: + case THP_RUN_SINGLE_PTE_SWAPOUT: + madvise(mem, size, MADV_PAGEOUT); + if (!range_is_swapped(mem, size)) { + ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); + goto munmap; + } + break; + default: + break; + } + + fn(mem, size); +munmap: + munmap(mmap_mem, mmap_size); + if (mremap_mem != MAP_FAILED) + munmap(mremap_mem, mremap_size); +} + +static void run_with_thp(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with THP\n", desc); + do_run_with_thp(fn, THP_RUN_PMD); +} + +static void run_with_thp_swap(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with swapped-out THP\n", desc); + do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT); +} + +static void run_with_pte_mapped_thp(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with PTE-mapped THP\n", desc); + do_run_with_thp(fn, THP_RUN_PTE); +} + +static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP\n", desc); + do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT); +} + +static void run_with_single_pte_of_thp(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with single PTE of THP\n", desc); + do_run_with_thp(fn, THP_RUN_SINGLE_PTE); +} + +static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP\n", desc); + do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT); +} + +static void run_with_partial_mremap_thp(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP\n", desc); + do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP); +} + +static void run_with_partial_shared_thp(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with partially shared THP\n", desc); + do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED); +} + +static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize) +{ + int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB; + char *mem, *dummy; + + ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc, + hugetlbsize / 1024); + + flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT; + + mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_skip("need more free huge pages\n"); + return; + } + + /* Populate an huge page. */ + memset(mem, 0, hugetlbsize); + + /* + * We need a total of two hugetlb pages to handle COW/unsharing + * properly, otherwise we might get zapped by a SIGBUS. + */ + dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0); + if (dummy == MAP_FAILED) { + ksft_test_result_skip("need more free huge pages\n"); + goto munmap; + } + munmap(dummy, hugetlbsize); + + fn(mem, hugetlbsize); +munmap: + munmap(mem, hugetlbsize); +} + +struct test_case { + const char *desc; + test_fn fn; +}; + +/* + * Test cases that are specific to anonymous pages: pages in private mappings + * that may get shared via COW during fork(). + */ +static const struct test_case anon_test_cases[] = { + /* + * Basic COW tests for fork() without any GUP. If we miss to break COW, + * either the child can observe modifications by the parent or the + * other way around. + */ + { + "Basic COW after fork()", + test_cow_in_parent, + }, + /* + * Basic test, but do an additional mprotect(PROT_READ)+ + * mprotect(PROT_READ|PROT_WRITE) in the parent before write access. + */ + { + "Basic COW after fork() with mprotect() optimization", + test_cow_in_parent_mprotect, + }, + /* + * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If + * we miss to break COW, the child observes modifications by the parent. + * This is CVE-2020-29374 reported by Jann Horn. + */ + { + "vmsplice() + unmap in child", + test_vmsplice_in_child + }, + /* + * vmsplice() test, but do an additional mprotect(PROT_READ)+ + * mprotect(PROT_READ|PROT_WRITE) in the parent before write access. + */ + { + "vmsplice() + unmap in child with mprotect() optimization", + test_vmsplice_in_child_mprotect + }, + /* + * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after + * fork(); modify in the child. If we miss to break COW, the parent + * observes modifications by the child. + */ + { + "vmsplice() before fork(), unmap in parent after fork()", + test_vmsplice_before_fork, + }, + /* + * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the + * child. If we miss to break COW, the parent observes modifications by + * the child. + */ + { + "vmsplice() + unmap in parent after fork()", + test_vmsplice_after_fork, + }, +#ifdef LOCAL_CONFIG_HAVE_LIBURING + /* + * Take a R/W longterm pin and then map the page R/O into the page + * table to trigger a write fault on next access. When modifying the + * page, the page content must be visible via the pin. + */ + { + "R/O-mapping a page registered as iouring fixed buffer", + test_iouring_ro, + }, + /* + * Take a R/W longterm pin and then fork() a child. When modifying the + * page, the page content must be visible via the pin. We expect the + * pinned page to not get shared with the child. + */ + { + "fork() with an iouring fixed buffer", + test_iouring_fork, + }, + +#endif /* LOCAL_CONFIG_HAVE_LIBURING */ + /* + * Take a R/O longterm pin on a R/O-mapped shared anonymous page. + * When modifying the page via the page table, the page content change + * must be visible via the pin. + */ + { + "R/O GUP pin on R/O-mapped shared page", + test_ro_pin_on_shared, + }, + /* Same as above, but using GUP-fast. */ + { + "R/O GUP-fast pin on R/O-mapped shared page", + test_ro_fast_pin_on_shared, + }, + /* + * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that + * was previously shared. When modifying the page via the page table, + * the page content change must be visible via the pin. + */ + { + "R/O GUP pin on R/O-mapped previously-shared page", + test_ro_pin_on_ro_previously_shared, + }, + /* Same as above, but using GUP-fast. */ + { + "R/O GUP-fast pin on R/O-mapped previously-shared page", + test_ro_fast_pin_on_ro_previously_shared, + }, + /* + * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page. + * When modifying the page via the page table, the page content change + * must be visible via the pin. + */ + { + "R/O GUP pin on R/O-mapped exclusive page", + test_ro_pin_on_ro_exclusive, + }, + /* Same as above, but using GUP-fast. */ + { + "R/O GUP-fast pin on R/O-mapped exclusive page", + test_ro_fast_pin_on_ro_exclusive, + }, +}; + +static void run_anon_test_case(struct test_case const *test_case) +{ + int i; + + run_with_base_page(test_case->fn, test_case->desc); + run_with_base_page_swap(test_case->fn, test_case->desc); + if (thpsize) { + run_with_thp(test_case->fn, test_case->desc); + run_with_thp_swap(test_case->fn, test_case->desc); + run_with_pte_mapped_thp(test_case->fn, test_case->desc); + run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc); + run_with_single_pte_of_thp(test_case->fn, test_case->desc); + run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc); + run_with_partial_mremap_thp(test_case->fn, test_case->desc); + run_with_partial_shared_thp(test_case->fn, test_case->desc); + } + for (i = 0; i < nr_hugetlbsizes; i++) + run_with_hugetlb(test_case->fn, test_case->desc, + hugetlbsizes[i]); +} + +static void run_anon_test_cases(void) +{ + int i; + + ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n"); + + for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++) + run_anon_test_case(&anon_test_cases[i]); +} + +static int tests_per_anon_test_case(void) +{ + int tests = 2 + nr_hugetlbsizes; + + if (thpsize) + tests += 8; + return tests; +} + +enum anon_thp_collapse_test { + ANON_THP_COLLAPSE_UNSHARED, + ANON_THP_COLLAPSE_FULLY_SHARED, + ANON_THP_COLLAPSE_LOWER_SHARED, + ANON_THP_COLLAPSE_UPPER_SHARED, +}; + +static void do_test_anon_thp_collapse(char *mem, size_t size, + enum anon_thp_collapse_test test) +{ + struct comm_pipes comm_pipes; + char buf; + int ret; + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + return; + } + + /* + * Trigger PTE-mapping the THP by temporarily mapping a single subpage + * R/O, such that we can try collapsing it later. + */ + ret = mprotect(mem + pagesize, pagesize, PROT_READ); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto close_comm_pipes; + } + ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto close_comm_pipes; + } + + switch (test) { + case ANON_THP_COLLAPSE_UNSHARED: + /* Collapse before actually COW-sharing the page. */ + ret = madvise(mem, size, MADV_COLLAPSE); + if (ret) { + ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", + strerror(errno)); + goto close_comm_pipes; + } + break; + case ANON_THP_COLLAPSE_FULLY_SHARED: + /* COW-share the full PTE-mapped THP. */ + break; + case ANON_THP_COLLAPSE_LOWER_SHARED: + /* Don't COW-share the upper part of the THP. */ + ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK); + if (ret) { + ksft_test_result_fail("MADV_DONTFORK failed\n"); + goto close_comm_pipes; + } + break; + case ANON_THP_COLLAPSE_UPPER_SHARED: + /* Don't COW-share the lower part of the THP. */ + ret = madvise(mem, size / 2, MADV_DONTFORK); + if (ret) { + ksft_test_result_fail("MADV_DONTFORK failed\n"); + goto close_comm_pipes; + } + break; + default: + assert(false); + } + + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto close_comm_pipes; + } else if (!ret) { + switch (test) { + case ANON_THP_COLLAPSE_UNSHARED: + case ANON_THP_COLLAPSE_FULLY_SHARED: + exit(child_memcmp_fn(mem, size, &comm_pipes)); + break; + case ANON_THP_COLLAPSE_LOWER_SHARED: + exit(child_memcmp_fn(mem, size / 2, &comm_pipes)); + break; + case ANON_THP_COLLAPSE_UPPER_SHARED: + exit(child_memcmp_fn(mem + size / 2, size / 2, + &comm_pipes)); + break; + default: + assert(false); + } + } + + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + + switch (test) { + case ANON_THP_COLLAPSE_UNSHARED: + break; + case ANON_THP_COLLAPSE_UPPER_SHARED: + case ANON_THP_COLLAPSE_LOWER_SHARED: + /* + * Revert MADV_DONTFORK such that we merge the VMAs and are + * able to actually collapse. + */ + ret = madvise(mem, size, MADV_DOFORK); + if (ret) { + ksft_test_result_fail("MADV_DOFORK failed\n"); + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + goto close_comm_pipes; + } + /* FALLTHROUGH */ + case ANON_THP_COLLAPSE_FULLY_SHARED: + /* Collapse before anyone modified the COW-shared page. */ + ret = madvise(mem, size, MADV_COLLAPSE); + if (ret) { + ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", + strerror(errno)); + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + goto close_comm_pipes; + } + break; + default: + assert(false); + } + + /* Modify the page. */ + memset(mem, 0xff, size); + write(comm_pipes.parent_ready[1], "0", 1); + + wait(&ret); + if (WIFEXITED(ret)) + ret = WEXITSTATUS(ret); + else + ret = -EINVAL; + + ksft_test_result(!ret, "No leak from parent into child\n"); +close_comm_pipes: + close_comm_pipes(&comm_pipes); +} + +static void test_anon_thp_collapse_unshared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED); +} + +static void test_anon_thp_collapse_fully_shared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED); +} + +static void test_anon_thp_collapse_lower_shared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED); +} + +static void test_anon_thp_collapse_upper_shared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED); +} + +/* + * Test cases that are specific to anonymous THP: pages in private mappings + * that may get shared via COW during fork(). + */ +static const struct test_case anon_thp_test_cases[] = { + /* + * Basic COW test for fork() without any GUP when collapsing a THP + * before fork(). + * + * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place + * collapse") might easily get COW handling wrong when not collapsing + * exclusivity information properly. + */ + { + "Basic COW after fork() when collapsing before fork()", + test_anon_thp_collapse_unshared, + }, + /* Basic COW test, but collapse after COW-sharing a full THP. */ + { + "Basic COW after fork() when collapsing after fork() (fully shared)", + test_anon_thp_collapse_fully_shared, + }, + /* + * Basic COW test, but collapse after COW-sharing the lower half of a + * THP. + */ + { + "Basic COW after fork() when collapsing after fork() (lower shared)", + test_anon_thp_collapse_lower_shared, + }, + /* + * Basic COW test, but collapse after COW-sharing the upper half of a + * THP. + */ + { + "Basic COW after fork() when collapsing after fork() (upper shared)", + test_anon_thp_collapse_upper_shared, + }, +}; + +static void run_anon_thp_test_cases(void) +{ + int i; + + if (!thpsize) + return; + + ksft_print_msg("[INFO] Anonymous THP tests\n"); + + for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) { + struct test_case const *test_case = &anon_thp_test_cases[i]; + + ksft_print_msg("[RUN] %s\n", test_case->desc); + do_run_with_thp(test_case->fn, THP_RUN_PMD); + } +} + +static int tests_per_anon_thp_test_case(void) +{ + return thpsize ? 1 : 0; +} + +typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size); + +static void test_cow(char *mem, const char *smem, size_t size) +{ + char *old = malloc(size); + + /* Backup the original content. */ + memcpy(old, smem, size); + + /* Modify the page. */ + memset(mem, 0xff, size); + + /* See if we still read the old values via the other mapping. */ + ksft_test_result(!memcmp(smem, old, size), + "Other mapping not modified\n"); + free(old); +} + +static void test_ro_pin(char *mem, const char *smem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST, false); +} + +static void test_ro_fast_pin(char *mem, const char *smem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST, true); +} + +static void run_with_zeropage(non_anon_test_fn fn, const char *desc) +{ + char *mem, *smem, tmp; + + ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc); + + mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + + smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* Read from the page to populate the shared zeropage. */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + + fn(mem, smem, pagesize); +munmap: + munmap(mem, pagesize); + if (smem != MAP_FAILED) + munmap(smem, pagesize); +} + +static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) +{ + char *mem, *smem, *mmap_mem, *mmap_smem, tmp; + size_t mmap_size; + int ret; + + ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc); + + if (!has_huge_zeropage) { + ksft_test_result_skip("Huge zeropage not enabled\n"); + return; + } + + /* For alignment purposes, we need twice the thp size. */ + mmap_size = 2 * thpsize; + mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mmap_mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + mmap_smem = mmap(NULL, mmap_size, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mmap_smem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* We need a THP-aligned memory area. */ + mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1)); + smem = (char *)(((uintptr_t)mmap_smem + thpsize) & ~(thpsize - 1)); + + ret = madvise(mem, thpsize, MADV_HUGEPAGE); + ret |= madvise(smem, thpsize, MADV_HUGEPAGE); + if (ret) { + ksft_test_result_fail("MADV_HUGEPAGE failed\n"); + goto munmap; + } + + /* + * Read from the memory to populate the huge shared zeropage. Read from + * the first sub-page and test if we get another sub-page populated + * automatically. + */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + if (!pagemap_is_populated(pagemap_fd, mem + pagesize) || + !pagemap_is_populated(pagemap_fd, smem + pagesize)) { + ksft_test_result_skip("Did not get THPs populated\n"); + goto munmap; + } + + fn(mem, smem, thpsize); +munmap: + munmap(mmap_mem, mmap_size); + if (mmap_smem != MAP_FAILED) + munmap(mmap_smem, mmap_size); +} + +static void run_with_memfd(non_anon_test_fn fn, const char *desc) +{ + char *mem, *smem, tmp; + int fd; + + ksft_print_msg("[RUN] %s ... with memfd\n", desc); + + fd = memfd_create("test", 0); + if (fd < 0) { + ksft_test_result_fail("memfd_create() failed\n"); + return; + } + + /* File consists of a single page filled with zeroes. */ + if (fallocate(fd, 0, 0, pagesize)) { + ksft_test_result_fail("fallocate() failed\n"); + goto close; + } + + /* Create a private mapping of the memfd. */ + mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto close; + } + smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* Fault the page in. */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + + fn(mem, smem, pagesize); +munmap: + munmap(mem, pagesize); + if (smem != MAP_FAILED) + munmap(smem, pagesize); +close: + close(fd); +} + +static void run_with_tmpfile(non_anon_test_fn fn, const char *desc) +{ + char *mem, *smem, tmp; + FILE *file; + int fd; + + ksft_print_msg("[RUN] %s ... with tmpfile\n", desc); + + file = tmpfile(); + if (!file) { + ksft_test_result_fail("tmpfile() failed\n"); + return; + } + + fd = fileno(file); + if (fd < 0) { + ksft_test_result_skip("fileno() failed\n"); + return; + } + + /* File consists of a single page filled with zeroes. */ + if (fallocate(fd, 0, 0, pagesize)) { + ksft_test_result_fail("fallocate() failed\n"); + goto close; + } + + /* Create a private mapping of the memfd. */ + mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto close; + } + smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* Fault the page in. */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + + fn(mem, smem, pagesize); +munmap: + munmap(mem, pagesize); + if (smem != MAP_FAILED) + munmap(smem, pagesize); +close: + fclose(file); +} + +static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc, + size_t hugetlbsize) +{ + int flags = MFD_HUGETLB; + char *mem, *smem, tmp; + int fd; + + ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc, + hugetlbsize / 1024); + + flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT; + + fd = memfd_create("test", flags); + if (fd < 0) { + ksft_test_result_skip("memfd_create() failed\n"); + return; + } + + /* File consists of a single page filled with zeroes. */ + if (fallocate(fd, 0, 0, hugetlbsize)) { + ksft_test_result_skip("need more free huge pages\n"); + goto close; + } + + /* Create a private mapping of the memfd. */ + mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, + 0); + if (mem == MAP_FAILED) { + ksft_test_result_skip("need more free huge pages\n"); + goto close; + } + smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* Fault the page in. */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + + fn(mem, smem, hugetlbsize); +munmap: + munmap(mem, hugetlbsize); + if (mem != MAP_FAILED) + munmap(smem, hugetlbsize); +close: + close(fd); +} + +struct non_anon_test_case { + const char *desc; + non_anon_test_fn fn; +}; + +/* + * Test cases that target any pages in private mappings that are not anonymous: + * pages that may get shared via COW ndependent of fork(). This includes + * the shared zeropage(s), pagecache pages, ... + */ +static const struct non_anon_test_case non_anon_test_cases[] = { + /* + * Basic COW test without any GUP. If we miss to break COW, changes are + * visible via other private/shared mappings. + */ + { + "Basic COW", + test_cow, + }, + /* + * Take a R/O longterm pin. When modifying the page via the page table, + * the page content change must be visible via the pin. + */ + { + "R/O longterm GUP pin", + test_ro_pin, + }, + /* Same as above, but using GUP-fast. */ + { + "R/O longterm GUP-fast pin", + test_ro_fast_pin, + }, +}; + +static void run_non_anon_test_case(struct non_anon_test_case const *test_case) +{ + int i; + + run_with_zeropage(test_case->fn, test_case->desc); + run_with_memfd(test_case->fn, test_case->desc); + run_with_tmpfile(test_case->fn, test_case->desc); + if (thpsize) + run_with_huge_zeropage(test_case->fn, test_case->desc); + for (i = 0; i < nr_hugetlbsizes; i++) + run_with_memfd_hugetlb(test_case->fn, test_case->desc, + hugetlbsizes[i]); +} + +static void run_non_anon_test_cases(void) +{ + int i; + + ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n"); + + for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++) + run_non_anon_test_case(&non_anon_test_cases[i]); +} + +static int tests_per_non_anon_test_case(void) +{ + int tests = 3 + nr_hugetlbsizes; + + if (thpsize) + tests += 1; + return tests; +} + +int main(int argc, char **argv) +{ + int err; + + pagesize = getpagesize(); + detect_thpsize(); + detect_hugetlbsizes(); + detect_huge_zeropage(); + + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() + + ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() + + ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case()); + + gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + + run_anon_test_cases(); + run_anon_thp_test_cases(); + run_non_anon_test_cases(); + + err = ksft_get_fail_cnt(); + if (err) + ksft_exit_fail_msg("%d out of %d tests failed\n", + err, ksft_test_num()); + return ksft_exit_pass(); +} diff --git a/tools/testing/selftests/mm/gup_test.c b/tools/testing/selftests/mm/gup_test.c new file mode 100644 index 000000000000..e43879291dac --- /dev/null +++ b/tools/testing/selftests/mm/gup_test.c @@ -0,0 +1,271 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../kselftest.h" + +#include "util.h" + +#define MB (1UL << 20) + +/* Just the flags we need, copied from mm.h: */ +#define FOLL_WRITE 0x01 /* check pte is writable */ +#define FOLL_TOUCH 0x02 /* mark page accessed */ + +#define GUP_TEST_FILE "/sys/kernel/debug/gup_test" + +static unsigned long cmd = GUP_FAST_BENCHMARK; +static int gup_fd, repeats = 1; +static unsigned long size = 128 * MB; +/* Serialize prints */ +static pthread_mutex_t print_mutex = PTHREAD_MUTEX_INITIALIZER; + +static char *cmd_to_str(unsigned long cmd) +{ + switch (cmd) { + case GUP_FAST_BENCHMARK: + return "GUP_FAST_BENCHMARK"; + case PIN_FAST_BENCHMARK: + return "PIN_FAST_BENCHMARK"; + case PIN_LONGTERM_BENCHMARK: + return "PIN_LONGTERM_BENCHMARK"; + case GUP_BASIC_TEST: + return "GUP_BASIC_TEST"; + case PIN_BASIC_TEST: + return "PIN_BASIC_TEST"; + case DUMP_USER_PAGES_TEST: + return "DUMP_USER_PAGES_TEST"; + } + return "Unknown command"; +} + +void *gup_thread(void *data) +{ + struct gup_test gup = *(struct gup_test *)data; + int i; + + /* Only report timing information on the *_BENCHMARK commands: */ + if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) || + (cmd == PIN_LONGTERM_BENCHMARK)) { + for (i = 0; i < repeats; i++) { + gup.size = size; + if (ioctl(gup_fd, cmd, &gup)) + perror("ioctl"), exit(1); + + pthread_mutex_lock(&print_mutex); + printf("%s: Time: get:%lld put:%lld us", + cmd_to_str(cmd), gup.get_delta_usec, + gup.put_delta_usec); + if (gup.size != size) + printf(", truncated (size: %lld)", gup.size); + printf("\n"); + pthread_mutex_unlock(&print_mutex); + } + } else { + gup.size = size; + if (ioctl(gup_fd, cmd, &gup)) { + perror("ioctl"); + exit(1); + } + + pthread_mutex_lock(&print_mutex); + printf("%s: done\n", cmd_to_str(cmd)); + if (gup.size != size) + printf("Truncated (size: %lld)\n", gup.size); + pthread_mutex_unlock(&print_mutex); + } + + return NULL; +} + +int main(int argc, char **argv) +{ + struct gup_test gup = { 0 }; + int filed, i, opt, nr_pages = 1, thp = -1, write = 1, nthreads = 1, ret; + int flags = MAP_PRIVATE, touch = 0; + char *file = "/dev/zero"; + pthread_t *tid; + char *p; + + while ((opt = getopt(argc, argv, "m:r:n:F:f:abcj:tTLUuwWSHpz")) != -1) { + switch (opt) { + case 'a': + cmd = PIN_FAST_BENCHMARK; + break; + case 'b': + cmd = PIN_BASIC_TEST; + break; + case 'L': + cmd = PIN_LONGTERM_BENCHMARK; + break; + case 'c': + cmd = DUMP_USER_PAGES_TEST; + /* + * Dump page 0 (index 1). May be overridden later, by + * user's non-option arguments. + * + * .which_pages is zero-based, so that zero can mean "do + * nothing". + */ + gup.which_pages[0] = 1; + break; + case 'p': + /* works only with DUMP_USER_PAGES_TEST */ + gup.test_flags |= GUP_TEST_FLAG_DUMP_PAGES_USE_PIN; + break; + case 'F': + /* strtol, so you can pass flags in hex form */ + gup.gup_flags = strtol(optarg, 0, 0); + break; + case 'j': + nthreads = atoi(optarg); + break; + case 'm': + size = atoi(optarg) * MB; + break; + case 'r': + repeats = atoi(optarg); + break; + case 'n': + nr_pages = atoi(optarg); + break; + case 't': + thp = 1; + break; + case 'T': + thp = 0; + break; + case 'U': + cmd = GUP_BASIC_TEST; + break; + case 'u': + cmd = GUP_FAST_BENCHMARK; + break; + case 'w': + write = 1; + break; + case 'W': + write = 0; + break; + case 'f': + file = optarg; + break; + case 'S': + flags &= ~MAP_PRIVATE; + flags |= MAP_SHARED; + break; + case 'H': + flags |= (MAP_HUGETLB | MAP_ANONYMOUS); + break; + case 'z': + /* fault pages in gup, do not fault in userland */ + touch = 1; + break; + default: + return -1; + } + } + + if (optind < argc) { + int extra_arg_count = 0; + /* + * For example: + * + * ./gup_test -c 0 1 0x1001 + * + * ...to dump pages 0, 1, and 4097 + */ + + while ((optind < argc) && + (extra_arg_count < GUP_TEST_MAX_PAGES_TO_DUMP)) { + /* + * Do the 1-based indexing here, so that the user can + * use normal 0-based indexing on the command line. + */ + long page_index = strtol(argv[optind], 0, 0) + 1; + + gup.which_pages[extra_arg_count] = page_index; + extra_arg_count++; + optind++; + } + } + + filed = open(file, O_RDWR|O_CREAT); + if (filed < 0) { + perror("open"); + exit(filed); + } + + gup.nr_pages_per_call = nr_pages; + if (write) + gup.gup_flags |= FOLL_WRITE; + + gup_fd = open(GUP_TEST_FILE, O_RDWR); + if (gup_fd == -1) { + switch (errno) { + case EACCES: + if (getuid()) + printf("Please run this test as root\n"); + break; + case ENOENT: + if (opendir("/sys/kernel/debug") == NULL) { + printf("mount debugfs at /sys/kernel/debug\n"); + break; + } + printf("check if CONFIG_GUP_TEST is enabled in kernel config\n"); + break; + default: + perror("failed to open " GUP_TEST_FILE); + break; + } + exit(KSFT_SKIP); + } + + p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0); + if (p == MAP_FAILED) { + perror("mmap"); + exit(1); + } + gup.addr = (unsigned long)p; + + if (thp == 1) + madvise(p, size, MADV_HUGEPAGE); + else if (thp == 0) + madvise(p, size, MADV_NOHUGEPAGE); + + /* + * FOLL_TOUCH, in gup_test, is used as an either/or case: either + * fault pages in from the kernel via FOLL_TOUCH, or fault them + * in here, from user space. This allows comparison of performance + * between those two cases. + */ + if (touch) { + gup.gup_flags |= FOLL_TOUCH; + } else { + for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE) + p[0] = 0; + } + + tid = malloc(sizeof(pthread_t) * nthreads); + assert(tid); + for (i = 0; i < nthreads; i++) { + ret = pthread_create(&tid[i], NULL, gup_thread, &gup); + assert(ret == 0); + } + for (i = 0; i < nthreads; i++) { + ret = pthread_join(tid[i], NULL); + assert(ret == 0); + } + free(tid); + + return 0; +} diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c new file mode 100644 index 000000000000..4adaad1b822f --- /dev/null +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -0,0 +1,2054 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * HMM stands for Heterogeneous Memory Management, it is a helper layer inside + * the linux kernel to help device drivers mirror a process address space in + * the device. This allows the device to use the same address space which + * makes communication and data exchange a lot easier. + * + * This framework's sole purpose is to exercise various code paths inside + * the kernel to make sure that HMM performs as expected and to flush out any + * bugs. + */ + +#include "../kselftest_harness.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* + * This is a private UAPI to the kernel test module so it isn't exported + * in the usual include/uapi/... directory. + */ +#include +#include + +struct hmm_buffer { + void *ptr; + void *mirror; + unsigned long size; + int fd; + uint64_t cpages; + uint64_t faults; +}; + +enum { + HMM_PRIVATE_DEVICE_ONE, + HMM_PRIVATE_DEVICE_TWO, + HMM_COHERENCE_DEVICE_ONE, + HMM_COHERENCE_DEVICE_TWO, +}; + +#define TWOMEG (1 << 21) +#define HMM_BUFFER_SIZE (1024 << 12) +#define HMM_PATH_MAX 64 +#define NTIMES 10 + +#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) +/* Just the flags we need, copied from mm.h: */ +#define FOLL_WRITE 0x01 /* check pte is writable */ +#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite */ + +FIXTURE(hmm) +{ + int fd; + unsigned int page_size; + unsigned int page_shift; +}; + +FIXTURE_VARIANT(hmm) +{ + int device_number; +}; + +FIXTURE_VARIANT_ADD(hmm, hmm_device_private) +{ + .device_number = HMM_PRIVATE_DEVICE_ONE, +}; + +FIXTURE_VARIANT_ADD(hmm, hmm_device_coherent) +{ + .device_number = HMM_COHERENCE_DEVICE_ONE, +}; + +FIXTURE(hmm2) +{ + int fd0; + int fd1; + unsigned int page_size; + unsigned int page_shift; +}; + +FIXTURE_VARIANT(hmm2) +{ + int device_number0; + int device_number1; +}; + +FIXTURE_VARIANT_ADD(hmm2, hmm2_device_private) +{ + .device_number0 = HMM_PRIVATE_DEVICE_ONE, + .device_number1 = HMM_PRIVATE_DEVICE_TWO, +}; + +FIXTURE_VARIANT_ADD(hmm2, hmm2_device_coherent) +{ + .device_number0 = HMM_COHERENCE_DEVICE_ONE, + .device_number1 = HMM_COHERENCE_DEVICE_TWO, +}; + +static int hmm_open(int unit) +{ + char pathname[HMM_PATH_MAX]; + int fd; + + snprintf(pathname, sizeof(pathname), "/dev/hmm_dmirror%d", unit); + fd = open(pathname, O_RDWR, 0); + if (fd < 0) + fprintf(stderr, "could not open hmm dmirror driver (%s)\n", + pathname); + return fd; +} + +static bool hmm_is_coherent_type(int dev_num) +{ + return (dev_num >= HMM_COHERENCE_DEVICE_ONE); +} + +FIXTURE_SETUP(hmm) +{ + self->page_size = sysconf(_SC_PAGE_SIZE); + self->page_shift = ffs(self->page_size) - 1; + + self->fd = hmm_open(variant->device_number); + if (self->fd < 0 && hmm_is_coherent_type(variant->device_number)) + SKIP(exit(0), "DEVICE_COHERENT not available"); + ASSERT_GE(self->fd, 0); +} + +FIXTURE_SETUP(hmm2) +{ + self->page_size = sysconf(_SC_PAGE_SIZE); + self->page_shift = ffs(self->page_size) - 1; + + self->fd0 = hmm_open(variant->device_number0); + if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0)) + SKIP(exit(0), "DEVICE_COHERENT not available"); + ASSERT_GE(self->fd0, 0); + self->fd1 = hmm_open(variant->device_number1); + ASSERT_GE(self->fd1, 0); +} + +FIXTURE_TEARDOWN(hmm) +{ + int ret = close(self->fd); + + ASSERT_EQ(ret, 0); + self->fd = -1; +} + +FIXTURE_TEARDOWN(hmm2) +{ + int ret = close(self->fd0); + + ASSERT_EQ(ret, 0); + self->fd0 = -1; + + ret = close(self->fd1); + ASSERT_EQ(ret, 0); + self->fd1 = -1; +} + +static int hmm_dmirror_cmd(int fd, + unsigned long request, + struct hmm_buffer *buffer, + unsigned long npages) +{ + struct hmm_dmirror_cmd cmd; + int ret; + + /* Simulate a device reading system memory. */ + cmd.addr = (__u64)buffer->ptr; + cmd.ptr = (__u64)buffer->mirror; + cmd.npages = npages; + + for (;;) { + ret = ioctl(fd, request, &cmd); + if (ret == 0) + break; + if (errno == EINTR) + continue; + return -errno; + } + buffer->cpages = cmd.cpages; + buffer->faults = cmd.faults; + + return 0; +} + +static void hmm_buffer_free(struct hmm_buffer *buffer) +{ + if (buffer == NULL) + return; + + if (buffer->ptr) + munmap(buffer->ptr, buffer->size); + free(buffer->mirror); + free(buffer); +} + +/* + * Create a temporary file that will be deleted on close. + */ +static int hmm_create_file(unsigned long size) +{ + char path[HMM_PATH_MAX]; + int fd; + + strcpy(path, "/tmp"); + fd = open(path, O_TMPFILE | O_EXCL | O_RDWR, 0600); + if (fd >= 0) { + int r; + + do { + r = ftruncate(fd, size); + } while (r == -1 && errno == EINTR); + if (!r) + return fd; + close(fd); + } + return -1; +} + +/* + * Return a random unsigned number. + */ +static unsigned int hmm_random(void) +{ + static int fd = -1; + unsigned int r; + + if (fd < 0) { + fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) { + fprintf(stderr, "%s:%d failed to open /dev/urandom\n", + __FILE__, __LINE__); + return ~0U; + } + } + read(fd, &r, sizeof(r)); + return r; +} + +static void hmm_nanosleep(unsigned int n) +{ + struct timespec t; + + t.tv_sec = 0; + t.tv_nsec = n; + nanosleep(&t, NULL); +} + +static int hmm_migrate_sys_to_dev(int fd, + struct hmm_buffer *buffer, + unsigned long npages) +{ + return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages); +} + +static int hmm_migrate_dev_to_sys(int fd, + struct hmm_buffer *buffer, + unsigned long npages) +{ + return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages); +} + +/* + * Simple NULL test of device open/close. + */ +TEST_F(hmm, open_close) +{ +} + +/* + * Read private anonymous memory. + */ +TEST_F(hmm, anon_read) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + int val; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* + * Initialize buffer in system memory but leave the first two pages + * zero (pte_none and pfn_zero). + */ + i = 2 * self->page_size / sizeof(*ptr); + for (ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Set buffer permission to read-only. */ + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + + /* Populate the CPU page table with a special zero page. */ + val = *(int *)(buffer->ptr + self->page_size); + ASSERT_EQ(val, 0); + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + ptr = buffer->mirror; + for (i = 0; i < 2 * self->page_size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + for (; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Read private anonymous memory which has been protected with + * mprotect() PROT_NONE. + */ +TEST_F(hmm, anon_read_prot) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Initialize mirror buffer so we can verify it isn't written. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = -i; + + /* Protect buffer from reading. */ + ret = mprotect(buffer->ptr, size, PROT_NONE); + ASSERT_EQ(ret, 0); + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, -EFAULT); + + /* Allow CPU to read the buffer so we can check it. */ + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + hmm_buffer_free(buffer); +} + +/* + * Write private anonymous memory. + */ +TEST_F(hmm, anon_write) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Write private anonymous memory which has been protected with + * mprotect() PROT_READ. + */ +TEST_F(hmm, anon_write_prot) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Simulate a device reading a zero page of memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, 1); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 1); + ASSERT_EQ(buffer->faults, 1); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, -EPERM); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + + /* Now allow writing and see that the zero page is replaced. */ + ret = mprotect(buffer->ptr, size, PROT_WRITE | PROT_READ); + ASSERT_EQ(ret, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Check that a device writing an anonymous private mapping + * will copy-on-write if a child process inherits the mapping. + */ +TEST_F(hmm, anon_write_child) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + pid_t pid; + int child_fd; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer->ptr so we can tell if it is written. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = -i; + + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); + if (pid != 0) { + waitpid(pid, &ret, 0); + ASSERT_EQ(WIFEXITED(ret), 1); + + /* Check that the parent's buffer did not change. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + return; + } + + /* Check that we see the parent's values. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + /* The child process needs its own mirror to its own mm. */ + child_fd = hmm_open(0); + ASSERT_GE(child_fd, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + close(child_fd); + exit(0); +} + +/* + * Check that a device writing an anonymous shared mapping + * will not copy-on-write if a child process inherits the mapping. + */ +TEST_F(hmm, anon_write_child_shared) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + pid_t pid; + int child_fd; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer->ptr so we can tell if it is written. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = -i; + + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); + if (pid != 0) { + waitpid(pid, &ret, 0); + ASSERT_EQ(WIFEXITED(ret), 1); + + /* Check that the parent's buffer did change. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + return; + } + + /* Check that we see the parent's values. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + /* The child process needs its own mirror to its own mm. */ + child_fd = hmm_open(0); + ASSERT_GE(child_fd, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + close(child_fd); + exit(0); +} + +/* + * Write private anonymous huge page. + */ +TEST_F(hmm, anon_write_huge) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = 2 * TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + size = TWOMEG; + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Read numeric data from raw and tagged kernel status files. Used to read + * /proc and /sys data (without a tag) and from /proc/meminfo (with a tag). + */ +static long file_read_ulong(char *file, const char *tag) +{ + int fd; + char buf[2048]; + int len; + char *p, *q; + long val; + + fd = open(file, O_RDONLY); + if (fd < 0) { + /* Error opening the file */ + return -1; + } + + len = read(fd, buf, sizeof(buf)); + close(fd); + if (len < 0) { + /* Error in reading the file */ + return -1; + } + if (len == sizeof(buf)) { + /* Error file is too large */ + return -1; + } + buf[len] = '\0'; + + /* Search for a tag if provided */ + if (tag) { + p = strstr(buf, tag); + if (!p) + return -1; /* looks like the line we want isn't there */ + p += strlen(tag); + } else + p = buf; + + val = strtol(p, &q, 0); + if (*q != ' ') { + /* Error parsing the file */ + return -1; + } + + return val; +} + +/* + * Write huge TLBFS page. + */ +TEST_F(hmm, anon_write_hugetlbfs) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long default_hsize; + unsigned long i; + int *ptr; + int ret; + + default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:"); + if (default_hsize < 0 || default_hsize*1024 < default_hsize) + SKIP(return, "Huge page size could not be determined"); + default_hsize = default_hsize*1024; /* KB to B */ + + size = ALIGN(TWOMEG, default_hsize); + npages = size >> self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (buffer->ptr == MAP_FAILED) { + free(buffer); + SKIP(return, "Huge page could not be allocated"); + } + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + munmap(buffer->ptr, buffer->size); + buffer->ptr = NULL; + hmm_buffer_free(buffer); +} + +/* + * Read mmap'ed file memory. + */ +TEST_F(hmm, file_read) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + int fd; + ssize_t len; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + fd = hmm_create_file(size); + ASSERT_GE(fd, 0); + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = fd; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + /* Write initial contents of the file. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + len = pwrite(fd, buffer->mirror, size, 0); + ASSERT_EQ(len, size); + memset(buffer->mirror, 0, size); + + buffer->ptr = mmap(NULL, size, + PROT_READ, + MAP_SHARED, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Write mmap'ed file memory. + */ +TEST_F(hmm, file_write) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + int fd; + ssize_t len; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + fd = hmm_create_file(size); + ASSERT_GE(fd, 0); + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = fd; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Check that the device also wrote the file. */ + len = pread(fd, buffer->mirror, size, 0); + ASSERT_EQ(len, size); + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Migrate anonymous memory to device private memory. + */ +TEST_F(hmm, migrate) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Migrate anonymous memory to device private memory and fault some of it back + * to system memory, then try migrating the resulting mix of system and device + * private memory to the device. + */ +TEST_F(hmm, migrate_fault) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Fault half the pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i) + ASSERT_EQ(ptr[i], i); + + /* Migrate memory to the device again. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +TEST_F(hmm, migrate_release) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Release device memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_RELEASE, buffer, npages); + ASSERT_EQ(ret, 0); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Migrate anonymous shared memory to device private memory. + */ +TEST_F(hmm, migrate_shared) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, -ENOENT); + + hmm_buffer_free(buffer); +} + +/* + * Try to migrate various memory types to device private memory. + */ +TEST_F(hmm2, migrate_mixed) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + int *ptr; + unsigned char *p; + int ret; + int val; + + npages = 6; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + p = buffer->ptr; + + /* Migrating a protected area should be an error. */ + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages); + ASSERT_EQ(ret, -EINVAL); + + /* Punch a hole after the first page address. */ + ret = munmap(buffer->ptr + self->page_size, self->page_size); + ASSERT_EQ(ret, 0); + + /* We expect an error if the vma doesn't cover the range. */ + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 3); + ASSERT_EQ(ret, -EINVAL); + + /* Page 2 will be a read-only zero page. */ + ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 2 * self->page_size); + val = *ptr + 3; + ASSERT_EQ(val, 3); + + /* Page 3 will be read-only. */ + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 3 * self->page_size); + *ptr = val; + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + + /* Page 4-5 will be read-write. */ + ret = mprotect(buffer->ptr + 4 * self->page_size, 2 * self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 4 * self->page_size); + *ptr = val; + ptr = (int *)(buffer->ptr + 5 * self->page_size); + *ptr = val; + + /* Now try to migrate pages 2-5 to device 1. */ + buffer->ptr = p + 2 * self->page_size; + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 4); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 4); + + /* Page 5 won't be migrated to device 0 because it's on device 1. */ + buffer->ptr = p + 5 * self->page_size; + ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1); + ASSERT_EQ(ret, -ENOENT); + buffer->ptr = p; + + buffer->ptr = p; + hmm_buffer_free(buffer); +} + +/* + * Migrate anonymous memory to device memory and back to system memory + * multiple times. In case of private zone configuration, this is done + * through fault pages accessed by CPU. In case of coherent zone configuration, + * the pages from the device should be explicitly migrated back to system memory. + * The reason is Coherent device zone has coherent access by CPU, therefore + * it will not generate any page fault. + */ +TEST_F(hmm, migrate_multiple) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + unsigned long c; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + for (c = 0; c < NTIMES; c++) { + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Migrate back to system memory and check them. */ + if (hmm_is_coherent_type(variant->device_number)) { + ret = hmm_migrate_dev_to_sys(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + } + + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); + } +} + +/* + * Read anonymous memory multiple times. + */ +TEST_F(hmm, anon_read_multiple) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + unsigned long c; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + for (c = 0; c < NTIMES; c++) { + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i + c; + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, + npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i + c); + + hmm_buffer_free(buffer); + } +} + +void *unmap_buffer(void *p) +{ + struct hmm_buffer *buffer = p; + + /* Delay for a bit and then unmap buffer while it is being read. */ + hmm_nanosleep(hmm_random() % 32000); + munmap(buffer->ptr + buffer->size / 2, buffer->size / 2); + buffer->ptr = NULL; + + return NULL; +} + +/* + * Try reading anonymous memory while it is being unmapped. + */ +TEST_F(hmm, anon_teardown) +{ + unsigned long npages; + unsigned long size; + unsigned long c; + void *ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + for (c = 0; c < NTIMES; ++c) { + pthread_t thread; + struct hmm_buffer *buffer; + unsigned long i; + int *ptr; + int rc; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i + c; + + rc = pthread_create(&thread, NULL, unmap_buffer, buffer); + ASSERT_EQ(rc, 0); + + /* Simulate a device reading system memory. */ + rc = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, + npages); + if (rc == 0) { + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; + i < size / sizeof(*ptr); + ++i) + ASSERT_EQ(ptr[i], i + c); + } + + pthread_join(thread, &ret); + hmm_buffer_free(buffer); + } +} + +/* + * Test memory snapshot without faulting in pages accessed by the device. + */ +TEST_F(hmm, mixedmap) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned char *m; + int ret; + + npages = 1; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, + self->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Simulate a device snapshotting CPU pagetables. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device saw. */ + m = buffer->mirror; + ASSERT_EQ(m[0], HMM_DMIRROR_PROT_READ); + + hmm_buffer_free(buffer); +} + +/* + * Test memory snapshot without faulting in pages accessed by the device. + */ +TEST_F(hmm2, snapshot) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + int *ptr; + unsigned char *p; + unsigned char *m; + int ret; + int val; + + npages = 7; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + p = buffer->ptr; + + /* Punch a hole after the first page address. */ + ret = munmap(buffer->ptr + self->page_size, self->page_size); + ASSERT_EQ(ret, 0); + + /* Page 2 will be read-only zero page. */ + ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 2 * self->page_size); + val = *ptr + 3; + ASSERT_EQ(val, 3); + + /* Page 3 will be read-only. */ + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 3 * self->page_size); + *ptr = val; + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + + /* Page 4-6 will be read-write. */ + ret = mprotect(buffer->ptr + 4 * self->page_size, 3 * self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 4 * self->page_size); + *ptr = val; + + /* Page 5 will be migrated to device 0. */ + buffer->ptr = p + 5 * self->page_size; + ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 1); + + /* Page 6 will be migrated to device 1. */ + buffer->ptr = p + 6 * self->page_size; + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 1); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 1); + + /* Simulate a device snapshotting CPU pagetables. */ + buffer->ptr = p; + ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device saw. */ + m = buffer->mirror; + ASSERT_EQ(m[0], HMM_DMIRROR_PROT_ERROR); + ASSERT_EQ(m[1], HMM_DMIRROR_PROT_ERROR); + ASSERT_EQ(m[2], HMM_DMIRROR_PROT_ZERO | HMM_DMIRROR_PROT_READ); + ASSERT_EQ(m[3], HMM_DMIRROR_PROT_READ); + ASSERT_EQ(m[4], HMM_DMIRROR_PROT_WRITE); + if (!hmm_is_coherent_type(variant->device_number0)) { + ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL | + HMM_DMIRROR_PROT_WRITE); + ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE); + } else { + ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | + HMM_DMIRROR_PROT_WRITE); + ASSERT_EQ(m[6], HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE | + HMM_DMIRROR_PROT_WRITE); + } + + hmm_buffer_free(buffer); +} + +/* + * Test the hmm_range_fault() HMM_PFN_PMD flag for large pages that + * should be mapped by a large page table entry. + */ +TEST_F(hmm, compound) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long default_hsize; + int *ptr; + unsigned char *m; + int ret; + unsigned long i; + + /* Skip test if we can't allocate a hugetlbfs page. */ + + default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:"); + if (default_hsize < 0 || default_hsize*1024 < default_hsize) + SKIP(return, "Huge page size could not be determined"); + default_hsize = default_hsize*1024; /* KB to B */ + + size = ALIGN(TWOMEG, default_hsize); + npages = size >> self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (buffer->ptr == MAP_FAILED) { + free(buffer); + return; + } + + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + /* Initialize the pages the device will snapshot in buffer->ptr. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device snapshotting CPU pagetables. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device saw. */ + m = buffer->mirror; + for (i = 0; i < npages; ++i) + ASSERT_EQ(m[i], HMM_DMIRROR_PROT_WRITE | + HMM_DMIRROR_PROT_PMD); + + /* Make the region read-only. */ + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + + /* Simulate a device snapshotting CPU pagetables. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device saw. */ + m = buffer->mirror; + for (i = 0; i < npages; ++i) + ASSERT_EQ(m[i], HMM_DMIRROR_PROT_READ | + HMM_DMIRROR_PROT_PMD); + + munmap(buffer->ptr, buffer->size); + buffer->ptr = NULL; + hmm_buffer_free(buffer); +} + +/* + * Test two devices reading the same memory (double mapped). + */ +TEST_F(hmm2, double_map) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = 6; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Make region read-only. */ + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + + /* Simulate device 0 reading system memory. */ + ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Simulate device 1 reading system memory. */ + ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Migrate pages to device 1 and try to read from device 0. */ + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what device 0 read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Basic check of exclusive faulting. + */ +TEST_F(hmm, exclusive) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Map memory exclusively for device access. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i]++, i); + + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i+1); + + /* Check atomic access revoked */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_CHECK_EXCLUSIVE, buffer, npages); + ASSERT_EQ(ret, 0); + + hmm_buffer_free(buffer); +} + +TEST_F(hmm, exclusive_mprotect) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Map memory exclusively for device access. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, -EPERM); + + hmm_buffer_free(buffer); +} + +/* + * Check copy-on-write works. + */ +TEST_F(hmm, exclusive_cow) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Map memory exclusively for device access. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + fork(); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i]++, i); + + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i+1); + + hmm_buffer_free(buffer); +} + +static int gup_test_exec(int gup_fd, unsigned long addr, int cmd, + int npages, int size, int flags) +{ + struct gup_test gup = { + .nr_pages_per_call = npages, + .addr = addr, + .gup_flags = FOLL_WRITE | flags, + .size = size, + }; + + if (ioctl(gup_fd, cmd, &gup)) { + perror("ioctl on error\n"); + return errno; + } + + return 0; +} + +/* + * Test get user device pages through gup_test. Setting PIN_LONGTERM flag. + * This should trigger a migration back to system memory for both, private + * and coherent type pages. + * This test makes use of gup_test module. Make sure GUP_TEST_CONFIG is added + * to your configuration before you run it. + */ +TEST_F(hmm, hmm_gup_test) +{ + struct hmm_buffer *buffer; + int gup_fd; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + unsigned char *m; + + gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); + if (gup_fd == -1) + SKIP(return, "Skipping test, could not find gup_test driver"); + + npages = 4; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr, + GUP_BASIC_TEST, 1, self->page_size, 0), 0); + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr + 1 * self->page_size, + GUP_FAST_BENCHMARK, 1, self->page_size, 0), 0); + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr + 2 * self->page_size, + PIN_FAST_BENCHMARK, 1, self->page_size, FOLL_LONGTERM), 0); + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr + 3 * self->page_size, + PIN_LONGTERM_BENCHMARK, 1, self->page_size, 0), 0); + + /* Take snapshot to CPU pagetables */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + m = buffer->mirror; + if (hmm_is_coherent_type(variant->device_number)) { + ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[0]); + ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[1]); + } else { + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[0]); + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[1]); + } + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[2]); + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[3]); + /* + * Check again the content on the pages. Make sure there's no + * corrupted data. + */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + close(gup_fd); + hmm_buffer_free(buffer); +} + +/* + * Test copy-on-write in device pages. + * In case of writing to COW private page(s), a page fault will migrate pages + * back to system memory first. Then, these pages will be duplicated. In case + * of COW device coherent type, pages are duplicated directly from device + * memory. + */ +TEST_F(hmm, hmm_cow_in_device) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + unsigned char *m; + pid_t pid; + int status; + + npages = 4; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); + if (!pid) { + /* Child process waitd for SIGTERM from the parent. */ + while (1) { + } + perror("Should not reach this\n"); + exit(0); + } + /* Parent process writes to COW pages(s) and gets a + * new copy in system. In case of device private pages, + * this write causes a migration to system mem first. + */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Terminate child and wait */ + EXPECT_EQ(0, kill(pid, SIGTERM)); + EXPECT_EQ(pid, waitpid(pid, &status, 0)); + EXPECT_NE(0, WIFSIGNALED(status)); + EXPECT_EQ(SIGTERM, WTERMSIG(status)); + + /* Take snapshot to CPU pagetables */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + m = buffer->mirror; + for (i = 0; i < npages; i++) + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[i]); + + hmm_buffer_free(buffer); +} +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/hugepage-mmap.c b/tools/testing/selftests/mm/hugepage-mmap.c new file mode 100644 index 000000000000..955ef87f382c --- /dev/null +++ b/tools/testing/selftests/mm/hugepage-mmap.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * hugepage-mmap: + * + * Example of using huge page memory in a user application using the mmap + * system call. Before running this application, make sure that the + * administrator has mounted the hugetlbfs filesystem (on some directory + * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this + * example, the app is requesting memory of size 256MB that is backed by + * huge pages. + * + * For the ia64 architecture, the Linux kernel reserves Region number 4 for + * huge pages. That means that if one requires a fixed address, a huge page + * aligned address starting with 0x800000... will be required. If a fixed + * address is not required, the kernel will select an address in the proper + * range. + * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include + +#define LENGTH (256UL*1024*1024) +#define PROTECTION (PROT_READ | PROT_WRITE) + +/* Only ia64 requires this */ +#ifdef __ia64__ +#define ADDR (void *)(0x8000000000000000UL) +#define FLAGS (MAP_SHARED | MAP_FIXED) +#else +#define ADDR (void *)(0x0UL) +#define FLAGS (MAP_SHARED) +#endif + +static void check_bytes(char *addr) +{ + printf("First hex is %x\n", *((unsigned int *)addr)); +} + +static void write_bytes(char *addr) +{ + unsigned long i; + + for (i = 0; i < LENGTH; i++) + *(addr + i) = (char)i; +} + +static int read_bytes(char *addr) +{ + unsigned long i; + + check_bytes(addr); + for (i = 0; i < LENGTH; i++) + if (*(addr + i) != (char)i) { + printf("Mismatch at %lu\n", i); + return 1; + } + return 0; +} + +int main(void) +{ + void *addr; + int fd, ret; + + fd = memfd_create("hugepage-mmap", MFD_HUGETLB); + if (fd < 0) { + perror("memfd_create() failed"); + exit(1); + } + + addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + close(fd); + exit(1); + } + + printf("Returned address is %p\n", addr); + check_bytes(addr); + write_bytes(addr); + ret = read_bytes(addr); + + munmap(addr, LENGTH); + close(fd); + + return ret; +} diff --git a/tools/testing/selftests/mm/hugepage-mremap.c b/tools/testing/selftests/mm/hugepage-mremap.c new file mode 100644 index 000000000000..e53b5eaa8fce --- /dev/null +++ b/tools/testing/selftests/mm/hugepage-mremap.c @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * hugepage-mremap: + * + * Example of remapping huge page memory in a user application using the + * mremap system call. The path to a file in a hugetlbfs filesystem must + * be passed as the last argument to this test. The amount of memory used + * by this test in MBs can optionally be passed as an argument. If no memory + * amount is passed, the default amount is 10MB. + * + * To make sure the test triggers pmd sharing and goes through the 'unshare' + * path in the mremap code use 1GB (1024) or more. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include /* Definition of O_* constants */ +#include /* Definition of SYS_* constants */ +#include +#include +#include + +#define DEFAULT_LENGTH_MB 10UL +#define MB_TO_BYTES(x) (x * 1024 * 1024) + +#define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC) +#define FLAGS (MAP_SHARED | MAP_ANONYMOUS) + +static void check_bytes(char *addr) +{ + printf("First hex is %x\n", *((unsigned int *)addr)); +} + +static void write_bytes(char *addr, size_t len) +{ + unsigned long i; + + for (i = 0; i < len; i++) + *(addr + i) = (char)i; +} + +static int read_bytes(char *addr, size_t len) +{ + unsigned long i; + + check_bytes(addr); + for (i = 0; i < len; i++) + if (*(addr + i) != (char)i) { + printf("Mismatch at %lu\n", i); + return 1; + } + return 0; +} + +static void register_region_with_uffd(char *addr, size_t len) +{ + long uffd; /* userfaultfd file descriptor */ + struct uffdio_api uffdio_api; + struct uffdio_register uffdio_register; + + /* Create and enable userfaultfd object. */ + + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd == -1) { + perror("userfaultfd"); + exit(1); + } + + uffdio_api.api = UFFD_API; + uffdio_api.features = 0; + if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { + perror("ioctl-UFFDIO_API"); + exit(1); + } + + /* Create a private anonymous mapping. The memory will be + * demand-zero paged--that is, not yet allocated. When we + * actually touch the memory, it will be allocated via + * the userfaultfd. + */ + + addr = mmap(NULL, len, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + printf("Address returned by mmap() = %p\n", addr); + + /* Register the memory range of the mapping we just created for + * handling by the userfaultfd object. In mode, we request to track + * missing pages (i.e., pages that have not yet been faulted in). + */ + + uffdio_register.range.start = (unsigned long)addr; + uffdio_register.range.len = len; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { + perror("ioctl-UFFDIO_REGISTER"); + exit(1); + } +} + +int main(int argc, char *argv[]) +{ + size_t length = 0; + int ret = 0, fd; + + if (argc >= 2 && !strcmp(argv[1], "-h")) { + printf("Usage: %s [length_in_MB]\n", argv[0]); + exit(1); + } + + /* Read memory length as the first arg if valid, otherwise fallback to + * the default length. + */ + if (argc >= 2) + length = (size_t)atoi(argv[1]); + else + length = DEFAULT_LENGTH_MB; + + length = MB_TO_BYTES(length); + fd = memfd_create(argv[0], MFD_HUGETLB); + if (fd < 0) { + perror("Open failed"); + exit(1); + } + + /* mmap to a PUD aligned address to hopefully trigger pmd sharing. */ + unsigned long suggested_addr = 0x7eaa40000000; + void *haddr = mmap((void *)suggested_addr, length, PROTECTION, + MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); + printf("Map haddr: Returned address is %p\n", haddr); + if (haddr == MAP_FAILED) { + perror("mmap1"); + exit(1); + } + + /* mmap again to a dummy address to hopefully trigger pmd sharing. */ + suggested_addr = 0x7daa40000000; + void *daddr = mmap((void *)suggested_addr, length, PROTECTION, + MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); + printf("Map daddr: Returned address is %p\n", daddr); + if (daddr == MAP_FAILED) { + perror("mmap3"); + exit(1); + } + + suggested_addr = 0x7faa40000000; + void *vaddr = + mmap((void *)suggested_addr, length, PROTECTION, FLAGS, -1, 0); + printf("Map vaddr: Returned address is %p\n", vaddr); + if (vaddr == MAP_FAILED) { + perror("mmap2"); + exit(1); + } + + register_region_with_uffd(haddr, length); + + void *addr = mremap(haddr, length, length, + MREMAP_MAYMOVE | MREMAP_FIXED, vaddr); + if (addr == MAP_FAILED) { + perror("mremap"); + exit(1); + } + + printf("Mremap: Returned address is %p\n", addr); + check_bytes(addr); + write_bytes(addr, length); + ret = read_bytes(addr, length); + + munmap(addr, length); + + addr = mremap(addr, length, length, 0); + if (addr != MAP_FAILED) { + printf("mremap: Expected failure, but call succeeded\n"); + exit(1); + } + + close(fd); + + return ret; +} diff --git a/tools/testing/selftests/mm/hugepage-shm.c b/tools/testing/selftests/mm/hugepage-shm.c new file mode 100644 index 000000000000..e2527f32005b --- /dev/null +++ b/tools/testing/selftests/mm/hugepage-shm.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * hugepage-shm: + * + * Example of using huge page memory in a user application using Sys V shared + * memory system calls. In this example the app is requesting 256MB of + * memory that is backed by huge pages. The application uses the flag + * SHM_HUGETLB in the shmget system call to inform the kernel that it is + * requesting huge pages. + * + * For the ia64 architecture, the Linux kernel reserves Region number 4 for + * huge pages. That means that if one requires a fixed address, a huge page + * aligned address starting with 0x800000... will be required. If a fixed + * address is not required, the kernel will select an address in the proper + * range. + * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. + * + * Note: The default shared memory limit is quite low on many kernels, + * you may need to increase it via: + * + * echo 268435456 > /proc/sys/kernel/shmmax + * + * This will increase the maximum size per shared memory segment to 256MB. + * The other limit that you will hit eventually is shmall which is the + * total amount of shared memory in pages. To set it to 16GB on a system + * with a 4kB pagesize do: + * + * echo 4194304 > /proc/sys/kernel/shmall + */ + +#include +#include +#include +#include +#include +#include + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + +#define LENGTH (256UL*1024*1024) + +#define dprintf(x) printf(x) + +/* Only ia64 requires this */ +#ifdef __ia64__ +#define ADDR (void *)(0x8000000000000000UL) +#define SHMAT_FLAGS (SHM_RND) +#else +#define ADDR (void *)(0x0UL) +#define SHMAT_FLAGS (0) +#endif + +int main(void) +{ + int shmid; + unsigned long i; + char *shmaddr; + + shmid = shmget(2, LENGTH, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); + if (shmid < 0) { + perror("shmget"); + exit(1); + } + printf("shmid: 0x%x\n", shmid); + + shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS); + if (shmaddr == (char *)-1) { + perror("Shared memory attach failure"); + shmctl(shmid, IPC_RMID, NULL); + exit(2); + } + printf("shmaddr: %p\n", shmaddr); + + dprintf("Starting the writes:\n"); + for (i = 0; i < LENGTH; i++) { + shmaddr[i] = (char)(i); + if (!(i % (1024 * 1024))) + dprintf("."); + } + dprintf("\n"); + + dprintf("Starting the Check..."); + for (i = 0; i < LENGTH; i++) + if (shmaddr[i] != (char)i) { + printf("\nIndex %lu mismatched\n", i); + exit(3); + } + dprintf("Done.\n"); + + if (shmdt((const void *)shmaddr) != 0) { + perror("Detach failure"); + shmctl(shmid, IPC_RMID, NULL); + exit(4); + } + + shmctl(shmid, IPC_RMID, NULL); + + return 0; +} diff --git a/tools/testing/selftests/mm/hugepage-vmemmap.c b/tools/testing/selftests/mm/hugepage-vmemmap.c new file mode 100644 index 000000000000..557bdbd4f87e --- /dev/null +++ b/tools/testing/selftests/mm/hugepage-vmemmap.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test case of using hugepage memory in a user application using the + * mmap system call with MAP_HUGETLB flag. Before running this program + * make sure the administrator has allocated enough default sized huge + * pages to cover the 2 MB allocation. + */ +#include +#include +#include +#include +#include + +#define MAP_LENGTH (2UL * 1024 * 1024) + +#ifndef MAP_HUGETLB +#define MAP_HUGETLB 0x40000 /* arch specific */ +#endif + +#define PAGE_SIZE 4096 + +#define PAGE_COMPOUND_HEAD (1UL << 15) +#define PAGE_COMPOUND_TAIL (1UL << 16) +#define PAGE_HUGE (1UL << 17) + +#define HEAD_PAGE_FLAGS (PAGE_COMPOUND_HEAD | PAGE_HUGE) +#define TAIL_PAGE_FLAGS (PAGE_COMPOUND_TAIL | PAGE_HUGE) + +#define PM_PFRAME_BITS 55 +#define PM_PFRAME_MASK ~((1UL << PM_PFRAME_BITS) - 1) + +/* + * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. + * That means the addresses starting with 0x800000... will need to be + * specified. Specifying a fixed address is not required on ppc64, i386 + * or x86_64. + */ +#ifdef __ia64__ +#define MAP_ADDR (void *)(0x8000000000000000UL) +#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) +#else +#define MAP_ADDR NULL +#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) +#endif + +static void write_bytes(char *addr, size_t length) +{ + unsigned long i; + + for (i = 0; i < length; i++) + *(addr + i) = (char)i; +} + +static unsigned long virt_to_pfn(void *addr) +{ + int fd; + unsigned long pagemap; + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) + return -1UL; + + lseek(fd, (unsigned long)addr / PAGE_SIZE * sizeof(pagemap), SEEK_SET); + read(fd, &pagemap, sizeof(pagemap)); + close(fd); + + return pagemap & ~PM_PFRAME_MASK; +} + +static int check_page_flags(unsigned long pfn) +{ + int fd, i; + unsigned long pageflags; + + fd = open("/proc/kpageflags", O_RDONLY); + if (fd < 0) + return -1; + + lseek(fd, pfn * sizeof(pageflags), SEEK_SET); + + read(fd, &pageflags, sizeof(pageflags)); + if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) { + close(fd); + printf("Head page flags (%lx) is invalid\n", pageflags); + return -1; + } + + /* + * pages other than the first page must be tail and shouldn't be head; + * this also verifies kernel has correctly set the fake page_head to tail + * while hugetlb_free_vmemmap is enabled. + */ + for (i = 1; i < MAP_LENGTH / PAGE_SIZE; i++) { + read(fd, &pageflags, sizeof(pageflags)); + if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS || + (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) { + close(fd); + printf("Tail page flags (%lx) is invalid\n", pageflags); + return -1; + } + } + + close(fd); + + return 0; +} + +int main(int argc, char **argv) +{ + void *addr; + unsigned long pfn; + + addr = mmap(MAP_ADDR, MAP_LENGTH, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* Trigger allocation of HugeTLB page. */ + write_bytes(addr, MAP_LENGTH); + + pfn = virt_to_pfn(addr); + if (pfn == -1UL) { + munmap(addr, MAP_LENGTH); + perror("virt_to_pfn"); + exit(1); + } + + printf("Returned address is %p whose pfn is %lx\n", addr, pfn); + + if (check_page_flags(pfn) < 0) { + munmap(addr, MAP_LENGTH); + perror("check_page_flags"); + exit(1); + } + + /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ + if (munmap(addr, MAP_LENGTH)) { + perror("munmap"); + exit(1); + } + + return 0; +} diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c new file mode 100644 index 000000000000..a634f47d1e56 --- /dev/null +++ b/tools/testing/selftests/mm/hugetlb-madvise.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * hugepage-madvise: + * + * Basic functional testing of madvise MADV_DONTNEED and MADV_REMOVE + * on hugetlb mappings. + * + * Before running this test, make sure the administrator has pre-allocated + * at least MIN_FREE_PAGES hugetlb pages and they are free. In addition, + * the test takes an argument that is the path to a file in a hugetlbfs + * filesystem. Therefore, a hugetlbfs filesystem must be mounted on some + * directory. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#define __USE_GNU +#include + +#define MIN_FREE_PAGES 20 +#define NR_HUGE_PAGES 10 /* common number of pages to map/allocate */ + +#define validate_free_pages(exp_free) \ + do { \ + int fhp = get_free_hugepages(); \ + if (fhp != (exp_free)) { \ + printf("Unexpected number of free huge " \ + "pages line %d\n", __LINE__); \ + exit(1); \ + } \ + } while (0) + +unsigned long huge_page_size; +unsigned long base_page_size; + +/* + * default_huge_page_size copied from mlock2-tests.c + */ +unsigned long default_huge_page_size(void) +{ + unsigned long hps = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return 0; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { + hps <<= 10; + break; + } + } + + free(line); + fclose(f); + return hps; +} + +unsigned long get_free_hugepages(void) +{ + unsigned long fhp = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return fhp; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "HugePages_Free: %lu", &fhp) == 1) + break; + } + + free(line); + fclose(f); + return fhp; +} + +void write_fault_pages(void *addr, unsigned long nr_pages) +{ + unsigned long i; + + for (i = 0; i < nr_pages; i++) + *((unsigned long *)(addr + (i * huge_page_size))) = i; +} + +void read_fault_pages(void *addr, unsigned long nr_pages) +{ + unsigned long dummy = 0; + unsigned long i; + + for (i = 0; i < nr_pages; i++) + dummy += *((unsigned long *)(addr + (i * huge_page_size))); +} + +int main(int argc, char **argv) +{ + unsigned long free_hugepages; + void *addr, *addr2; + int fd; + int ret; + + huge_page_size = default_huge_page_size(); + if (!huge_page_size) { + printf("Unable to determine huge page size, exiting!\n"); + exit(1); + } + base_page_size = sysconf(_SC_PAGE_SIZE); + if (!huge_page_size) { + printf("Unable to determine base page size, exiting!\n"); + exit(1); + } + + free_hugepages = get_free_hugepages(); + if (free_hugepages < MIN_FREE_PAGES) { + printf("Not enough free huge pages to test, exiting!\n"); + exit(1); + } + + fd = memfd_create(argv[0], MFD_HUGETLB); + if (fd < 0) { + perror("memfd_create() failed"); + exit(1); + } + + /* + * Test validity of MADV_DONTNEED addr and length arguments. mmap + * size is NR_HUGE_PAGES + 2. One page at the beginning and end of + * the mapping will be unmapped so we KNOW there is nothing mapped + * there. + */ + addr = mmap(NULL, (NR_HUGE_PAGES + 2) * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + if (munmap(addr, huge_page_size) || + munmap(addr + (NR_HUGE_PAGES + 1) * huge_page_size, + huge_page_size)) { + perror("munmap"); + exit(1); + } + addr = addr + huge_page_size; + + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* addr before mapping should fail */ + ret = madvise(addr - base_page_size, NR_HUGE_PAGES * huge_page_size, + MADV_DONTNEED); + if (!ret) { + printf("Unexpected success of madvise call with invalid addr line %d\n", + __LINE__); + exit(1); + } + + /* addr + length after mapping should fail */ + ret = madvise(addr, (NR_HUGE_PAGES * huge_page_size) + base_page_size, + MADV_DONTNEED); + if (!ret) { + printf("Unexpected success of madvise call with invalid length line %d\n", + __LINE__); + exit(1); + } + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test alignment of MADV_DONTNEED addr and length arguments + */ + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* addr is not huge page size aligned and should fail */ + ret = madvise(addr + base_page_size, + NR_HUGE_PAGES * huge_page_size - base_page_size, + MADV_DONTNEED); + if (!ret) { + printf("Unexpected success of madvise call with unaligned start address %d\n", + __LINE__); + exit(1); + } + + /* addr + length should be aligned down to huge page size */ + if (madvise(addr, + ((NR_HUGE_PAGES - 1) * huge_page_size) + base_page_size, + MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + + /* should free all but last page in mapping */ + validate_free_pages(free_hugepages - 1); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + validate_free_pages(free_hugepages); + + /* + * Test MADV_DONTNEED on anonymous private mapping + */ + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + + /* should free all pages in mapping */ + validate_free_pages(free_hugepages); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test MADV_DONTNEED on private mapping of hugetlb file + */ + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* read should not consume any pages */ + read_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* madvise should not free any pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* writes should allocate private pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* madvise should free private pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* writes should allocate private pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* + * The fallocate below certainly should free the pages associated + * with the file. However, pages in the private mapping are also + * freed. This is not the 'correct' behavior, but is expected + * because this is how it has worked since the initial hugetlb + * implementation. + */ + if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test MADV_DONTNEED on shared mapping of hugetlb file + */ + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* write should not consume any pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* madvise should not free any pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* + * Test MADV_REMOVE on shared mapping of hugetlb file + * + * madvise is same as hole punch and should free all pages. + */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages); + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test MADV_REMOVE on shared and private mapping of hugetlb file + */ + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* shared write should not consume any additional pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr2 = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, fd, 0); + if (addr2 == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* private read should not consume any pages */ + read_fault_pages(addr2, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* private write should consume additional pages */ + write_fault_pages(addr2, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* madvise of shared mapping should not free any pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* madvise of private mapping should free private pages */ + if (madvise(addr2, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* private write should consume additional pages again */ + write_fault_pages(addr2, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* + * madvise should free both file and private pages although this is + * not correct. private pages should not be freed, but this is + * expected. See comment associated with FALLOC_FL_PUNCH_HOLE call. + */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + (void)munmap(addr2, NR_HUGE_PAGES * huge_page_size); + + close(fd); + return 0; +} diff --git a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh new file mode 100644 index 000000000000..bf2d2a684edf --- /dev/null +++ b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh @@ -0,0 +1,252 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +set -e + +if [[ $(id -u) -ne 0 ]]; then + echo "This test must be run as root. Skipping..." + exit $ksft_skip +fi + +usage_file=usage_in_bytes + +if [[ "$1" == "-cgroup-v2" ]]; then + cgroup2=1 + usage_file=current +fi + + +if [[ $cgroup2 ]]; then + CGROUP_ROOT=$(mount -t cgroup2 | head -1 | awk -e '{print $3}') + if [[ -z "$CGROUP_ROOT" ]]; then + CGROUP_ROOT=/dev/cgroup/memory + mount -t cgroup2 none $CGROUP_ROOT + do_umount=1 + fi + echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control +else + CGROUP_ROOT=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}') + if [[ -z "$CGROUP_ROOT" ]]; then + CGROUP_ROOT=/dev/cgroup/memory + mount -t cgroup memory,hugetlb $CGROUP_ROOT + do_umount=1 + fi +fi +MNT='/mnt/huge/' + +function get_machine_hugepage_size() { + hpz=$(grep -i hugepagesize /proc/meminfo) + kb=${hpz:14:-3} + mb=$(($kb / 1024)) + echo $mb +} + +MB=$(get_machine_hugepage_size) + +function cleanup() { + echo cleanup + set +e + rm -rf "$MNT"/* 2>/dev/null + umount "$MNT" 2>/dev/null + rmdir "$MNT" 2>/dev/null + rmdir "$CGROUP_ROOT"/a/b 2>/dev/null + rmdir "$CGROUP_ROOT"/a 2>/dev/null + rmdir "$CGROUP_ROOT"/test1 2>/dev/null + echo 0 >/proc/sys/vm/nr_hugepages + set -e +} + +function assert_state() { + local expected_a="$1" + local expected_a_hugetlb="$2" + local expected_b="" + local expected_b_hugetlb="" + + if [ ! -z ${3:-} ] && [ ! -z ${4:-} ]; then + expected_b="$3" + expected_b_hugetlb="$4" + fi + local tolerance=$((5 * 1024 * 1024)) + + local actual_a + actual_a="$(cat "$CGROUP_ROOT"/a/memory.$usage_file)" + if [[ $actual_a -lt $(($expected_a - $tolerance)) ]] || + [[ $actual_a -gt $(($expected_a + $tolerance)) ]]; then + echo actual a = $((${actual_a%% *} / 1024 / 1024)) MB + echo expected a = $((${expected_a%% *} / 1024 / 1024)) MB + echo fail + + cleanup + exit 1 + fi + + local actual_a_hugetlb + actual_a_hugetlb="$(cat "$CGROUP_ROOT"/a/hugetlb.${MB}MB.$usage_file)" + if [[ $actual_a_hugetlb -lt $(($expected_a_hugetlb - $tolerance)) ]] || + [[ $actual_a_hugetlb -gt $(($expected_a_hugetlb + $tolerance)) ]]; then + echo actual a hugetlb = $((${actual_a_hugetlb%% *} / 1024 / 1024)) MB + echo expected a hugetlb = $((${expected_a_hugetlb%% *} / 1024 / 1024)) MB + echo fail + + cleanup + exit 1 + fi + + if [[ -z "$expected_b" || -z "$expected_b_hugetlb" ]]; then + return + fi + + local actual_b + actual_b="$(cat "$CGROUP_ROOT"/a/b/memory.$usage_file)" + if [[ $actual_b -lt $(($expected_b - $tolerance)) ]] || + [[ $actual_b -gt $(($expected_b + $tolerance)) ]]; then + echo actual b = $((${actual_b%% *} / 1024 / 1024)) MB + echo expected b = $((${expected_b%% *} / 1024 / 1024)) MB + echo fail + + cleanup + exit 1 + fi + + local actual_b_hugetlb + actual_b_hugetlb="$(cat "$CGROUP_ROOT"/a/b/hugetlb.${MB}MB.$usage_file)" + if [[ $actual_b_hugetlb -lt $(($expected_b_hugetlb - $tolerance)) ]] || + [[ $actual_b_hugetlb -gt $(($expected_b_hugetlb + $tolerance)) ]]; then + echo actual b hugetlb = $((${actual_b_hugetlb%% *} / 1024 / 1024)) MB + echo expected b hugetlb = $((${expected_b_hugetlb%% *} / 1024 / 1024)) MB + echo fail + + cleanup + exit 1 + fi +} + +function setup() { + echo 100 >/proc/sys/vm/nr_hugepages + mkdir "$CGROUP_ROOT"/a + sleep 1 + if [[ $cgroup2 ]]; then + echo "+hugetlb +memory" >$CGROUP_ROOT/a/cgroup.subtree_control + else + echo 0 >$CGROUP_ROOT/a/cpuset.mems + echo 0 >$CGROUP_ROOT/a/cpuset.cpus + fi + + mkdir "$CGROUP_ROOT"/a/b + + if [[ ! $cgroup2 ]]; then + echo 0 >$CGROUP_ROOT/a/b/cpuset.mems + echo 0 >$CGROUP_ROOT/a/b/cpuset.cpus + fi + + mkdir -p "$MNT" + mount -t hugetlbfs none "$MNT" +} + +write_hugetlbfs() { + local cgroup="$1" + local path="$2" + local size="$3" + + if [[ $cgroup2 ]]; then + echo $$ >$CGROUP_ROOT/$cgroup/cgroup.procs + else + echo 0 >$CGROUP_ROOT/$cgroup/cpuset.mems + echo 0 >$CGROUP_ROOT/$cgroup/cpuset.cpus + echo $$ >"$CGROUP_ROOT/$cgroup/tasks" + fi + ./write_to_hugetlbfs -p "$path" -s "$size" -m 0 -o + if [[ $cgroup2 ]]; then + echo $$ >$CGROUP_ROOT/cgroup.procs + else + echo $$ >"$CGROUP_ROOT/tasks" + fi + echo +} + +set -e + +size=$((${MB} * 1024 * 1024 * 25)) # 50MB = 25 * 2MB hugepages. + +cleanup + +echo +echo +echo Test charge, rmdir, uncharge +setup +echo mkdir +mkdir $CGROUP_ROOT/test1 + +echo write +write_hugetlbfs test1 "$MNT"/test $size + +echo rmdir +rmdir $CGROUP_ROOT/test1 +mkdir $CGROUP_ROOT/test1 + +echo uncharge +rm -rf /mnt/huge/* + +cleanup + +echo done +echo +echo +if [[ ! $cgroup2 ]]; then + echo "Test parent and child hugetlb usage" + setup + + echo write + write_hugetlbfs a "$MNT"/test $size + + echo Assert memory charged correctly for parent use. + assert_state 0 $size 0 0 + + write_hugetlbfs a/b "$MNT"/test2 $size + + echo Assert memory charged correctly for child use. + assert_state 0 $(($size * 2)) 0 $size + + rmdir "$CGROUP_ROOT"/a/b + sleep 5 + echo Assert memory reparent correctly. + assert_state 0 $(($size * 2)) + + rm -rf "$MNT"/* + umount "$MNT" + echo Assert memory uncharged correctly. + assert_state 0 0 + + cleanup +fi + +echo +echo +echo "Test child only hugetlb usage" +echo setup +setup + +echo write +write_hugetlbfs a/b "$MNT"/test2 $size + +echo Assert memory charged correctly for child only use. +assert_state 0 $(($size)) 0 $size + +rmdir "$CGROUP_ROOT"/a/b +echo Assert memory reparent correctly. +assert_state 0 $size + +rm -rf "$MNT"/* +umount "$MNT" +echo Assert memory uncharged correctly. +assert_state 0 0 + +cleanup + +echo ALL PASS + +umount $CGROUP_ROOT +rm -rf $CGROUP_ROOT diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c new file mode 100644 index 000000000000..64126c8cd561 --- /dev/null +++ b/tools/testing/selftests/mm/khugepaged.c @@ -0,0 +1,1558 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "linux/magic.h" + +#include "vm_util.h" + +#ifndef MADV_PAGEOUT +#define MADV_PAGEOUT 21 +#endif +#ifndef MADV_POPULATE_READ +#define MADV_POPULATE_READ 22 +#endif +#ifndef MADV_COLLAPSE +#define MADV_COLLAPSE 25 +#endif + +#define BASE_ADDR ((void *)(1UL << 30)) +static unsigned long hpage_pmd_size; +static unsigned long page_size; +static int hpage_pmd_nr; + +#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" +#define PID_SMAPS "/proc/self/smaps" +#define TEST_FILE "collapse_test_file" + +#define MAX_LINE_LENGTH 500 + +enum vma_type { + VMA_ANON, + VMA_FILE, + VMA_SHMEM, +}; + +struct mem_ops { + void *(*setup_area)(int nr_hpages); + void (*cleanup_area)(void *p, unsigned long size); + void (*fault)(void *p, unsigned long start, unsigned long end); + bool (*check_huge)(void *addr, int nr_hpages); + const char *name; +}; + +static struct mem_ops *file_ops; +static struct mem_ops *anon_ops; +static struct mem_ops *shmem_ops; + +struct collapse_context { + void (*collapse)(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect); + bool enforce_pte_scan_limits; + const char *name; +}; + +static struct collapse_context *khugepaged_context; +static struct collapse_context *madvise_context; + +struct file_info { + const char *dir; + char path[PATH_MAX]; + enum vma_type type; + int fd; + char dev_queue_read_ahead_path[PATH_MAX]; +}; + +static struct file_info finfo; + +enum thp_enabled { + THP_ALWAYS, + THP_MADVISE, + THP_NEVER, +}; + +static const char *thp_enabled_strings[] = { + "always", + "madvise", + "never", + NULL +}; + +enum thp_defrag { + THP_DEFRAG_ALWAYS, + THP_DEFRAG_DEFER, + THP_DEFRAG_DEFER_MADVISE, + THP_DEFRAG_MADVISE, + THP_DEFRAG_NEVER, +}; + +static const char *thp_defrag_strings[] = { + "always", + "defer", + "defer+madvise", + "madvise", + "never", + NULL +}; + +enum shmem_enabled { + SHMEM_ALWAYS, + SHMEM_WITHIN_SIZE, + SHMEM_ADVISE, + SHMEM_NEVER, + SHMEM_DENY, + SHMEM_FORCE, +}; + +static const char *shmem_enabled_strings[] = { + "always", + "within_size", + "advise", + "never", + "deny", + "force", + NULL +}; + +struct khugepaged_settings { + bool defrag; + unsigned int alloc_sleep_millisecs; + unsigned int scan_sleep_millisecs; + unsigned int max_ptes_none; + unsigned int max_ptes_swap; + unsigned int max_ptes_shared; + unsigned long pages_to_scan; +}; + +struct settings { + enum thp_enabled thp_enabled; + enum thp_defrag thp_defrag; + enum shmem_enabled shmem_enabled; + bool use_zero_page; + struct khugepaged_settings khugepaged; + unsigned long read_ahead_kb; +}; + +static struct settings saved_settings; +static bool skip_settings_restore; + +static int exit_status; + +static void success(const char *msg) +{ + printf(" \e[32m%s\e[0m\n", msg); +} + +static void fail(const char *msg) +{ + printf(" \e[31m%s\e[0m\n", msg); + exit_status++; +} + +static void skip(const char *msg) +{ + printf(" \e[33m%s\e[0m\n", msg); +} + +static int read_file(const char *path, char *buf, size_t buflen) +{ + int fd; + ssize_t numread; + + fd = open(path, O_RDONLY); + if (fd == -1) + return 0; + + numread = read(fd, buf, buflen - 1); + if (numread < 1) { + close(fd); + return 0; + } + + buf[numread] = '\0'; + close(fd); + + return (unsigned int) numread; +} + +static int write_file(const char *path, const char *buf, size_t buflen) +{ + int fd; + ssize_t numwritten; + + fd = open(path, O_WRONLY); + if (fd == -1) { + printf("open(%s)\n", path); + exit(EXIT_FAILURE); + return 0; + } + + numwritten = write(fd, buf, buflen - 1); + close(fd); + if (numwritten < 1) { + printf("write(%s)\n", buf); + exit(EXIT_FAILURE); + return 0; + } + + return (unsigned int) numwritten; +} + +static int read_string(const char *name, const char *strings[]) +{ + char path[PATH_MAX]; + char buf[256]; + char *c; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!read_file(path, buf, sizeof(buf))) { + perror(path); + exit(EXIT_FAILURE); + } + + c = strchr(buf, '['); + if (!c) { + printf("%s: Parse failure\n", __func__); + exit(EXIT_FAILURE); + } + + c++; + memmove(buf, c, sizeof(buf) - (c - buf)); + + c = strchr(buf, ']'); + if (!c) { + printf("%s: Parse failure\n", __func__); + exit(EXIT_FAILURE); + } + *c = '\0'; + + ret = 0; + while (strings[ret]) { + if (!strcmp(strings[ret], buf)) + return ret; + ret++; + } + + printf("Failed to parse %s\n", name); + exit(EXIT_FAILURE); +} + +static void write_string(const char *name, const char *val) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!write_file(path, val, strlen(val) + 1)) { + perror(path); + exit(EXIT_FAILURE); + } +} + +static const unsigned long _read_num(const char *path) +{ + char buf[21]; + + if (read_file(path, buf, sizeof(buf)) < 0) { + perror("read_file(read_num)"); + exit(EXIT_FAILURE); + } + + return strtoul(buf, NULL, 10); +} + +static const unsigned long read_num(const char *name) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + return _read_num(path); +} + +static void _write_num(const char *path, unsigned long num) +{ + char buf[21]; + + sprintf(buf, "%ld", num); + if (!write_file(path, buf, strlen(buf) + 1)) { + perror(path); + exit(EXIT_FAILURE); + } +} + +static void write_num(const char *name, unsigned long num) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + _write_num(path, num); +} + +static void write_settings(struct settings *settings) +{ + struct khugepaged_settings *khugepaged = &settings->khugepaged; + + write_string("enabled", thp_enabled_strings[settings->thp_enabled]); + write_string("defrag", thp_defrag_strings[settings->thp_defrag]); + write_string("shmem_enabled", + shmem_enabled_strings[settings->shmem_enabled]); + write_num("use_zero_page", settings->use_zero_page); + + write_num("khugepaged/defrag", khugepaged->defrag); + write_num("khugepaged/alloc_sleep_millisecs", + khugepaged->alloc_sleep_millisecs); + write_num("khugepaged/scan_sleep_millisecs", + khugepaged->scan_sleep_millisecs); + write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none); + write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap); + write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared); + write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); + + if (file_ops && finfo.type == VMA_FILE) + _write_num(finfo.dev_queue_read_ahead_path, + settings->read_ahead_kb); +} + +#define MAX_SETTINGS_DEPTH 4 +static struct settings settings_stack[MAX_SETTINGS_DEPTH]; +static int settings_index; + +static struct settings *current_settings(void) +{ + if (!settings_index) { + printf("Fail: No settings set"); + exit(EXIT_FAILURE); + } + return settings_stack + settings_index - 1; +} + +static void push_settings(struct settings *settings) +{ + if (settings_index >= MAX_SETTINGS_DEPTH) { + printf("Fail: Settings stack exceeded"); + exit(EXIT_FAILURE); + } + settings_stack[settings_index++] = *settings; + write_settings(current_settings()); +} + +static void pop_settings(void) +{ + if (settings_index <= 0) { + printf("Fail: Settings stack empty"); + exit(EXIT_FAILURE); + } + --settings_index; + write_settings(current_settings()); +} + +static void restore_settings(int sig) +{ + if (skip_settings_restore) + goto out; + + printf("Restore THP and khugepaged settings..."); + write_settings(&saved_settings); + success("OK"); + if (sig) + exit(EXIT_FAILURE); +out: + exit(exit_status); +} + +static void save_settings(void) +{ + printf("Save THP and khugepaged settings..."); + saved_settings = (struct settings) { + .thp_enabled = read_string("enabled", thp_enabled_strings), + .thp_defrag = read_string("defrag", thp_defrag_strings), + .shmem_enabled = + read_string("shmem_enabled", shmem_enabled_strings), + .use_zero_page = read_num("use_zero_page"), + }; + saved_settings.khugepaged = (struct khugepaged_settings) { + .defrag = read_num("khugepaged/defrag"), + .alloc_sleep_millisecs = + read_num("khugepaged/alloc_sleep_millisecs"), + .scan_sleep_millisecs = + read_num("khugepaged/scan_sleep_millisecs"), + .max_ptes_none = read_num("khugepaged/max_ptes_none"), + .max_ptes_swap = read_num("khugepaged/max_ptes_swap"), + .max_ptes_shared = read_num("khugepaged/max_ptes_shared"), + .pages_to_scan = read_num("khugepaged/pages_to_scan"), + }; + if (file_ops && finfo.type == VMA_FILE) + saved_settings.read_ahead_kb = + _read_num(finfo.dev_queue_read_ahead_path); + + success("OK"); + + signal(SIGTERM, restore_settings); + signal(SIGINT, restore_settings); + signal(SIGHUP, restore_settings); + signal(SIGQUIT, restore_settings); +} + +static void get_finfo(const char *dir) +{ + struct stat path_stat; + struct statfs fs; + char buf[1 << 10]; + char path[PATH_MAX]; + char *str, *end; + + finfo.dir = dir; + stat(finfo.dir, &path_stat); + if (!S_ISDIR(path_stat.st_mode)) { + printf("%s: Not a directory (%s)\n", __func__, finfo.dir); + exit(EXIT_FAILURE); + } + if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE, + finfo.dir) >= sizeof(finfo.path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + if (statfs(finfo.dir, &fs)) { + perror("statfs()"); + exit(EXIT_FAILURE); + } + finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE; + if (finfo.type == VMA_SHMEM) + return; + + /* Find owning device's queue/read_ahead_kb control */ + if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent", + major(path_stat.st_dev), minor(path_stat.st_dev)) + >= sizeof(path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + if (read_file(path, buf, sizeof(buf)) < 0) { + perror("read_file(read_num)"); + exit(EXIT_FAILURE); + } + if (strstr(buf, "DEVTYPE=disk")) { + /* Found it */ + if (snprintf(finfo.dev_queue_read_ahead_path, + sizeof(finfo.dev_queue_read_ahead_path), + "/sys/dev/block/%d:%d/queue/read_ahead_kb", + major(path_stat.st_dev), minor(path_stat.st_dev)) + >= sizeof(finfo.dev_queue_read_ahead_path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + return; + } + if (!strstr(buf, "DEVTYPE=partition")) { + printf("%s: Unknown device type: %s\n", __func__, path); + exit(EXIT_FAILURE); + } + /* + * Partition of block device - need to find actual device. + * Using naming convention that devnameN is partition of + * device devname. + */ + str = strstr(buf, "DEVNAME="); + if (!str) { + printf("%s: Could not read: %s", __func__, path); + exit(EXIT_FAILURE); + } + str += 8; + end = str; + while (*end) { + if (isdigit(*end)) { + *end = '\0'; + if (snprintf(finfo.dev_queue_read_ahead_path, + sizeof(finfo.dev_queue_read_ahead_path), + "/sys/block/%s/queue/read_ahead_kb", + str) >= sizeof(finfo.dev_queue_read_ahead_path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + return; + } + ++end; + } + printf("%s: Could not read: %s\n", __func__, path); + exit(EXIT_FAILURE); +} + +static bool check_swap(void *addr, unsigned long size) +{ + bool swap = false; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + char addr_pattern[MAX_LINE_LENGTH]; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", + (unsigned long) addr); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + + + fp = fopen(PID_SMAPS, "r"); + if (!fp) { + printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); + exit(EXIT_FAILURE); + } + if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) + goto err_out; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB", + size >> 10); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + /* + * Fetch the Swap: in the same block and check whether it got + * the expected number of hugeepages next. + */ + if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer))) + goto err_out; + + if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) + goto err_out; + + swap = true; +err_out: + fclose(fp); + return swap; +} + +static void *alloc_mapping(int nr) +{ + void *p; + + p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (p != BASE_ADDR) { + printf("Failed to allocate VMA at %p\n", BASE_ADDR); + exit(EXIT_FAILURE); + } + + return p; +} + +static void fill_memory(int *p, unsigned long start, unsigned long end) +{ + int i; + + for (i = start / page_size; i < end / page_size; i++) + p[i * page_size / sizeof(*p)] = i + 0xdead0000; +} + +/* + * MADV_COLLAPSE is a best-effort request and may fail if an internal + * resource is temporarily unavailable, in which case it will set errno to + * EAGAIN. In such a case, immediately reattempt the operation one more + * time. + */ +static int madvise_collapse_retry(void *p, unsigned long size) +{ + bool retry = true; + int ret; + +retry: + ret = madvise(p, size, MADV_COLLAPSE); + if (ret && errno == EAGAIN && retry) { + retry = false; + goto retry; + } + return ret; +} + +/* + * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with + * validate_memory()'able contents. + */ +static void *alloc_hpage(struct mem_ops *ops) +{ + void *p = ops->setup_area(1); + + ops->fault(p, 0, hpage_pmd_size); + + /* + * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE. + * The latter is ineligible for collapse by MADV_COLLAPSE + * while the former might cause MADV_COLLAPSE to race with + * khugepaged on low-load system (like a test machine), which + * would cause MADV_COLLAPSE to fail with EAGAIN. + */ + printf("Allocate huge page..."); + if (madvise_collapse_retry(p, hpage_pmd_size)) { + perror("madvise(MADV_COLLAPSE)"); + exit(EXIT_FAILURE); + } + if (!ops->check_huge(p, 1)) { + perror("madvise(MADV_COLLAPSE)"); + exit(EXIT_FAILURE); + } + if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) { + perror("madvise(MADV_HUGEPAGE)"); + exit(EXIT_FAILURE); + } + success("OK"); + return p; +} + +static void validate_memory(int *p, unsigned long start, unsigned long end) +{ + int i; + + for (i = start / page_size; i < end / page_size; i++) { + if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) { + printf("Page %d is corrupted: %#x\n", + i, p[i * page_size / sizeof(*p)]); + exit(EXIT_FAILURE); + } + } +} + +static void *anon_setup_area(int nr_hpages) +{ + return alloc_mapping(nr_hpages); +} + +static void anon_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); +} + +static void anon_fault(void *p, unsigned long start, unsigned long end) +{ + fill_memory(p, start, end); +} + +static bool anon_check_huge(void *addr, int nr_hpages) +{ + return check_huge_anon(addr, nr_hpages, hpage_pmd_size); +} + +static void *file_setup_area(int nr_hpages) +{ + int fd; + void *p; + unsigned long size; + + unlink(finfo.path); /* Cleanup from previous failed tests */ + printf("Creating %s for collapse%s...", finfo.path, + finfo.type == VMA_SHMEM ? " (tmpfs)" : ""); + fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL, + 777); + if (fd < 0) { + perror("open()"); + exit(EXIT_FAILURE); + } + + size = nr_hpages * hpage_pmd_size; + p = alloc_mapping(nr_hpages); + fill_memory(p, 0, size); + write(fd, p, size); + close(fd); + munmap(p, size); + success("OK"); + + printf("Opening %s read only for collapse...", finfo.path); + finfo.fd = open(finfo.path, O_RDONLY, 777); + if (finfo.fd < 0) { + perror("open()"); + exit(EXIT_FAILURE); + } + p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC, + MAP_PRIVATE, finfo.fd, 0); + if (p == MAP_FAILED || p != BASE_ADDR) { + perror("mmap()"); + exit(EXIT_FAILURE); + } + + /* Drop page cache */ + write_file("/proc/sys/vm/drop_caches", "3", 2); + success("OK"); + return p; +} + +static void file_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); + close(finfo.fd); + unlink(finfo.path); +} + +static void file_fault(void *p, unsigned long start, unsigned long end) +{ + if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) { + perror("madvise(MADV_POPULATE_READ"); + exit(EXIT_FAILURE); + } +} + +static bool file_check_huge(void *addr, int nr_hpages) +{ + switch (finfo.type) { + case VMA_FILE: + return check_huge_file(addr, nr_hpages, hpage_pmd_size); + case VMA_SHMEM: + return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); + default: + exit(EXIT_FAILURE); + return false; + } +} + +static void *shmem_setup_area(int nr_hpages) +{ + void *p; + unsigned long size = nr_hpages * hpage_pmd_size; + + finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0); + if (finfo.fd < 0) { + perror("memfd_create()"); + exit(EXIT_FAILURE); + } + if (ftruncate(finfo.fd, size)) { + perror("ftruncate()"); + exit(EXIT_FAILURE); + } + p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd, + 0); + if (p != BASE_ADDR) { + perror("mmap()"); + exit(EXIT_FAILURE); + } + return p; +} + +static void shmem_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); + close(finfo.fd); +} + +static bool shmem_check_huge(void *addr, int nr_hpages) +{ + return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); +} + +static struct mem_ops __anon_ops = { + .setup_area = &anon_setup_area, + .cleanup_area = &anon_cleanup_area, + .fault = &anon_fault, + .check_huge = &anon_check_huge, + .name = "anon", +}; + +static struct mem_ops __file_ops = { + .setup_area = &file_setup_area, + .cleanup_area = &file_cleanup_area, + .fault = &file_fault, + .check_huge = &file_check_huge, + .name = "file", +}; + +static struct mem_ops __shmem_ops = { + .setup_area = &shmem_setup_area, + .cleanup_area = &shmem_cleanup_area, + .fault = &anon_fault, + .check_huge = &shmem_check_huge, + .name = "shmem", +}; + +static void __madvise_collapse(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect) +{ + int ret; + struct settings settings = *current_settings(); + + printf("%s...", msg); + + /* + * Prevent khugepaged interference and tests that MADV_COLLAPSE + * ignores /sys/kernel/mm/transparent_hugepage/enabled + */ + settings.thp_enabled = THP_NEVER; + settings.shmem_enabled = SHMEM_NEVER; + push_settings(&settings); + + /* Clear VM_NOHUGEPAGE */ + madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); + ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size); + if (((bool)ret) == expect) + fail("Fail: Bad return value"); + else if (!ops->check_huge(p, expect ? nr_hpages : 0)) + fail("Fail: check_huge()"); + else + success("OK"); + + pop_settings(); +} + +static void madvise_collapse(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect) +{ + /* Sanity check */ + if (!ops->check_huge(p, 0)) { + printf("Unexpected huge page\n"); + exit(EXIT_FAILURE); + } + __madvise_collapse(msg, p, nr_hpages, ops, expect); +} + +#define TICK 500000 +static bool wait_for_scan(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops) +{ + int full_scans; + int timeout = 6; /* 3 seconds */ + + /* Sanity check */ + if (!ops->check_huge(p, 0)) { + printf("Unexpected huge page\n"); + exit(EXIT_FAILURE); + } + + madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); + + /* Wait until the second full_scan completed */ + full_scans = read_num("khugepaged/full_scans") + 2; + + printf("%s...", msg); + while (timeout--) { + if (ops->check_huge(p, nr_hpages)) + break; + if (read_num("khugepaged/full_scans") >= full_scans) + break; + printf("."); + usleep(TICK); + } + + madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE); + + return timeout == -1; +} + +static void khugepaged_collapse(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect) +{ + if (wait_for_scan(msg, p, nr_hpages, ops)) { + if (expect) + fail("Timeout"); + else + success("OK"); + return; + } + + /* + * For file and shmem memory, khugepaged only retracts pte entries after + * putting the new hugepage in the page cache. The hugepage must be + * subsequently refaulted to install the pmd mapping for the mm. + */ + if (ops != &__anon_ops) + ops->fault(p, 0, nr_hpages * hpage_pmd_size); + + if (ops->check_huge(p, expect ? nr_hpages : 0)) + success("OK"); + else + fail("Fail"); +} + +static struct collapse_context __khugepaged_context = { + .collapse = &khugepaged_collapse, + .enforce_pte_scan_limits = true, + .name = "khugepaged", +}; + +static struct collapse_context __madvise_context = { + .collapse = &madvise_collapse, + .enforce_pte_scan_limits = false, + .name = "madvise", +}; + +static bool is_tmpfs(struct mem_ops *ops) +{ + return ops == &__file_ops && finfo.type == VMA_SHMEM; +} + +static void alloc_at_fault(void) +{ + struct settings settings = *current_settings(); + char *p; + + settings.thp_enabled = THP_ALWAYS; + push_settings(&settings); + + p = alloc_mapping(1); + *p = 1; + printf("Allocate huge page on fault..."); + if (check_huge_anon(p, 1, hpage_pmd_size)) + success("OK"); + else + fail("Fail"); + + pop_settings(); + + madvise(p, page_size, MADV_DONTNEED); + printf("Split huge PMD on MADV_DONTNEED..."); + if (check_huge_anon(p, 0, hpage_pmd_size)) + success("OK"); + else + fail("Fail"); + munmap(p, hpage_pmd_size); +} + +static void collapse_full(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + int nr_hpages = 4; + unsigned long size = nr_hpages * hpage_pmd_size; + + p = ops->setup_area(nr_hpages); + ops->fault(p, 0, size); + c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages, + ops, true); + validate_memory(p, 0, size); + ops->cleanup_area(p, size); +} + +static void collapse_empty(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + + p = ops->setup_area(1); + c->collapse("Do not collapse empty PTE table", p, 1, ops, false); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + + p = ops->setup_area(1); + ops->fault(p, 0, page_size); + c->collapse("Collapse PTE table with single PTE entry present", p, + 1, ops, true); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops) +{ + int max_ptes_none = hpage_pmd_nr / 2; + struct settings settings = *current_settings(); + void *p; + + settings.khugepaged.max_ptes_none = max_ptes_none; + push_settings(&settings); + + p = ops->setup_area(1); + + if (is_tmpfs(ops)) { + /* shmem pages always in the page cache */ + printf("tmpfs..."); + skip("Skip"); + goto skip; + } + + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1, + ops, !c->enforce_pte_scan_limits); + validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + + if (c->enforce_pte_scan_limits) { + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops, + true); + validate_memory(p, 0, + (hpage_pmd_nr - max_ptes_none) * page_size); + } +skip: + ops->cleanup_area(p, hpage_pmd_size); + pop_settings(); +} + +static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); + + printf("Swapout one page..."); + if (madvise(p, page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + c->collapse("Collapse with swapping in single PTE entry", p, 1, ops, + true); + validate_memory(p, 0, hpage_pmd_size); +out: + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops) +{ + int max_ptes_swap = read_num("khugepaged/max_ptes_swap"); + void *p; + + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); + + printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); + if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, (max_ptes_swap + 1) * page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops, + !c->enforce_pte_scan_limits); + validate_memory(p, 0, hpage_pmd_size); + + if (c->enforce_pte_scan_limits) { + ops->fault(p, 0, hpage_pmd_size); + printf("Swapout %d of %d pages...", max_ptes_swap, + hpage_pmd_nr); + if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, max_ptes_swap * page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + c->collapse("Collapse with max_ptes_swap pages swapped out", p, + 1, ops, true); + validate_memory(p, 0, hpage_pmd_size); + } +out: + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + + p = alloc_hpage(ops); + + if (is_tmpfs(ops)) { + /* MADV_DONTNEED won't evict tmpfs pages */ + printf("tmpfs..."); + skip("Skip"); + goto skip; + } + + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + printf("Split huge page leaving single PTE mapping compound page..."); + madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + c->collapse("Collapse PTE table with single PTE mapping compound page", + p, 1, ops, true); + validate_memory(p, 0, page_size); +skip: + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + + p = alloc_hpage(ops); + printf("Split huge page leaving single PTE page table full of compound pages..."); + madvise(p, page_size, MADV_NOHUGEPAGE); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + c->collapse("Collapse PTE table full of compound pages", p, 1, ops, + true); + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + int i; + + p = ops->setup_area(1); + for (i = 0; i < hpage_pmd_nr; i++) { + printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", + i + 1, hpage_pmd_nr); + + madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); + ops->fault(BASE_ADDR, 0, hpage_pmd_size); + if (!ops->check_huge(BASE_ADDR, 1)) { + printf("Failed to allocate huge page\n"); + exit(EXIT_FAILURE); + } + madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE); + + p = mremap(BASE_ADDR - i * page_size, + i * page_size + hpage_pmd_size, + (i + 1) * page_size, + MREMAP_MAYMOVE | MREMAP_FIXED, + BASE_ADDR + 2 * hpage_pmd_size); + if (p == MAP_FAILED) { + perror("mremap+unmap"); + exit(EXIT_FAILURE); + } + + p = mremap(BASE_ADDR + 2 * hpage_pmd_size, + (i + 1) * page_size, + (i + 1) * page_size + hpage_pmd_size, + MREMAP_MAYMOVE | MREMAP_FIXED, + BASE_ADDR - (i + 1) * page_size); + if (p == MAP_FAILED) { + perror("mremap+alloc"); + exit(EXIT_FAILURE); + } + } + + ops->cleanup_area(BASE_ADDR, hpage_pmd_size); + ops->fault(p, 0, hpage_pmd_size); + if (!ops->check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + + c->collapse("Collapse PTE table full of different compound pages", p, 1, + ops, true); + + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_fork(struct collapse_context *c, struct mem_ops *ops) +{ + int wstatus; + void *p; + + p = ops->setup_area(1); + + printf("Allocate small page..."); + ops->fault(p, 0, page_size); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + printf("Share small page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + ops->fault(p, page_size, 2 * page_size); + c->collapse("Collapse PTE table with single page shared with parent process", + p, 1, ops, true); + + validate_memory(p, 0, page_size); + ops->cleanup_area(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has small page..."); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, page_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops) +{ + int wstatus; + void *p; + + p = alloc_hpage(ops); + printf("Share huge page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (ops->check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + + printf("Split huge page PMD in child process..."); + madvise(p, page_size, MADV_NOHUGEPAGE); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + ops->fault(p, 0, page_size); + + write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); + c->collapse("Collapse PTE table full of compound pages in child", + p, 1, ops, true); + write_num("khugepaged/max_ptes_shared", + current_settings()->khugepaged.max_ptes_shared); + + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has huge page..."); + if (ops->check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops) +{ + int max_ptes_shared = read_num("khugepaged/max_ptes_shared"); + int wstatus; + void *p; + + p = alloc_hpage(ops); + printf("Share huge page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (ops->check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + + printf("Trigger CoW on page %d of %d...", + hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + c->collapse("Maybe collapse with max_ptes_shared exceeded", p, + 1, ops, !c->enforce_pte_scan_limits); + + if (c->enforce_pte_scan_limits) { + printf("Trigger CoW on page %d of %d...", + hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) * + page_size); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + c->collapse("Collapse with max_ptes_shared PTEs shared", + p, 1, ops, true); + } + + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has huge page..."); + if (ops->check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void madvise_collapse_existing_thps(struct collapse_context *c, + struct mem_ops *ops) +{ + void *p; + + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); + c->collapse("Collapse fully populated PTE table...", p, 1, ops, true); + validate_memory(p, 0, hpage_pmd_size); + + /* c->collapse() will find a hugepage and complain - call directly. */ + __madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true); + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +/* + * Test race with khugepaged where page tables have been retracted and + * pmd cleared. + */ +static void madvise_retracted_page_tables(struct collapse_context *c, + struct mem_ops *ops) +{ + void *p; + int nr_hpages = 1; + unsigned long size = nr_hpages * hpage_pmd_size; + + p = ops->setup_area(nr_hpages); + ops->fault(p, 0, size); + + /* Let khugepaged collapse and leave pmd cleared */ + if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages, + ops)) { + fail("Timeout"); + return; + } + success("OK"); + c->collapse("Install huge PMD from page cache", p, nr_hpages, ops, + true); + validate_memory(p, 0, size); + ops->cleanup_area(p, size); +} + +static void usage(void) +{ + fprintf(stderr, "\nUsage: ./khugepaged [dir]\n\n"); + fprintf(stderr, "\t\t: :\n"); + fprintf(stderr, "\t\t: [all|khugepaged|madvise]\n"); + fprintf(stderr, "\t\t: [all|anon|file|shmem]\n"); + fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n"); + fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n"); + fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n"); + fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n"); + fprintf(stderr, "\tmounted with huge=madvise option for khugepaged tests to work\n"); + exit(1); +} + +static void parse_test_type(int argc, const char **argv) +{ + char *buf; + const char *token; + + if (argc == 1) { + /* Backwards compatibility */ + khugepaged_context = &__khugepaged_context; + madvise_context = &__madvise_context; + anon_ops = &__anon_ops; + return; + } + + buf = strdup(argv[1]); + token = strsep(&buf, ":"); + + if (!strcmp(token, "all")) { + khugepaged_context = &__khugepaged_context; + madvise_context = &__madvise_context; + } else if (!strcmp(token, "khugepaged")) { + khugepaged_context = &__khugepaged_context; + } else if (!strcmp(token, "madvise")) { + madvise_context = &__madvise_context; + } else { + usage(); + } + + if (!buf) + usage(); + + if (!strcmp(buf, "all")) { + file_ops = &__file_ops; + anon_ops = &__anon_ops; + shmem_ops = &__shmem_ops; + } else if (!strcmp(buf, "anon")) { + anon_ops = &__anon_ops; + } else if (!strcmp(buf, "file")) { + file_ops = &__file_ops; + } else if (!strcmp(buf, "shmem")) { + shmem_ops = &__shmem_ops; + } else { + usage(); + } + + if (!file_ops) + return; + + if (argc != 3) + usage(); +} + +int main(int argc, const char **argv) +{ + struct settings default_settings = { + .thp_enabled = THP_MADVISE, + .thp_defrag = THP_DEFRAG_ALWAYS, + .shmem_enabled = SHMEM_ADVISE, + .use_zero_page = 0, + .khugepaged = { + .defrag = 1, + .alloc_sleep_millisecs = 10, + .scan_sleep_millisecs = 10, + }, + /* + * When testing file-backed memory, the collapse path + * looks at how many pages are found in the page cache, not + * what pages are mapped. Disable read ahead optimization so + * pages don't find their way into the page cache unless + * we mem_ops->fault() them in. + */ + .read_ahead_kb = 0, + }; + + parse_test_type(argc, argv); + + if (file_ops) + get_finfo(argv[2]); + + setbuf(stdout, NULL); + + page_size = getpagesize(); + hpage_pmd_size = read_pmd_pagesize(); + hpage_pmd_nr = hpage_pmd_size / page_size; + + default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; + default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8; + default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2; + default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; + + save_settings(); + push_settings(&default_settings); + + alloc_at_fault(); + +#define TEST(t, c, o) do { \ + if (c && o) { \ + printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \ + t(c, o); \ + } \ + } while (0) + + TEST(collapse_full, khugepaged_context, anon_ops); + TEST(collapse_full, khugepaged_context, file_ops); + TEST(collapse_full, khugepaged_context, shmem_ops); + TEST(collapse_full, madvise_context, anon_ops); + TEST(collapse_full, madvise_context, file_ops); + TEST(collapse_full, madvise_context, shmem_ops); + + TEST(collapse_empty, khugepaged_context, anon_ops); + TEST(collapse_empty, madvise_context, anon_ops); + + TEST(collapse_single_pte_entry, khugepaged_context, anon_ops); + TEST(collapse_single_pte_entry, khugepaged_context, file_ops); + TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops); + TEST(collapse_single_pte_entry, madvise_context, anon_ops); + TEST(collapse_single_pte_entry, madvise_context, file_ops); + TEST(collapse_single_pte_entry, madvise_context, shmem_ops); + + TEST(collapse_max_ptes_none, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_none, khugepaged_context, file_ops); + TEST(collapse_max_ptes_none, madvise_context, anon_ops); + TEST(collapse_max_ptes_none, madvise_context, file_ops); + + TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops); + TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops); + TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops); + TEST(collapse_single_pte_entry_compound, madvise_context, file_ops); + + TEST(collapse_full_of_compound, khugepaged_context, anon_ops); + TEST(collapse_full_of_compound, khugepaged_context, file_ops); + TEST(collapse_full_of_compound, khugepaged_context, shmem_ops); + TEST(collapse_full_of_compound, madvise_context, anon_ops); + TEST(collapse_full_of_compound, madvise_context, file_ops); + TEST(collapse_full_of_compound, madvise_context, shmem_ops); + + TEST(collapse_compound_extreme, khugepaged_context, anon_ops); + TEST(collapse_compound_extreme, madvise_context, anon_ops); + + TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops); + TEST(collapse_swapin_single_pte, madvise_context, anon_ops); + + TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_swap, madvise_context, anon_ops); + + TEST(collapse_fork, khugepaged_context, anon_ops); + TEST(collapse_fork, madvise_context, anon_ops); + + TEST(collapse_fork_compound, khugepaged_context, anon_ops); + TEST(collapse_fork_compound, madvise_context, anon_ops); + + TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_shared, madvise_context, anon_ops); + + TEST(madvise_collapse_existing_thps, madvise_context, anon_ops); + TEST(madvise_collapse_existing_thps, madvise_context, file_ops); + TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops); + + TEST(madvise_retracted_page_tables, madvise_context, file_ops); + TEST(madvise_retracted_page_tables, madvise_context, shmem_ops); + + restore_settings(0); +} diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c new file mode 100644 index 000000000000..d8b5b4930412 --- /dev/null +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -0,0 +1,279 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KSM functional tests + * + * Copyright 2022, Red Hat, Inc. + * + * Author(s): David Hildenbrand + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" +#include "vm_util.h" + +#define KiB 1024u +#define MiB (1024 * KiB) + +static int ksm_fd; +static int ksm_full_scans_fd; +static int pagemap_fd; +static size_t pagesize; + +static bool range_maps_duplicates(char *addr, unsigned long size) +{ + unsigned long offs_a, offs_b, pfn_a, pfn_b; + + /* + * There is no easy way to check if there are KSM pages mapped into + * this range. We only check that the range does not map the same PFN + * twice by comparing each pair of mapped pages. + */ + for (offs_a = 0; offs_a < size; offs_a += pagesize) { + pfn_a = pagemap_get_pfn(pagemap_fd, addr + offs_a); + /* Page not present or PFN not exposed by the kernel. */ + if (pfn_a == -1ul || !pfn_a) + continue; + + for (offs_b = offs_a + pagesize; offs_b < size; + offs_b += pagesize) { + pfn_b = pagemap_get_pfn(pagemap_fd, addr + offs_b); + if (pfn_b == -1ul || !pfn_b) + continue; + if (pfn_a == pfn_b) + return true; + } + } + return false; +} + +static long ksm_get_full_scans(void) +{ + char buf[10]; + ssize_t ret; + + ret = pread(ksm_full_scans_fd, buf, sizeof(buf) - 1, 0); + if (ret <= 0) + return -errno; + buf[ret] = 0; + + return strtol(buf, NULL, 10); +} + +static int ksm_merge(void) +{ + long start_scans, end_scans; + + /* Wait for two full scans such that any possible merging happened. */ + start_scans = ksm_get_full_scans(); + if (start_scans < 0) + return start_scans; + if (write(ksm_fd, "1", 1) != 1) + return -errno; + do { + end_scans = ksm_get_full_scans(); + if (end_scans < 0) + return end_scans; + } while (end_scans < start_scans + 2); + + return 0; +} + +static char *mmap_and_merge_range(char val, unsigned long size) +{ + char *map; + + map = mmap(NULL, size, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANON, -1, 0); + if (map == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return MAP_FAILED; + } + + /* Don't use THP. Ignore if THP are not around on a kernel. */ + if (madvise(map, size, MADV_NOHUGEPAGE) && errno != EINVAL) { + ksft_test_result_fail("MADV_NOHUGEPAGE failed\n"); + goto unmap; + } + + /* Make sure each page contains the same values to merge them. */ + memset(map, val, size); + if (madvise(map, size, MADV_MERGEABLE)) { + ksft_test_result_fail("MADV_MERGEABLE failed\n"); + goto unmap; + } + + /* Run KSM to trigger merging and wait. */ + if (ksm_merge()) { + ksft_test_result_fail("Running KSM failed\n"); + goto unmap; + } + return map; +unmap: + munmap(map, size); + return MAP_FAILED; +} + +static void test_unmerge(void) +{ + const unsigned int size = 2 * MiB; + char *map; + + ksft_print_msg("[RUN] %s\n", __func__); + + map = mmap_and_merge_range(0xcf, size); + if (map == MAP_FAILED) + return; + + if (madvise(map, size, MADV_UNMERGEABLE)) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + goto unmap; + } + + ksft_test_result(!range_maps_duplicates(map, size), + "Pages were unmerged\n"); +unmap: + munmap(map, size); +} + +static void test_unmerge_discarded(void) +{ + const unsigned int size = 2 * MiB; + char *map; + + ksft_print_msg("[RUN] %s\n", __func__); + + map = mmap_and_merge_range(0xcf, size); + if (map == MAP_FAILED) + return; + + /* Discard half of all mapped pages so we have pte_none() entries. */ + if (madvise(map, size / 2, MADV_DONTNEED)) { + ksft_test_result_fail("MADV_DONTNEED failed\n"); + goto unmap; + } + + if (madvise(map, size, MADV_UNMERGEABLE)) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + goto unmap; + } + + ksft_test_result(!range_maps_duplicates(map, size), + "Pages were unmerged\n"); +unmap: + munmap(map, size); +} + +#ifdef __NR_userfaultfd +static void test_unmerge_uffd_wp(void) +{ + struct uffdio_writeprotect uffd_writeprotect; + struct uffdio_register uffdio_register; + const unsigned int size = 2 * MiB; + struct uffdio_api uffdio_api; + char *map; + int uffd; + + ksft_print_msg("[RUN] %s\n", __func__); + + map = mmap_and_merge_range(0xcf, size); + if (map == MAP_FAILED) + return; + + /* See if UFFD is around. */ + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd < 0) { + ksft_test_result_skip("__NR_userfaultfd failed\n"); + goto unmap; + } + + /* See if UFFD-WP is around. */ + uffdio_api.api = UFFD_API; + uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP; + if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) { + ksft_test_result_fail("UFFDIO_API failed\n"); + goto close_uffd; + } + if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) { + ksft_test_result_skip("UFFD_FEATURE_PAGEFAULT_FLAG_WP not available\n"); + goto close_uffd; + } + + /* Register UFFD-WP, no need for an actual handler. */ + uffdio_register.range.start = (unsigned long) map; + uffdio_register.range.len = size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) { + ksft_test_result_fail("UFFDIO_REGISTER_MODE_WP failed\n"); + goto close_uffd; + } + + /* Write-protect the range using UFFD-WP. */ + uffd_writeprotect.range.start = (unsigned long) map; + uffd_writeprotect.range.len = size; + uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP; + if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { + ksft_test_result_fail("UFFDIO_WRITEPROTECT failed\n"); + goto close_uffd; + } + + if (madvise(map, size, MADV_UNMERGEABLE)) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + goto close_uffd; + } + + ksft_test_result(!range_maps_duplicates(map, size), + "Pages were unmerged\n"); +close_uffd: + close(uffd); +unmap: + munmap(map, size); +} +#endif + +int main(int argc, char **argv) +{ + unsigned int tests = 2; + int err; + +#ifdef __NR_userfaultfd + tests++; +#endif + + ksft_print_header(); + ksft_set_plan(tests); + + pagesize = getpagesize(); + + ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); + if (ksm_fd < 0) + ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n"); + ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY); + if (ksm_full_scans_fd < 0) + ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n"); + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); + + test_unmerge(); + test_unmerge_discarded(); +#ifdef __NR_userfaultfd + test_unmerge_uffd_wp(); +#endif + + err = ksft_get_fail_cnt(); + if (err) + ksft_exit_fail_msg("%d out of %d tests failed\n", + err, ksft_test_num()); + return ksft_exit_pass(); +} diff --git a/tools/testing/selftests/mm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c new file mode 100644 index 000000000000..f9eb4d67e0dd --- /dev/null +++ b/tools/testing/selftests/mm/ksm_tests.c @@ -0,0 +1,849 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" +#include +#include "util.h" + +#define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/" +#define KSM_FP(s) (KSM_SYSFS_PATH s) +#define KSM_SCAN_LIMIT_SEC_DEFAULT 120 +#define KSM_PAGE_COUNT_DEFAULT 10l +#define KSM_PROT_STR_DEFAULT "rw" +#define KSM_USE_ZERO_PAGES_DEFAULT false +#define KSM_MERGE_ACROSS_NODES_DEFAULT true +#define MB (1ul << 20) + +struct ksm_sysfs { + unsigned long max_page_sharing; + unsigned long merge_across_nodes; + unsigned long pages_to_scan; + unsigned long run; + unsigned long sleep_millisecs; + unsigned long stable_node_chains_prune_millisecs; + unsigned long use_zero_pages; +}; + +enum ksm_test_name { + CHECK_KSM_MERGE, + CHECK_KSM_UNMERGE, + CHECK_KSM_ZERO_PAGE_MERGE, + CHECK_KSM_NUMA_MERGE, + KSM_MERGE_TIME, + KSM_MERGE_TIME_HUGE_PAGES, + KSM_UNMERGE_TIME, + KSM_COW_TIME +}; + +static int ksm_write_sysfs(const char *file_path, unsigned long val) +{ + FILE *f = fopen(file_path, "w"); + + if (!f) { + fprintf(stderr, "f %s\n", file_path); + perror("fopen"); + return 1; + } + if (fprintf(f, "%lu", val) < 0) { + perror("fprintf"); + fclose(f); + return 1; + } + fclose(f); + + return 0; +} + +static int ksm_read_sysfs(const char *file_path, unsigned long *val) +{ + FILE *f = fopen(file_path, "r"); + + if (!f) { + fprintf(stderr, "f %s\n", file_path); + perror("fopen"); + return 1; + } + if (fscanf(f, "%lu", val) != 1) { + perror("fscanf"); + fclose(f); + return 1; + } + fclose(f); + + return 0; +} + +static int str_to_prot(char *prot_str) +{ + int prot = 0; + + if ((strchr(prot_str, 'r')) != NULL) + prot |= PROT_READ; + if ((strchr(prot_str, 'w')) != NULL) + prot |= PROT_WRITE; + if ((strchr(prot_str, 'x')) != NULL) + prot |= PROT_EXEC; + + return prot; +} + +static void print_help(void) +{ + printf("usage: ksm_tests [-h] [-a prot] [-p page_count] [-l timeout]\n" + "[-z use_zero_pages] [-m merge_across_nodes] [-s size]\n"); + + printf("Supported :\n" + " -M (page merging)\n" + " -Z (zero pages merging)\n" + " -N (merging of pages in different NUMA nodes)\n" + " -U (page unmerging)\n" + " -P evaluate merging time and speed.\n" + " For this test, the size of duplicated memory area (in MiB)\n" + " must be provided using -s option\n" + " -H evaluate merging time and speed of area allocated mostly with huge pages\n" + " For this test, the size of duplicated memory area (in MiB)\n" + " must be provided using -s option\n" + " -D evaluate unmerging time and speed when disabling KSM.\n" + " For this test, the size of duplicated memory area (in MiB)\n" + " must be provided using -s option\n" + " -C evaluate the time required to break COW of merged pages.\n\n"); + + printf(" -a: specify the access protections of pages.\n" + " must be of the form [rwx].\n" + " Default: %s\n", KSM_PROT_STR_DEFAULT); + printf(" -p: specify the number of pages to test.\n" + " Default: %ld\n", KSM_PAGE_COUNT_DEFAULT); + printf(" -l: limit the maximum running time (in seconds) for a test.\n" + " Default: %d seconds\n", KSM_SCAN_LIMIT_SEC_DEFAULT); + printf(" -z: change use_zero_pages tunable\n" + " Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT); + printf(" -m: change merge_across_nodes tunable\n" + " Default: %d\n", KSM_MERGE_ACROSS_NODES_DEFAULT); + printf(" -s: the size of duplicated memory area (in MiB)\n"); + + exit(0); +} + +static void *allocate_memory(void *ptr, int prot, int mapping, char data, size_t map_size) +{ + void *map_ptr = mmap(ptr, map_size, PROT_WRITE, mapping, -1, 0); + + if (!map_ptr) { + perror("mmap"); + return NULL; + } + memset(map_ptr, data, map_size); + if (mprotect(map_ptr, map_size, prot)) { + perror("mprotect"); + munmap(map_ptr, map_size); + return NULL; + } + + return map_ptr; +} + +static int ksm_do_scan(int scan_count, struct timespec start_time, int timeout) +{ + struct timespec cur_time; + unsigned long cur_scan, init_scan; + + if (ksm_read_sysfs(KSM_FP("full_scans"), &init_scan)) + return 1; + cur_scan = init_scan; + + while (cur_scan < init_scan + scan_count) { + if (ksm_read_sysfs(KSM_FP("full_scans"), &cur_scan)) + return 1; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &cur_time)) { + perror("clock_gettime"); + return 1; + } + if ((cur_time.tv_sec - start_time.tv_sec) > timeout) { + printf("Scan time limit exceeded\n"); + return 1; + } + } + + return 0; +} + +static int ksm_merge_pages(void *addr, size_t size, struct timespec start_time, int timeout) +{ + if (madvise(addr, size, MADV_MERGEABLE)) { + perror("madvise"); + return 1; + } + if (ksm_write_sysfs(KSM_FP("run"), 1)) + return 1; + + /* Since merging occurs only after 2 scans, make sure to get at least 2 full scans */ + if (ksm_do_scan(2, start_time, timeout)) + return 1; + + return 0; +} + +static int ksm_unmerge_pages(void *addr, size_t size, + struct timespec start_time, int timeout) +{ + if (madvise(addr, size, MADV_UNMERGEABLE)) { + perror("madvise"); + return 1; + } + return 0; +} + +static bool assert_ksm_pages_count(long dupl_page_count) +{ + unsigned long max_page_sharing, pages_sharing, pages_shared; + + if (ksm_read_sysfs(KSM_FP("pages_shared"), &pages_shared) || + ksm_read_sysfs(KSM_FP("pages_sharing"), &pages_sharing) || + ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing)) + return false; + + /* + * Since there must be at least 2 pages for merging and 1 page can be + * shared with the limited number of pages (max_page_sharing), sometimes + * there are 'leftover' pages that cannot be merged. For example, if there + * are 11 pages and max_page_sharing = 10, then only 10 pages will be + * merged and the 11th page won't be affected. As a result, when the number + * of duplicate pages is divided by max_page_sharing and the remainder is 1, + * pages_shared and pages_sharing values will be equal between dupl_page_count + * and dupl_page_count - 1. + */ + if (dupl_page_count % max_page_sharing == 1 || dupl_page_count % max_page_sharing == 0) { + if (pages_shared == dupl_page_count / max_page_sharing && + pages_sharing == pages_shared * (max_page_sharing - 1)) + return true; + } else { + if (pages_shared == (dupl_page_count / max_page_sharing + 1) && + pages_sharing == dupl_page_count - pages_shared) + return true; + } + + return false; +} + +static int ksm_save_def(struct ksm_sysfs *ksm_sysfs) +{ + if (ksm_read_sysfs(KSM_FP("max_page_sharing"), &ksm_sysfs->max_page_sharing) || + numa_available() ? 0 : + ksm_read_sysfs(KSM_FP("merge_across_nodes"), &ksm_sysfs->merge_across_nodes) || + ksm_read_sysfs(KSM_FP("sleep_millisecs"), &ksm_sysfs->sleep_millisecs) || + ksm_read_sysfs(KSM_FP("pages_to_scan"), &ksm_sysfs->pages_to_scan) || + ksm_read_sysfs(KSM_FP("run"), &ksm_sysfs->run) || + ksm_read_sysfs(KSM_FP("stable_node_chains_prune_millisecs"), + &ksm_sysfs->stable_node_chains_prune_millisecs) || + ksm_read_sysfs(KSM_FP("use_zero_pages"), &ksm_sysfs->use_zero_pages)) + return 1; + + return 0; +} + +static int ksm_restore(struct ksm_sysfs *ksm_sysfs) +{ + if (ksm_write_sysfs(KSM_FP("max_page_sharing"), ksm_sysfs->max_page_sharing) || + numa_available() ? 0 : + ksm_write_sysfs(KSM_FP("merge_across_nodes"), ksm_sysfs->merge_across_nodes) || + ksm_write_sysfs(KSM_FP("pages_to_scan"), ksm_sysfs->pages_to_scan) || + ksm_write_sysfs(KSM_FP("run"), ksm_sysfs->run) || + ksm_write_sysfs(KSM_FP("sleep_millisecs"), ksm_sysfs->sleep_millisecs) || + ksm_write_sysfs(KSM_FP("stable_node_chains_prune_millisecs"), + ksm_sysfs->stable_node_chains_prune_millisecs) || + ksm_write_sysfs(KSM_FP("use_zero_pages"), ksm_sysfs->use_zero_pages)) + return 1; + + return 0; +} + +static int check_ksm_merge(int mapping, int prot, long page_count, int timeout, size_t page_size) +{ + void *map_ptr; + struct timespec start_time; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + /* fill pages with the same data and merge them */ + map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + /* verify that the right number of pages are merged */ + if (assert_ksm_pages_count(page_count)) { + printf("OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + } + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + +static int check_ksm_unmerge(int mapping, int prot, int timeout, size_t page_size) +{ + void *map_ptr; + struct timespec start_time; + int page_count = 2; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + /* fill pages with the same data and merge them */ + map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + /* change 1 byte in each of the 2 pages -- KSM must automatically unmerge them */ + memset(map_ptr, '-', 1); + memset(map_ptr + page_size, '+', 1); + + /* get at least 1 scan, so KSM can detect that the pages were modified */ + if (ksm_do_scan(1, start_time, timeout)) + goto err_out; + + /* check that unmerging was successful and 0 pages are currently merged */ + if (assert_ksm_pages_count(0)) { + printf("OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + } + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + +static int check_ksm_zero_page_merge(int mapping, int prot, long page_count, int timeout, + bool use_zero_pages, size_t page_size) +{ + void *map_ptr; + struct timespec start_time; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + if (ksm_write_sysfs(KSM_FP("use_zero_pages"), use_zero_pages)) + return KSFT_FAIL; + + /* fill pages with zero and try to merge them */ + map_ptr = allocate_memory(NULL, prot, mapping, 0, page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + /* + * verify that the right number of pages are merged: + * 1) if use_zero_pages is set to 1, empty pages are merged + * with the kernel zero page instead of with each other; + * 2) if use_zero_pages is set to 0, empty pages are not treated specially + * and merged as usual. + */ + if (use_zero_pages && !assert_ksm_pages_count(0)) + goto err_out; + else if (!use_zero_pages && !assert_ksm_pages_count(page_count)) + goto err_out; + + printf("OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + +static int get_next_mem_node(int node) +{ + + long node_size; + int mem_node = 0; + int i, max_node = numa_max_node(); + + for (i = node + 1; i <= max_node + node; i++) { + mem_node = i % (max_node + 1); + node_size = numa_node_size(mem_node, NULL); + if (node_size > 0) + break; + } + return mem_node; +} + +static int get_first_mem_node(void) +{ + return get_next_mem_node(numa_max_node()); +} + +static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_across_nodes, + size_t page_size) +{ + void *numa1_map_ptr, *numa2_map_ptr; + struct timespec start_time; + int page_count = 2; + int first_node; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + if (numa_available() < 0) { + perror("NUMA support not enabled"); + return KSFT_SKIP; + } + if (numa_num_configured_nodes() <= 1) { + printf("At least 2 NUMA nodes must be available\n"); + return KSFT_SKIP; + } + if (ksm_write_sysfs(KSM_FP("merge_across_nodes"), merge_across_nodes)) + return KSFT_FAIL; + + /* allocate 2 pages in 2 different NUMA nodes and fill them with the same data */ + first_node = get_first_mem_node(); + numa1_map_ptr = numa_alloc_onnode(page_size, first_node); + numa2_map_ptr = numa_alloc_onnode(page_size, get_next_mem_node(first_node)); + if (!numa1_map_ptr || !numa2_map_ptr) { + perror("numa_alloc_onnode"); + return KSFT_FAIL; + } + + memset(numa1_map_ptr, '*', page_size); + memset(numa2_map_ptr, '*', page_size); + + /* try to merge the pages */ + if (ksm_merge_pages(numa1_map_ptr, page_size, start_time, timeout) || + ksm_merge_pages(numa2_map_ptr, page_size, start_time, timeout)) + goto err_out; + + /* + * verify that the right number of pages are merged: + * 1) if merge_across_nodes was enabled, 2 duplicate pages will be merged; + * 2) if merge_across_nodes = 0, there must be 0 merged pages, since there is + * only 1 unique page in each node and they can't be shared. + */ + if (merge_across_nodes && !assert_ksm_pages_count(page_count)) + goto err_out; + else if (!merge_across_nodes && !assert_ksm_pages_count(0)) + goto err_out; + + numa_free(numa1_map_ptr, page_size); + numa_free(numa2_map_ptr, page_size); + printf("OK\n"); + return KSFT_PASS; + +err_out: + numa_free(numa1_map_ptr, page_size); + numa_free(numa2_map_ptr, page_size); + printf("Not OK\n"); + return KSFT_FAIL; +} + +static int ksm_merge_hugepages_time(int mapping, int prot, int timeout, size_t map_size) +{ + void *map_ptr, *map_ptr_orig; + struct timespec start_time, end_time; + unsigned long scan_time_ns; + int pagemap_fd, n_normal_pages, n_huge_pages; + + map_size *= MB; + size_t len = map_size; + + len -= len % HPAGE_SIZE; + map_ptr_orig = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0); + map_ptr = map_ptr_orig + HPAGE_SIZE - (uintptr_t)map_ptr_orig % HPAGE_SIZE; + + if (map_ptr_orig == MAP_FAILED) + err(2, "initial mmap"); + + if (madvise(map_ptr, len + HPAGE_SIZE, MADV_HUGEPAGE)) + err(2, "MADV_HUGEPAGE"); + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + err(2, "open pagemap"); + + n_normal_pages = 0; + n_huge_pages = 0; + for (void *p = map_ptr; p < map_ptr + len; p += HPAGE_SIZE) { + if (allocate_transhuge(p, pagemap_fd) < 0) + n_normal_pages++; + else + n_huge_pages++; + } + printf("Number of normal pages: %d\n", n_normal_pages); + printf("Number of huge pages: %d\n", n_huge_pages); + + memset(map_ptr, '*', len); + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) + goto err_out; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n", map_size / MB); + printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, + scan_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", (map_size / MB) / + ((double)scan_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr_orig, len + HPAGE_SIZE); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr_orig, len + HPAGE_SIZE); + return KSFT_FAIL; +} + +static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size) +{ + void *map_ptr; + struct timespec start_time, end_time; + unsigned long scan_time_ns; + + map_size *= MB; + + map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size); + if (!map_ptr) + return KSFT_FAIL; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) + goto err_out; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n", map_size / MB); + printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, + scan_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", (map_size / MB) / + ((double)scan_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr, map_size); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, map_size); + return KSFT_FAIL; +} + +static int ksm_unmerge_time(int mapping, int prot, int timeout, size_t map_size) +{ + void *map_ptr; + struct timespec start_time, end_time; + unsigned long scan_time_ns; + + map_size *= MB; + + map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size); + if (!map_ptr) + return KSFT_FAIL; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) + goto err_out; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_unmerge_pages(map_ptr, map_size, start_time, timeout)) + goto err_out; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n", map_size / MB); + printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, + scan_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", (map_size / MB) / + ((double)scan_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr, map_size); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, map_size); + return KSFT_FAIL; +} + +static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size) +{ + void *map_ptr; + struct timespec start_time, end_time; + unsigned long cow_time_ns; + + /* page_count must be less than 2*page_size */ + size_t page_count = 4000; + + map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + for (size_t i = 0; i < page_count - 1; i = i + 2) + memset(map_ptr + page_size * i, '-', 1); + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n\n", (page_size * page_count) / MB); + printf("Not merged pages:\n"); + printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, + cow_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n\n", ((page_size * (page_count / 2)) / MB) / + ((double)cow_time_ns / NSEC_PER_SEC)); + + /* Create 2000 pairs of duplicate pages */ + for (size_t i = 0; i < page_count - 1; i = i + 2) { + memset(map_ptr + page_size * i, '+', i / 2 + 1); + memset(map_ptr + page_size * (i + 1), '+', i / 2 + 1); + } + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + for (size_t i = 0; i < page_count - 1; i = i + 2) + memset(map_ptr + page_size * i, '-', 1); + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Merged pages:\n"); + printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, + cow_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", ((page_size * (page_count / 2)) / MB) / + ((double)cow_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + +int main(int argc, char *argv[]) +{ + int ret, opt; + int prot = 0; + int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT; + long page_count = KSM_PAGE_COUNT_DEFAULT; + size_t page_size = sysconf(_SC_PAGESIZE); + struct ksm_sysfs ksm_sysfs_old; + int test_name = CHECK_KSM_MERGE; + bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT; + bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT; + long size_MB = 0; + + while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCHD")) != -1) { + switch (opt) { + case 'a': + prot = str_to_prot(optarg); + break; + case 'p': + page_count = atol(optarg); + if (page_count <= 0) { + printf("The number of pages must be greater than 0\n"); + return KSFT_FAIL; + } + break; + case 'l': + ksm_scan_limit_sec = atoi(optarg); + if (ksm_scan_limit_sec <= 0) { + printf("Timeout value must be greater than 0\n"); + return KSFT_FAIL; + } + break; + case 'h': + print_help(); + break; + case 'z': + if (strcmp(optarg, "0") == 0) + use_zero_pages = 0; + else + use_zero_pages = 1; + break; + case 'm': + if (strcmp(optarg, "0") == 0) + merge_across_nodes = 0; + else + merge_across_nodes = 1; + break; + case 's': + size_MB = atoi(optarg); + if (size_MB <= 0) { + printf("Size must be greater than 0\n"); + return KSFT_FAIL; + } + case 'M': + break; + case 'U': + test_name = CHECK_KSM_UNMERGE; + break; + case 'Z': + test_name = CHECK_KSM_ZERO_PAGE_MERGE; + break; + case 'N': + test_name = CHECK_KSM_NUMA_MERGE; + break; + case 'P': + test_name = KSM_MERGE_TIME; + break; + case 'H': + test_name = KSM_MERGE_TIME_HUGE_PAGES; + break; + case 'D': + test_name = KSM_UNMERGE_TIME; + break; + case 'C': + test_name = KSM_COW_TIME; + break; + default: + return KSFT_FAIL; + } + } + + if (prot == 0) + prot = str_to_prot(KSM_PROT_STR_DEFAULT); + + if (access(KSM_SYSFS_PATH, F_OK)) { + printf("Config KSM not enabled\n"); + return KSFT_SKIP; + } + + if (ksm_save_def(&ksm_sysfs_old)) { + printf("Cannot save default tunables\n"); + return KSFT_FAIL; + } + + if (ksm_write_sysfs(KSM_FP("run"), 2) || + ksm_write_sysfs(KSM_FP("sleep_millisecs"), 0) || + numa_available() ? 0 : + ksm_write_sysfs(KSM_FP("merge_across_nodes"), 1) || + ksm_write_sysfs(KSM_FP("pages_to_scan"), page_count)) + return KSFT_FAIL; + + switch (test_name) { + case CHECK_KSM_MERGE: + ret = check_ksm_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, + ksm_scan_limit_sec, page_size); + break; + case CHECK_KSM_UNMERGE: + ret = check_ksm_unmerge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + page_size); + break; + case CHECK_KSM_ZERO_PAGE_MERGE: + ret = check_ksm_zero_page_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, + ksm_scan_limit_sec, use_zero_pages, page_size); + break; + case CHECK_KSM_NUMA_MERGE: + ret = check_ksm_numa_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + merge_across_nodes, page_size); + break; + case KSM_MERGE_TIME: + if (size_MB == 0) { + printf("Option '-s' is required.\n"); + return KSFT_FAIL; + } + ret = ksm_merge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + size_MB); + break; + case KSM_MERGE_TIME_HUGE_PAGES: + if (size_MB == 0) { + printf("Option '-s' is required.\n"); + return KSFT_FAIL; + } + ret = ksm_merge_hugepages_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, + ksm_scan_limit_sec, size_MB); + break; + case KSM_UNMERGE_TIME: + if (size_MB == 0) { + printf("Option '-s' is required.\n"); + return KSFT_FAIL; + } + ret = ksm_unmerge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, + ksm_scan_limit_sec, size_MB); + break; + case KSM_COW_TIME: + ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + page_size); + break; + } + + if (ksm_restore(&ksm_sysfs_old)) { + printf("Cannot restore default tunables\n"); + return KSFT_FAIL; + } + + return ret; +} diff --git a/tools/testing/selftests/mm/madv_populate.c b/tools/testing/selftests/mm/madv_populate.c new file mode 100644 index 000000000000..262eae6b58f2 --- /dev/null +++ b/tools/testing/selftests/mm/madv_populate.c @@ -0,0 +1,296 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * MADV_POPULATE_READ and MADV_POPULATE_WRITE tests + * + * Copyright 2021, Red Hat, Inc. + * + * Author(s): David Hildenbrand + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" +#include "vm_util.h" + +#ifndef MADV_POPULATE_READ +#define MADV_POPULATE_READ 22 +#endif /* MADV_POPULATE_READ */ +#ifndef MADV_POPULATE_WRITE +#define MADV_POPULATE_WRITE 23 +#endif /* MADV_POPULATE_WRITE */ + +/* + * For now, we're using 2 MiB of private anonymous memory for all tests. + */ +#define SIZE (2 * 1024 * 1024) + +static size_t pagesize; + +static void sense_support(void) +{ + char *addr; + int ret; + + addr = mmap(0, pagesize, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (!addr) + ksft_exit_fail_msg("mmap failed\n"); + + ret = madvise(addr, pagesize, MADV_POPULATE_READ); + if (ret) + ksft_exit_skip("MADV_POPULATE_READ is not available\n"); + + ret = madvise(addr, pagesize, MADV_POPULATE_WRITE); + if (ret) + ksft_exit_skip("MADV_POPULATE_WRITE is not available\n"); + + munmap(addr, pagesize); +} + +static void test_prot_read(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(!ret, "MADV_POPULATE_READ with PROT_READ\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(ret == -1 && errno == EINVAL, + "MADV_POPULATE_WRITE with PROT_READ\n"); + + munmap(addr, SIZE); +} + +static void test_prot_write(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(ret == -1 && errno == EINVAL, + "MADV_POPULATE_READ with PROT_WRITE\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(!ret, "MADV_POPULATE_WRITE with PROT_WRITE\n"); + + munmap(addr, SIZE); +} + +static void test_holes(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + ret = munmap(addr + pagesize, pagesize); + if (ret) + ksft_exit_fail_msg("munmap failed\n"); + + /* Hole in the middle */ + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_READ with holes in the middle\n"); + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_WRITE with holes in the middle\n"); + + /* Hole at end */ + ret = madvise(addr, 2 * pagesize, MADV_POPULATE_READ); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_READ with holes at the end\n"); + ret = madvise(addr, 2 * pagesize, MADV_POPULATE_WRITE); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_WRITE with holes at the end\n"); + + /* Hole at beginning */ + ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_READ); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_READ with holes at the beginning\n"); + ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_WRITE); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_WRITE with holes at the beginning\n"); + + munmap(addr, SIZE); +} + +static bool range_is_populated(char *start, ssize_t size) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + bool ret = true; + + if (fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + for (; size > 0 && ret; size -= pagesize, start += pagesize) + if (!pagemap_is_populated(fd, start)) + ret = false; + close(fd); + return ret; +} + +static bool range_is_not_populated(char *start, ssize_t size) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + bool ret = true; + + if (fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + for (; size > 0 && ret; size -= pagesize, start += pagesize) + if (pagemap_is_populated(fd, start)) + ret = false; + close(fd); + return ret; +} + +static void test_populate_read(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + ksft_test_result(range_is_not_populated(addr, SIZE), + "range initially not populated\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(!ret, "MADV_POPULATE_READ\n"); + ksft_test_result(range_is_populated(addr, SIZE), + "range is populated\n"); + + munmap(addr, SIZE); +} + +static void test_populate_write(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + ksft_test_result(range_is_not_populated(addr, SIZE), + "range initially not populated\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(!ret, "MADV_POPULATE_WRITE\n"); + ksft_test_result(range_is_populated(addr, SIZE), + "range is populated\n"); + + munmap(addr, SIZE); +} + +static bool range_is_softdirty(char *start, ssize_t size) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + bool ret = true; + + if (fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + for (; size > 0 && ret; size -= pagesize, start += pagesize) + if (!pagemap_is_softdirty(fd, start)) + ret = false; + close(fd); + return ret; +} + +static bool range_is_not_softdirty(char *start, ssize_t size) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + bool ret = true; + + if (fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + for (; size > 0 && ret; size -= pagesize, start += pagesize) + if (pagemap_is_softdirty(fd, start)) + ret = false; + close(fd); + return ret; +} + +static void test_softdirty(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + /* Clear any softdirty bits. */ + clear_softdirty(); + ksft_test_result(range_is_not_softdirty(addr, SIZE), + "range is not softdirty\n"); + + /* Populating READ should set softdirty. */ + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(!ret, "MADV_POPULATE_READ\n"); + ksft_test_result(range_is_not_softdirty(addr, SIZE), + "range is not softdirty\n"); + + /* Populating WRITE should set softdirty. */ + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(!ret, "MADV_POPULATE_WRITE\n"); + ksft_test_result(range_is_softdirty(addr, SIZE), + "range is softdirty\n"); + + munmap(addr, SIZE); +} + +int main(int argc, char **argv) +{ + int err; + + pagesize = getpagesize(); + + ksft_print_header(); + ksft_set_plan(21); + + sense_support(); + test_prot_read(); + test_prot_write(); + test_holes(); + test_populate_read(); + test_populate_write(); + test_softdirty(); + + err = ksft_get_fail_cnt(); + if (err) + ksft_exit_fail_msg("%d out of %d tests failed\n", + err, ksft_test_num()); + return ksft_exit_pass(); +} diff --git a/tools/testing/selftests/mm/map_fixed_noreplace.c b/tools/testing/selftests/mm/map_fixed_noreplace.c new file mode 100644 index 000000000000..eed44322d1a6 --- /dev/null +++ b/tools/testing/selftests/mm/map_fixed_noreplace.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Test that MAP_FIXED_NOREPLACE works. + * + * Copyright 2018, Jann Horn + * Copyright 2018, Michael Ellerman, IBM Corporation. + */ + +#include +#include +#include +#include +#include + +#ifndef MAP_FIXED_NOREPLACE +#define MAP_FIXED_NOREPLACE 0x100000 +#endif + +static void dump_maps(void) +{ + char cmd[32]; + + snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid()); + system(cmd); +} + +static unsigned long find_base_addr(unsigned long size) +{ + void *addr; + unsigned long flags; + + flags = MAP_PRIVATE | MAP_ANONYMOUS; + addr = mmap(NULL, size, PROT_NONE, flags, -1, 0); + if (addr == MAP_FAILED) { + printf("Error: couldn't map the space we need for the test\n"); + return 0; + } + + if (munmap(addr, size) != 0) { + printf("Error: couldn't map the space we need for the test\n"); + return 0; + } + return (unsigned long)addr; +} + +int main(void) +{ + unsigned long base_addr; + unsigned long flags, addr, size, page_size; + char *p; + + page_size = sysconf(_SC_PAGE_SIZE); + + //let's find a base addr that is free before we start the tests + size = 5 * page_size; + base_addr = find_base_addr(size); + if (!base_addr) { + printf("Error: couldn't map the space we need for the test\n"); + return 1; + } + + flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE; + + // Check we can map all the areas we need below + errno = 0; + addr = base_addr; + size = 5 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p == MAP_FAILED) { + dump_maps(); + printf("Error: couldn't map the space we need for the test\n"); + return 1; + } + + errno = 0; + if (munmap((void *)addr, 5 * page_size) != 0) { + dump_maps(); + printf("Error: munmap failed!?\n"); + return 1; + } + printf("unmap() successful\n"); + + errno = 0; + addr = base_addr + page_size; + size = 3 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p == MAP_FAILED) { + dump_maps(); + printf("Error: first mmap() failed unexpectedly\n"); + return 1; + } + + /* + * Exact same mapping again: + * base | free | new + * +1 | mapped | new + * +2 | mapped | new + * +3 | mapped | new + * +4 | free | new + */ + errno = 0; + addr = base_addr; + size = 5 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p != MAP_FAILED) { + dump_maps(); + printf("Error:1: mmap() succeeded when it shouldn't have\n"); + return 1; + } + + /* + * Second mapping contained within first: + * + * base | free | + * +1 | mapped | + * +2 | mapped | new + * +3 | mapped | + * +4 | free | + */ + errno = 0; + addr = base_addr + (2 * page_size); + size = page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p != MAP_FAILED) { + dump_maps(); + printf("Error:2: mmap() succeeded when it shouldn't have\n"); + return 1; + } + + /* + * Overlap end of existing mapping: + * base | free | + * +1 | mapped | + * +2 | mapped | + * +3 | mapped | new + * +4 | free | new + */ + errno = 0; + addr = base_addr + (3 * page_size); + size = 2 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p != MAP_FAILED) { + dump_maps(); + printf("Error:3: mmap() succeeded when it shouldn't have\n"); + return 1; + } + + /* + * Overlap start of existing mapping: + * base | free | new + * +1 | mapped | new + * +2 | mapped | + * +3 | mapped | + * +4 | free | + */ + errno = 0; + addr = base_addr; + size = 2 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p != MAP_FAILED) { + dump_maps(); + printf("Error:4: mmap() succeeded when it shouldn't have\n"); + return 1; + } + + /* + * Adjacent to start of existing mapping: + * base | free | new + * +1 | mapped | + * +2 | mapped | + * +3 | mapped | + * +4 | free | + */ + errno = 0; + addr = base_addr; + size = page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p == MAP_FAILED) { + dump_maps(); + printf("Error:5: mmap() failed when it shouldn't have\n"); + return 1; + } + + /* + * Adjacent to end of existing mapping: + * base | free | + * +1 | mapped | + * +2 | mapped | + * +3 | mapped | + * +4 | free | new + */ + errno = 0; + addr = base_addr + (4 * page_size); + size = page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p == MAP_FAILED) { + dump_maps(); + printf("Error:6: mmap() failed when it shouldn't have\n"); + return 1; + } + + addr = base_addr; + size = 5 * page_size; + if (munmap((void *)addr, size) != 0) { + dump_maps(); + printf("Error: munmap failed!?\n"); + return 1; + } + printf("unmap() successful\n"); + + printf("OK\n"); + return 0; +} diff --git a/tools/testing/selftests/mm/map_hugetlb.c b/tools/testing/selftests/mm/map_hugetlb.c new file mode 100644 index 000000000000..312889edb84a --- /dev/null +++ b/tools/testing/selftests/mm/map_hugetlb.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Example of using hugepage memory in a user application using the mmap + * system call with MAP_HUGETLB flag. Before running this program make + * sure the administrator has allocated enough default sized huge pages + * to cover the 256 MB allocation. + * + * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. + * That means the addresses starting with 0x800000... will need to be + * specified. Specifying a fixed address is not required on ppc64, i386 + * or x86_64. + */ +#include +#include +#include +#include +#include + +#define LENGTH (256UL*1024*1024) +#define PROTECTION (PROT_READ | PROT_WRITE) + +#ifndef MAP_HUGETLB +#define MAP_HUGETLB 0x40000 /* arch specific */ +#endif + +#ifndef MAP_HUGE_SHIFT +#define MAP_HUGE_SHIFT 26 +#endif + +#ifndef MAP_HUGE_MASK +#define MAP_HUGE_MASK 0x3f +#endif + +/* Only ia64 requires this */ +#ifdef __ia64__ +#define ADDR (void *)(0x8000000000000000UL) +#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) +#else +#define ADDR (void *)(0x0UL) +#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) +#endif + +static void check_bytes(char *addr) +{ + printf("First hex is %x\n", *((unsigned int *)addr)); +} + +static void write_bytes(char *addr, size_t length) +{ + unsigned long i; + + for (i = 0; i < length; i++) + *(addr + i) = (char)i; +} + +static int read_bytes(char *addr, size_t length) +{ + unsigned long i; + + check_bytes(addr); + for (i = 0; i < length; i++) + if (*(addr + i) != (char)i) { + printf("Mismatch at %lu\n", i); + return 1; + } + return 0; +} + +int main(int argc, char **argv) +{ + void *addr; + int ret; + size_t length = LENGTH; + int flags = FLAGS; + int shift = 0; + + if (argc > 1) + length = atol(argv[1]) << 20; + if (argc > 2) { + shift = atoi(argv[2]); + if (shift) + flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; + } + + if (shift) + printf("%u kB hugepages\n", 1 << (shift - 10)); + else + printf("Default size hugepages\n"); + printf("Mapping %lu Mbytes\n", (unsigned long)length >> 20); + + addr = mmap(ADDR, length, PROTECTION, flags, -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + printf("Returned address is %p\n", addr); + check_bytes(addr); + write_bytes(addr, length); + ret = read_bytes(addr, length); + + /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ + if (munmap(addr, length)) { + perror("munmap"); + exit(1); + } + + return ret; +} diff --git a/tools/testing/selftests/mm/map_populate.c b/tools/testing/selftests/mm/map_populate.c new file mode 100644 index 000000000000..6b8aeaa0bf7a --- /dev/null +++ b/tools/testing/selftests/mm/map_populate.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Dmitry Safonov, Arista Networks + * + * MAP_POPULATE | MAP_PRIVATE should COW VMA pages. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef MMAP_SZ +#define MMAP_SZ 4096 +#endif + +#define BUG_ON(condition, description) \ + do { \ + if (condition) { \ + fprintf(stderr, "[FAIL]\t%s:%d\t%s:%s\n", __func__, \ + __LINE__, (description), strerror(errno)); \ + exit(1); \ + } \ + } while (0) + +static int parent_f(int sock, unsigned long *smap, int child) +{ + int status, ret; + + ret = read(sock, &status, sizeof(int)); + BUG_ON(ret <= 0, "read(sock)"); + + *smap = 0x22222BAD; + ret = msync(smap, MMAP_SZ, MS_SYNC); + BUG_ON(ret, "msync()"); + + ret = write(sock, &status, sizeof(int)); + BUG_ON(ret <= 0, "write(sock)"); + + waitpid(child, &status, 0); + BUG_ON(!WIFEXITED(status), "child in unexpected state"); + + return WEXITSTATUS(status); +} + +static int child_f(int sock, unsigned long *smap, int fd) +{ + int ret, buf = 0; + + smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_POPULATE, fd, 0); + BUG_ON(smap == MAP_FAILED, "mmap()"); + + BUG_ON(*smap != 0xdeadbabe, "MAP_PRIVATE | MAP_POPULATE changed file"); + + ret = write(sock, &buf, sizeof(int)); + BUG_ON(ret <= 0, "write(sock)"); + + ret = read(sock, &buf, sizeof(int)); + BUG_ON(ret <= 0, "read(sock)"); + + BUG_ON(*smap == 0x22222BAD, "MAP_POPULATE didn't COW private page"); + BUG_ON(*smap != 0xdeadbabe, "mapping was corrupted"); + + return 0; +} + +int main(int argc, char **argv) +{ + int sock[2], child, ret; + FILE *ftmp; + unsigned long *smap; + + ftmp = tmpfile(); + BUG_ON(ftmp == 0, "tmpfile()"); + + ret = ftruncate(fileno(ftmp), MMAP_SZ); + BUG_ON(ret, "ftruncate()"); + + smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE, + MAP_SHARED, fileno(ftmp), 0); + BUG_ON(smap == MAP_FAILED, "mmap()"); + + *smap = 0xdeadbabe; + /* Probably unnecessary, but let it be. */ + ret = msync(smap, MMAP_SZ, MS_SYNC); + BUG_ON(ret, "msync()"); + + ret = socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sock); + BUG_ON(ret, "socketpair()"); + + child = fork(); + BUG_ON(child == -1, "fork()"); + + if (child) { + ret = close(sock[0]); + BUG_ON(ret, "close()"); + + return parent_f(sock[1], smap, child); + } + + ret = close(sock[1]); + BUG_ON(ret, "close()"); + + return child_f(sock[0], smap, fileno(ftmp)); +} diff --git a/tools/testing/selftests/mm/memfd_secret.c b/tools/testing/selftests/mm/memfd_secret.c new file mode 100644 index 000000000000..957b9e18c729 --- /dev/null +++ b/tools/testing/selftests/mm/memfd_secret.c @@ -0,0 +1,296 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corporation, 2021 + * + * Author: Mike Rapoport + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +#define fail(fmt, ...) ksft_test_result_fail(fmt, ##__VA_ARGS__) +#define pass(fmt, ...) ksft_test_result_pass(fmt, ##__VA_ARGS__) +#define skip(fmt, ...) ksft_test_result_skip(fmt, ##__VA_ARGS__) + +#ifdef __NR_memfd_secret + +#define PATTERN 0x55 + +static const int prot = PROT_READ | PROT_WRITE; +static const int mode = MAP_SHARED; + +static unsigned long page_size; +static unsigned long mlock_limit_cur; +static unsigned long mlock_limit_max; + +static int memfd_secret(unsigned int flags) +{ + return syscall(__NR_memfd_secret, flags); +} + +static void test_file_apis(int fd) +{ + char buf[64]; + + if ((read(fd, buf, sizeof(buf)) >= 0) || + (write(fd, buf, sizeof(buf)) >= 0) || + (pread(fd, buf, sizeof(buf), 0) >= 0) || + (pwrite(fd, buf, sizeof(buf), 0) >= 0)) + fail("unexpected file IO\n"); + else + pass("file IO is blocked as expected\n"); +} + +static void test_mlock_limit(int fd) +{ + size_t len; + char *mem; + + len = mlock_limit_cur; + mem = mmap(NULL, len, prot, mode, fd, 0); + if (mem == MAP_FAILED) { + fail("unable to mmap secret memory\n"); + return; + } + munmap(mem, len); + + len = mlock_limit_max * 2; + mem = mmap(NULL, len, prot, mode, fd, 0); + if (mem != MAP_FAILED) { + fail("unexpected mlock limit violation\n"); + munmap(mem, len); + return; + } + + pass("mlock limit is respected\n"); +} + +static void try_process_vm_read(int fd, int pipefd[2]) +{ + struct iovec liov, riov; + char buf[64]; + char *mem; + + if (read(pipefd[0], &mem, sizeof(mem)) < 0) { + fail("pipe write: %s\n", strerror(errno)); + exit(KSFT_FAIL); + } + + liov.iov_len = riov.iov_len = sizeof(buf); + liov.iov_base = buf; + riov.iov_base = mem; + + if (process_vm_readv(getppid(), &liov, 1, &riov, 1, 0) < 0) { + if (errno == ENOSYS) + exit(KSFT_SKIP); + exit(KSFT_PASS); + } + + exit(KSFT_FAIL); +} + +static void try_ptrace(int fd, int pipefd[2]) +{ + pid_t ppid = getppid(); + int status; + char *mem; + long ret; + + if (read(pipefd[0], &mem, sizeof(mem)) < 0) { + perror("pipe write"); + exit(KSFT_FAIL); + } + + ret = ptrace(PTRACE_ATTACH, ppid, 0, 0); + if (ret) { + perror("ptrace_attach"); + exit(KSFT_FAIL); + } + + ret = waitpid(ppid, &status, WUNTRACED); + if ((ret != ppid) || !(WIFSTOPPED(status))) { + fprintf(stderr, "weird waitppid result %ld stat %x\n", + ret, status); + exit(KSFT_FAIL); + } + + if (ptrace(PTRACE_PEEKDATA, ppid, mem, 0)) + exit(KSFT_PASS); + + exit(KSFT_FAIL); +} + +static void check_child_status(pid_t pid, const char *name) +{ + int status; + + waitpid(pid, &status, 0); + + if (WIFEXITED(status) && WEXITSTATUS(status) == KSFT_SKIP) { + skip("%s is not supported\n", name); + return; + } + + if ((WIFEXITED(status) && WEXITSTATUS(status) == KSFT_PASS) || + WIFSIGNALED(status)) { + pass("%s is blocked as expected\n", name); + return; + } + + fail("%s: unexpected memory access\n", name); +} + +static void test_remote_access(int fd, const char *name, + void (*func)(int fd, int pipefd[2])) +{ + int pipefd[2]; + pid_t pid; + char *mem; + + if (pipe(pipefd)) { + fail("pipe failed: %s\n", strerror(errno)); + return; + } + + pid = fork(); + if (pid < 0) { + fail("fork failed: %s\n", strerror(errno)); + return; + } + + if (pid == 0) { + func(fd, pipefd); + return; + } + + mem = mmap(NULL, page_size, prot, mode, fd, 0); + if (mem == MAP_FAILED) { + fail("Unable to mmap secret memory\n"); + return; + } + + ftruncate(fd, page_size); + memset(mem, PATTERN, page_size); + + if (write(pipefd[1], &mem, sizeof(mem)) < 0) { + fail("pipe write: %s\n", strerror(errno)); + return; + } + + check_child_status(pid, name); +} + +static void test_process_vm_read(int fd) +{ + test_remote_access(fd, "process_vm_read", try_process_vm_read); +} + +static void test_ptrace(int fd) +{ + test_remote_access(fd, "ptrace", try_ptrace); +} + +static int set_cap_limits(rlim_t max) +{ + struct rlimit new; + cap_t cap = cap_init(); + + new.rlim_cur = max; + new.rlim_max = max; + if (setrlimit(RLIMIT_MEMLOCK, &new)) { + perror("setrlimit() returns error"); + return -1; + } + + /* drop capabilities including CAP_IPC_LOCK */ + if (cap_set_proc(cap)) { + perror("cap_set_proc() returns error"); + return -2; + } + + return 0; +} + +static void prepare(void) +{ + struct rlimit rlim; + + page_size = sysconf(_SC_PAGE_SIZE); + if (!page_size) + ksft_exit_fail_msg("Failed to get page size %s\n", + strerror(errno)); + + if (getrlimit(RLIMIT_MEMLOCK, &rlim)) + ksft_exit_fail_msg("Unable to detect mlock limit: %s\n", + strerror(errno)); + + mlock_limit_cur = rlim.rlim_cur; + mlock_limit_max = rlim.rlim_max; + + printf("page_size: %ld, mlock.soft: %ld, mlock.hard: %ld\n", + page_size, mlock_limit_cur, mlock_limit_max); + + if (page_size > mlock_limit_cur) + mlock_limit_cur = page_size; + if (page_size > mlock_limit_max) + mlock_limit_max = page_size; + + if (set_cap_limits(mlock_limit_max)) + ksft_exit_fail_msg("Unable to set mlock limit: %s\n", + strerror(errno)); +} + +#define NUM_TESTS 4 + +int main(int argc, char *argv[]) +{ + int fd; + + prepare(); + + ksft_print_header(); + ksft_set_plan(NUM_TESTS); + + fd = memfd_secret(0); + if (fd < 0) { + if (errno == ENOSYS) + ksft_exit_skip("memfd_secret is not supported\n"); + else + ksft_exit_fail_msg("memfd_secret failed: %s\n", + strerror(errno)); + } + + test_mlock_limit(fd); + test_file_apis(fd); + test_process_vm_read(fd); + test_ptrace(fd); + + close(fd); + + ksft_finished(); +} + +#else /* __NR_memfd_secret */ + +int main(int argc, char *argv[]) +{ + printf("skip: skipping memfd_secret test (missing __NR_memfd_secret)\n"); + return KSFT_SKIP; +} + +#endif /* __NR_memfd_secret */ diff --git a/tools/testing/selftests/mm/migration.c b/tools/testing/selftests/mm/migration.c new file mode 100644 index 000000000000..1cec8425e3ca --- /dev/null +++ b/tools/testing/selftests/mm/migration.c @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * The main purpose of the tests here is to exercise the migration entry code + * paths in the kernel. + */ + +#include "../kselftest_harness.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#define TWOMEG (2<<20) +#define RUNTIME (60) + +#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) + +FIXTURE(migration) +{ + pthread_t *threads; + pid_t *pids; + int nthreads; + int n1; + int n2; +}; + +FIXTURE_SETUP(migration) +{ + int n; + + ASSERT_EQ(numa_available(), 0); + self->nthreads = numa_num_task_cpus() - 1; + self->n1 = -1; + self->n2 = -1; + + for (n = 0; n < numa_max_possible_node(); n++) + if (numa_bitmask_isbitset(numa_all_nodes_ptr, n)) { + if (self->n1 == -1) { + self->n1 = n; + } else { + self->n2 = n; + break; + } + } + + self->threads = malloc(self->nthreads * sizeof(*self->threads)); + ASSERT_NE(self->threads, NULL); + self->pids = malloc(self->nthreads * sizeof(*self->pids)); + ASSERT_NE(self->pids, NULL); +}; + +FIXTURE_TEARDOWN(migration) +{ + free(self->threads); + free(self->pids); +} + +int migrate(uint64_t *ptr, int n1, int n2) +{ + int ret, tmp; + int status = 0; + struct timespec ts1, ts2; + + if (clock_gettime(CLOCK_MONOTONIC, &ts1)) + return -1; + + while (1) { + if (clock_gettime(CLOCK_MONOTONIC, &ts2)) + return -1; + + if (ts2.tv_sec - ts1.tv_sec >= RUNTIME) + return 0; + + ret = move_pages(0, 1, (void **) &ptr, &n2, &status, + MPOL_MF_MOVE_ALL); + if (ret) { + if (ret > 0) + printf("Didn't migrate %d pages\n", ret); + else + perror("Couldn't migrate pages"); + return -2; + } + + tmp = n2; + n2 = n1; + n1 = tmp; + } + + return 0; +} + +void *access_mem(void *ptr) +{ + uint64_t y = 0; + volatile uint64_t *x = ptr; + + while (1) { + pthread_testcancel(); + y += *x; + } + + return NULL; +} + +/* + * Basic migration entry testing. One thread will move pages back and forth + * between nodes whilst other threads try and access them triggering the + * migration entry wait paths in the kernel. + */ +TEST_F_TIMEOUT(migration, private_anon, 2*RUNTIME) +{ + uint64_t *ptr; + int i; + + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) + SKIP(return, "Not enough threads or NUMA nodes available"); + + ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + memset(ptr, 0xde, TWOMEG); + for (i = 0; i < self->nthreads - 1; i++) + if (pthread_create(&self->threads[i], NULL, access_mem, ptr)) + perror("Couldn't create thread"); + + ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); + for (i = 0; i < self->nthreads - 1; i++) + ASSERT_EQ(pthread_cancel(self->threads[i]), 0); +} + +/* + * Same as the previous test but with shared memory. + */ +TEST_F_TIMEOUT(migration, shared_anon, 2*RUNTIME) +{ + pid_t pid; + uint64_t *ptr; + int i; + + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) + SKIP(return, "Not enough threads or NUMA nodes available"); + + ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + memset(ptr, 0xde, TWOMEG); + for (i = 0; i < self->nthreads - 1; i++) { + pid = fork(); + if (!pid) + access_mem(ptr); + else + self->pids[i] = pid; + } + + ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); + for (i = 0; i < self->nthreads - 1; i++) + ASSERT_EQ(kill(self->pids[i], SIGTERM), 0); +} + +/* + * Tests the pmd migration entry paths. + */ +TEST_F_TIMEOUT(migration, private_anon_thp, 2*RUNTIME) +{ + uint64_t *ptr; + int i; + + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) + SKIP(return, "Not enough threads or NUMA nodes available"); + + ptr = mmap(NULL, 2*TWOMEG, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + ptr = (uint64_t *) ALIGN((uintptr_t) ptr, TWOMEG); + ASSERT_EQ(madvise(ptr, TWOMEG, MADV_HUGEPAGE), 0); + memset(ptr, 0xde, TWOMEG); + for (i = 0; i < self->nthreads - 1; i++) + if (pthread_create(&self->threads[i], NULL, access_mem, ptr)) + perror("Couldn't create thread"); + + ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); + for (i = 0; i < self->nthreads - 1; i++) + ASSERT_EQ(pthread_cancel(self->threads[i]), 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/mlock-random-test.c b/tools/testing/selftests/mm/mlock-random-test.c new file mode 100644 index 000000000000..782ea94dee2f --- /dev/null +++ b/tools/testing/selftests/mm/mlock-random-test.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * It tests the mlock/mlock2() when they are invoked + * on randomly memory region. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mlock2.h" + +#define CHUNK_UNIT (128 * 1024) +#define MLOCK_RLIMIT_SIZE (CHUNK_UNIT * 2) +#define MLOCK_WITHIN_LIMIT_SIZE CHUNK_UNIT +#define MLOCK_OUTOF_LIMIT_SIZE (CHUNK_UNIT * 3) + +#define TEST_LOOP 100 +#define PAGE_ALIGN(size, ps) (((size) + ((ps) - 1)) & ~((ps) - 1)) + +int set_cap_limits(rlim_t max) +{ + struct rlimit new; + cap_t cap = cap_init(); + + new.rlim_cur = max; + new.rlim_max = max; + if (setrlimit(RLIMIT_MEMLOCK, &new)) { + perror("setrlimit() returns error\n"); + return -1; + } + + /* drop capabilities including CAP_IPC_LOCK */ + if (cap_set_proc(cap)) { + perror("cap_set_proc() returns error\n"); + return -2; + } + + return 0; +} + +int get_proc_locked_vm_size(void) +{ + FILE *f; + int ret = -1; + char line[1024] = {0}; + unsigned long lock_size = 0; + + f = fopen("/proc/self/status", "r"); + if (!f) { + perror("fopen"); + return -1; + } + + while (fgets(line, 1024, f)) { + if (strstr(line, "VmLck")) { + ret = sscanf(line, "VmLck:\t%8lu kB", &lock_size); + if (ret <= 0) { + printf("sscanf() on VmLck error: %s: %d\n", + line, ret); + fclose(f); + return -1; + } + fclose(f); + return (int)(lock_size << 10); + } + } + + perror("cannot parse VmLck in /proc/self/status\n"); + fclose(f); + return -1; +} + +/* + * Get the MMUPageSize of the memory region including input + * address from proc file. + * + * return value: on error case, 0 will be returned. + * Otherwise the page size(in bytes) is returned. + */ +int get_proc_page_size(unsigned long addr) +{ + FILE *smaps; + char *line; + unsigned long mmupage_size = 0; + size_t size; + + smaps = seek_to_smaps_entry(addr); + if (!smaps) { + printf("Unable to parse /proc/self/smaps\n"); + return 0; + } + + while (getline(&line, &size, smaps) > 0) { + if (!strstr(line, "MMUPageSize")) { + free(line); + line = NULL; + size = 0; + continue; + } + + /* found the MMUPageSize of this section */ + if (sscanf(line, "MMUPageSize: %8lu kB", + &mmupage_size) < 1) { + printf("Unable to parse smaps entry for Size:%s\n", + line); + break; + } + + } + free(line); + if (smaps) + fclose(smaps); + return mmupage_size << 10; +} + +/* + * Test mlock/mlock2() on provided memory chunk. + * It expects the mlock/mlock2() to be successful (within rlimit) + * + * With allocated memory chunk [p, p + alloc_size), this + * test will choose start/len randomly to perform mlock/mlock2 + * [start, start + len] memory range. The range is within range + * of the allocated chunk. + * + * The memory region size alloc_size is within the rlimit. + * So we always expect a success of mlock/mlock2. + * + * VmLck is assumed to be 0 before this test. + * + * return value: 0 - success + * else: failure + */ +int test_mlock_within_limit(char *p, int alloc_size) +{ + int i; + int ret = 0; + int locked_vm_size = 0; + struct rlimit cur; + int page_size = 0; + + getrlimit(RLIMIT_MEMLOCK, &cur); + if (cur.rlim_cur < alloc_size) { + printf("alloc_size[%d] < %u rlimit,lead to mlock failure\n", + alloc_size, (unsigned int)cur.rlim_cur); + return -1; + } + + srand(time(NULL)); + for (i = 0; i < TEST_LOOP; i++) { + /* + * - choose mlock/mlock2 randomly + * - choose lock_size randomly but lock_size < alloc_size + * - choose start_offset randomly but p+start_offset+lock_size + * < p+alloc_size + */ + int is_mlock = !!(rand() % 2); + int lock_size = rand() % alloc_size; + int start_offset = rand() % (alloc_size - lock_size); + + if (is_mlock) + ret = mlock(p + start_offset, lock_size); + else + ret = mlock2_(p + start_offset, lock_size, + MLOCK_ONFAULT); + + if (ret) { + printf("%s() failure at |%p(%d)| mlock:|%p(%d)|\n", + is_mlock ? "mlock" : "mlock2", + p, alloc_size, + p + start_offset, lock_size); + return ret; + } + } + + /* + * Check VmLck left by the tests. + */ + locked_vm_size = get_proc_locked_vm_size(); + page_size = get_proc_page_size((unsigned long)p); + if (page_size == 0) { + printf("cannot get proc MMUPageSize\n"); + return -1; + } + + if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size) { + printf("test_mlock_within_limit() left VmLck:%d on %d chunk\n", + locked_vm_size, alloc_size); + return -1; + } + + return 0; +} + + +/* + * We expect the mlock/mlock2() to be fail (outof limitation) + * + * With allocated memory chunk [p, p + alloc_size), this + * test will randomly choose start/len and perform mlock/mlock2 + * on [start, start+len] range. + * + * The memory region size alloc_size is above the rlimit. + * And the len to be locked is higher than rlimit. + * So we always expect a failure of mlock/mlock2. + * No locked page number should be increased as a side effect. + * + * return value: 0 - success + * else: failure + */ +int test_mlock_outof_limit(char *p, int alloc_size) +{ + int i; + int ret = 0; + int locked_vm_size = 0, old_locked_vm_size = 0; + struct rlimit cur; + + getrlimit(RLIMIT_MEMLOCK, &cur); + if (cur.rlim_cur >= alloc_size) { + printf("alloc_size[%d] >%u rlimit, violates test condition\n", + alloc_size, (unsigned int)cur.rlim_cur); + return -1; + } + + old_locked_vm_size = get_proc_locked_vm_size(); + srand(time(NULL)); + for (i = 0; i < TEST_LOOP; i++) { + int is_mlock = !!(rand() % 2); + int lock_size = (rand() % (alloc_size - cur.rlim_cur)) + + cur.rlim_cur; + int start_offset = rand() % (alloc_size - lock_size); + + if (is_mlock) + ret = mlock(p + start_offset, lock_size); + else + ret = mlock2_(p + start_offset, lock_size, + MLOCK_ONFAULT); + if (ret == 0) { + printf("%s() succeeds? on %p(%d) mlock%p(%d)\n", + is_mlock ? "mlock" : "mlock2", + p, alloc_size, + p + start_offset, lock_size); + return -1; + } + } + + locked_vm_size = get_proc_locked_vm_size(); + if (locked_vm_size != old_locked_vm_size) { + printf("tests leads to new mlocked page: old[%d], new[%d]\n", + old_locked_vm_size, + locked_vm_size); + return -1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + char *p = NULL; + int ret = 0; + + if (set_cap_limits(MLOCK_RLIMIT_SIZE)) + return -1; + + p = malloc(MLOCK_WITHIN_LIMIT_SIZE); + if (p == NULL) { + perror("malloc() failure\n"); + return -1; + } + ret = test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE); + if (ret) + return ret; + munlock(p, MLOCK_WITHIN_LIMIT_SIZE); + free(p); + + + p = malloc(MLOCK_OUTOF_LIMIT_SIZE); + if (p == NULL) { + perror("malloc() failure\n"); + return -1; + } + ret = test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE); + if (ret) + return ret; + munlock(p, MLOCK_OUTOF_LIMIT_SIZE); + free(p); + + return 0; +} diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c new file mode 100644 index 000000000000..11b2301f3aa3 --- /dev/null +++ b/tools/testing/selftests/mm/mlock2-tests.c @@ -0,0 +1,520 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include "mlock2.h" + +#include "../kselftest.h" + +struct vm_boundaries { + unsigned long start; + unsigned long end; +}; + +static int get_vm_area(unsigned long addr, struct vm_boundaries *area) +{ + FILE *file; + int ret = 1; + char line[1024] = {0}; + char *end_addr; + char *stop; + unsigned long start; + unsigned long end; + + if (!area) + return ret; + + file = fopen("/proc/self/maps", "r"); + if (!file) { + perror("fopen"); + return ret; + } + + memset(area, 0, sizeof(struct vm_boundaries)); + + while(fgets(line, 1024, file)) { + end_addr = strchr(line, '-'); + if (!end_addr) { + printf("cannot parse /proc/self/maps\n"); + goto out; + } + *end_addr = '\0'; + end_addr++; + stop = strchr(end_addr, ' '); + if (!stop) { + printf("cannot parse /proc/self/maps\n"); + goto out; + } + stop = '\0'; + + sscanf(line, "%lx", &start); + sscanf(end_addr, "%lx", &end); + + if (start <= addr && end > addr) { + area->start = start; + area->end = end; + ret = 0; + goto out; + } + } +out: + fclose(file); + return ret; +} + +#define VMFLAGS "VmFlags:" + +static bool is_vmflag_set(unsigned long addr, const char *vmflag) +{ + char *line = NULL; + char *flags; + size_t size = 0; + bool ret = false; + FILE *smaps; + + smaps = seek_to_smaps_entry(addr); + if (!smaps) { + printf("Unable to parse /proc/self/smaps\n"); + goto out; + } + + while (getline(&line, &size, smaps) > 0) { + if (!strstr(line, VMFLAGS)) { + free(line); + line = NULL; + size = 0; + continue; + } + + flags = line + strlen(VMFLAGS); + ret = (strstr(flags, vmflag) != NULL); + goto out; + } + +out: + free(line); + fclose(smaps); + return ret; +} + +#define SIZE "Size:" +#define RSS "Rss:" +#define LOCKED "lo" + +static unsigned long get_value_for_name(unsigned long addr, const char *name) +{ + char *line = NULL; + size_t size = 0; + char *value_ptr; + FILE *smaps = NULL; + unsigned long value = -1UL; + + smaps = seek_to_smaps_entry(addr); + if (!smaps) { + printf("Unable to parse /proc/self/smaps\n"); + goto out; + } + + while (getline(&line, &size, smaps) > 0) { + if (!strstr(line, name)) { + free(line); + line = NULL; + size = 0; + continue; + } + + value_ptr = line + strlen(name); + if (sscanf(value_ptr, "%lu kB", &value) < 1) { + printf("Unable to parse smaps entry for Size\n"); + goto out; + } + break; + } + +out: + if (smaps) + fclose(smaps); + free(line); + return value; +} + +static bool is_vma_lock_on_fault(unsigned long addr) +{ + bool locked; + unsigned long vma_size, vma_rss; + + locked = is_vmflag_set(addr, LOCKED); + if (!locked) + return false; + + vma_size = get_value_for_name(addr, SIZE); + vma_rss = get_value_for_name(addr, RSS); + + /* only one page is faulted in */ + return (vma_rss < vma_size); +} + +#define PRESENT_BIT 0x8000000000000000ULL +#define PFN_MASK 0x007FFFFFFFFFFFFFULL +#define UNEVICTABLE_BIT (1UL << 18) + +static int lock_check(unsigned long addr) +{ + bool locked; + unsigned long vma_size, vma_rss; + + locked = is_vmflag_set(addr, LOCKED); + if (!locked) + return false; + + vma_size = get_value_for_name(addr, SIZE); + vma_rss = get_value_for_name(addr, RSS); + + return (vma_rss == vma_size); +} + +static int unlock_lock_check(char *map) +{ + if (is_vmflag_set((unsigned long)map, LOCKED)) { + printf("VMA flag %s is present on page 1 after unlock\n", LOCKED); + return 1; + } + + return 0; +} + +static int test_mlock_lock() +{ + char *map; + int ret = 1; + unsigned long page_size = getpagesize(); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map == MAP_FAILED) { + perror("test_mlock_locked mmap"); + goto out; + } + + if (mlock2_(map, 2 * page_size, 0)) { + if (errno == ENOSYS) { + printf("Cannot call new mlock family, skipping test\n"); + _exit(KSFT_SKIP); + } + perror("mlock2(0)"); + goto unmap; + } + + if (!lock_check((unsigned long)map)) + goto unmap; + + /* Now unlock and recheck attributes */ + if (munlock(map, 2 * page_size)) { + perror("munlock()"); + goto unmap; + } + + ret = unlock_lock_check(map); + +unmap: + munmap(map, 2 * page_size); +out: + return ret; +} + +static int onfault_check(char *map) +{ + *map = 'a'; + if (!is_vma_lock_on_fault((unsigned long)map)) { + printf("VMA is not marked for lock on fault\n"); + return 1; + } + + return 0; +} + +static int unlock_onfault_check(char *map) +{ + unsigned long page_size = getpagesize(); + + if (is_vma_lock_on_fault((unsigned long)map) || + is_vma_lock_on_fault((unsigned long)map + page_size)) { + printf("VMA is still lock on fault after unlock\n"); + return 1; + } + + return 0; +} + +static int test_mlock_onfault() +{ + char *map; + int ret = 1; + unsigned long page_size = getpagesize(); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map == MAP_FAILED) { + perror("test_mlock_locked mmap"); + goto out; + } + + if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) { + if (errno == ENOSYS) { + printf("Cannot call new mlock family, skipping test\n"); + _exit(KSFT_SKIP); + } + perror("mlock2(MLOCK_ONFAULT)"); + goto unmap; + } + + if (onfault_check(map)) + goto unmap; + + /* Now unlock and recheck attributes */ + if (munlock(map, 2 * page_size)) { + if (errno == ENOSYS) { + printf("Cannot call new mlock family, skipping test\n"); + _exit(KSFT_SKIP); + } + perror("munlock()"); + goto unmap; + } + + ret = unlock_onfault_check(map); +unmap: + munmap(map, 2 * page_size); +out: + return ret; +} + +static int test_lock_onfault_of_present() +{ + char *map; + int ret = 1; + unsigned long page_size = getpagesize(); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map == MAP_FAILED) { + perror("test_mlock_locked mmap"); + goto out; + } + + *map = 'a'; + + if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) { + if (errno == ENOSYS) { + printf("Cannot call new mlock family, skipping test\n"); + _exit(KSFT_SKIP); + } + perror("mlock2(MLOCK_ONFAULT)"); + goto unmap; + } + + if (!is_vma_lock_on_fault((unsigned long)map) || + !is_vma_lock_on_fault((unsigned long)map + page_size)) { + printf("VMA with present pages is not marked lock on fault\n"); + goto unmap; + } + ret = 0; +unmap: + munmap(map, 2 * page_size); +out: + return ret; +} + +static int test_munlockall() +{ + char *map; + int ret = 1; + unsigned long page_size = getpagesize(); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + + if (map == MAP_FAILED) { + perror("test_munlockall mmap"); + goto out; + } + + if (mlockall(MCL_CURRENT)) { + perror("mlockall(MCL_CURRENT)"); + goto out; + } + + if (!lock_check((unsigned long)map)) + goto unmap; + + if (munlockall()) { + perror("munlockall()"); + goto unmap; + } + + if (unlock_lock_check(map)) + goto unmap; + + munmap(map, 2 * page_size); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + + if (map == MAP_FAILED) { + perror("test_munlockall second mmap"); + goto out; + } + + if (mlockall(MCL_CURRENT | MCL_ONFAULT)) { + perror("mlockall(MCL_CURRENT | MCL_ONFAULT)"); + goto unmap; + } + + if (onfault_check(map)) + goto unmap; + + if (munlockall()) { + perror("munlockall()"); + goto unmap; + } + + if (unlock_onfault_check(map)) + goto unmap; + + if (mlockall(MCL_CURRENT | MCL_FUTURE)) { + perror("mlockall(MCL_CURRENT | MCL_FUTURE)"); + goto out; + } + + if (!lock_check((unsigned long)map)) + goto unmap; + + if (munlockall()) { + perror("munlockall()"); + goto unmap; + } + + ret = unlock_lock_check(map); + +unmap: + munmap(map, 2 * page_size); +out: + munlockall(); + return ret; +} + +static int test_vma_management(bool call_mlock) +{ + int ret = 1; + void *map; + unsigned long page_size = getpagesize(); + struct vm_boundaries page1; + struct vm_boundaries page2; + struct vm_boundaries page3; + + map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map == MAP_FAILED) { + perror("mmap()"); + return ret; + } + + if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) { + if (errno == ENOSYS) { + printf("Cannot call new mlock family, skipping test\n"); + _exit(KSFT_SKIP); + } + perror("mlock(ONFAULT)\n"); + goto out; + } + + if (get_vm_area((unsigned long)map, &page1) || + get_vm_area((unsigned long)map + page_size, &page2) || + get_vm_area((unsigned long)map + page_size * 2, &page3)) { + printf("couldn't find mapping in /proc/self/maps\n"); + goto out; + } + + /* + * Before we unlock a portion, we need to that all three pages are in + * the same VMA. If they are not we abort this test (Note that this is + * not a failure) + */ + if (page1.start != page2.start || page2.start != page3.start) { + printf("VMAs are not merged to start, aborting test\n"); + ret = 0; + goto out; + } + + if (munlock(map + page_size, page_size)) { + perror("munlock()"); + goto out; + } + + if (get_vm_area((unsigned long)map, &page1) || + get_vm_area((unsigned long)map + page_size, &page2) || + get_vm_area((unsigned long)map + page_size * 2, &page3)) { + printf("couldn't find mapping in /proc/self/maps\n"); + goto out; + } + + /* All three VMAs should be different */ + if (page1.start == page2.start || page2.start == page3.start) { + printf("failed to split VMA for munlock\n"); + goto out; + } + + /* Now unlock the first and third page and check the VMAs again */ + if (munlock(map, page_size * 3)) { + perror("munlock()"); + goto out; + } + + if (get_vm_area((unsigned long)map, &page1) || + get_vm_area((unsigned long)map + page_size, &page2) || + get_vm_area((unsigned long)map + page_size * 2, &page3)) { + printf("couldn't find mapping in /proc/self/maps\n"); + goto out; + } + + /* Now all three VMAs should be the same */ + if (page1.start != page2.start || page2.start != page3.start) { + printf("failed to merge VMAs after munlock\n"); + goto out; + } + + ret = 0; +out: + munmap(map, 3 * page_size); + return ret; +} + +static int test_mlockall(int (test_function)(bool call_mlock)) +{ + int ret = 1; + + if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) { + perror("mlockall"); + return ret; + } + + ret = test_function(false); + munlockall(); + return ret; +} + +int main(int argc, char **argv) +{ + int ret = 0; + ret += test_mlock_lock(); + ret += test_mlock_onfault(); + ret += test_munlockall(); + ret += test_lock_onfault_of_present(); + ret += test_vma_management(true); + ret += test_mlockall(test_vma_management); + return ret; +} diff --git a/tools/testing/selftests/mm/mlock2.h b/tools/testing/selftests/mm/mlock2.h new file mode 100644 index 000000000000..2a6e76c226bc --- /dev/null +++ b/tools/testing/selftests/mm/mlock2.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include + +#ifndef MLOCK_ONFAULT +#define MLOCK_ONFAULT 1 +#endif + +#ifndef MCL_ONFAULT +#define MCL_ONFAULT (MCL_FUTURE << 1) +#endif + +static int mlock2_(void *start, size_t len, int flags) +{ +#ifdef __NR_mlock2 + return syscall(__NR_mlock2, start, len, flags); +#else + errno = ENOSYS; + return -1; +#endif +} + +static FILE *seek_to_smaps_entry(unsigned long addr) +{ + FILE *file; + char *line = NULL; + size_t size = 0; + unsigned long start, end; + char perms[5]; + unsigned long offset; + char dev[32]; + unsigned long inode; + char path[BUFSIZ]; + + file = fopen("/proc/self/smaps", "r"); + if (!file) { + perror("fopen smaps"); + _exit(1); + } + + while (getline(&line, &size, file) > 0) { + if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n", + &start, &end, perms, &offset, dev, &inode, path) < 6) + goto next; + + if (start <= addr && addr < end) + goto out; + +next: + free(line); + line = NULL; + size = 0; + } + + fclose(file); + file = NULL; + +out: + free(line); + return file; +} diff --git a/tools/testing/selftests/mm/mrelease_test.c b/tools/testing/selftests/mm/mrelease_test.c new file mode 100644 index 000000000000..6c62966ab5db --- /dev/null +++ b/tools/testing/selftests/mm/mrelease_test.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2022 Google LLC + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#include "util.h" + +#include "../kselftest.h" + +#ifndef __NR_pidfd_open +#define __NR_pidfd_open -1 +#endif + +#ifndef __NR_process_mrelease +#define __NR_process_mrelease -1 +#endif + +#define MB(x) (x << 20) +#define MAX_SIZE_MB 1024 + +static int alloc_noexit(unsigned long nr_pages, int pipefd) +{ + int ppid = getppid(); + int timeout = 10; /* 10sec timeout to get killed */ + unsigned long i; + char *buf; + + buf = (char *)mmap(NULL, nr_pages * PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, 0, 0); + if (buf == MAP_FAILED) { + perror("mmap failed, halting the test"); + return KSFT_FAIL; + } + + for (i = 0; i < nr_pages; i++) + *((unsigned long *)(buf + (i * PAGE_SIZE))) = i; + + /* Signal the parent that the child is ready */ + if (write(pipefd, "", 1) < 0) { + perror("write"); + return KSFT_FAIL; + } + + /* Wait to be killed (when reparenting happens) */ + while (getppid() == ppid && timeout > 0) { + sleep(1); + timeout--; + } + + munmap(buf, nr_pages * PAGE_SIZE); + + return (timeout > 0) ? KSFT_PASS : KSFT_FAIL; +} + +/* The process_mrelease calls in this test are expected to fail */ +static void run_negative_tests(int pidfd) +{ + int res; + /* Test invalid flags. Expect to fail with EINVAL error code. */ + if (!syscall(__NR_process_mrelease, pidfd, (unsigned int)-1) || + errno != EINVAL) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + perror("process_mrelease with wrong flags"); + exit(res); + } + /* + * Test reaping while process is alive with no pending SIGKILL. + * Expect to fail with EINVAL error code. + */ + if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + perror("process_mrelease on a live process"); + exit(res); + } +} + +static int child_main(int pipefd[], size_t size) +{ + int res; + + /* Allocate and fault-in memory and wait to be killed */ + close(pipefd[0]); + res = alloc_noexit(MB(size) / PAGE_SIZE, pipefd[1]); + close(pipefd[1]); + return res; +} + +int main(void) +{ + int pipefd[2], pidfd; + bool success, retry; + size_t size; + pid_t pid; + char byte; + int res; + + /* Test a wrong pidfd */ + if (!syscall(__NR_process_mrelease, -1, 0) || errno != EBADF) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + perror("process_mrelease with wrong pidfd"); + exit(res); + } + + /* Start the test with 1MB child memory allocation */ + size = 1; +retry: + /* + * Pipe for the child to signal when it's done allocating + * memory + */ + if (pipe(pipefd)) { + perror("pipe"); + exit(KSFT_FAIL); + } + pid = fork(); + if (pid < 0) { + perror("fork"); + close(pipefd[0]); + close(pipefd[1]); + exit(KSFT_FAIL); + } + + if (pid == 0) { + /* Child main routine */ + res = child_main(pipefd, size); + exit(res); + } + + /* + * Parent main routine: + * Wait for the child to finish allocations, then kill and reap + */ + close(pipefd[1]); + /* Block until the child is ready */ + res = read(pipefd[0], &byte, 1); + close(pipefd[0]); + if (res < 0) { + perror("read"); + if (!kill(pid, SIGKILL)) + waitpid(pid, NULL, 0); + exit(KSFT_FAIL); + } + + pidfd = syscall(__NR_pidfd_open, pid, 0); + if (pidfd < 0) { + perror("pidfd_open"); + if (!kill(pid, SIGKILL)) + waitpid(pid, NULL, 0); + exit(KSFT_FAIL); + } + + /* Run negative tests which require a live child */ + run_negative_tests(pidfd); + + if (kill(pid, SIGKILL)) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + perror("kill"); + exit(res); + } + + success = (syscall(__NR_process_mrelease, pidfd, 0) == 0); + if (!success) { + /* + * If we failed to reap because the child exited too soon, + * before we could call process_mrelease. Double child's memory + * which causes it to spend more time on cleanup and increases + * our chances of reaping its memory before it exits. + * Retry until we succeed or reach MAX_SIZE_MB. + */ + if (errno == ESRCH) { + retry = (size <= MAX_SIZE_MB); + } else { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + perror("process_mrelease"); + waitpid(pid, NULL, 0); + exit(res); + } + } + + /* Cleanup to prevent zombies */ + if (waitpid(pid, NULL, 0) < 0) { + perror("waitpid"); + exit(KSFT_FAIL); + } + close(pidfd); + + if (!success) { + if (retry) { + size *= 2; + goto retry; + } + printf("All process_mrelease attempts failed!\n"); + exit(KSFT_FAIL); + } + + printf("Success reaping a child with %zuMB of memory allocations\n", + size); + return KSFT_PASS; +} diff --git a/tools/testing/selftests/mm/mremap_dontunmap.c b/tools/testing/selftests/mm/mremap_dontunmap.c new file mode 100644 index 000000000000..f01dc4a85b0b --- /dev/null +++ b/tools/testing/selftests/mm/mremap_dontunmap.c @@ -0,0 +1,364 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Tests for mremap w/ MREMAP_DONTUNMAP. + * + * Copyright 2020, Brian Geffon + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +#ifndef MREMAP_DONTUNMAP +#define MREMAP_DONTUNMAP 4 +#endif + +unsigned long page_size; +char *page_buffer; + +static void dump_maps(void) +{ + char cmd[32]; + + snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid()); + system(cmd); +} + +#define BUG_ON(condition, description) \ + do { \ + if (condition) { \ + fprintf(stderr, "[FAIL]\t%s():%d\t%s:%s\n", __func__, \ + __LINE__, (description), strerror(errno)); \ + dump_maps(); \ + exit(1); \ + } \ + } while (0) + +// Try a simple operation for to "test" for kernel support this prevents +// reporting tests as failed when it's run on an older kernel. +static int kernel_support_for_mremap_dontunmap() +{ + int ret = 0; + unsigned long num_pages = 1; + void *source_mapping = mmap(NULL, num_pages * page_size, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + + // This simple remap should only fail if MREMAP_DONTUNMAP isn't + // supported. + void *dest_mapping = + mremap(source_mapping, num_pages * page_size, num_pages * page_size, + MREMAP_DONTUNMAP | MREMAP_MAYMOVE, 0); + if (dest_mapping == MAP_FAILED) { + ret = errno; + } else { + BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, + "unable to unmap destination mapping"); + } + + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); + return ret; +} + +// This helper will just validate that an entire mapping contains the expected +// byte. +static int check_region_contains_byte(void *addr, unsigned long size, char byte) +{ + BUG_ON(size & (page_size - 1), + "check_region_contains_byte expects page multiples"); + BUG_ON((unsigned long)addr & (page_size - 1), + "check_region_contains_byte expects page alignment"); + + memset(page_buffer, byte, page_size); + + unsigned long num_pages = size / page_size; + unsigned long i; + + // Compare each page checking that it contains our expected byte. + for (i = 0; i < num_pages; ++i) { + int ret = + memcmp(addr + (i * page_size), page_buffer, page_size); + if (ret) { + return ret; + } + } + + return 0; +} + +// this test validates that MREMAP_DONTUNMAP moves the pagetables while leaving +// the source mapping mapped. +static void mremap_dontunmap_simple() +{ + unsigned long num_pages = 5; + + void *source_mapping = + mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + + memset(source_mapping, 'a', num_pages * page_size); + + // Try to just move the whole mapping anywhere (not fixed). + void *dest_mapping = + mremap(source_mapping, num_pages * page_size, num_pages * page_size, + MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); + BUG_ON(dest_mapping == MAP_FAILED, "mremap"); + + // Validate that the pages have been moved, we know they were moved if + // the dest_mapping contains a's. + BUG_ON(check_region_contains_byte + (dest_mapping, num_pages * page_size, 'a') != 0, + "pages did not migrate"); + BUG_ON(check_region_contains_byte + (source_mapping, num_pages * page_size, 0) != 0, + "source should have no ptes"); + + BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, + "unable to unmap destination mapping"); + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); +} + +// This test validates that MREMAP_DONTUNMAP on a shared mapping works as expected. +static void mremap_dontunmap_simple_shmem() +{ + unsigned long num_pages = 5; + + int mem_fd = memfd_create("memfd", MFD_CLOEXEC); + BUG_ON(mem_fd < 0, "memfd_create"); + + BUG_ON(ftruncate(mem_fd, num_pages * page_size) < 0, + "ftruncate"); + + void *source_mapping = + mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, + MAP_FILE | MAP_SHARED, mem_fd, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + + BUG_ON(close(mem_fd) < 0, "close"); + + memset(source_mapping, 'a', num_pages * page_size); + + // Try to just move the whole mapping anywhere (not fixed). + void *dest_mapping = + mremap(source_mapping, num_pages * page_size, num_pages * page_size, + MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); + if (dest_mapping == MAP_FAILED && errno == EINVAL) { + // Old kernel which doesn't support MREMAP_DONTUNMAP on shmem. + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); + return; + } + + BUG_ON(dest_mapping == MAP_FAILED, "mremap"); + + // Validate that the pages have been moved, we know they were moved if + // the dest_mapping contains a's. + BUG_ON(check_region_contains_byte + (dest_mapping, num_pages * page_size, 'a') != 0, + "pages did not migrate"); + + // Because the region is backed by shmem, we will actually see the same + // memory at the source location still. + BUG_ON(check_region_contains_byte + (source_mapping, num_pages * page_size, 'a') != 0, + "source should have no ptes"); + + BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, + "unable to unmap destination mapping"); + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); +} + +// This test validates MREMAP_DONTUNMAP will move page tables to a specific +// destination using MREMAP_FIXED, also while validating that the source +// remains intact. +static void mremap_dontunmap_simple_fixed() +{ + unsigned long num_pages = 5; + + // Since we want to guarantee that we can remap to a point, we will + // create a mapping up front. + void *dest_mapping = + mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(dest_mapping == MAP_FAILED, "mmap"); + memset(dest_mapping, 'X', num_pages * page_size); + + void *source_mapping = + mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + memset(source_mapping, 'a', num_pages * page_size); + + void *remapped_mapping = + mremap(source_mapping, num_pages * page_size, num_pages * page_size, + MREMAP_FIXED | MREMAP_DONTUNMAP | MREMAP_MAYMOVE, + dest_mapping); + BUG_ON(remapped_mapping == MAP_FAILED, "mremap"); + BUG_ON(remapped_mapping != dest_mapping, + "mremap should have placed the remapped mapping at dest_mapping"); + + // The dest mapping will have been unmap by mremap so we expect the Xs + // to be gone and replaced with a's. + BUG_ON(check_region_contains_byte + (dest_mapping, num_pages * page_size, 'a') != 0, + "pages did not migrate"); + + // And the source mapping will have had its ptes dropped. + BUG_ON(check_region_contains_byte + (source_mapping, num_pages * page_size, 0) != 0, + "source should have no ptes"); + + BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, + "unable to unmap destination mapping"); + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); +} + +// This test validates that we can MREMAP_DONTUNMAP for a portion of an +// existing mapping. +static void mremap_dontunmap_partial_mapping() +{ + /* + * source mapping: + * -------------- + * | aaaaaaaaaa | + * -------------- + * to become: + * -------------- + * | aaaaa00000 | + * -------------- + * With the destination mapping containing 5 pages of As. + * --------- + * | aaaaa | + * --------- + */ + unsigned long num_pages = 10; + void *source_mapping = + mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + memset(source_mapping, 'a', num_pages * page_size); + + // We will grab the last 5 pages of the source and move them. + void *dest_mapping = + mremap(source_mapping + (5 * page_size), 5 * page_size, + 5 * page_size, + MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); + BUG_ON(dest_mapping == MAP_FAILED, "mremap"); + + // We expect the first 5 pages of the source to contain a's and the + // final 5 pages to contain zeros. + BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 'a') != + 0, "first 5 pages of source should have original pages"); + BUG_ON(check_region_contains_byte + (source_mapping + (5 * page_size), 5 * page_size, 0) != 0, + "final 5 pages of source should have no ptes"); + + // Finally we expect the destination to have 5 pages worth of a's. + BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') != + 0, "dest mapping should contain ptes from the source"); + + BUG_ON(munmap(dest_mapping, 5 * page_size) == -1, + "unable to unmap destination mapping"); + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); +} + +// This test validates that we can remap over only a portion of a mapping. +static void mremap_dontunmap_partial_mapping_overwrite(void) +{ + /* + * source mapping: + * --------- + * |aaaaa| + * --------- + * dest mapping initially: + * ----------- + * |XXXXXXXXXX| + * ------------ + * Source to become: + * --------- + * |00000| + * --------- + * With the destination mapping containing 5 pages of As. + * ------------ + * |aaaaaXXXXX| + * ------------ + */ + void *source_mapping = + mmap(NULL, 5 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + memset(source_mapping, 'a', 5 * page_size); + + void *dest_mapping = + mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(dest_mapping == MAP_FAILED, "mmap"); + memset(dest_mapping, 'X', 10 * page_size); + + // We will grab the last 5 pages of the source and move them. + void *remapped_mapping = + mremap(source_mapping, 5 * page_size, + 5 * page_size, + MREMAP_DONTUNMAP | MREMAP_MAYMOVE | MREMAP_FIXED, dest_mapping); + BUG_ON(dest_mapping == MAP_FAILED, "mremap"); + BUG_ON(dest_mapping != remapped_mapping, "expected to remap to dest_mapping"); + + BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 0) != + 0, "first 5 pages of source should have no ptes"); + + // Finally we expect the destination to have 5 pages worth of a's. + BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') != 0, + "dest mapping should contain ptes from the source"); + + // Finally the last 5 pages shouldn't have been touched. + BUG_ON(check_region_contains_byte(dest_mapping + (5 * page_size), + 5 * page_size, 'X') != 0, + "dest mapping should have retained the last 5 pages"); + + BUG_ON(munmap(dest_mapping, 10 * page_size) == -1, + "unable to unmap destination mapping"); + BUG_ON(munmap(source_mapping, 5 * page_size) == -1, + "unable to unmap source mapping"); +} + +int main(void) +{ + page_size = sysconf(_SC_PAGE_SIZE); + + // test for kernel support for MREMAP_DONTUNMAP skipping the test if + // not. + if (kernel_support_for_mremap_dontunmap() != 0) { + printf("No kernel support for MREMAP_DONTUNMAP\n"); + return KSFT_SKIP; + } + + // Keep a page sized buffer around for when we need it. + page_buffer = + mmap(NULL, page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(page_buffer == MAP_FAILED, "unable to mmap a page."); + + mremap_dontunmap_simple(); + mremap_dontunmap_simple_shmem(); + mremap_dontunmap_simple_fixed(); + mremap_dontunmap_partial_mapping(); + mremap_dontunmap_partial_mapping_overwrite(); + + BUG_ON(munmap(page_buffer, page_size) == -1, + "unable to unmap page buffer"); + + printf("OK\n"); + return 0; +} diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c new file mode 100644 index 000000000000..9496346973d4 --- /dev/null +++ b/tools/testing/selftests/mm/mremap_test.c @@ -0,0 +1,475 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2020 Google LLC + */ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +#define EXPECT_SUCCESS 0 +#define EXPECT_FAILURE 1 +#define NON_OVERLAPPING 0 +#define OVERLAPPING 1 +#define NS_PER_SEC 1000000000ULL +#define VALIDATION_DEFAULT_THRESHOLD 4 /* 4MB */ +#define VALIDATION_NO_THRESHOLD 0 /* Verify the entire region */ + +#define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) + +struct config { + unsigned long long src_alignment; + unsigned long long dest_alignment; + unsigned long long region_size; + int overlapping; +}; + +struct test { + const char *name; + struct config config; + int expect_failure; +}; + +enum { + _1KB = 1ULL << 10, /* 1KB -> not page aligned */ + _4KB = 4ULL << 10, + _8KB = 8ULL << 10, + _1MB = 1ULL << 20, + _2MB = 2ULL << 20, + _4MB = 4ULL << 20, + _1GB = 1ULL << 30, + _2GB = 2ULL << 30, + PMD = _2MB, + PUD = _1GB, +}; + +#define PTE page_size + +#define MAKE_TEST(source_align, destination_align, size, \ + overlaps, should_fail, test_name) \ +(struct test){ \ + .name = test_name, \ + .config = { \ + .src_alignment = source_align, \ + .dest_alignment = destination_align, \ + .region_size = size, \ + .overlapping = overlaps, \ + }, \ + .expect_failure = should_fail \ +} + +/* + * Returns false if the requested remap region overlaps with an + * existing mapping (e.g text, stack) else returns true. + */ +static bool is_remap_region_valid(void *addr, unsigned long long size) +{ + void *remap_addr = NULL; + bool ret = true; + + /* Use MAP_FIXED_NOREPLACE flag to ensure region is not mapped */ + remap_addr = mmap(addr, size, PROT_READ | PROT_WRITE, + MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, + -1, 0); + + if (remap_addr == MAP_FAILED) { + if (errno == EEXIST) + ret = false; + } else { + munmap(remap_addr, size); + } + + return ret; +} + +/* Returns mmap_min_addr sysctl tunable from procfs */ +static unsigned long long get_mmap_min_addr(void) +{ + FILE *fp; + int n_matched; + static unsigned long long addr; + + if (addr) + return addr; + + fp = fopen("/proc/sys/vm/mmap_min_addr", "r"); + if (fp == NULL) { + ksft_print_msg("Failed to open /proc/sys/vm/mmap_min_addr: %s\n", + strerror(errno)); + exit(KSFT_SKIP); + } + + n_matched = fscanf(fp, "%llu", &addr); + if (n_matched != 1) { + ksft_print_msg("Failed to read /proc/sys/vm/mmap_min_addr: %s\n", + strerror(errno)); + fclose(fp); + exit(KSFT_SKIP); + } + + fclose(fp); + return addr; +} + +/* + * This test validates that merge is called when expanding a mapping. + * Mapping containing three pages is created, middle page is unmapped + * and then the mapping containing the first page is expanded so that + * it fills the created hole. The two parts should merge creating + * single mapping with three pages. + */ +static void mremap_expand_merge(unsigned long page_size) +{ + char *test_name = "mremap expand merge"; + FILE *fp; + char *line = NULL; + size_t len = 0; + bool success = false; + char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + munmap(start + page_size, page_size); + mremap(start, page_size, 2 * page_size, 0); + + fp = fopen("/proc/self/maps", "r"); + if (fp == NULL) { + ksft_test_result_fail("%s\n", test_name); + return; + } + + while (getline(&line, &len, fp) != -1) { + char *first = strtok(line, "- "); + void *first_val = (void *)strtol(first, NULL, 16); + char *second = strtok(NULL, "- "); + void *second_val = (void *) strtol(second, NULL, 16); + + if (first_val == start && second_val == start + 3 * page_size) { + success = true; + break; + } + } + if (success) + ksft_test_result_pass("%s\n", test_name); + else + ksft_test_result_fail("%s\n", test_name); + fclose(fp); +} + +/* + * Returns the start address of the mapping on success, else returns + * NULL on failure. + */ +static void *get_source_mapping(struct config c) +{ + unsigned long long addr = 0ULL; + void *src_addr = NULL; + unsigned long long mmap_min_addr; + + mmap_min_addr = get_mmap_min_addr(); + +retry: + addr += c.src_alignment; + if (addr < mmap_min_addr) + goto retry; + + src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE, + MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, + -1, 0); + if (src_addr == MAP_FAILED) { + if (errno == EPERM || errno == EEXIST) + goto retry; + goto error; + } + /* + * Check that the address is aligned to the specified alignment. + * Addresses which have alignments that are multiples of that + * specified are not considered valid. For instance, 1GB address is + * 2MB-aligned, however it will not be considered valid for a + * requested alignment of 2MB. This is done to reduce coincidental + * alignment in the tests. + */ + if (((unsigned long long) src_addr & (c.src_alignment - 1)) || + !((unsigned long long) src_addr & c.src_alignment)) { + munmap(src_addr, c.region_size); + goto retry; + } + + if (!src_addr) + goto error; + + return src_addr; +error: + ksft_print_msg("Failed to map source region: %s\n", + strerror(errno)); + return NULL; +} + +/* Returns the time taken for the remap on success else returns -1. */ +static long long remap_region(struct config c, unsigned int threshold_mb, + char pattern_seed) +{ + void *addr, *src_addr, *dest_addr; + unsigned long long i; + struct timespec t_start = {0, 0}, t_end = {0, 0}; + long long start_ns, end_ns, align_mask, ret, offset; + unsigned long long threshold; + + if (threshold_mb == VALIDATION_NO_THRESHOLD) + threshold = c.region_size; + else + threshold = MIN(threshold_mb * _1MB, c.region_size); + + src_addr = get_source_mapping(c); + if (!src_addr) { + ret = -1; + goto out; + } + + /* Set byte pattern */ + srand(pattern_seed); + for (i = 0; i < threshold; i++) + memset((char *) src_addr + i, (char) rand(), 1); + + /* Mask to zero out lower bits of address for alignment */ + align_mask = ~(c.dest_alignment - 1); + /* Offset of destination address from the end of the source region */ + offset = (c.overlapping) ? -c.dest_alignment : c.dest_alignment; + addr = (void *) (((unsigned long long) src_addr + c.region_size + + offset) & align_mask); + + /* See comment in get_source_mapping() */ + if (!((unsigned long long) addr & c.dest_alignment)) + addr = (void *) ((unsigned long long) addr | c.dest_alignment); + + /* Don't destroy existing mappings unless expected to overlap */ + while (!is_remap_region_valid(addr, c.region_size) && !c.overlapping) { + /* Check for unsigned overflow */ + if (addr + c.dest_alignment < addr) { + ksft_print_msg("Couldn't find a valid region to remap to\n"); + ret = -1; + goto out; + } + addr += c.dest_alignment; + } + + clock_gettime(CLOCK_MONOTONIC, &t_start); + dest_addr = mremap(src_addr, c.region_size, c.region_size, + MREMAP_MAYMOVE|MREMAP_FIXED, (char *) addr); + clock_gettime(CLOCK_MONOTONIC, &t_end); + + if (dest_addr == MAP_FAILED) { + ksft_print_msg("mremap failed: %s\n", strerror(errno)); + ret = -1; + goto clean_up_src; + } + + /* Verify byte pattern after remapping */ + srand(pattern_seed); + for (i = 0; i < threshold; i++) { + char c = (char) rand(); + + if (((char *) dest_addr)[i] != c) { + ksft_print_msg("Data after remap doesn't match at offset %d\n", + i); + ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff, + ((char *) dest_addr)[i] & 0xff); + ret = -1; + goto clean_up_dest; + } + } + + start_ns = t_start.tv_sec * NS_PER_SEC + t_start.tv_nsec; + end_ns = t_end.tv_sec * NS_PER_SEC + t_end.tv_nsec; + ret = end_ns - start_ns; + +/* + * Since the destination address is specified using MREMAP_FIXED, subsequent + * mremap will unmap any previous mapping at the address range specified by + * dest_addr and region_size. This significantly affects the remap time of + * subsequent tests. So we clean up mappings after each test. + */ +clean_up_dest: + munmap(dest_addr, c.region_size); +clean_up_src: + munmap(src_addr, c.region_size); +out: + return ret; +} + +static void run_mremap_test_case(struct test test_case, int *failures, + unsigned int threshold_mb, + unsigned int pattern_seed) +{ + long long remap_time = remap_region(test_case.config, threshold_mb, + pattern_seed); + + if (remap_time < 0) { + if (test_case.expect_failure) + ksft_test_result_xfail("%s\n\tExpected mremap failure\n", + test_case.name); + else { + ksft_test_result_fail("%s\n", test_case.name); + *failures += 1; + } + } else { + /* + * Comparing mremap time is only applicable if entire region + * was faulted in. + */ + if (threshold_mb == VALIDATION_NO_THRESHOLD || + test_case.config.region_size <= threshold_mb * _1MB) + ksft_test_result_pass("%s\n\tmremap time: %12lldns\n", + test_case.name, remap_time); + else + ksft_test_result_pass("%s\n", test_case.name); + } +} + +static void usage(const char *cmd) +{ + fprintf(stderr, + "Usage: %s [[-t ] [-p ]]\n" + "-t\t only validate threshold_mb of the remapped region\n" + " \t if 0 is supplied no threshold is used; all tests\n" + " \t are run and remapped regions validated fully.\n" + " \t The default threshold used is 4MB.\n" + "-p\t provide a seed to generate the random pattern for\n" + " \t validating the remapped region.\n", cmd); +} + +static int parse_args(int argc, char **argv, unsigned int *threshold_mb, + unsigned int *pattern_seed) +{ + const char *optstr = "t:p:"; + int opt; + + while ((opt = getopt(argc, argv, optstr)) != -1) { + switch (opt) { + case 't': + *threshold_mb = atoi(optarg); + break; + case 'p': + *pattern_seed = atoi(optarg); + break; + default: + usage(argv[0]); + return -1; + } + } + + if (optind < argc) { + usage(argv[0]); + return -1; + } + + return 0; +} + +#define MAX_TEST 13 +#define MAX_PERF_TEST 3 +int main(int argc, char **argv) +{ + int failures = 0; + int i, run_perf_tests; + unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; + unsigned int pattern_seed; + int num_expand_tests = 1; + struct test test_cases[MAX_TEST]; + struct test perf_test_cases[MAX_PERF_TEST]; + int page_size; + time_t t; + + pattern_seed = (unsigned int) time(&t); + + if (parse_args(argc, argv, &threshold_mb, &pattern_seed) < 0) + exit(EXIT_FAILURE); + + ksft_print_msg("Test configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n", + threshold_mb, pattern_seed); + + page_size = sysconf(_SC_PAGESIZE); + + /* Expected mremap failures */ + test_cases[0] = MAKE_TEST(page_size, page_size, page_size, + OVERLAPPING, EXPECT_FAILURE, + "mremap - Source and Destination Regions Overlapping"); + + test_cases[1] = MAKE_TEST(page_size, page_size/4, page_size, + NON_OVERLAPPING, EXPECT_FAILURE, + "mremap - Destination Address Misaligned (1KB-aligned)"); + test_cases[2] = MAKE_TEST(page_size/4, page_size, page_size, + NON_OVERLAPPING, EXPECT_FAILURE, + "mremap - Source Address Misaligned (1KB-aligned)"); + + /* Src addr PTE aligned */ + test_cases[3] = MAKE_TEST(PTE, PTE, PTE * 2, + NON_OVERLAPPING, EXPECT_SUCCESS, + "8KB mremap - Source PTE-aligned, Destination PTE-aligned"); + + /* Src addr 1MB aligned */ + test_cases[4] = MAKE_TEST(_1MB, PTE, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2MB mremap - Source 1MB-aligned, Destination PTE-aligned"); + test_cases[5] = MAKE_TEST(_1MB, _1MB, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2MB mremap - Source 1MB-aligned, Destination 1MB-aligned"); + + /* Src addr PMD aligned */ + test_cases[6] = MAKE_TEST(PMD, PTE, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "4MB mremap - Source PMD-aligned, Destination PTE-aligned"); + test_cases[7] = MAKE_TEST(PMD, _1MB, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "4MB mremap - Source PMD-aligned, Destination 1MB-aligned"); + test_cases[8] = MAKE_TEST(PMD, PMD, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "4MB mremap - Source PMD-aligned, Destination PMD-aligned"); + + /* Src addr PUD aligned */ + test_cases[9] = MAKE_TEST(PUD, PTE, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination PTE-aligned"); + test_cases[10] = MAKE_TEST(PUD, _1MB, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination 1MB-aligned"); + test_cases[11] = MAKE_TEST(PUD, PMD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination PMD-aligned"); + test_cases[12] = MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination PUD-aligned"); + + perf_test_cases[0] = MAKE_TEST(page_size, page_size, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "1GB mremap - Source PTE-aligned, Destination PTE-aligned"); + /* + * mremap 1GB region - Page table level aligned time + * comparison. + */ + perf_test_cases[1] = MAKE_TEST(PMD, PMD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "1GB mremap - Source PMD-aligned, Destination PMD-aligned"); + perf_test_cases[2] = MAKE_TEST(PUD, PUD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "1GB mremap - Source PUD-aligned, Destination PUD-aligned"); + + run_perf_tests = (threshold_mb == VALIDATION_NO_THRESHOLD) || + (threshold_mb * _1MB >= _1GB); + + ksft_set_plan(ARRAY_SIZE(test_cases) + (run_perf_tests ? + ARRAY_SIZE(perf_test_cases) : 0) + num_expand_tests); + + for (i = 0; i < ARRAY_SIZE(test_cases); i++) + run_mremap_test_case(test_cases[i], &failures, threshold_mb, + pattern_seed); + + mremap_expand_merge(page_size); + + if (run_perf_tests) { + ksft_print_msg("\n%s\n", + "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:"); + for (i = 0; i < ARRAY_SIZE(perf_test_cases); i++) + run_mremap_test_case(perf_test_cases[i], &failures, + threshold_mb, pattern_seed); + } + + if (failures > 0) + ksft_exit_fail(); + else + ksft_exit_pass(); +} diff --git a/tools/testing/selftests/mm/on-fault-limit.c b/tools/testing/selftests/mm/on-fault-limit.c new file mode 100644 index 000000000000..634d87dfb2a4 --- /dev/null +++ b/tools/testing/selftests/mm/on-fault-limit.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include + +#ifndef MCL_ONFAULT +#define MCL_ONFAULT (MCL_FUTURE << 1) +#endif + +static int test_limit(void) +{ + int ret = 1; + struct rlimit lims; + void *map; + + if (getrlimit(RLIMIT_MEMLOCK, &lims)) { + perror("getrlimit"); + return ret; + } + + if (mlockall(MCL_ONFAULT | MCL_FUTURE)) { + perror("mlockall"); + return ret; + } + + map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0); + if (map != MAP_FAILED) + printf("mmap should have failed, but didn't\n"); + else { + ret = 0; + munmap(map, 2 * lims.rlim_max); + } + + munlockall(); + return ret; +} + +int main(int argc, char **argv) +{ + int ret = 0; + + ret += test_limit(); + return ret; +} diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h new file mode 100644 index 000000000000..92f3be3dd8e5 --- /dev/null +++ b/tools/testing/selftests/mm/pkey-helpers.h @@ -0,0 +1,226 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PKEYS_HELPER_H +#define _PKEYS_HELPER_H +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +/* Define some kernel-like types */ +#define u8 __u8 +#define u16 __u16 +#define u32 __u32 +#define u64 __u64 + +#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) + +#ifndef DEBUG_LEVEL +#define DEBUG_LEVEL 0 +#endif +#define DPRINT_IN_SIGNAL_BUF_SIZE 4096 +extern int dprint_in_signal; +extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; + +extern int test_nr; +extern int iteration_nr; + +#ifdef __GNUC__ +__attribute__((format(printf, 1, 2))) +#endif +static inline void sigsafe_printf(const char *format, ...) +{ + va_list ap; + + if (!dprint_in_signal) { + va_start(ap, format); + vprintf(format, ap); + va_end(ap); + } else { + int ret; + /* + * No printf() functions are signal-safe. + * They deadlock easily. Write the format + * string to get some output, even if + * incomplete. + */ + ret = write(1, format, strlen(format)); + if (ret < 0) + exit(1); + } +} +#define dprintf_level(level, args...) do { \ + if (level <= DEBUG_LEVEL) \ + sigsafe_printf(args); \ +} while (0) +#define dprintf0(args...) dprintf_level(0, args) +#define dprintf1(args...) dprintf_level(1, args) +#define dprintf2(args...) dprintf_level(2, args) +#define dprintf3(args...) dprintf_level(3, args) +#define dprintf4(args...) dprintf_level(4, args) + +extern void abort_hooks(void); +#define pkey_assert(condition) do { \ + if (!(condition)) { \ + dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ + __FILE__, __LINE__, \ + test_nr, iteration_nr); \ + dprintf0("errno at assert: %d", errno); \ + abort_hooks(); \ + exit(__LINE__); \ + } \ +} while (0) + +__attribute__((noinline)) int read_ptr(int *ptr); +void expected_pkey_fault(int pkey); +int sys_pkey_alloc(unsigned long flags, unsigned long init_val); +int sys_pkey_free(unsigned long pkey); +int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey); +void record_pkey_malloc(void *ptr, long size, int prot); + +#if defined(__i386__) || defined(__x86_64__) /* arch */ +#include "pkey-x86.h" +#elif defined(__powerpc64__) /* arch */ +#include "pkey-powerpc.h" +#else /* arch */ +#error Architecture not supported +#endif /* arch */ + +#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) + +static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) +{ + u32 shift = pkey_bit_position(pkey); + /* mask out bits from pkey in old value */ + reg &= ~((u64)PKEY_MASK << shift); + /* OR in new bits for pkey */ + reg |= (flags & PKEY_MASK) << shift; + return reg; +} + +static inline u64 get_pkey_bits(u64 reg, int pkey) +{ + u32 shift = pkey_bit_position(pkey); + /* + * shift down the relevant bits to the lowest two, then + * mask off all the other higher bits + */ + return ((reg >> shift) & PKEY_MASK); +} + +extern u64 shadow_pkey_reg; + +static inline u64 _read_pkey_reg(int line) +{ + u64 pkey_reg = __read_pkey_reg(); + + dprintf4("read_pkey_reg(line=%d) pkey_reg: %016llx" + " shadow: %016llx\n", + line, pkey_reg, shadow_pkey_reg); + assert(pkey_reg == shadow_pkey_reg); + + return pkey_reg; +} + +#define read_pkey_reg() _read_pkey_reg(__LINE__) + +static inline void write_pkey_reg(u64 pkey_reg) +{ + dprintf4("%s() changing %016llx to %016llx\n", __func__, + __read_pkey_reg(), pkey_reg); + /* will do the shadow check for us: */ + read_pkey_reg(); + __write_pkey_reg(pkey_reg); + shadow_pkey_reg = pkey_reg; + dprintf4("%s(%016llx) pkey_reg: %016llx\n", __func__, + pkey_reg, __read_pkey_reg()); +} + +/* + * These are technically racy. since something could + * change PKEY register between the read and the write. + */ +static inline void __pkey_access_allow(int pkey, int do_allow) +{ + u64 pkey_reg = read_pkey_reg(); + int bit = pkey * 2; + + if (do_allow) + pkey_reg &= (1<si_pkey; +#else + return (u32 *)(((u8 *)si) + si_pkey_offset); +#endif +} + +static inline int kernel_has_pkeys(void) +{ + /* try allocating a key and see if it succeeds */ + int ret = sys_pkey_alloc(0, 0); + if (ret <= 0) { + return 0; + } + sys_pkey_free(ret); + return 1; +} + +static inline int is_pkeys_supported(void) +{ + /* check if the cpu supports pkeys */ + if (!cpu_has_pkeys()) { + dprintf1("SKIP: %s: no CPU support\n", __func__); + return 0; + } + + /* check if the kernel supports pkeys */ + if (!kernel_has_pkeys()) { + dprintf1("SKIP: %s: no kernel support\n", __func__); + return 0; + } + + return 1; +} + +#endif /* _PKEYS_HELPER_H */ diff --git a/tools/testing/selftests/mm/pkey-powerpc.h b/tools/testing/selftests/mm/pkey-powerpc.h new file mode 100644 index 000000000000..1ebb586b2fbc --- /dev/null +++ b/tools/testing/selftests/mm/pkey-powerpc.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _PKEYS_POWERPC_H +#define _PKEYS_POWERPC_H + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 386 +#endif +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 384 +# define SYS_pkey_free 385 +#endif +#define REG_IP_IDX PT_NIP +#define REG_TRAPNO PT_TRAP +#define gregs gp_regs +#define fpregs fp_regs +#define si_pkey_offset 0x20 + +#undef PKEY_DISABLE_ACCESS +#define PKEY_DISABLE_ACCESS 0x3 /* disable read and write */ + +#undef PKEY_DISABLE_WRITE +#define PKEY_DISABLE_WRITE 0x2 + +#define NR_PKEYS 32 +#define NR_RESERVED_PKEYS_4K 27 /* pkey-0, pkey-1, exec-only-pkey + and 24 other keys that cannot be + represented in the PTE */ +#define NR_RESERVED_PKEYS_64K_3KEYS 3 /* PowerNV and KVM: pkey-0, + pkey-1 and exec-only key */ +#define NR_RESERVED_PKEYS_64K_4KEYS 4 /* PowerVM: pkey-0, pkey-1, + pkey-31 and exec-only key */ +#define PKEY_BITS_PER_PKEY 2 +#define HPAGE_SIZE (1UL << 24) +#define PAGE_SIZE sysconf(_SC_PAGESIZE) + +static inline u32 pkey_bit_position(int pkey) +{ + return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY; +} + +static inline u64 __read_pkey_reg(void) +{ + u64 pkey_reg; + + asm volatile("mfspr %0, 0xd" : "=r" (pkey_reg)); + + return pkey_reg; +} + +static inline void __write_pkey_reg(u64 pkey_reg) +{ + u64 amr = pkey_reg; + + dprintf4("%s() changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); + + asm volatile("isync; mtspr 0xd, %0; isync" + : : "r" ((unsigned long)(amr)) : "memory"); + + dprintf4("%s() pkey register after changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); +} + +static inline int cpu_has_pkeys(void) +{ + /* No simple way to determine this */ + return 1; +} + +static inline bool arch_is_powervm() +{ + struct stat buf; + + if ((stat("/sys/firmware/devicetree/base/ibm,partition-name", &buf) == 0) && + (stat("/sys/firmware/devicetree/base/hmc-managed?", &buf) == 0) && + (stat("/sys/firmware/devicetree/base/chosen/qemu,graphic-width", &buf) == -1) ) + return true; + + return false; +} + +static inline int get_arch_reserved_keys(void) +{ + if (sysconf(_SC_PAGESIZE) == 4096) + return NR_RESERVED_PKEYS_4K; + else + if (arch_is_powervm()) + return NR_RESERVED_PKEYS_64K_4KEYS; + else + return NR_RESERVED_PKEYS_64K_3KEYS; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ + /* + * powerpc does not allow userspace to change permissions of exec-only + * keys since those keys are not allocated by userspace. The signal + * handler wont be able to reset the permissions, which means the code + * will infinitely continue to segfault here. + */ + return; +} + +/* 4-byte instructions * 16384 = 64K page */ +#define __page_o_noops() asm(".rept 16384 ; nop; .endr") + +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + void *ptr; + int ret; + + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + pkey_assert(pkey < NR_PKEYS); + ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + pkey_assert(ptr != (void *)-1); + + ret = syscall(__NR_subpage_prot, ptr, size, NULL); + if (ret) { + perror("subpage_perm"); + return PTR_ERR_ENOTSUP; + } + + ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); + pkey_assert(!ret); + record_pkey_malloc(ptr, size, prot); + + dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); + return ptr; +} + +#endif /* _PKEYS_POWERPC_H */ diff --git a/tools/testing/selftests/mm/pkey-x86.h b/tools/testing/selftests/mm/pkey-x86.h new file mode 100644 index 000000000000..72c14cd3ddc7 --- /dev/null +++ b/tools/testing/selftests/mm/pkey-x86.h @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _PKEYS_X86_H +#define _PKEYS_X86_H + +#ifdef __i386__ + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 380 +#endif + +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 381 +# define SYS_pkey_free 382 +#endif + +#define REG_IP_IDX REG_EIP +#define si_pkey_offset 0x14 + +#else + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 329 +#endif + +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 330 +# define SYS_pkey_free 331 +#endif + +#define REG_IP_IDX REG_RIP +#define si_pkey_offset 0x20 + +#endif + +#ifndef PKEY_DISABLE_ACCESS +# define PKEY_DISABLE_ACCESS 0x1 +#endif + +#ifndef PKEY_DISABLE_WRITE +# define PKEY_DISABLE_WRITE 0x2 +#endif + +#define NR_PKEYS 16 +#define NR_RESERVED_PKEYS 2 /* pkey-0 and exec-only-pkey */ +#define PKEY_BITS_PER_PKEY 2 +#define HPAGE_SIZE (1UL<<21) +#define PAGE_SIZE 4096 +#define MB (1<<20) + +static inline void __page_o_noops(void) +{ + /* 8-bytes of instruction * 512 bytes = 1 page */ + asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr"); +} + +static inline u64 __read_pkey_reg(void) +{ + unsigned int eax, edx; + unsigned int ecx = 0; + unsigned pkey_reg; + + asm volatile(".byte 0x0f,0x01,0xee\n\t" + : "=a" (eax), "=d" (edx) + : "c" (ecx)); + pkey_reg = eax; + return pkey_reg; +} + +static inline void __write_pkey_reg(u64 pkey_reg) +{ + unsigned int eax = pkey_reg; + unsigned int ecx = 0; + unsigned int edx = 0; + + dprintf4("%s() changing %016llx to %016llx\n", __func__, + __read_pkey_reg(), pkey_reg); + asm volatile(".byte 0x0f,0x01,0xef\n\t" + : : "a" (eax), "c" (ecx), "d" (edx)); + assert(pkey_reg == __read_pkey_reg()); +} + +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */ +#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */ +#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ + +static inline int cpu_has_pkeys(void) +{ + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + + __cpuid_count(0x7, 0x0, eax, ebx, ecx, edx); + + if (!(ecx & X86_FEATURE_PKU)) { + dprintf2("cpu does not have PKU\n"); + return 0; + } + if (!(ecx & X86_FEATURE_OSPKE)) { + dprintf2("cpu does not have OSPKE\n"); + return 0; + } + return 1; +} + +static inline int cpu_max_xsave_size(void) +{ + unsigned long XSTATE_CPUID = 0xd; + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + + __cpuid_count(XSTATE_CPUID, 0, eax, ebx, ecx, edx); + return ecx; +} + +static inline u32 pkey_bit_position(int pkey) +{ + return pkey * PKEY_BITS_PER_PKEY; +} + +#define XSTATE_PKEY_BIT (9) +#define XSTATE_PKEY 0x200 +#define XSTATE_BV_OFFSET 512 + +int pkey_reg_xstate_offset(void) +{ + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + int xstate_offset; + int xstate_size; + unsigned long XSTATE_CPUID = 0xd; + int leaf; + + /* assume that XSTATE_PKEY is set in XCR0 */ + leaf = XSTATE_PKEY_BIT; + { + __cpuid_count(XSTATE_CPUID, leaf, eax, ebx, ecx, edx); + + if (leaf == XSTATE_PKEY_BIT) { + xstate_offset = ebx; + xstate_size = eax; + } + } + + if (xstate_size == 0) { + printf("could not find size/offset of PKEY in xsave state\n"); + return 0; + } + + return xstate_offset; +} + +static inline int get_arch_reserved_keys(void) +{ + return NR_RESERVED_PKEYS; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ + int ptr_contents; + + ptr_contents = read_ptr(p1); + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); + expected_pkey_fault(pkey); +} + +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + return PTR_ERR_ENOTSUP; +} + +#endif /* _PKEYS_X86_H */ diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c new file mode 100644 index 000000000000..95f403a0c46d --- /dev/null +++ b/tools/testing/selftests/mm/protection_keys.c @@ -0,0 +1,1788 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst) + * + * There are examples in here of: + * * how to set protection keys on memory + * * how to set/clear bits in pkey registers (the rights register) + * * how to handle SEGV_PKUERR signals and extract pkey-relevant + * information from the siginfo + * + * Things to add: + * make sure KSM and KSM COW breaking works + * prefault pages in at malloc, or not + * protect MPX bounds tables with protection keys? + * make sure VMA splitting/merging is working correctly + * OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys + * look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel + * do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks + * + * Compile like this: + * gcc -mxsave -o protection_keys -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm + * gcc -mxsave -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm + */ +#define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pkey-helpers.h" + +int iteration_nr = 1; +int test_nr; + +u64 shadow_pkey_reg; +int dprint_in_signal; +char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; + +void cat_into_file(char *str, char *file) +{ + int fd = open(file, O_RDWR); + int ret; + + dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file); + /* + * these need to be raw because they are called under + * pkey_assert() + */ + if (fd < 0) { + fprintf(stderr, "error opening '%s'\n", str); + perror("error: "); + exit(__LINE__); + } + + ret = write(fd, str, strlen(str)); + if (ret != strlen(str)) { + perror("write to file failed"); + fprintf(stderr, "filename: '%s' str: '%s'\n", file, str); + exit(__LINE__); + } + close(fd); +} + +#if CONTROL_TRACING > 0 +static int warned_tracing; +int tracing_root_ok(void) +{ + if (geteuid() != 0) { + if (!warned_tracing) + fprintf(stderr, "WARNING: not run as root, " + "can not do tracing control\n"); + warned_tracing = 1; + return 0; + } + return 1; +} +#endif + +void tracing_on(void) +{ +#if CONTROL_TRACING > 0 +#define TRACEDIR "/sys/kernel/debug/tracing" + char pidstr[32]; + + if (!tracing_root_ok()) + return; + + sprintf(pidstr, "%d", getpid()); + cat_into_file("0", TRACEDIR "/tracing_on"); + cat_into_file("\n", TRACEDIR "/trace"); + if (1) { + cat_into_file("function_graph", TRACEDIR "/current_tracer"); + cat_into_file("1", TRACEDIR "/options/funcgraph-proc"); + } else { + cat_into_file("nop", TRACEDIR "/current_tracer"); + } + cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid"); + cat_into_file("1", TRACEDIR "/tracing_on"); + dprintf1("enabled tracing\n"); +#endif +} + +void tracing_off(void) +{ +#if CONTROL_TRACING > 0 + if (!tracing_root_ok()) + return; + cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on"); +#endif +} + +void abort_hooks(void) +{ + fprintf(stderr, "running %s()...\n", __func__); + tracing_off(); +#ifdef SLEEP_ON_ABORT + sleep(SLEEP_ON_ABORT); +#endif +} + +/* + * This attempts to have roughly a page of instructions followed by a few + * instructions that do a write, and another page of instructions. That + * way, we are pretty sure that the write is in the second page of + * instructions and has at least a page of padding behind it. + * + * *That* lets us be sure to madvise() away the write instruction, which + * will then fault, which makes sure that the fault code handles + * execute-only memory properly. + */ +#ifdef __powerpc64__ +/* This way, both 4K and 64K alignment are maintained */ +__attribute__((__aligned__(65536))) +#else +__attribute__((__aligned__(PAGE_SIZE))) +#endif +void lots_o_noops_around_write(int *write_to_me) +{ + dprintf3("running %s()\n", __func__); + __page_o_noops(); + /* Assume this happens in the second page of instructions: */ + *write_to_me = __LINE__; + /* pad out by another page: */ + __page_o_noops(); + dprintf3("%s() done\n", __func__); +} + +void dump_mem(void *dumpme, int len_bytes) +{ + char *c = (void *)dumpme; + int i; + + for (i = 0; i < len_bytes; i += sizeof(u64)) { + u64 *ptr = (u64 *)(c + i); + dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr); + } +} + +static u32 hw_pkey_get(int pkey, unsigned long flags) +{ + u64 pkey_reg = __read_pkey_reg(); + + dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", + __func__, pkey, flags, 0, 0); + dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg); + + return (u32) get_pkey_bits(pkey_reg, pkey); +} + +static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) +{ + u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); + u64 old_pkey_reg = __read_pkey_reg(); + u64 new_pkey_reg; + + /* make sure that 'rights' only contains the bits we expect: */ + assert(!(rights & ~mask)); + + /* modify bits accordingly in old pkey_reg and assign it */ + new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights); + + __write_pkey_reg(new_pkey_reg); + + dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x" + " pkey_reg now: %016llx old_pkey_reg: %016llx\n", + __func__, pkey, rights, flags, 0, __read_pkey_reg(), + old_pkey_reg); + return 0; +} + +void pkey_disable_set(int pkey, int flags) +{ + unsigned long syscall_flags = 0; + int ret; + int pkey_rights; + u64 orig_pkey_reg = read_pkey_reg(); + + dprintf1("START->%s(%d, 0x%x)\n", __func__, + pkey, flags); + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + pkey_assert(pkey_rights >= 0); + + pkey_rights |= flags; + + ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); + assert(!ret); + /* pkey_reg and flags have the same format */ + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); + dprintf1("%s(%d) shadow: 0x%016llx\n", + __func__, pkey, shadow_pkey_reg); + + pkey_assert(ret >= 0); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + dprintf1("%s(%d) pkey_reg: 0x%016llx\n", + __func__, pkey, read_pkey_reg()); + if (flags) + pkey_assert(read_pkey_reg() >= orig_pkey_reg); + dprintf1("END<---%s(%d, 0x%x)\n", __func__, + pkey, flags); +} + +void pkey_disable_clear(int pkey, int flags) +{ + unsigned long syscall_flags = 0; + int ret; + int pkey_rights = hw_pkey_get(pkey, syscall_flags); + u64 orig_pkey_reg = read_pkey_reg(); + + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + pkey_assert(pkey_rights >= 0); + + pkey_rights &= ~flags; + + ret = hw_pkey_set(pkey, pkey_rights, 0); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); + pkey_assert(ret >= 0); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, + pkey, read_pkey_reg()); + if (flags) + assert(read_pkey_reg() <= orig_pkey_reg); +} + +void pkey_write_allow(int pkey) +{ + pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); +} +void pkey_write_deny(int pkey) +{ + pkey_disable_set(pkey, PKEY_DISABLE_WRITE); +} +void pkey_access_allow(int pkey) +{ + pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); +} +void pkey_access_deny(int pkey) +{ + pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); +} + +/* Failed address bound checks: */ +#ifndef SEGV_BNDERR +# define SEGV_BNDERR 3 +#endif + +#ifndef SEGV_PKUERR +# define SEGV_PKUERR 4 +#endif + +static char *si_code_str(int si_code) +{ + if (si_code == SEGV_MAPERR) + return "SEGV_MAPERR"; + if (si_code == SEGV_ACCERR) + return "SEGV_ACCERR"; + if (si_code == SEGV_BNDERR) + return "SEGV_BNDERR"; + if (si_code == SEGV_PKUERR) + return "SEGV_PKUERR"; + return "UNKNOWN"; +} + +int pkey_faults; +int last_si_pkey = -1; +void signal_handler(int signum, siginfo_t *si, void *vucontext) +{ + ucontext_t *uctxt = vucontext; + int trapno; + unsigned long ip; + char *fpregs; +#if defined(__i386__) || defined(__x86_64__) /* arch */ + u32 *pkey_reg_ptr; + int pkey_reg_offset; +#endif /* arch */ + u64 siginfo_pkey; + u32 *si_pkey_ptr; + + dprint_in_signal = 1; + dprintf1(">>>>===============SIGSEGV============================\n"); + dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", + __func__, __LINE__, + __read_pkey_reg(), shadow_pkey_reg); + + trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; + ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; + fpregs = (char *) uctxt->uc_mcontext.fpregs; + + dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n", + __func__, trapno, ip, si_code_str(si->si_code), + si->si_code); + +#if defined(__i386__) || defined(__x86_64__) /* arch */ +#ifdef __i386__ + /* + * 32-bit has some extra padding so that userspace can tell whether + * the XSTATE header is present in addition to the "legacy" FPU + * state. We just assume that it is here. + */ + fpregs += 0x70; +#endif /* i386 */ + pkey_reg_offset = pkey_reg_xstate_offset(); + pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]); + + /* + * If we got a PKEY fault, we *HAVE* to have at least one bit set in + * here. + */ + dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset()); + if (DEBUG_LEVEL > 4) + dump_mem(pkey_reg_ptr - 128, 256); + pkey_assert(*pkey_reg_ptr); +#endif /* arch */ + + dprintf1("siginfo: %p\n", si); + dprintf1(" fpregs: %p\n", fpregs); + + if ((si->si_code == SEGV_MAPERR) || + (si->si_code == SEGV_ACCERR) || + (si->si_code == SEGV_BNDERR)) { + printf("non-PK si_code, exiting...\n"); + exit(4); + } + + si_pkey_ptr = siginfo_get_pkey_ptr(si); + dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr); + dump_mem((u8 *)si_pkey_ptr - 8, 24); + siginfo_pkey = *si_pkey_ptr; + pkey_assert(siginfo_pkey < NR_PKEYS); + last_si_pkey = siginfo_pkey; + + /* + * need __read_pkey_reg() version so we do not do shadow_pkey_reg + * checking + */ + dprintf1("signal pkey_reg from pkey_reg: %016llx\n", + __read_pkey_reg()); + dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey); +#if defined(__i386__) || defined(__x86_64__) /* arch */ + dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr); + *(u64 *)pkey_reg_ptr = 0x00000000; + dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n"); +#elif defined(__powerpc64__) /* arch */ + /* restore access and let the faulting instruction continue */ + pkey_access_allow(siginfo_pkey); +#endif /* arch */ + pkey_faults++; + dprintf1("<<<<==================================================\n"); + dprint_in_signal = 0; +} + +int wait_all_children(void) +{ + int status; + return waitpid(-1, &status, 0); +} + +void sig_chld(int x) +{ + dprint_in_signal = 1; + dprintf2("[%d] SIGCHLD: %d\n", getpid(), x); + dprint_in_signal = 0; +} + +void setup_sigsegv_handler(void) +{ + int r, rs; + struct sigaction newact; + struct sigaction oldact; + + /* #PF is mapped to sigsegv */ + int signum = SIGSEGV; + + newact.sa_handler = 0; + newact.sa_sigaction = signal_handler; + + /*sigset_t - signals to block while in the handler */ + /* get the old signal mask. */ + rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask); + pkey_assert(rs == 0); + + /* call sa_sigaction, not sa_handler*/ + newact.sa_flags = SA_SIGINFO; + + newact.sa_restorer = 0; /* void(*)(), obsolete */ + r = sigaction(signum, &newact, &oldact); + r = sigaction(SIGALRM, &newact, &oldact); + pkey_assert(r == 0); +} + +void setup_handlers(void) +{ + signal(SIGCHLD, &sig_chld); + setup_sigsegv_handler(); +} + +pid_t fork_lazy_child(void) +{ + pid_t forkret; + + forkret = fork(); + pkey_assert(forkret >= 0); + dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); + + if (!forkret) { + /* in the child */ + while (1) { + dprintf1("child sleeping...\n"); + sleep(30); + } + } + return forkret; +} + +int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey) +{ + int sret; + + dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__, + ptr, size, orig_prot, pkey); + + errno = 0; + sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey); + if (errno) { + dprintf2("SYS_mprotect_key sret: %d\n", sret); + dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot); + dprintf2("SYS_mprotect_key failed, errno: %d\n", errno); + if (DEBUG_LEVEL >= 2) + perror("SYS_mprotect_pkey"); + } + return sret; +} + +int sys_pkey_alloc(unsigned long flags, unsigned long init_val) +{ + int ret = syscall(SYS_pkey_alloc, flags, init_val); + dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n", + __func__, flags, init_val, ret, errno); + return ret; +} + +int alloc_pkey(void) +{ + int ret; + unsigned long init_val = 0x0; + + dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", + __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); + ret = sys_pkey_alloc(0, init_val); + /* + * pkey_alloc() sets PKEY register, so we need to reflect it in + * shadow_pkey_reg: + */ + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + if (ret > 0) { + /* clear both the bits: */ + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, + ~PKEY_MASK); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, + __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + /* + * move the new state in from init_val + * (remember, we cheated and init_val == pkey_reg format) + */ + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, + init_val); + } + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno); + /* for shadow checking: */ + read_pkey_reg(); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + return ret; +} + +int sys_pkey_free(unsigned long pkey) +{ + int ret = syscall(SYS_pkey_free, pkey); + dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret); + return ret; +} + +/* + * I had a bug where pkey bits could be set by mprotect() but + * not cleared. This ensures we get lots of random bit sets + * and clears on the vma and pte pkey bits. + */ +int alloc_random_pkey(void) +{ + int max_nr_pkey_allocs; + int ret; + int i; + int alloced_pkeys[NR_PKEYS]; + int nr_alloced = 0; + int random_index; + memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); + + /* allocate every possible key and make a note of which ones we got */ + max_nr_pkey_allocs = NR_PKEYS; + for (i = 0; i < max_nr_pkey_allocs; i++) { + int new_pkey = alloc_pkey(); + if (new_pkey < 0) + break; + alloced_pkeys[nr_alloced++] = new_pkey; + } + + pkey_assert(nr_alloced > 0); + /* select a random one out of the allocated ones */ + random_index = rand() % nr_alloced; + ret = alloced_pkeys[random_index]; + /* now zero it out so we don't free it next */ + alloced_pkeys[random_index] = 0; + + /* go through the allocated ones that we did not want and free them */ + for (i = 0; i < nr_alloced; i++) { + int free_ret; + if (!alloced_pkeys[i]) + continue; + free_ret = sys_pkey_free(alloced_pkeys[i]); + pkey_assert(!free_ret); + } + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, + __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); + return ret; +} + +int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey) +{ + int nr_iterations = random() % 100; + int ret; + + while (0) { + int rpkey = alloc_random_pkey(); + ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); + dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", + ptr, size, orig_prot, pkey, ret); + if (nr_iterations-- < 0) + break; + + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + sys_pkey_free(rpkey); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + } + pkey_assert(pkey < NR_PKEYS); + + ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); + dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", + ptr, size, orig_prot, pkey, ret); + pkey_assert(!ret); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, + __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); + return ret; +} + +struct pkey_malloc_record { + void *ptr; + long size; + int prot; +}; +struct pkey_malloc_record *pkey_malloc_records; +struct pkey_malloc_record *pkey_last_malloc_record; +long nr_pkey_malloc_records; +void record_pkey_malloc(void *ptr, long size, int prot) +{ + long i; + struct pkey_malloc_record *rec = NULL; + + for (i = 0; i < nr_pkey_malloc_records; i++) { + rec = &pkey_malloc_records[i]; + /* find a free record */ + if (rec) + break; + } + if (!rec) { + /* every record is full */ + size_t old_nr_records = nr_pkey_malloc_records; + size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1); + size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record); + dprintf2("new_nr_records: %zd\n", new_nr_records); + dprintf2("new_size: %zd\n", new_size); + pkey_malloc_records = realloc(pkey_malloc_records, new_size); + pkey_assert(pkey_malloc_records != NULL); + rec = &pkey_malloc_records[nr_pkey_malloc_records]; + /* + * realloc() does not initialize memory, so zero it from + * the first new record all the way to the end. + */ + for (i = 0; i < new_nr_records - old_nr_records; i++) + memset(rec + i, 0, sizeof(*rec)); + } + dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n", + (int)(rec - pkey_malloc_records), rec, ptr, size); + rec->ptr = ptr; + rec->size = size; + rec->prot = prot; + pkey_last_malloc_record = rec; + nr_pkey_malloc_records++; +} + +void free_pkey_malloc(void *ptr) +{ + long i; + int ret; + dprintf3("%s(%p)\n", __func__, ptr); + for (i = 0; i < nr_pkey_malloc_records; i++) { + struct pkey_malloc_record *rec = &pkey_malloc_records[i]; + dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n", + ptr, i, rec, rec->ptr, rec->size); + if ((ptr < rec->ptr) || + (ptr >= rec->ptr + rec->size)) + continue; + + dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n", + ptr, i, rec, rec->ptr, rec->size); + nr_pkey_malloc_records--; + ret = munmap(rec->ptr, rec->size); + dprintf3("munmap ret: %d\n", ret); + pkey_assert(!ret); + dprintf3("clearing rec->ptr, rec: %p\n", rec); + rec->ptr = NULL; + dprintf3("done clearing rec->ptr, rec: %p\n", rec); + return; + } + pkey_assert(false); +} + + +void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) +{ + void *ptr; + int ret; + + read_pkey_reg(); + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + pkey_assert(pkey < NR_PKEYS); + ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + pkey_assert(ptr != (void *)-1); + ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); + pkey_assert(!ret); + record_pkey_malloc(ptr, size, prot); + read_pkey_reg(); + + dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); + return ptr; +} + +void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) +{ + int ret; + void *ptr; + + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + /* + * Guarantee we can fit at least one huge page in the resulting + * allocation by allocating space for 2: + */ + size = ALIGN_UP(size, HPAGE_SIZE * 2); + ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + pkey_assert(ptr != (void *)-1); + record_pkey_malloc(ptr, size, prot); + mprotect_pkey(ptr, size, prot, pkey); + + dprintf1("unaligned ptr: %p\n", ptr); + ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE); + dprintf1(" aligned ptr: %p\n", ptr); + ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE); + dprintf1("MADV_HUGEPAGE ret: %d\n", ret); + ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED); + dprintf1("MADV_WILLNEED ret: %d\n", ret); + memset(ptr, 0, HPAGE_SIZE); + + dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr); + return ptr; +} + +int hugetlb_setup_ok; +#define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages" +#define GET_NR_HUGE_PAGES 10 +void setup_hugetlbfs(void) +{ + int err; + int fd; + char buf[256]; + long hpagesz_kb; + long hpagesz_mb; + + if (geteuid() != 0) { + fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n"); + return; + } + + cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages"); + + /* + * Now go make sure that we got the pages and that they + * are PMD-level pages. Someone might have made PUD-level + * pages the default. + */ + hpagesz_kb = HPAGE_SIZE / 1024; + hpagesz_mb = hpagesz_kb / 1024; + sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb); + fd = open(buf, O_RDONLY); + if (fd < 0) { + fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n", + hpagesz_mb, strerror(errno)); + return; + } + + /* -1 to guarantee leaving the trailing \0 */ + err = read(fd, buf, sizeof(buf)-1); + close(fd); + if (err <= 0) { + fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n", + hpagesz_mb, strerror(errno)); + return; + } + + if (atoi(buf) != GET_NR_HUGE_PAGES) { + fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n", + hpagesz_mb, buf, GET_NR_HUGE_PAGES); + return; + } + + hugetlb_setup_ok = 1; +} + +void *malloc_pkey_hugetlb(long size, int prot, u16 pkey) +{ + void *ptr; + int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB; + + if (!hugetlb_setup_ok) + return PTR_ERR_ENOTSUP; + + dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey); + size = ALIGN_UP(size, HPAGE_SIZE * 2); + pkey_assert(pkey < NR_PKEYS); + ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0); + pkey_assert(ptr != (void *)-1); + mprotect_pkey(ptr, size, prot, pkey); + + record_pkey_malloc(ptr, size, prot); + + dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr); + return ptr; +} + +void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey) +{ + void *ptr; + int fd; + + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + pkey_assert(pkey < NR_PKEYS); + fd = open("/dax/foo", O_RDWR); + pkey_assert(fd >= 0); + + ptr = mmap(0, size, prot, MAP_SHARED, fd, 0); + pkey_assert(ptr != (void *)-1); + + mprotect_pkey(ptr, size, prot, pkey); + + record_pkey_malloc(ptr, size, prot); + + dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr); + close(fd); + return ptr; +} + +void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { + + malloc_pkey_with_mprotect, + malloc_pkey_with_mprotect_subpage, + malloc_pkey_anon_huge, + malloc_pkey_hugetlb +/* can not do direct with the pkey_mprotect() API: + malloc_pkey_mmap_direct, + malloc_pkey_mmap_dax, +*/ +}; + +void *malloc_pkey(long size, int prot, u16 pkey) +{ + void *ret; + static int malloc_type; + int nr_malloc_types = ARRAY_SIZE(pkey_malloc); + + pkey_assert(pkey < NR_PKEYS); + + while (1) { + pkey_assert(malloc_type < nr_malloc_types); + + ret = pkey_malloc[malloc_type](size, prot, pkey); + pkey_assert(ret != (void *)-1); + + malloc_type++; + if (malloc_type >= nr_malloc_types) + malloc_type = (random()%nr_malloc_types); + + /* try again if the malloc_type we tried is unsupported */ + if (ret == PTR_ERR_ENOTSUP) + continue; + + break; + } + + dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__, + size, prot, pkey, ret); + return ret; +} + +int last_pkey_faults; +#define UNKNOWN_PKEY -2 +void expected_pkey_fault(int pkey) +{ + dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n", + __func__, last_pkey_faults, pkey_faults); + dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey); + pkey_assert(last_pkey_faults + 1 == pkey_faults); + + /* + * For exec-only memory, we do not know the pkey in + * advance, so skip this check. + */ + if (pkey != UNKNOWN_PKEY) + pkey_assert(last_si_pkey == pkey); + +#if defined(__i386__) || defined(__x86_64__) /* arch */ + /* + * The signal handler shold have cleared out PKEY register to let the + * test program continue. We now have to restore it. + */ + if (__read_pkey_reg() != 0) +#else /* arch */ + if (__read_pkey_reg() != shadow_pkey_reg) +#endif /* arch */ + pkey_assert(0); + + __write_pkey_reg(shadow_pkey_reg); + dprintf1("%s() set pkey_reg=%016llx to restore state after signal " + "nuked it\n", __func__, shadow_pkey_reg); + last_pkey_faults = pkey_faults; + last_si_pkey = -1; +} + +#define do_not_expect_pkey_fault(msg) do { \ + if (last_pkey_faults != pkey_faults) \ + dprintf0("unexpected PKey fault: %s\n", msg); \ + pkey_assert(last_pkey_faults == pkey_faults); \ +} while (0) + +int test_fds[10] = { -1 }; +int nr_test_fds; +void __save_test_fd(int fd) +{ + pkey_assert(fd >= 0); + pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds)); + test_fds[nr_test_fds] = fd; + nr_test_fds++; +} + +int get_test_read_fd(void) +{ + int test_fd = open("/etc/passwd", O_RDONLY); + __save_test_fd(test_fd); + return test_fd; +} + +void close_test_fds(void) +{ + int i; + + for (i = 0; i < nr_test_fds; i++) { + if (test_fds[i] < 0) + continue; + close(test_fds[i]); + test_fds[i] = -1; + } + nr_test_fds = 0; +} + +#define barrier() __asm__ __volatile__("": : :"memory") +__attribute__((noinline)) int read_ptr(int *ptr) +{ + /* + * Keep GCC from optimizing this away somehow + */ + barrier(); + return *ptr; +} + +void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) +{ + int i, err; + int max_nr_pkey_allocs; + int alloced_pkeys[NR_PKEYS]; + int nr_alloced = 0; + long size; + + pkey_assert(pkey_last_malloc_record); + size = pkey_last_malloc_record->size; + /* + * This is a bit of a hack. But mprotect() requires + * huge-page-aligned sizes when operating on hugetlbfs. + * So, make sure that we use something that's a multiple + * of a huge page when we can. + */ + if (size >= HPAGE_SIZE) + size = HPAGE_SIZE; + + /* allocate every possible key and make sure key-0 never got allocated */ + max_nr_pkey_allocs = NR_PKEYS; + for (i = 0; i < max_nr_pkey_allocs; i++) { + int new_pkey = alloc_pkey(); + pkey_assert(new_pkey != 0); + + if (new_pkey < 0) + break; + alloced_pkeys[nr_alloced++] = new_pkey; + } + /* free all the allocated keys */ + for (i = 0; i < nr_alloced; i++) { + int free_ret; + + if (!alloced_pkeys[i]) + continue; + free_ret = sys_pkey_free(alloced_pkeys[i]); + pkey_assert(!free_ret); + } + + /* attach key-0 in various modes */ + err = sys_mprotect_pkey(ptr, size, PROT_READ, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0); + pkey_assert(!err); +} + +void test_read_of_write_disabled_region(int *ptr, u16 pkey) +{ + int ptr_contents; + + dprintf1("disabling write access to PKEY[1], doing read\n"); + pkey_write_deny(pkey); + ptr_contents = read_ptr(ptr); + dprintf1("*ptr: %d\n", ptr_contents); + dprintf1("\n"); +} +void test_read_of_access_disabled_region(int *ptr, u16 pkey) +{ + int ptr_contents; + + dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr); + read_pkey_reg(); + pkey_access_deny(pkey); + ptr_contents = read_ptr(ptr); + dprintf1("*ptr: %d\n", ptr_contents); + expected_pkey_fault(pkey); +} + +void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + int ptr_contents; + + dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", + pkey, ptr); + ptr_contents = read_ptr(ptr); + dprintf1("reading ptr before disabling the read : %d\n", + ptr_contents); + read_pkey_reg(); + pkey_access_deny(pkey); + ptr_contents = read_ptr(ptr); + dprintf1("*ptr: %d\n", ptr_contents); + expected_pkey_fault(pkey); +} + +void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + *ptr = __LINE__; + dprintf1("disabling write access; after accessing the page, " + "to PKEY[%02d], doing write\n", pkey); + pkey_write_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + +void test_write_of_write_disabled_region(int *ptr, u16 pkey) +{ + dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); + pkey_write_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} +void test_write_of_access_disabled_region(int *ptr, u16 pkey) +{ + dprintf1("disabling access to PKEY[%02d], doing write\n", pkey); + pkey_access_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + +void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + *ptr = __LINE__; + dprintf1("disabling access; after accessing the page, " + " to PKEY[%02d], doing write\n", pkey); + pkey_access_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + +void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) +{ + int ret; + int test_fd = get_test_read_fd(); + + dprintf1("disabling access to PKEY[%02d], " + "having kernel read() to buffer\n", pkey); + pkey_access_deny(pkey); + ret = read(test_fd, ptr, 1); + dprintf1("read ret: %d\n", ret); + pkey_assert(ret); +} +void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey) +{ + int ret; + int test_fd = get_test_read_fd(); + + pkey_write_deny(pkey); + ret = read(test_fd, ptr, 100); + dprintf1("read ret: %d\n", ret); + if (ret < 0 && (DEBUG_LEVEL > 0)) + perror("verbose read result (OK for this to be bad)"); + pkey_assert(ret); +} + +void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey) +{ + int pipe_ret, vmsplice_ret; + struct iovec iov; + int pipe_fds[2]; + + pipe_ret = pipe(pipe_fds); + + pkey_assert(pipe_ret == 0); + dprintf1("disabling access to PKEY[%02d], " + "having kernel vmsplice from buffer\n", pkey); + pkey_access_deny(pkey); + iov.iov_base = ptr; + iov.iov_len = PAGE_SIZE; + vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT); + dprintf1("vmsplice() ret: %d\n", vmsplice_ret); + pkey_assert(vmsplice_ret == -1); + + close(pipe_fds[0]); + close(pipe_fds[1]); +} + +void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey) +{ + int ignored = 0xdada; + int futex_ret; + int some_int = __LINE__; + + dprintf1("disabling write to PKEY[%02d], " + "doing futex gunk in buffer\n", pkey); + *ptr = some_int; + pkey_write_deny(pkey); + futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL, + &ignored, ignored); + if (DEBUG_LEVEL > 0) + perror("futex"); + dprintf1("futex() ret: %d\n", futex_ret); +} + +/* Assumes that all pkeys other than 'pkey' are unallocated */ +void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey) +{ + int err; + int i; + + /* Note: 0 is the default pkey, so don't mess with it */ + for (i = 1; i < NR_PKEYS; i++) { + if (pkey == i) + continue; + + dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i); + err = sys_pkey_free(i); + pkey_assert(err); + + err = sys_pkey_free(i); + pkey_assert(err); + + err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i); + pkey_assert(err); + } +} + +/* Assumes that all pkeys other than 'pkey' are unallocated */ +void test_pkey_syscalls_bad_args(int *ptr, u16 pkey) +{ + int err; + int bad_pkey = NR_PKEYS+99; + + /* pass a known-invalid pkey in: */ + err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey); + pkey_assert(err); +} + +void become_child(void) +{ + pid_t forkret; + + forkret = fork(); + pkey_assert(forkret >= 0); + dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); + + if (!forkret) { + /* in the child */ + return; + } + exit(0); +} + +/* Assumes that all pkeys other than 'pkey' are unallocated */ +void test_pkey_alloc_exhaust(int *ptr, u16 pkey) +{ + int err; + int allocated_pkeys[NR_PKEYS] = {0}; + int nr_allocated_pkeys = 0; + int i; + + for (i = 0; i < NR_PKEYS*3; i++) { + int new_pkey; + dprintf1("%s() alloc loop: %d\n", __func__, i); + new_pkey = alloc_pkey(); + dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, err, __read_pkey_reg(), + shadow_pkey_reg); + read_pkey_reg(); /* for shadow checking */ + dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC); + if ((new_pkey == -1) && (errno == ENOSPC)) { + dprintf2("%s() failed to allocate pkey after %d tries\n", + __func__, nr_allocated_pkeys); + } else { + /* + * Ensure the number of successes never + * exceeds the number of keys supported + * in the hardware. + */ + pkey_assert(nr_allocated_pkeys < NR_PKEYS); + allocated_pkeys[nr_allocated_pkeys++] = new_pkey; + } + + /* + * Make sure that allocation state is properly + * preserved across fork(). + */ + if (i == NR_PKEYS*2) + become_child(); + } + + dprintf3("%s()::%d\n", __func__, __LINE__); + + /* + * On x86: + * There are 16 pkeys supported in hardware. Three are + * allocated by the time we get here: + * 1. The default key (0) + * 2. One possibly consumed by an execute-only mapping. + * 3. One allocated by the test code and passed in via + * 'pkey' to this function. + * Ensure that we can allocate at least another 13 (16-3). + * + * On powerpc: + * There are either 5, 28, 29 or 32 pkeys supported in + * hardware depending on the page size (4K or 64K) and + * platform (powernv or powervm). Four are allocated by + * the time we get here. These include pkey-0, pkey-1, + * exec-only pkey and the one allocated by the test code. + * Ensure that we can allocate the remaining. + */ + pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1)); + + for (i = 0; i < nr_allocated_pkeys; i++) { + err = sys_pkey_free(allocated_pkeys[i]); + pkey_assert(!err); + read_pkey_reg(); /* for shadow checking */ + } +} + +void arch_force_pkey_reg_init(void) +{ +#if defined(__i386__) || defined(__x86_64__) /* arch */ + u64 *buf; + + /* + * All keys should be allocated and set to allow reads and + * writes, so the register should be all 0. If not, just + * skip the test. + */ + if (read_pkey_reg()) + return; + + /* + * Just allocate an absurd about of memory rather than + * doing the XSAVE size enumeration dance. + */ + buf = mmap(NULL, 1*MB, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + + /* These __builtins require compiling with -mxsave */ + + /* XSAVE to build a valid buffer: */ + __builtin_ia32_xsave(buf, XSTATE_PKEY); + /* Clear XSTATE_BV[PKRU]: */ + buf[XSTATE_BV_OFFSET/sizeof(u64)] &= ~XSTATE_PKEY; + /* XRSTOR will likely get PKRU back to the init state: */ + __builtin_ia32_xrstor(buf, XSTATE_PKEY); + + munmap(buf, 1*MB); +#endif +} + + +/* + * This is mostly useless on ppc for now. But it will not + * hurt anything and should give some better coverage as + * a long-running test that continually checks the pkey + * register. + */ +void test_pkey_init_state(int *ptr, u16 pkey) +{ + int err; + int allocated_pkeys[NR_PKEYS] = {0}; + int nr_allocated_pkeys = 0; + int i; + + for (i = 0; i < NR_PKEYS; i++) { + int new_pkey = alloc_pkey(); + + if (new_pkey < 0) + continue; + allocated_pkeys[nr_allocated_pkeys++] = new_pkey; + } + + dprintf3("%s()::%d\n", __func__, __LINE__); + + arch_force_pkey_reg_init(); + + /* + * Loop for a bit, hoping to get exercise the kernel + * context switch code. + */ + for (i = 0; i < 1000000; i++) + read_pkey_reg(); + + for (i = 0; i < nr_allocated_pkeys; i++) { + err = sys_pkey_free(allocated_pkeys[i]); + pkey_assert(!err); + read_pkey_reg(); /* for shadow checking */ + } +} + +/* + * pkey 0 is special. It is allocated by default, so you do not + * have to call pkey_alloc() to use it first. Make sure that it + * is usable. + */ +void test_mprotect_with_pkey_0(int *ptr, u16 pkey) +{ + long size; + int prot; + + assert(pkey_last_malloc_record); + size = pkey_last_malloc_record->size; + /* + * This is a bit of a hack. But mprotect() requires + * huge-page-aligned sizes when operating on hugetlbfs. + * So, make sure that we use something that's a multiple + * of a huge page when we can. + */ + if (size >= HPAGE_SIZE) + size = HPAGE_SIZE; + prot = pkey_last_malloc_record->prot; + + /* Use pkey 0 */ + mprotect_pkey(ptr, size, prot, 0); + + /* Make sure that we can set it back to the original pkey. */ + mprotect_pkey(ptr, size, prot, pkey); +} + +void test_ptrace_of_child(int *ptr, u16 pkey) +{ + __attribute__((__unused__)) int peek_result; + pid_t child_pid; + void *ignored = 0; + long ret; + int status; + /* + * This is the "control" for our little expermient. Make sure + * we can always access it when ptracing. + */ + int *plain_ptr_unaligned = malloc(HPAGE_SIZE); + int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE); + + /* + * Fork a child which is an exact copy of this process, of course. + * That means we can do all of our tests via ptrace() and then plain + * memory access and ensure they work differently. + */ + child_pid = fork_lazy_child(); + dprintf1("[%d] child pid: %d\n", getpid(), child_pid); + + ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored); + if (ret) + perror("attach"); + dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__); + pkey_assert(ret != -1); + ret = waitpid(child_pid, &status, WUNTRACED); + if ((ret != child_pid) || !(WIFSTOPPED(status))) { + fprintf(stderr, "weird waitpid result %ld stat %x\n", + ret, status); + pkey_assert(0); + } + dprintf2("waitpid ret: %ld\n", ret); + dprintf2("waitpid status: %d\n", status); + + pkey_access_deny(pkey); + pkey_write_deny(pkey); + + /* Write access, untested for now: + ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data); + pkey_assert(ret != -1); + dprintf1("poke at %p: %ld\n", peek_at, ret); + */ + + /* + * Try to access the pkey-protected "ptr" via ptrace: + */ + ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored); + /* expect it to work, without an error: */ + pkey_assert(ret != -1); + /* Now access from the current task, and expect an exception: */ + peek_result = read_ptr(ptr); + expected_pkey_fault(pkey); + + /* + * Try to access the NON-pkey-protected "plain_ptr" via ptrace: + */ + ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored); + /* expect it to work, without an error: */ + pkey_assert(ret != -1); + /* Now access from the current task, and expect NO exception: */ + peek_result = read_ptr(plain_ptr); + do_not_expect_pkey_fault("read plain pointer after ptrace"); + + ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0); + pkey_assert(ret != -1); + + ret = kill(child_pid, SIGKILL); + pkey_assert(ret != -1); + + wait(&status); + + free(plain_ptr_unaligned); +} + +void *get_pointer_to_instructions(void) +{ + void *p1; + + p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE); + dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write); + /* lots_o_noops_around_write should be page-aligned already */ + assert(p1 == &lots_o_noops_around_write); + + /* Point 'p1' at the *second* page of the function: */ + p1 += PAGE_SIZE; + + /* + * Try to ensure we fault this in on next touch to ensure + * we get an instruction fault as opposed to a data one + */ + madvise(p1, PAGE_SIZE, MADV_DONTNEED); + + return p1; +} + +void test_executing_on_unreadable_memory(int *ptr, u16 pkey) +{ + void *p1; + int scratch; + int ptr_contents; + int ret; + + p1 = get_pointer_to_instructions(); + lots_o_noops_around_write(&scratch); + ptr_contents = read_ptr(p1); + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); + + ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey); + pkey_assert(!ret); + pkey_access_deny(pkey); + + dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); + + /* + * Make sure this is an *instruction* fault + */ + madvise(p1, PAGE_SIZE, MADV_DONTNEED); + lots_o_noops_around_write(&scratch); + do_not_expect_pkey_fault("executing on PROT_EXEC memory"); + expect_fault_on_read_execonly_key(p1, pkey); +} + +void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) +{ + void *p1; + int scratch; + int ptr_contents; + int ret; + + dprintf1("%s() start\n", __func__); + + p1 = get_pointer_to_instructions(); + lots_o_noops_around_write(&scratch); + ptr_contents = read_ptr(p1); + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); + + /* Use a *normal* mprotect(), not mprotect_pkey(): */ + ret = mprotect(p1, PAGE_SIZE, PROT_EXEC); + pkey_assert(!ret); + + /* + * Reset the shadow, assuming that the above mprotect() + * correctly changed PKRU, but to an unknown value since + * the actual allocated pkey is unknown. + */ + shadow_pkey_reg = __read_pkey_reg(); + + dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); + + /* Make sure this is an *instruction* fault */ + madvise(p1, PAGE_SIZE, MADV_DONTNEED); + lots_o_noops_around_write(&scratch); + do_not_expect_pkey_fault("executing on PROT_EXEC memory"); + expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY); + + /* + * Put the memory back to non-PROT_EXEC. Should clear the + * exec-only pkey off the VMA and allow it to be readable + * again. Go to PROT_NONE first to check for a kernel bug + * that did not clear the pkey when doing PROT_NONE. + */ + ret = mprotect(p1, PAGE_SIZE, PROT_NONE); + pkey_assert(!ret); + + ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC); + pkey_assert(!ret); + ptr_contents = read_ptr(p1); + do_not_expect_pkey_fault("plain read on recently PROT_EXEC area"); +} + +#if defined(__i386__) || defined(__x86_64__) +void test_ptrace_modifies_pkru(int *ptr, u16 pkey) +{ + u32 new_pkru; + pid_t child; + int status, ret; + int pkey_offset = pkey_reg_xstate_offset(); + size_t xsave_size = cpu_max_xsave_size(); + void *xsave; + u32 *pkey_register; + u64 *xstate_bv; + struct iovec iov; + + new_pkru = ~read_pkey_reg(); + /* Don't make PROT_EXEC mappings inaccessible */ + new_pkru &= ~3; + + child = fork(); + pkey_assert(child >= 0); + dprintf3("[%d] fork() ret: %d\n", getpid(), child); + if (!child) { + ptrace(PTRACE_TRACEME, 0, 0, 0); + /* Stop and allow the tracer to modify PKRU directly */ + raise(SIGSTOP); + + /* + * need __read_pkey_reg() version so we do not do shadow_pkey_reg + * checking + */ + if (__read_pkey_reg() != new_pkru) + exit(1); + + /* Stop and allow the tracer to clear XSTATE_BV for PKRU */ + raise(SIGSTOP); + + if (__read_pkey_reg() != 0) + exit(1); + + /* Stop and allow the tracer to examine PKRU */ + raise(SIGSTOP); + + exit(0); + } + + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); + + xsave = (void *)malloc(xsave_size); + pkey_assert(xsave > 0); + + /* Modify the PKRU register directly */ + iov.iov_base = xsave; + iov.iov_len = xsave_size; + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + + pkey_register = (u32 *)(xsave + pkey_offset); + pkey_assert(*pkey_register == read_pkey_reg()); + + *pkey_register = new_pkru; + + ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + + /* Test that the modification is visible in ptrace before any execution */ + memset(xsave, 0xCC, xsave_size); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + pkey_assert(*pkey_register == new_pkru); + + /* Execute the tracee */ + ret = ptrace(PTRACE_CONT, child, 0, 0); + pkey_assert(ret == 0); + + /* Test that the tracee saw the PKRU value change */ + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); + + /* Test that the modification is visible in ptrace after execution */ + memset(xsave, 0xCC, xsave_size); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + pkey_assert(*pkey_register == new_pkru); + + /* Clear the PKRU bit from XSTATE_BV */ + xstate_bv = (u64 *)(xsave + 512); + *xstate_bv &= ~(1 << 9); + + ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + + /* Test that the modification is visible in ptrace before any execution */ + memset(xsave, 0xCC, xsave_size); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + pkey_assert(*pkey_register == 0); + + ret = ptrace(PTRACE_CONT, child, 0, 0); + pkey_assert(ret == 0); + + /* Test that the tracee saw the PKRU value go to 0 */ + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); + + /* Test that the modification is visible in ptrace after execution */ + memset(xsave, 0xCC, xsave_size); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + pkey_assert(*pkey_register == 0); + + ret = ptrace(PTRACE_CONT, child, 0, 0); + pkey_assert(ret == 0); + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFEXITED(status)); + pkey_assert(WEXITSTATUS(status) == 0); + free(xsave); +} +#endif + +void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) +{ + int size = PAGE_SIZE; + int sret; + + if (cpu_has_pkeys()) { + dprintf1("SKIP: %s: no CPU support\n", __func__); + return; + } + + sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey); + pkey_assert(sret < 0); +} + +void (*pkey_tests[])(int *ptr, u16 pkey) = { + test_read_of_write_disabled_region, + test_read_of_access_disabled_region, + test_read_of_access_disabled_region_with_page_already_mapped, + test_write_of_write_disabled_region, + test_write_of_write_disabled_region_with_page_already_mapped, + test_write_of_access_disabled_region, + test_write_of_access_disabled_region_with_page_already_mapped, + test_kernel_write_of_access_disabled_region, + test_kernel_write_of_write_disabled_region, + test_kernel_gup_of_access_disabled_region, + test_kernel_gup_write_to_write_disabled_region, + test_executing_on_unreadable_memory, + test_implicit_mprotect_exec_only_memory, + test_mprotect_with_pkey_0, + test_ptrace_of_child, + test_pkey_init_state, + test_pkey_syscalls_on_non_allocated_pkey, + test_pkey_syscalls_bad_args, + test_pkey_alloc_exhaust, + test_pkey_alloc_free_attach_pkey0, +#if defined(__i386__) || defined(__x86_64__) + test_ptrace_modifies_pkru, +#endif +}; + +void run_tests_once(void) +{ + int *ptr; + int prot = PROT_READ|PROT_WRITE; + + for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) { + int pkey; + int orig_pkey_faults = pkey_faults; + + dprintf1("======================\n"); + dprintf1("test %d preparing...\n", test_nr); + + tracing_on(); + pkey = alloc_random_pkey(); + dprintf1("test %d starting with pkey: %d\n", test_nr, pkey); + ptr = malloc_pkey(PAGE_SIZE, prot, pkey); + dprintf1("test %d starting...\n", test_nr); + pkey_tests[test_nr](ptr, pkey); + dprintf1("freeing test memory: %p\n", ptr); + free_pkey_malloc(ptr); + sys_pkey_free(pkey); + + dprintf1("pkey_faults: %d\n", pkey_faults); + dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults); + + tracing_off(); + close_test_fds(); + + printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr); + dprintf1("======================\n\n"); + } + iteration_nr++; +} + +void pkey_setup_shadow(void) +{ + shadow_pkey_reg = __read_pkey_reg(); +} + +int main(void) +{ + int nr_iterations = 22; + int pkeys_supported = is_pkeys_supported(); + + srand((unsigned int)time(NULL)); + + setup_handlers(); + + printf("has pkeys: %d\n", pkeys_supported); + + if (!pkeys_supported) { + int size = PAGE_SIZE; + int *ptr; + + printf("running PKEY tests for unsupported CPU/OS\n"); + + ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + assert(ptr != (void *)-1); + test_mprotect_pkey_on_unsupported_cpu(ptr, 1); + exit(0); + } + + pkey_setup_shadow(); + printf("startup pkey_reg: %016llx\n", read_pkey_reg()); + setup_hugetlbfs(); + + while (nr_iterations-- > 0) + run_tests_once(); + + printf("done (all tests OK)\n"); + return 0; +} diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh new file mode 100644 index 000000000000..8984e0bb58c7 --- /dev/null +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -0,0 +1,274 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Please run as root + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +exitcode=0 + +usage() { + cat <"] + -t: specify specific categories to tests to run + -h: display this message + +The default behavior is to run all tests. + +Alternatively, specific groups tests can be run by passing a string +to the -t argument containing one or more of the following categories +separated by spaces: +- mmap + tests for mmap(2) +- gup_test + tests for gup using gup_test interface +- userfaultfd + tests for userfaultfd(2) +- compaction + a test for the patch "Allow compaction of unevictable pages" +- mlock + tests for mlock(2) +- mremap + tests for mremap(2) +- hugevm + tests for very large virtual address space +- vmalloc + vmalloc smoke tests +- hmm + hmm smoke tests +- madv_populate + test memadvise(2) MADV_POPULATE_{READ,WRITE} options +- memfd_secret + test memfd_secret(2) +- process_mrelease + test process_mrelease(2) +- ksm + ksm tests that do not require >=2 NUMA nodes +- ksm_numa + ksm tests that require >=2 NUMA nodes +- pkey + memory protection key tests +- soft_dirty + test soft dirty page bit semantics +- cow + test copy-on-write semantics +example: ./run_vmtests.sh -t "hmm mmap ksm" +EOF + exit 0 +} + + +while getopts "ht:" OPT; do + case ${OPT} in + "h") usage ;; + "t") VM_SELFTEST_ITEMS=${OPTARG} ;; + esac +done +shift $((OPTIND -1)) + +# default behavior: run all tests +VM_SELFTEST_ITEMS=${VM_SELFTEST_ITEMS:-default} + +test_selected() { + if [ "$VM_SELFTEST_ITEMS" == "default" ]; then + # If no VM_SELFTEST_ITEMS are specified, run all tests + return 0 + fi + # If test selected argument is one of the test items + if [[ " ${VM_SELFTEST_ITEMS[*]} " =~ " ${1} " ]]; then + return 0 + else + return 1 + fi +} + +# get huge pagesize and freepages from /proc/meminfo +while read -r name size unit; do + if [ "$name" = "HugePages_Free:" ]; then + freepgs="$size" + fi + if [ "$name" = "Hugepagesize:" ]; then + hpgsize_KB="$size" + fi +done < /proc/meminfo + +# Simple hugetlbfs tests have a hardcoded minimum requirement of +# huge pages totaling 256MB (262144KB) in size. The userfaultfd +# hugetlb test requires a minimum of 2 * nr_cpus huge pages. Take +# both of these requirements into account and attempt to increase +# number of huge pages available. +nr_cpus=$(nproc) +hpgsize_MB=$((hpgsize_KB / 1024)) +half_ufd_size_MB=$((((nr_cpus * hpgsize_MB + 127) / 128) * 128)) +needmem_KB=$((half_ufd_size_MB * 2 * 1024)) + +# set proper nr_hugepages +if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then + nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) + needpgs=$((needmem_KB / hpgsize_KB)) + tries=2 + while [ "$tries" -gt 0 ] && [ "$freepgs" -lt "$needpgs" ]; do + lackpgs=$((needpgs - freepgs)) + echo 3 > /proc/sys/vm/drop_caches + if ! echo $((lackpgs + nr_hugepgs)) > /proc/sys/vm/nr_hugepages; then + echo "Please run this test as root" + exit $ksft_skip + fi + while read -r name size unit; do + if [ "$name" = "HugePages_Free:" ]; then + freepgs=$size + fi + done < /proc/meminfo + tries=$((tries - 1)) + done + if [ "$freepgs" -lt "$needpgs" ]; then + printf "Not enough huge pages available (%d < %d)\n" \ + "$freepgs" "$needpgs" + exit 1 + fi +else + echo "no hugetlbfs support in kernel?" + exit 1 +fi + +# filter 64bit architectures +ARCH64STR="arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64" +if [ -z "$ARCH" ]; then + ARCH=$(uname -m 2>/dev/null | sed -e 's/aarch64.*/arm64/') +fi +VADDR64=0 +echo "$ARCH64STR" | grep "$ARCH" &>/dev/null && VADDR64=1 + +# Usage: run_test [test binary] [arbitrary test arguments...] +run_test() { + if test_selected ${CATEGORY}; then + local title="running $*" + local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -) + printf "%s\n%s\n%s\n" "$sep" "$title" "$sep" + + "$@" + local ret=$? + if [ $ret -eq 0 ]; then + echo "[PASS]" + elif [ $ret -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip + else + echo "[FAIL]" + exitcode=1 + fi + fi # test_selected +} + +CATEGORY="hugetlb" run_test ./hugepage-mmap + +shmmax=$(cat /proc/sys/kernel/shmmax) +shmall=$(cat /proc/sys/kernel/shmall) +echo 268435456 > /proc/sys/kernel/shmmax +echo 4194304 > /proc/sys/kernel/shmall +CATEGORY="hugetlb" run_test ./hugepage-shm +echo "$shmmax" > /proc/sys/kernel/shmmax +echo "$shmall" > /proc/sys/kernel/shmall + +CATEGORY="hugetlb" run_test ./map_hugetlb +CATEGORY="hugetlb" run_test ./hugepage-mremap +CATEGORY="hugetlb" run_test ./hugepage-vmemmap +CATEGORY="hugetlb" run_test ./hugetlb-madvise + +if test_selected "hugetlb"; then + echo "NOTE: These hugetlb tests provide minimal coverage. Use" + echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" + echo " hugetlb regression testing." +fi + +CATEGORY="mmap" run_test ./map_fixed_noreplace + +# get_user_pages_fast() benchmark +CATEGORY="gup_test" run_test ./gup_test -u +# pin_user_pages_fast() benchmark +CATEGORY="gup_test" run_test ./gup_test -a +# Dump pages 0, 19, and 4096, using pin_user_pages: +CATEGORY="gup_test" run_test ./gup_test -ct -F 0x1 0 19 0x1000 + +uffd_mods=("" ":dev") +for mod in "${uffd_mods[@]}"; do + CATEGORY="userfaultfd" run_test ./userfaultfd anon${mod} 20 16 + # Hugetlb tests require source and destination huge pages. Pass in half + # the size ($half_ufd_size_MB), which is used for *each*. + CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32 + CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32 + CATEGORY="userfaultfd" run_test ./userfaultfd shmem${mod} 20 16 +done + +#cleanup +echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages + +CATEGORY="compaction" run_test ./compaction_test + +CATEGORY="mlock" run_test sudo -u nobody ./on-fault-limit + +CATEGORY="mmap" run_test ./map_populate + +CATEGORY="mlock" run_test ./mlock-random-test + +CATEGORY="mlock" run_test ./mlock2-tests + +CATEGORY="process_mrelease" run_test ./mrelease_test + +CATEGORY="mremap" run_test ./mremap_test + +CATEGORY="hugetlb" run_test ./thuge-gen + +if [ $VADDR64 -ne 0 ]; then + CATEGORY="hugevm" run_test ./virtual_address_range + + # virtual address 128TB switch test + CATEGORY="hugevm" run_test ./va_128TBswitch.sh +fi # VADDR64 + +# vmalloc stability smoke test +CATEGORY="vmalloc" run_test ./test_vmalloc.sh smoke + +CATEGORY="mremap" run_test ./mremap_dontunmap + +CATEGORY="hmm" run_test ./test_hmm.sh smoke + +# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests +CATEGORY="madv_populate" run_test ./madv_populate + +CATEGORY="memfd_secret" run_test ./memfd_secret + +# KSM MADV_MERGEABLE test with 10 identical pages +CATEGORY="ksm" run_test ./ksm_tests -M -p 10 +# KSM unmerge test +CATEGORY="ksm" run_test ./ksm_tests -U +# KSM test with 10 zero pages and use_zero_pages = 0 +CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 0 +# KSM test with 10 zero pages and use_zero_pages = 1 +CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 1 +# KSM test with 2 NUMA nodes and merge_across_nodes = 1 +CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 1 +# KSM test with 2 NUMA nodes and merge_across_nodes = 0 +CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 0 + +CATEGORY="ksm" run_test ./ksm_functional_tests + +run_test ./ksm_functional_tests + +# protection_keys tests +if [ -x ./protection_keys_32 ] +then + CATEGORY="pkey" run_test ./protection_keys_32 +fi + +if [ -x ./protection_keys_64 ] +then + CATEGORY="pkey" run_test ./protection_keys_64 +fi + +CATEGORY="soft_dirty" run_test ./soft-dirty + +# COW tests +CATEGORY="cow" run_test ./cow + +exit $exitcode diff --git a/tools/testing/selftests/mm/settings b/tools/testing/selftests/mm/settings new file mode 100644 index 000000000000..9abfc60e9e6f --- /dev/null +++ b/tools/testing/selftests/mm/settings @@ -0,0 +1 @@ +timeout=45 diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c new file mode 100644 index 000000000000..21d8830c5f24 --- /dev/null +++ b/tools/testing/selftests/mm/soft-dirty.c @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include "../kselftest.h" +#include "vm_util.h" + +#define PAGEMAP_FILE_PATH "/proc/self/pagemap" +#define TEST_ITERATIONS 10000 + +static void test_simple(int pagemap_fd, int pagesize) +{ + int i; + char *map; + + map = aligned_alloc(pagesize, pagesize); + if (!map) + ksft_exit_fail_msg("mmap failed\n"); + + clear_softdirty(); + + for (i = 0 ; i < TEST_ITERATIONS; i++) { + if (pagemap_is_softdirty(pagemap_fd, map) == 1) { + ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i); + break; + } + + clear_softdirty(); + // Write something to the page to get the dirty bit enabled on the page + map[0]++; + + if (pagemap_is_softdirty(pagemap_fd, map) == 0) { + ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i); + break; + } + + clear_softdirty(); + } + free(map); + + ksft_test_result(i == TEST_ITERATIONS, "Test %s\n", __func__); +} + +static void test_vma_reuse(int pagemap_fd, int pagesize) +{ + char *map, *map2; + + map = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap failed"); + + // The kernel always marks new regions as soft dirty + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s dirty bit of allocated page\n", __func__); + + clear_softdirty(); + munmap(map, pagesize); + + map2 = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0); + if (map2 == MAP_FAILED) + ksft_exit_fail_msg("mmap failed"); + + // Dirty bit is set for new regions even if they are reused + if (map == map2) + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1, + "Test %s dirty bit of reused address page\n", __func__); + else + ksft_test_result_skip("Test %s dirty bit of reused address page\n", __func__); + + munmap(map2, pagesize); +} + +static void test_hugepage(int pagemap_fd, int pagesize) +{ + char *map; + int i, ret; + size_t hpage_len = read_pmd_pagesize(); + + map = memalign(hpage_len, hpage_len); + if (!map) + ksft_exit_fail_msg("memalign failed\n"); + + ret = madvise(map, hpage_len, MADV_HUGEPAGE); + if (ret) + ksft_exit_fail_msg("madvise failed %d\n", ret); + + for (i = 0; i < hpage_len; i++) + map[i] = (char)i; + + if (check_huge_anon(map, 1, hpage_len)) { + ksft_test_result_pass("Test %s huge page allocation\n", __func__); + + clear_softdirty(); + for (i = 0 ; i < TEST_ITERATIONS ; i++) { + if (pagemap_is_softdirty(pagemap_fd, map) == 1) { + ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i); + break; + } + + clear_softdirty(); + // Write something to the page to get the dirty bit enabled on the page + map[0]++; + + if (pagemap_is_softdirty(pagemap_fd, map) == 0) { + ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i); + break; + } + clear_softdirty(); + } + + ksft_test_result(i == TEST_ITERATIONS, "Test %s huge page dirty bit\n", __func__); + } else { + // hugepage allocation failed. skip these tests + ksft_test_result_skip("Test %s huge page allocation\n", __func__); + ksft_test_result_skip("Test %s huge page dirty bit\n", __func__); + } + free(map); +} + +static void test_mprotect(int pagemap_fd, int pagesize, bool anon) +{ + const char *type[] = {"file", "anon"}; + const char *fname = "./soft-dirty-test-file"; + int test_fd; + char *map; + + if (anon) { + map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + if (!map) + ksft_exit_fail_msg("anon mmap failed\n"); + } else { + test_fd = open(fname, O_RDWR | O_CREAT); + if (test_fd < 0) { + ksft_test_result_skip("Test %s open() file failed\n", __func__); + return; + } + unlink(fname); + ftruncate(test_fd, pagesize); + map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, + MAP_SHARED, test_fd, 0); + if (!map) + ksft_exit_fail_msg("file mmap failed\n"); + } + + *map = 1; + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s-%s dirty bit of new written page\n", + __func__, type[anon]); + clear_softdirty(); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, + "Test %s-%s soft-dirty clear after clear_refs\n", + __func__, type[anon]); + mprotect(map, pagesize, PROT_READ); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, + "Test %s-%s soft-dirty clear after marking RO\n", + __func__, type[anon]); + mprotect(map, pagesize, PROT_READ|PROT_WRITE); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, + "Test %s-%s soft-dirty clear after marking RW\n", + __func__, type[anon]); + *map = 2; + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s-%s soft-dirty after rewritten\n", + __func__, type[anon]); + + munmap(map, pagesize); + + if (!anon) + close(test_fd); +} + +static void test_mprotect_anon(int pagemap_fd, int pagesize) +{ + test_mprotect(pagemap_fd, pagesize, true); +} + +static void test_mprotect_file(int pagemap_fd, int pagesize) +{ + test_mprotect(pagemap_fd, pagesize, false); +} + +int main(int argc, char **argv) +{ + int pagemap_fd; + int pagesize; + + ksft_print_header(); + ksft_set_plan(15); + + pagemap_fd = open(PAGEMAP_FILE_PATH, O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_fail_msg("Failed to open %s\n", PAGEMAP_FILE_PATH); + + pagesize = getpagesize(); + + test_simple(pagemap_fd, pagesize); + test_vma_reuse(pagemap_fd, pagesize); + test_hugepage(pagemap_fd, pagesize); + test_mprotect_anon(pagemap_fd, pagesize); + test_mprotect_file(pagemap_fd, pagesize); + + close(pagemap_fd); + + return ksft_exit_pass(); +} diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c new file mode 100644 index 000000000000..76e1c36dd9e5 --- /dev/null +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -0,0 +1,309 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual + * address range in a process via /split_huge_pages interface. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "vm_util.h" + +uint64_t pagesize; +unsigned int pageshift; +uint64_t pmd_pagesize; + +#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages" +#define INPUT_MAX 80 + +#define PID_FMT "%d,0x%lx,0x%lx" +#define PATH_FMT "%s,0x%lx,0x%lx" + +#define PFN_MASK ((1UL<<55)-1) +#define KPF_THP (1UL<<22) + +int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file) +{ + uint64_t paddr; + uint64_t page_flags; + + if (pagemap_file) { + pread(pagemap_file, &paddr, sizeof(paddr), + ((long)vaddr >> pageshift) * sizeof(paddr)); + + if (kpageflags_file) { + pread(kpageflags_file, &page_flags, sizeof(page_flags), + (paddr & PFN_MASK) * sizeof(page_flags)); + + return !!(page_flags & KPF_THP); + } + } + return 0; +} + +static int write_file(const char *path, const char *buf, size_t buflen) +{ + int fd; + ssize_t numwritten; + + fd = open(path, O_WRONLY); + if (fd == -1) + return 0; + + numwritten = write(fd, buf, buflen - 1); + close(fd); + if (numwritten < 1) + return 0; + + return (unsigned int) numwritten; +} + +static void write_debugfs(const char *fmt, ...) +{ + char input[INPUT_MAX]; + int ret; + va_list argp; + + va_start(argp, fmt); + ret = vsnprintf(input, INPUT_MAX, fmt, argp); + va_end(argp); + + if (ret >= INPUT_MAX) { + printf("%s: Debugfs input is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) { + perror(SPLIT_DEBUGFS); + exit(EXIT_FAILURE); + } +} + +void split_pmd_thp(void) +{ + char *one_page; + size_t len = 4 * pmd_pagesize; + size_t i; + + one_page = memalign(pmd_pagesize, len); + + if (!one_page) { + printf("Fail to allocate memory\n"); + exit(EXIT_FAILURE); + } + + madvise(one_page, len, MADV_HUGEPAGE); + + for (i = 0; i < len; i++) + one_page[i] = (char)i; + + if (!check_huge_anon(one_page, 1, pmd_pagesize)) { + printf("No THP is allocated\n"); + exit(EXIT_FAILURE); + } + + /* split all THPs */ + write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, + (uint64_t)one_page + len); + + for (i = 0; i < len; i++) + if (one_page[i] != (char)i) { + printf("%ld byte corrupted\n", i); + exit(EXIT_FAILURE); + } + + + if (check_huge_anon(one_page, 0, pmd_pagesize)) { + printf("Still AnonHugePages not split\n"); + exit(EXIT_FAILURE); + } + + printf("Split huge pages successful\n"); + free(one_page); +} + +void split_pte_mapped_thp(void) +{ + char *one_page, *pte_mapped, *pte_mapped2; + size_t len = 4 * pmd_pagesize; + uint64_t thp_size; + size_t i; + const char *pagemap_template = "/proc/%d/pagemap"; + const char *kpageflags_proc = "/proc/kpageflags"; + char pagemap_proc[255]; + int pagemap_fd; + int kpageflags_fd; + + if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) { + perror("get pagemap proc error"); + exit(EXIT_FAILURE); + } + pagemap_fd = open(pagemap_proc, O_RDONLY); + + if (pagemap_fd == -1) { + perror("read pagemap:"); + exit(EXIT_FAILURE); + } + + kpageflags_fd = open(kpageflags_proc, O_RDONLY); + + if (kpageflags_fd == -1) { + perror("read kpageflags:"); + exit(EXIT_FAILURE); + } + + one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + + madvise(one_page, len, MADV_HUGEPAGE); + + for (i = 0; i < len; i++) + one_page[i] = (char)i; + + if (!check_huge_anon(one_page, 1, pmd_pagesize)) { + printf("No THP is allocated\n"); + exit(EXIT_FAILURE); + } + + /* remap the first pagesize of first THP */ + pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE); + + /* remap the Nth pagesize of Nth THP */ + for (i = 1; i < 4; i++) { + pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i, + pagesize, pagesize, + MREMAP_MAYMOVE|MREMAP_FIXED, + pte_mapped + pagesize * i); + if (pte_mapped2 == (char *)-1) { + perror("mremap failed"); + exit(EXIT_FAILURE); + } + } + + /* smap does not show THPs after mremap, use kpageflags instead */ + thp_size = 0; + for (i = 0; i < pagesize * 4; i++) + if (i % pagesize == 0 && + is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) + thp_size++; + + if (thp_size != 4) { + printf("Some THPs are missing during mremap\n"); + exit(EXIT_FAILURE); + } + + /* split all remapped THPs */ + write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped, + (uint64_t)pte_mapped + pagesize * 4); + + /* smap does not show THPs after mremap, use kpageflags instead */ + thp_size = 0; + for (i = 0; i < pagesize * 4; i++) { + if (pte_mapped[i] != (char)i) { + printf("%ld byte corrupted\n", i); + exit(EXIT_FAILURE); + } + if (i % pagesize == 0 && + is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) + thp_size++; + } + + if (thp_size) { + printf("Still %ld THPs not split\n", thp_size); + exit(EXIT_FAILURE); + } + + printf("Split PTE-mapped huge pages successful\n"); + munmap(one_page, len); + close(pagemap_fd); + close(kpageflags_fd); +} + +void split_file_backed_thp(void) +{ + int status; + int fd; + ssize_t num_written; + char tmpfs_template[] = "/tmp/thp_split_XXXXXX"; + const char *tmpfs_loc = mkdtemp(tmpfs_template); + char testfile[INPUT_MAX]; + uint64_t pgoff_start = 0, pgoff_end = 1024; + + printf("Please enable pr_debug in split_huge_pages_in_file() if you need more info.\n"); + + status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m"); + + if (status) { + printf("Unable to create a tmpfs for testing\n"); + exit(EXIT_FAILURE); + } + + status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc); + if (status >= INPUT_MAX) { + printf("Fail to create file-backed THP split testing file\n"); + goto cleanup; + } + + fd = open(testfile, O_CREAT|O_WRONLY); + if (fd == -1) { + perror("Cannot open testing file\n"); + goto cleanup; + } + + /* write something to the file, so a file-backed THP can be allocated */ + num_written = write(fd, tmpfs_loc, strlen(tmpfs_loc) + 1); + close(fd); + + if (num_written < 1) { + printf("Fail to write data to testing file\n"); + goto cleanup; + } + + /* split the file-backed THP */ + write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end); + + status = unlink(testfile); + if (status) + perror("Cannot remove testing file\n"); + +cleanup: + status = umount(tmpfs_loc); + if (status) { + printf("Unable to umount %s\n", tmpfs_loc); + exit(EXIT_FAILURE); + } + status = rmdir(tmpfs_loc); + if (status) { + perror("cannot remove tmp dir"); + exit(EXIT_FAILURE); + } + + printf("file-backed THP split test done, please check dmesg for more information\n"); +} + +int main(int argc, char **argv) +{ + if (geteuid() != 0) { + printf("Please run the benchmark as root\n"); + exit(EXIT_FAILURE); + } + + pagesize = getpagesize(); + pageshift = ffs(pagesize) - 1; + pmd_pagesize = read_pmd_pagesize(); + + split_pmd_thp(); + split_pte_mapped_thp(); + split_file_backed_thp(); + + return 0; +} diff --git a/tools/testing/selftests/mm/test_hmm.sh b/tools/testing/selftests/mm/test_hmm.sh new file mode 100644 index 000000000000..46e19b5d648d --- /dev/null +++ b/tools/testing/selftests/mm/test_hmm.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2018 Uladzislau Rezki (Sony) +# +# This is a test script for the kernel test driver to analyse vmalloc +# allocator. Therefore it is just a kernel module loader. You can specify +# and pass different parameters in order to: +# a) analyse performance of vmalloc allocations; +# b) stressing and stability check of vmalloc subsystem. + +TEST_NAME="test_hmm" +DRIVER="test_hmm" + +# 1 if fails +exitcode=1 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +check_test_requirements() +{ + uid=$(id -u) + if [ $uid -ne 0 ]; then + echo "$0: Must be run as root" + exit $ksft_skip + fi + + if ! which modprobe > /dev/null 2>&1; then + echo "$0: You need modprobe installed" + exit $ksft_skip + fi + + if ! modinfo $DRIVER > /dev/null 2>&1; then + echo "$0: You must have the following enabled in your kernel:" + echo "CONFIG_TEST_HMM=m" + exit $ksft_skip + fi +} + +load_driver() +{ + if [ $# -eq 0 ]; then + modprobe $DRIVER > /dev/null 2>&1 + else + if [ $# -eq 2 ]; then + modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2 + > /dev/null 2>&1 + else + echo "Missing module parameters. Make sure pass"\ + "spm_addr_dev0 and spm_addr_dev1" + usage + fi + fi +} + +unload_driver() +{ + modprobe -r $DRIVER > /dev/null 2>&1 +} + +run_smoke() +{ + echo "Running smoke test. Note, this test provides basic coverage." + + load_driver $1 $2 + $(dirname "${BASH_SOURCE[0]}")/hmm-tests + unload_driver +} + +usage() +{ + echo -n "Usage: $0" + echo + echo "Example usage:" + echo + echo "# Shows help message" + echo "./${TEST_NAME}.sh" + echo + echo "# Smoke testing" + echo "./${TEST_NAME}.sh smoke" + echo + echo "# Smoke testing with SPM enabled" + echo "./${TEST_NAME}.sh smoke " + echo + exit 0 +} + +function run_test() +{ + if [ $# -eq 0 ]; then + usage + else + if [ "$1" = "smoke" ]; then + run_smoke $2 $3 + else + usage + fi + fi +} + +check_test_requirements +run_test $@ + +exit 0 diff --git a/tools/testing/selftests/mm/test_vmalloc.sh b/tools/testing/selftests/mm/test_vmalloc.sh new file mode 100644 index 000000000000..d73b846736f1 --- /dev/null +++ b/tools/testing/selftests/mm/test_vmalloc.sh @@ -0,0 +1,177 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2018 Uladzislau Rezki (Sony) +# +# This is a test script for the kernel test driver to analyse vmalloc +# allocator. Therefore it is just a kernel module loader. You can specify +# and pass different parameters in order to: +# a) analyse performance of vmalloc allocations; +# b) stressing and stability check of vmalloc subsystem. + +TEST_NAME="vmalloc" +DRIVER="test_${TEST_NAME}" +NUM_CPUS=`grep -c ^processor /proc/cpuinfo` + +# 1 if fails +exitcode=1 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +# +# Static templates for performance, stressing and smoke tests. +# Also it is possible to pass any supported parameters manualy. +# +PERF_PARAM="sequential_test_order=1 test_repeat_count=3" +SMOKE_PARAM="test_loop_count=10000 test_repeat_count=10" +STRESS_PARAM="nr_threads=$NUM_CPUS test_repeat_count=20" + +check_test_requirements() +{ + uid=$(id -u) + if [ $uid -ne 0 ]; then + echo "$0: Must be run as root" + exit $ksft_skip + fi + + if ! which modprobe > /dev/null 2>&1; then + echo "$0: You need modprobe installed" + exit $ksft_skip + fi + + if ! modinfo $DRIVER > /dev/null 2>&1; then + echo "$0: You must have the following enabled in your kernel:" + echo "CONFIG_TEST_VMALLOC=m" + exit $ksft_skip + fi +} + +run_perfformance_check() +{ + echo "Run performance tests to evaluate how fast vmalloc allocation is." + echo "It runs all test cases on one single CPU with sequential order." + + modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1 + echo "Done." + echo "Ccheck the kernel message buffer to see the summary." +} + +run_stability_check() +{ + echo "Run stability tests. In order to stress vmalloc subsystem all" + echo "available test cases are run by NUM_CPUS workers simultaneously." + echo "It will take time, so be patient." + + modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1 + echo "Done." + echo "Check the kernel ring buffer to see the summary." +} + +run_smoke_check() +{ + echo "Run smoke test. Note, this test provides basic coverage." + echo "Please check $0 output how it can be used" + echo "for deep performance analysis as well as stress testing." + + modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1 + echo "Done." + echo "Check the kernel ring buffer to see the summary." +} + +usage() +{ + echo -n "Usage: $0 [ performance ] | [ stress ] | | [ smoke ] | " + echo "manual parameters" + echo + echo "Valid tests and parameters:" + echo + modinfo $DRIVER + echo + echo "Example usage:" + echo + echo "# Shows help message" + echo "./${DRIVER}.sh" + echo + echo "# Runs 1 test(id_1), repeats it 5 times by NUM_CPUS workers" + echo "./${DRIVER}.sh nr_threads=$NUM_CPUS run_test_mask=1 test_repeat_count=5" + echo + echo -n "# Runs 4 tests(id_1|id_2|id_4|id_16) on one CPU with " + echo "sequential order" + echo -n "./${DRIVER}.sh sequential_test_order=1 " + echo "run_test_mask=23" + echo + echo -n "# Runs all tests by NUM_CPUS workers, shuffled order, repeats " + echo "20 times" + echo "./${DRIVER}.sh nr_threads=$NUM_CPUS test_repeat_count=20" + echo + echo "# Performance analysis" + echo "./${DRIVER}.sh performance" + echo + echo "# Stress testing" + echo "./${DRIVER}.sh stress" + echo + exit 0 +} + +function validate_passed_args() +{ + VALID_ARGS=`modinfo $DRIVER | awk '/parm:/ {print $2}' | sed 's/:.*//'` + + # + # Something has been passed, check it. + # + for passed_arg in $@; do + key=${passed_arg//=*/} + val="${passed_arg:$((${#key}+1))}" + valid=0 + + for valid_arg in $VALID_ARGS; do + if [[ $key = $valid_arg ]] && [[ $val -gt 0 ]]; then + valid=1 + break + fi + done + + if [[ $valid -ne 1 ]]; then + echo "Error: key or value is not correct: ${key} $val" + exit $exitcode + fi + done +} + +function run_manual_check() +{ + # + # Validate passed parameters. If there is wrong one, + # the script exists and does not execute further. + # + validate_passed_args $@ + + echo "Run the test with following parameters: $@" + modprobe $DRIVER $@ > /dev/null 2>&1 + echo "Done." + echo "Check the kernel ring buffer to see the summary." +} + +function run_test() +{ + if [ $# -eq 0 ]; then + usage + else + if [[ "$1" = "performance" ]]; then + run_perfformance_check + elif [[ "$1" = "stress" ]]; then + run_stability_check + elif [[ "$1" = "smoke" ]]; then + run_smoke_check + else + run_manual_check $@ + fi + fi +} + +check_test_requirements +run_test $@ + +exit 0 diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c new file mode 100644 index 000000000000..361ef7192cc6 --- /dev/null +++ b/tools/testing/selftests/mm/thuge-gen.c @@ -0,0 +1,257 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Test selecting other page sizes for mmap/shmget. + + Before running this huge pages for each huge page size must have been + reserved. + For large pages beyond MAX_ORDER (like 1GB on x86) boot options must be used. + Also shmmax must be increased. + And you need to run as root to work around some weird permissions in shm. + And nothing using huge pages should run in parallel. + When the program aborts you may need to clean up the shm segments with + ipcrm -m by hand, like this + sudo ipcs | awk '$1 == "0x00000000" {print $2}' | xargs -n1 sudo ipcrm -m + (warning this will remove all if someone else uses them) */ + +#define _GNU_SOURCE 1 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define err(x) perror(x), exit(1) + +#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) +#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) +#define MAP_HUGE_SHIFT 26 +#define MAP_HUGE_MASK 0x3f +#if !defined(MAP_HUGETLB) +#define MAP_HUGETLB 0x40000 +#endif + +#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */ +#define SHM_HUGE_SHIFT 26 +#define SHM_HUGE_MASK 0x3f +#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT) +#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT) + +#define NUM_PAGESIZES 5 + +#define NUM_PAGES 4 + +#define Dprintf(fmt...) // printf(fmt) + +unsigned long page_sizes[NUM_PAGESIZES]; +int num_page_sizes; + +int ilog2(unsigned long v) +{ + int l = 0; + while ((1UL << l) < v) + l++; + return l; +} + +void find_pagesizes(void) +{ + glob_t g; + int i; + glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g); + assert(g.gl_pathc <= NUM_PAGESIZES); + for (i = 0; i < g.gl_pathc; i++) { + sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB", + &page_sizes[i]); + page_sizes[i] <<= 10; + printf("Found %luMB\n", page_sizes[i] >> 20); + } + num_page_sizes = g.gl_pathc; + globfree(&g); +} + +unsigned long default_huge_page_size(void) +{ + unsigned long hps = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + if (!f) + return 0; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { + hps <<= 10; + break; + } + } + free(line); + return hps; +} + +void show(unsigned long ps) +{ + char buf[100]; + if (ps == getpagesize()) + return; + printf("%luMB: ", ps >> 20); + fflush(stdout); + snprintf(buf, sizeof buf, + "cat /sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", + ps >> 10); + system(buf); +} + +unsigned long read_sysfs(int warn, char *fmt, ...) +{ + char *line = NULL; + size_t linelen = 0; + char buf[100]; + FILE *f; + va_list ap; + unsigned long val = 0; + + va_start(ap, fmt); + vsnprintf(buf, sizeof buf, fmt, ap); + va_end(ap); + + f = fopen(buf, "r"); + if (!f) { + if (warn) + printf("missing %s\n", buf); + return 0; + } + if (getline(&line, &linelen, f) > 0) { + sscanf(line, "%lu", &val); + } + fclose(f); + free(line); + return val; +} + +unsigned long read_free(unsigned long ps) +{ + return read_sysfs(ps != getpagesize(), + "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", + ps >> 10); +} + +void test_mmap(unsigned long size, unsigned flags) +{ + char *map; + unsigned long before, after; + int err; + + before = read_free(size); + map = mmap(NULL, size*NUM_PAGES, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB|flags, -1, 0); + + if (map == (char *)-1) err("mmap"); + memset(map, 0xff, size*NUM_PAGES); + after = read_free(size); + Dprintf("before %lu after %lu diff %ld size %lu\n", + before, after, before - after, size); + assert(size == getpagesize() || (before - after) == NUM_PAGES); + show(size); + err = munmap(map, size); + assert(!err); +} + +void test_shmget(unsigned long size, unsigned flags) +{ + int id; + unsigned long before, after; + int err; + + before = read_free(size); + id = shmget(IPC_PRIVATE, size * NUM_PAGES, IPC_CREAT|0600|flags); + if (id < 0) err("shmget"); + + struct shm_info i; + if (shmctl(id, SHM_INFO, (void *)&i) < 0) err("shmctl"); + Dprintf("alloc %lu res %lu\n", i.shm_tot, i.shm_rss); + + + Dprintf("id %d\n", id); + char *map = shmat(id, NULL, 0600); + if (map == (char*)-1) err("shmat"); + + shmctl(id, IPC_RMID, NULL); + + memset(map, 0xff, size*NUM_PAGES); + after = read_free(size); + + Dprintf("before %lu after %lu diff %ld size %lu\n", + before, after, before - after, size); + assert(size == getpagesize() || (before - after) == NUM_PAGES); + show(size); + err = shmdt(map); + assert(!err); +} + +void sanity_checks(void) +{ + int i; + unsigned long largest = getpagesize(); + + for (i = 0; i < num_page_sizes; i++) { + if (page_sizes[i] > largest) + largest = page_sizes[i]; + + if (read_free(page_sizes[i]) < NUM_PAGES) { + printf("Not enough huge pages for page size %lu MB, need %u\n", + page_sizes[i] >> 20, + NUM_PAGES); + exit(0); + } + } + + if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) { + printf("Please do echo %lu > /proc/sys/kernel/shmmax", largest * NUM_PAGES); + exit(0); + } + +#if defined(__x86_64__) + if (largest != 1U<<30) { + printf("No GB pages available on x86-64\n" + "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES); + exit(0); + } +#endif +} + +int main(void) +{ + int i; + unsigned default_hps = default_huge_page_size(); + + find_pagesizes(); + + sanity_checks(); + + for (i = 0; i < num_page_sizes; i++) { + unsigned long ps = page_sizes[i]; + int arg = ilog2(ps) << MAP_HUGE_SHIFT; + printf("Testing %luMB mmap with shift %x\n", ps >> 20, arg); + test_mmap(ps, MAP_HUGETLB | arg); + } + printf("Testing default huge mmap\n"); + test_mmap(default_hps, SHM_HUGETLB); + + puts("Testing non-huge shmget"); + test_shmget(getpagesize(), 0); + + for (i = 0; i < num_page_sizes; i++) { + unsigned long ps = page_sizes[i]; + int arg = ilog2(ps) << SHM_HUGE_SHIFT; + printf("Testing %luMB shmget with shift %x\n", ps >> 20, arg); + test_shmget(ps, SHM_HUGETLB | arg); + } + puts("default huge shmget"); + test_shmget(default_hps, SHM_HUGETLB); + + return 0; +} diff --git a/tools/testing/selftests/mm/transhuge-stress.c b/tools/testing/selftests/mm/transhuge-stress.c new file mode 100644 index 000000000000..e3f00adb1b82 --- /dev/null +++ b/tools/testing/selftests/mm/transhuge-stress.c @@ -0,0 +1,122 @@ +/* + * Stress test for transparent huge pages, memory compaction and migration. + * + * Authors: Konstantin Khlebnikov + * + * This is free and unencumbered software released into the public domain. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "util.h" + +int backing_fd = -1; +int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE; +#define PROT_RW (PROT_READ | PROT_WRITE) + +int main(int argc, char **argv) +{ + size_t ram, len; + void *ptr, *p; + struct timespec a, b; + int i = 0; + char *name = NULL; + double s; + uint8_t *map; + size_t map_len; + int pagemap_fd; + + ram = sysconf(_SC_PHYS_PAGES); + if (ram > SIZE_MAX / sysconf(_SC_PAGESIZE) / 4) + ram = SIZE_MAX / 4; + else + ram *= sysconf(_SC_PAGESIZE); + len = ram; + + while (++i < argc) { + if (!strcmp(argv[i], "-h")) + errx(1, "usage: %s [size in MiB]", argv[0]); + else if (!strcmp(argv[i], "-f")) + name = argv[++i]; + else + len = atoll(argv[i]) << 20; + } + + if (name) { + backing_fd = open(name, O_RDWR); + if (backing_fd == -1) + errx(2, "open %s", name); + mmap_flags = MAP_SHARED; + } + + warnx("allocate %zd transhuge pages, using %zd MiB virtual memory" + " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20, + ram >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1)); + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + err(2, "open pagemap"); + + len -= len % HPAGE_SIZE; + ptr = mmap(NULL, len + HPAGE_SIZE, PROT_RW, mmap_flags, backing_fd, 0); + if (ptr == MAP_FAILED) + err(2, "initial mmap"); + ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE; + + if (madvise(ptr, len, MADV_HUGEPAGE)) + err(2, "MADV_HUGEPAGE"); + + map_len = ram >> (HPAGE_SHIFT - 1); + map = malloc(map_len); + if (!map) + errx(2, "map malloc"); + + while (1) { + int nr_succeed = 0, nr_failed = 0, nr_pages = 0; + + memset(map, 0, map_len); + + clock_gettime(CLOCK_MONOTONIC, &a); + for (p = ptr; p < ptr + len; p += HPAGE_SIZE) { + int64_t pfn; + + pfn = allocate_transhuge(p, pagemap_fd); + + if (pfn < 0) { + nr_failed++; + } else { + size_t idx = pfn >> (HPAGE_SHIFT - PAGE_SHIFT); + + nr_succeed++; + if (idx >= map_len) { + map = realloc(map, idx + 1); + if (!map) + errx(2, "map realloc"); + memset(map + map_len, 0, idx + 1 - map_len); + map_len = idx + 1; + } + if (!map[idx]) + nr_pages++; + map[idx] = 1; + } + + /* split transhuge page, keep last page */ + if (madvise(p, HPAGE_SIZE - PAGE_SIZE, MADV_DONTNEED)) + err(2, "MADV_DONTNEED"); + } + clock_gettime(CLOCK_MONOTONIC, &b); + s = b.tv_sec - a.tv_sec + (b.tv_nsec - a.tv_nsec) / 1000000000.; + + warnx("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t" + "%4d succeed, %4d failed, %4d different pages", + s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20), + nr_succeed, nr_failed, nr_pages); + } +} diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c new file mode 100644 index 000000000000..7f22844ed704 --- /dev/null +++ b/tools/testing/selftests/mm/userfaultfd.c @@ -0,0 +1,1858 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Stress userfaultfd syscall. + * + * Copyright (C) 2015 Red Hat, Inc. + * + * This test allocates two virtual areas and bounces the physical + * memory across the two virtual areas (from area_src to area_dst) + * using userfaultfd. + * + * There are three threads running per CPU: + * + * 1) one per-CPU thread takes a per-page pthread_mutex in a random + * page of the area_dst (while the physical page may still be in + * area_src), and increments a per-page counter in the same page, + * and checks its value against a verification region. + * + * 2) another per-CPU thread handles the userfaults generated by + * thread 1 above. userfaultfd blocking reads or poll() modes are + * exercised interleaved. + * + * 3) one last per-CPU thread transfers the memory in the background + * at maximum bandwidth (if not already transferred by thread + * 2). Each cpu thread takes cares of transferring a portion of the + * area. + * + * When all threads of type 3 completed the transfer, one bounce is + * complete. area_src and area_dst are then swapped. All threads are + * respawned and so the bounce is immediately restarted in the + * opposite direction. + * + * per-CPU threads 1 by triggering userfaults inside + * pthread_mutex_lock will also verify the atomicity of the memory + * transfer (UFFDIO_COPY). + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" +#include "vm_util.h" + +#ifdef __NR_userfaultfd + +static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; + +#define BOUNCE_RANDOM (1<<0) +#define BOUNCE_RACINGFAULTS (1<<1) +#define BOUNCE_VERIFY (1<<2) +#define BOUNCE_POLL (1<<3) +static int bounces; + +#define TEST_ANON 1 +#define TEST_HUGETLB 2 +#define TEST_SHMEM 3 +static int test_type; + +#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY) + +#define BASE_PMD_ADDR ((void *)(1UL << 30)) + +/* test using /dev/userfaultfd, instead of userfaultfd(2) */ +static bool test_dev_userfaultfd; + +/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ +#define ALARM_INTERVAL_SECS 10 +static volatile bool test_uffdio_copy_eexist = true; +static volatile bool test_uffdio_zeropage_eexist = true; +/* Whether to test uffd write-protection */ +static bool test_uffdio_wp = true; +/* Whether to test uffd minor faults */ +static bool test_uffdio_minor = false; +static bool map_shared; +static int mem_fd; +static unsigned long long *count_verify; +static int uffd = -1; +static int uffd_flags, finished, *pipefd; +static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; +static char *zeropage; +pthread_attr_t attr; +static bool test_collapse; + +/* Userfaultfd test statistics */ +struct uffd_stats { + int cpu; + unsigned long missing_faults; + unsigned long wp_faults; + unsigned long minor_faults; +}; + +/* pthread_mutex_t starts at page offset 0 */ +#define area_mutex(___area, ___nr) \ + ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) +/* + * count is placed in the page after pthread_mutex_t naturally aligned + * to avoid non alignment faults on non-x86 archs. + */ +#define area_count(___area, ___nr) \ + ((volatile unsigned long long *) ((unsigned long) \ + ((___area) + (___nr)*page_size + \ + sizeof(pthread_mutex_t) + \ + sizeof(unsigned long long) - 1) & \ + ~(unsigned long)(sizeof(unsigned long long) \ + - 1))) + +#define swap(a, b) \ + do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) + +#define factor_of_2(x) ((x) ^ ((x) & ((x) - 1))) + +const char *examples = + "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" + "./userfaultfd anon 100 99999\n\n" + "# Run the same anonymous memory test, but using /dev/userfaultfd:\n" + "./userfaultfd anon:dev 100 99999\n\n" + "# Run share memory test on 1GiB region with 99 bounces:\n" + "./userfaultfd shmem 1000 99\n\n" + "# Run hugetlb memory test on 256MiB region with 50 bounces:\n" + "./userfaultfd hugetlb 256 50\n\n" + "# Run the same hugetlb test but using shared file:\n" + "./userfaultfd hugetlb_shared 256 50\n\n" + "# 10MiB-~6GiB 999 bounces anonymous test, " + "continue forever unless an error triggers\n" + "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n"; + +static void usage(void) +{ + fprintf(stderr, "\nUsage: ./userfaultfd " + "[hugetlbfs_file]\n\n"); + fprintf(stderr, "Supported : anon, hugetlb, " + "hugetlb_shared, shmem\n\n"); + fprintf(stderr, "'Test mods' can be joined to the test type string with a ':'. " + "Supported mods:\n"); + fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n"); + fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n"); + fprintf(stderr, "\tcollapse - Test MADV_COLLAPSE of UFFDIO_REGISTER_MODE_MINOR\n" + "memory\n"); + fprintf(stderr, "\nExample test mod usage:\n"); + fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n"); + fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n"); + + fprintf(stderr, "Examples:\n\n"); + fprintf(stderr, "%s", examples); + exit(1); +} + +#define _err(fmt, ...) \ + do { \ + int ret = errno; \ + fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \ + fprintf(stderr, " (errno=%d, line=%d)\n", \ + ret, __LINE__); \ + } while (0) + +#define errexit(exitcode, fmt, ...) \ + do { \ + _err(fmt, ##__VA_ARGS__); \ + exit(exitcode); \ + } while (0) + +#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__) + +static void uffd_stats_reset(struct uffd_stats *uffd_stats, + unsigned long n_cpus) +{ + int i; + + for (i = 0; i < n_cpus; i++) { + uffd_stats[i].cpu = i; + uffd_stats[i].missing_faults = 0; + uffd_stats[i].wp_faults = 0; + uffd_stats[i].minor_faults = 0; + } +} + +static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) +{ + int i; + unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; + + for (i = 0; i < n_cpus; i++) { + miss_total += stats[i].missing_faults; + wp_total += stats[i].wp_faults; + minor_total += stats[i].minor_faults; + } + + printf("userfaults: "); + if (miss_total) { + printf("%llu missing (", miss_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].missing_faults); + printf("\b) "); + } + if (wp_total) { + printf("%llu wp (", wp_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].wp_faults); + printf("\b) "); + } + if (minor_total) { + printf("%llu minor (", minor_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].minor_faults); + printf("\b)"); + } + printf("\n"); +} + +static void anon_release_pages(char *rel_area) +{ + if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); +} + +static void anon_allocate_area(void **alloc_area, bool is_src) +{ + *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); +} + +static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) +{ +} + +static void hugetlb_release_pages(char *rel_area) +{ + if (!map_shared) { + if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); + } else { + if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) + err("madvise(MADV_REMOVE) failed"); + } +} + +static void hugetlb_allocate_area(void **alloc_area, bool is_src) +{ + off_t size = nr_pages * page_size; + off_t offset = is_src ? 0 : size; + void *area_alias = NULL; + char **alloc_area_alias; + + *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE, + (map_shared ? MAP_SHARED : MAP_PRIVATE) | + (is_src ? 0 : MAP_NORESERVE), + mem_fd, offset); + if (*alloc_area == MAP_FAILED) + err("mmap of hugetlbfs file failed"); + + if (map_shared) { + area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED, mem_fd, offset); + if (area_alias == MAP_FAILED) + err("mmap of hugetlb file alias failed"); + } + + if (is_src) { + alloc_area_alias = &area_src_alias; + } else { + alloc_area_alias = &area_dst_alias; + } + if (area_alias) + *alloc_area_alias = area_alias; +} + +static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) +{ + if (!map_shared) + return; + + *start = (unsigned long) area_dst_alias + offset; +} + +static void shmem_release_pages(char *rel_area) +{ + if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) + err("madvise(MADV_REMOVE) failed"); +} + +static void shmem_allocate_area(void **alloc_area, bool is_src) +{ + void *area_alias = NULL; + size_t bytes = nr_pages * page_size; + unsigned long offset = is_src ? 0 : bytes; + char *p = NULL, *p_alias = NULL; + + if (test_collapse) { + p = BASE_PMD_ADDR; + if (!is_src) + /* src map + alias + interleaved hpages */ + p += 2 * (bytes + hpage_size); + p_alias = p; + p_alias += bytes; + p_alias += hpage_size; /* Prevent src/dst VMA merge */ + } + + *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + mem_fd, offset); + if (*alloc_area == MAP_FAILED) + err("mmap of memfd failed"); + if (test_collapse && *alloc_area != p) + err("mmap of memfd failed at %p", p); + + area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + mem_fd, offset); + if (area_alias == MAP_FAILED) + err("mmap of memfd alias failed"); + if (test_collapse && area_alias != p_alias) + err("mmap of anonymous memory failed at %p", p_alias); + + if (is_src) + area_src_alias = area_alias; + else + area_dst_alias = area_alias; +} + +static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) +{ + *start = (unsigned long)area_dst_alias + offset; +} + +static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) +{ + if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size)) + err("Did not find expected %d number of hugepages", + expect_nr_hpages); +} + +struct uffd_test_ops { + void (*allocate_area)(void **alloc_area, bool is_src); + void (*release_pages)(char *rel_area); + void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); + void (*check_pmd_mapping)(void *p, int expect_nr_hpages); +}; + +static struct uffd_test_ops anon_uffd_test_ops = { + .allocate_area = anon_allocate_area, + .release_pages = anon_release_pages, + .alias_mapping = noop_alias_mapping, + .check_pmd_mapping = NULL, +}; + +static struct uffd_test_ops shmem_uffd_test_ops = { + .allocate_area = shmem_allocate_area, + .release_pages = shmem_release_pages, + .alias_mapping = shmem_alias_mapping, + .check_pmd_mapping = shmem_check_pmd_mapping, +}; + +static struct uffd_test_ops hugetlb_uffd_test_ops = { + .allocate_area = hugetlb_allocate_area, + .release_pages = hugetlb_release_pages, + .alias_mapping = hugetlb_alias_mapping, + .check_pmd_mapping = NULL, +}; + +static struct uffd_test_ops *uffd_test_ops; + +static inline uint64_t uffd_minor_feature(void) +{ + if (test_type == TEST_HUGETLB && map_shared) + return UFFD_FEATURE_MINOR_HUGETLBFS; + else if (test_type == TEST_SHMEM) + return UFFD_FEATURE_MINOR_SHMEM; + else + return 0; +} + +static uint64_t get_expected_ioctls(uint64_t mode) +{ + uint64_t ioctls = UFFD_API_RANGE_IOCTLS; + + if (test_type == TEST_HUGETLB) + ioctls &= ~(1 << _UFFDIO_ZEROPAGE); + + if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp)) + ioctls &= ~(1 << _UFFDIO_WRITEPROTECT); + + if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor)) + ioctls &= ~(1 << _UFFDIO_CONTINUE); + + return ioctls; +} + +static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls) +{ + uint64_t expected = get_expected_ioctls(mode); + uint64_t actual = ioctls & expected; + + if (actual != expected) { + err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64, + expected, actual); + } +} + +static int __userfaultfd_open_dev(void) +{ + int fd, _uffd; + + fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); + if (fd < 0) + errexit(KSFT_SKIP, "opening /dev/userfaultfd failed"); + + _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS); + if (_uffd < 0) + errexit(errno == ENOTTY ? KSFT_SKIP : 1, + "creating userfaultfd failed"); + close(fd); + return _uffd; +} + +static void userfaultfd_open(uint64_t *features) +{ + struct uffdio_api uffdio_api; + + if (test_dev_userfaultfd) + uffd = __userfaultfd_open_dev(); + else { + uffd = syscall(__NR_userfaultfd, UFFD_FLAGS); + if (uffd < 0) + errexit(errno == ENOSYS ? KSFT_SKIP : 1, + "creating userfaultfd failed"); + } + uffd_flags = fcntl(uffd, F_GETFD, NULL); + + uffdio_api.api = UFFD_API; + uffdio_api.features = *features; + if (ioctl(uffd, UFFDIO_API, &uffdio_api)) + err("UFFDIO_API failed.\nPlease make sure to " + "run with either root or ptrace capability."); + if (uffdio_api.api != UFFD_API) + err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); + + *features = uffdio_api.features; +} + +static inline void munmap_area(void **area) +{ + if (*area) + if (munmap(*area, nr_pages * page_size)) + err("munmap"); + + *area = NULL; +} + +static void uffd_test_ctx_clear(void) +{ + size_t i; + + if (pipefd) { + for (i = 0; i < nr_cpus * 2; ++i) { + if (close(pipefd[i])) + err("close pipefd"); + } + free(pipefd); + pipefd = NULL; + } + + if (count_verify) { + free(count_verify); + count_verify = NULL; + } + + if (uffd != -1) { + if (close(uffd)) + err("close uffd"); + uffd = -1; + } + + munmap_area((void **)&area_src); + munmap_area((void **)&area_src_alias); + munmap_area((void **)&area_dst); + munmap_area((void **)&area_dst_alias); + munmap_area((void **)&area_remap); +} + +static void uffd_test_ctx_init(uint64_t features) +{ + unsigned long nr, cpu; + + uffd_test_ctx_clear(); + + uffd_test_ops->allocate_area((void **)&area_src, true); + uffd_test_ops->allocate_area((void **)&area_dst, false); + + userfaultfd_open(&features); + + count_verify = malloc(nr_pages * sizeof(unsigned long long)); + if (!count_verify) + err("count_verify"); + + for (nr = 0; nr < nr_pages; nr++) { + *area_mutex(area_src, nr) = + (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; + count_verify[nr] = *area_count(area_src, nr) = 1; + /* + * In the transition between 255 to 256, powerpc will + * read out of order in my_bcmp and see both bytes as + * zero, so leave a placeholder below always non-zero + * after the count, to avoid my_bcmp to trigger false + * positives. + */ + *(area_count(area_src, nr) + 1) = 1; + } + + /* + * After initialization of area_src, we must explicitly release pages + * for area_dst to make sure it's fully empty. Otherwise we could have + * some area_dst pages be errornously initialized with zero pages, + * hence we could hit memory corruption later in the test. + * + * One example is when THP is globally enabled, above allocate_area() + * calls could have the two areas merged into a single VMA (as they + * will have the same VMA flags so they're mergeable). When we + * initialize the area_src above, it's possible that some part of + * area_dst could have been faulted in via one huge THP that will be + * shared between area_src and area_dst. It could cause some of the + * area_dst won't be trapped by missing userfaults. + * + * This release_pages() will guarantee even if that happened, we'll + * proactively split the thp and drop any accidentally initialized + * pages within area_dst. + */ + uffd_test_ops->release_pages(area_dst); + + pipefd = malloc(sizeof(int) * nr_cpus * 2); + if (!pipefd) + err("pipefd"); + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) + err("pipe"); +} + +static int my_bcmp(char *str1, char *str2, size_t n) +{ + unsigned long i; + for (i = 0; i < n; i++) + if (str1[i] != str2[i]) + return 1; + return 0; +} + +static void wp_range(int ufd, __u64 start, __u64 len, bool wp) +{ + struct uffdio_writeprotect prms; + + /* Write protection page faults */ + prms.range.start = start; + prms.range.len = len; + /* Undo write-protect, do wakeup after that */ + prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; + + if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) + err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); +} + +static void continue_range(int ufd, __u64 start, __u64 len) +{ + struct uffdio_continue req; + int ret; + + req.range.start = start; + req.range.len = len; + req.mode = 0; + + if (ioctl(ufd, UFFDIO_CONTINUE, &req)) + err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, + (uint64_t)start); + + /* + * Error handling within the kernel for continue is subtly different + * from copy or zeropage, so it may be a source of bugs. Trigger an + * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. + */ + req.mapped = 0; + ret = ioctl(ufd, UFFDIO_CONTINUE, &req); + if (ret >= 0 || req.mapped != -EEXIST) + err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, + ret, (int64_t) req.mapped); +} + +static void *locking_thread(void *arg) +{ + unsigned long cpu = (unsigned long) arg; + unsigned long page_nr; + unsigned long long count; + + if (!(bounces & BOUNCE_RANDOM)) { + page_nr = -bounces; + if (!(bounces & BOUNCE_RACINGFAULTS)) + page_nr += cpu * nr_pages_per_cpu; + } + + while (!finished) { + if (bounces & BOUNCE_RANDOM) { + if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr)) + err("getrandom failed"); + } else + page_nr += 1; + page_nr %= nr_pages; + pthread_mutex_lock(area_mutex(area_dst, page_nr)); + count = *area_count(area_dst, page_nr); + if (count != count_verify[page_nr]) + err("page_nr %lu memory corruption %llu %llu", + page_nr, count, count_verify[page_nr]); + count++; + *area_count(area_dst, page_nr) = count_verify[page_nr] = count; + pthread_mutex_unlock(area_mutex(area_dst, page_nr)); + } + + return NULL; +} + +static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, + unsigned long offset) +{ + uffd_test_ops->alias_mapping(&uffdio_copy->dst, + uffdio_copy->len, + offset); + if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { + /* real retval in ufdio_copy.copy */ + if (uffdio_copy->copy != -EEXIST) + err("UFFDIO_COPY retry error: %"PRId64, + (int64_t)uffdio_copy->copy); + } else { + err("UFFDIO_COPY retry unexpected: %"PRId64, + (int64_t)uffdio_copy->copy); + } +} + +static void wake_range(int ufd, unsigned long addr, unsigned long len) +{ + struct uffdio_range uffdio_wake; + + uffdio_wake.start = addr; + uffdio_wake.len = len; + + if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) + fprintf(stderr, "error waking %lu\n", + addr), exit(1); +} + +static int __copy_page(int ufd, unsigned long offset, bool retry) +{ + struct uffdio_copy uffdio_copy; + + if (offset >= nr_pages * page_size) + err("unexpected offset %lu\n", offset); + uffdio_copy.dst = (unsigned long) area_dst + offset; + uffdio_copy.src = (unsigned long) area_src + offset; + uffdio_copy.len = page_size; + if (test_uffdio_wp) + uffdio_copy.mode = UFFDIO_COPY_MODE_WP; + else + uffdio_copy.mode = 0; + uffdio_copy.copy = 0; + if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { + /* real retval in ufdio_copy.copy */ + if (uffdio_copy.copy != -EEXIST) + err("UFFDIO_COPY error: %"PRId64, + (int64_t)uffdio_copy.copy); + wake_range(ufd, uffdio_copy.dst, page_size); + } else if (uffdio_copy.copy != page_size) { + err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); + } else { + if (test_uffdio_copy_eexist && retry) { + test_uffdio_copy_eexist = false; + retry_copy_page(ufd, &uffdio_copy, offset); + } + return 1; + } + return 0; +} + +static int copy_page_retry(int ufd, unsigned long offset) +{ + return __copy_page(ufd, offset, true); +} + +static int copy_page(int ufd, unsigned long offset) +{ + return __copy_page(ufd, offset, false); +} + +static int uffd_read_msg(int ufd, struct uffd_msg *msg) +{ + int ret = read(uffd, msg, sizeof(*msg)); + + if (ret != sizeof(*msg)) { + if (ret < 0) { + if (errno == EAGAIN || errno == EINTR) + return 1; + err("blocking read error"); + } else { + err("short read"); + } + } + + return 0; +} + +static void uffd_handle_page_fault(struct uffd_msg *msg, + struct uffd_stats *stats) +{ + unsigned long offset; + + if (msg->event != UFFD_EVENT_PAGEFAULT) + err("unexpected msg event %u", msg->event); + + if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { + /* Write protect page faults */ + wp_range(uffd, msg->arg.pagefault.address, page_size, false); + stats->wp_faults++; + } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { + uint8_t *area; + int b; + + /* + * Minor page faults + * + * To prove we can modify the original range for testing + * purposes, we're going to bit flip this range before + * continuing. + * + * Note that this requires all minor page fault tests operate on + * area_dst (non-UFFD-registered) and area_dst_alias + * (UFFD-registered). + */ + + area = (uint8_t *)(area_dst + + ((char *)msg->arg.pagefault.address - + area_dst_alias)); + for (b = 0; b < page_size; ++b) + area[b] = ~area[b]; + continue_range(uffd, msg->arg.pagefault.address, page_size); + stats->minor_faults++; + } else { + /* + * Missing page faults. + * + * Here we force a write check for each of the missing mode + * faults. It's guaranteed because the only threads that + * will trigger uffd faults are the locking threads, and + * their first instruction to touch the missing page will + * always be pthread_mutex_lock(). + * + * Note that here we relied on an NPTL glibc impl detail to + * always read the lock type at the entry of the lock op + * (pthread_mutex_t.__data.__type, offset 0x10) before + * doing any locking operations to guarantee that. It's + * actually not good to rely on this impl detail because + * logically a pthread-compatible lib can implement the + * locks without types and we can fail when linking with + * them. However since we used to find bugs with this + * strict check we still keep it around. Hopefully this + * could be a good hint when it fails again. If one day + * it'll break on some other impl of glibc we'll revisit. + */ + if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) + err("unexpected write fault"); + + offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; + offset &= ~(page_size-1); + + if (copy_page(uffd, offset)) + stats->missing_faults++; + } +} + +static void *uffd_poll_thread(void *arg) +{ + struct uffd_stats *stats = (struct uffd_stats *)arg; + unsigned long cpu = stats->cpu; + struct pollfd pollfd[2]; + struct uffd_msg msg; + struct uffdio_register uffd_reg; + int ret; + char tmp_chr; + + pollfd[0].fd = uffd; + pollfd[0].events = POLLIN; + pollfd[1].fd = pipefd[cpu*2]; + pollfd[1].events = POLLIN; + + for (;;) { + ret = poll(pollfd, 2, -1); + if (ret <= 0) { + if (errno == EINTR || errno == EAGAIN) + continue; + err("poll error: %d", ret); + } + if (pollfd[1].revents & POLLIN) { + if (read(pollfd[1].fd, &tmp_chr, 1) != 1) + err("read pipefd error"); + break; + } + if (!(pollfd[0].revents & POLLIN)) + err("pollfd[0].revents %d", pollfd[0].revents); + if (uffd_read_msg(uffd, &msg)) + continue; + switch (msg.event) { + default: + err("unexpected msg event %u\n", msg.event); + break; + case UFFD_EVENT_PAGEFAULT: + uffd_handle_page_fault(&msg, stats); + break; + case UFFD_EVENT_FORK: + close(uffd); + uffd = msg.arg.fork.ufd; + pollfd[0].fd = uffd; + break; + case UFFD_EVENT_REMOVE: + uffd_reg.range.start = msg.arg.remove.start; + uffd_reg.range.len = msg.arg.remove.end - + msg.arg.remove.start; + if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) + err("remove failure"); + break; + case UFFD_EVENT_REMAP: + area_remap = area_dst; /* save for later unmap */ + area_dst = (char *)(unsigned long)msg.arg.remap.to; + break; + } + } + + return NULL; +} + +pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; + +static void *uffd_read_thread(void *arg) +{ + struct uffd_stats *stats = (struct uffd_stats *)arg; + struct uffd_msg msg; + + pthread_mutex_unlock(&uffd_read_mutex); + /* from here cancellation is ok */ + + for (;;) { + if (uffd_read_msg(uffd, &msg)) + continue; + uffd_handle_page_fault(&msg, stats); + } + + return NULL; +} + +static void *background_thread(void *arg) +{ + unsigned long cpu = (unsigned long) arg; + unsigned long page_nr, start_nr, mid_nr, end_nr; + + start_nr = cpu * nr_pages_per_cpu; + end_nr = (cpu+1) * nr_pages_per_cpu; + mid_nr = (start_nr + end_nr) / 2; + + /* Copy the first half of the pages */ + for (page_nr = start_nr; page_nr < mid_nr; page_nr++) + copy_page_retry(uffd, page_nr * page_size); + + /* + * If we need to test uffd-wp, set it up now. Then we'll have + * at least the first half of the pages mapped already which + * can be write-protected for testing + */ + if (test_uffdio_wp) + wp_range(uffd, (unsigned long)area_dst + start_nr * page_size, + nr_pages_per_cpu * page_size, true); + + /* + * Continue the 2nd half of the page copying, handling write + * protection faults if any + */ + for (page_nr = mid_nr; page_nr < end_nr; page_nr++) + copy_page_retry(uffd, page_nr * page_size); + + return NULL; +} + +static int stress(struct uffd_stats *uffd_stats) +{ + unsigned long cpu; + pthread_t locking_threads[nr_cpus]; + pthread_t uffd_threads[nr_cpus]; + pthread_t background_threads[nr_cpus]; + + finished = 0; + for (cpu = 0; cpu < nr_cpus; cpu++) { + if (pthread_create(&locking_threads[cpu], &attr, + locking_thread, (void *)cpu)) + return 1; + if (bounces & BOUNCE_POLL) { + if (pthread_create(&uffd_threads[cpu], &attr, + uffd_poll_thread, + (void *)&uffd_stats[cpu])) + return 1; + } else { + if (pthread_create(&uffd_threads[cpu], &attr, + uffd_read_thread, + (void *)&uffd_stats[cpu])) + return 1; + pthread_mutex_lock(&uffd_read_mutex); + } + if (pthread_create(&background_threads[cpu], &attr, + background_thread, (void *)cpu)) + return 1; + } + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pthread_join(background_threads[cpu], NULL)) + return 1; + + /* + * Be strict and immediately zap area_src, the whole area has + * been transferred already by the background treads. The + * area_src could then be faulted in a racy way by still + * running uffdio_threads reading zeropages after we zapped + * area_src (but they're guaranteed to get -EEXIST from + * UFFDIO_COPY without writing zero pages into area_dst + * because the background threads already completed). + */ + uffd_test_ops->release_pages(area_src); + + finished = 1; + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pthread_join(locking_threads[cpu], NULL)) + return 1; + + for (cpu = 0; cpu < nr_cpus; cpu++) { + char c; + if (bounces & BOUNCE_POLL) { + if (write(pipefd[cpu*2+1], &c, 1) != 1) + err("pipefd write error"); + if (pthread_join(uffd_threads[cpu], + (void *)&uffd_stats[cpu])) + return 1; + } else { + if (pthread_cancel(uffd_threads[cpu])) + return 1; + if (pthread_join(uffd_threads[cpu], NULL)) + return 1; + } + } + + return 0; +} + +sigjmp_buf jbuf, *sigbuf; + +static void sighndl(int sig, siginfo_t *siginfo, void *ptr) +{ + if (sig == SIGBUS) { + if (sigbuf) + siglongjmp(*sigbuf, 1); + abort(); + } +} + +/* + * For non-cooperative userfaultfd test we fork() a process that will + * generate pagefaults, will mremap the area monitored by the + * userfaultfd and at last this process will release the monitored + * area. + * For the anonymous and shared memory the area is divided into two + * parts, the first part is accessed before mremap, and the second + * part is accessed after mremap. Since hugetlbfs does not support + * mremap, the entire monitored area is accessed in a single pass for + * HUGETLB_TEST. + * The release of the pages currently generates event for shmem and + * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked + * for hugetlb. + * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register + * monitored area, generate pagefaults and test that signal is delivered. + * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2 + * test robustness use case - we release monitored area, fork a process + * that will generate pagefaults and verify signal is generated. + * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal + * feature. Using monitor thread, verify no userfault events are generated. + */ +static int faulting_process(int signal_test) +{ + unsigned long nr; + unsigned long long count; + unsigned long split_nr_pages; + unsigned long lastnr; + struct sigaction act; + volatile unsigned long signalled = 0; + + split_nr_pages = (nr_pages + 1) / 2; + + if (signal_test) { + sigbuf = &jbuf; + memset(&act, 0, sizeof(act)); + act.sa_sigaction = sighndl; + act.sa_flags = SA_SIGINFO; + if (sigaction(SIGBUS, &act, 0)) + err("sigaction"); + lastnr = (unsigned long)-1; + } + + for (nr = 0; nr < split_nr_pages; nr++) { + volatile int steps = 1; + unsigned long offset = nr * page_size; + + if (signal_test) { + if (sigsetjmp(*sigbuf, 1) != 0) { + if (steps == 1 && nr == lastnr) + err("Signal repeated"); + + lastnr = nr; + if (signal_test == 1) { + if (steps == 1) { + /* This is a MISSING request */ + steps++; + if (copy_page(uffd, offset)) + signalled++; + } else { + /* This is a WP request */ + assert(steps == 2); + wp_range(uffd, + (__u64)area_dst + + offset, + page_size, false); + } + } else { + signalled++; + continue; + } + } + } + + count = *area_count(area_dst, nr); + if (count != count_verify[nr]) + err("nr %lu memory corruption %llu %llu\n", + nr, count, count_verify[nr]); + /* + * Trigger write protection if there is by writing + * the same value back. + */ + *area_count(area_dst, nr) = count; + } + + if (signal_test) + return signalled != split_nr_pages; + + area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, + MREMAP_MAYMOVE | MREMAP_FIXED, area_src); + if (area_dst == MAP_FAILED) + err("mremap"); + /* Reset area_src since we just clobbered it */ + area_src = NULL; + + for (; nr < nr_pages; nr++) { + count = *area_count(area_dst, nr); + if (count != count_verify[nr]) { + err("nr %lu memory corruption %llu %llu\n", + nr, count, count_verify[nr]); + } + /* + * Trigger write protection if there is by writing + * the same value back. + */ + *area_count(area_dst, nr) = count; + } + + uffd_test_ops->release_pages(area_dst); + + for (nr = 0; nr < nr_pages; nr++) + if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) + err("nr %lu is not zero", nr); + + return 0; +} + +static void retry_uffdio_zeropage(int ufd, + struct uffdio_zeropage *uffdio_zeropage, + unsigned long offset) +{ + uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start, + uffdio_zeropage->range.len, + offset); + if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) { + if (uffdio_zeropage->zeropage != -EEXIST) + err("UFFDIO_ZEROPAGE error: %"PRId64, + (int64_t)uffdio_zeropage->zeropage); + } else { + err("UFFDIO_ZEROPAGE error: %"PRId64, + (int64_t)uffdio_zeropage->zeropage); + } +} + +static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) +{ + struct uffdio_zeropage uffdio_zeropage; + int ret; + bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE); + __s64 res; + + if (offset >= nr_pages * page_size) + err("unexpected offset %lu", offset); + uffdio_zeropage.range.start = (unsigned long) area_dst + offset; + uffdio_zeropage.range.len = page_size; + uffdio_zeropage.mode = 0; + ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage); + res = uffdio_zeropage.zeropage; + if (ret) { + /* real retval in ufdio_zeropage.zeropage */ + if (has_zeropage) + err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res); + else if (res != -EINVAL) + err("UFFDIO_ZEROPAGE not -EINVAL"); + } else if (has_zeropage) { + if (res != page_size) { + err("UFFDIO_ZEROPAGE unexpected size"); + } else { + if (test_uffdio_zeropage_eexist && retry) { + test_uffdio_zeropage_eexist = false; + retry_uffdio_zeropage(ufd, &uffdio_zeropage, + offset); + } + return 1; + } + } else + err("UFFDIO_ZEROPAGE succeeded"); + + return 0; +} + +static int uffdio_zeropage(int ufd, unsigned long offset) +{ + return __uffdio_zeropage(ufd, offset, false); +} + +/* exercise UFFDIO_ZEROPAGE */ +static int userfaultfd_zeropage_test(void) +{ + struct uffdio_register uffdio_register; + + printf("testing UFFDIO_ZEROPAGE: "); + fflush(stdout); + + uffd_test_ctx_init(0); + + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); + + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); + + if (uffdio_zeropage(uffd, 0)) + if (my_bcmp(area_dst, zeropage, page_size)) + err("zeropage is not zero"); + + printf("done.\n"); + return 0; +} + +static int userfaultfd_events_test(void) +{ + struct uffdio_register uffdio_register; + pthread_t uffd_mon; + int err, features; + pid_t pid; + char c; + struct uffd_stats stats = { 0 }; + + printf("testing events (fork, remap, remove): "); + fflush(stdout); + + features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | + UFFD_FEATURE_EVENT_REMOVE; + uffd_test_ctx_init(features); + + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); + + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); + + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) + err("uffd_poll_thread create"); + + pid = fork(); + if (pid < 0) + err("fork"); + + if (!pid) + exit(faulting_process(0)); + + waitpid(pid, &err, 0); + if (err) + err("faulting process failed"); + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); + if (pthread_join(uffd_mon, NULL)) + return 1; + + uffd_stats_report(&stats, 1); + + return stats.missing_faults != nr_pages; +} + +static int userfaultfd_sig_test(void) +{ + struct uffdio_register uffdio_register; + unsigned long userfaults; + pthread_t uffd_mon; + int err, features; + pid_t pid; + char c; + struct uffd_stats stats = { 0 }; + + printf("testing signal delivery: "); + fflush(stdout); + + features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS; + uffd_test_ctx_init(features); + + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); + + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); + + if (faulting_process(1)) + err("faulting process failed"); + + uffd_test_ops->release_pages(area_dst); + + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) + err("uffd_poll_thread create"); + + pid = fork(); + if (pid < 0) + err("fork"); + + if (!pid) + exit(faulting_process(2)); + + waitpid(pid, &err, 0); + if (err) + err("faulting process failed"); + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); + if (pthread_join(uffd_mon, (void **)&userfaults)) + return 1; + + printf("done.\n"); + if (userfaults) + err("Signal test failed, userfaults: %ld", userfaults); + + return userfaults != 0; +} + +void check_memory_contents(char *p) +{ + unsigned long i; + uint8_t expected_byte; + void *expected_page; + + if (posix_memalign(&expected_page, page_size, page_size)) + err("out of memory"); + + for (i = 0; i < nr_pages; ++i) { + expected_byte = ~((uint8_t)(i % ((uint8_t)-1))); + memset(expected_page, expected_byte, page_size); + if (my_bcmp(expected_page, p + (i * page_size), page_size)) + err("unexpected page contents after minor fault"); + } + + free(expected_page); +} + +static int userfaultfd_minor_test(void) +{ + unsigned long p; + struct uffdio_register uffdio_register; + pthread_t uffd_mon; + char c; + struct uffd_stats stats = { 0 }; + + if (!test_uffdio_minor) + return 0; + + printf("testing minor faults: "); + fflush(stdout); + + uffd_test_ctx_init(uffd_minor_feature()); + + uffdio_register.range.start = (unsigned long)area_dst_alias; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); + + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); + + /* + * After registering with UFFD, populate the non-UFFD-registered side of + * the shared mapping. This should *not* trigger any UFFD minor faults. + */ + for (p = 0; p < nr_pages; ++p) { + memset(area_dst + (p * page_size), p % ((uint8_t)-1), + page_size); + } + + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) + err("uffd_poll_thread create"); + + /* + * Read each of the pages back using the UFFD-registered mapping. We + * expect that the first time we touch a page, it will result in a minor + * fault. uffd_poll_thread will resolve the fault by bit-flipping the + * page's contents, and then issuing a CONTINUE ioctl. + */ + check_memory_contents(area_dst_alias); + + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); + if (pthread_join(uffd_mon, NULL)) + return 1; + + uffd_stats_report(&stats, 1); + + if (test_collapse) { + printf("testing collapse of uffd memory into PMD-mapped THPs:"); + if (madvise(area_dst_alias, nr_pages * page_size, + MADV_COLLAPSE)) + err("madvise(MADV_COLLAPSE)"); + + uffd_test_ops->check_pmd_mapping(area_dst, + nr_pages * page_size / + hpage_size); + /* + * This won't cause uffd-fault - it purely just makes sure there + * was no corruption. + */ + check_memory_contents(area_dst_alias); + printf(" done.\n"); + } + + return stats.missing_faults != 0 || stats.minor_faults != nr_pages; +} + +#define BIT_ULL(nr) (1ULL << (nr)) +#define PM_SOFT_DIRTY BIT_ULL(55) +#define PM_MMAP_EXCLUSIVE BIT_ULL(56) +#define PM_UFFD_WP BIT_ULL(57) +#define PM_FILE BIT_ULL(61) +#define PM_SWAP BIT_ULL(62) +#define PM_PRESENT BIT_ULL(63) + +static int pagemap_open(void) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + + if (fd < 0) + err("open pagemap"); + + return fd; +} + +static uint64_t pagemap_read_vaddr(int fd, void *vaddr) +{ + uint64_t value; + int ret; + + ret = pread(fd, &value, sizeof(uint64_t), + ((uint64_t)vaddr >> 12) * sizeof(uint64_t)); + if (ret != sizeof(uint64_t)) + err("pread() on pagemap failed"); + + return value; +} + +/* This macro let __LINE__ works in err() */ +#define pagemap_check_wp(value, wp) do { \ + if (!!(value & PM_UFFD_WP) != wp) \ + err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \ + } while (0) + +static int pagemap_test_fork(bool present) +{ + pid_t child = fork(); + uint64_t value; + int fd, result; + + if (!child) { + /* Open the pagemap fd of the child itself */ + fd = pagemap_open(); + value = pagemap_read_vaddr(fd, area_dst); + /* + * After fork() uffd-wp bit should be gone as long as we're + * without UFFD_FEATURE_EVENT_FORK + */ + pagemap_check_wp(value, false); + /* Succeed */ + exit(0); + } + waitpid(child, &result, 0); + return result; +} + +static void userfaultfd_pagemap_test(unsigned int test_pgsize) +{ + struct uffdio_register uffdio_register; + int pagemap_fd; + uint64_t value; + + /* Pagemap tests uffd-wp only */ + if (!test_uffdio_wp) + return; + + /* Not enough memory to test this page size */ + if (test_pgsize > nr_pages * page_size) + return; + + printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize); + /* Flush so it doesn't flush twice in parent/child later */ + fflush(stdout); + + uffd_test_ctx_init(0); + + if (test_pgsize > page_size) { + /* This is a thp test */ + if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE)) + err("madvise(MADV_HUGEPAGE) failed"); + } else if (test_pgsize == page_size) { + /* This is normal page test; force no thp */ + if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE)) + err("madvise(MADV_NOHUGEPAGE) failed"); + } + + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failed"); + + pagemap_fd = pagemap_open(); + + /* Touch the page */ + *area_dst = 1; + wp_range(uffd, (uint64_t)area_dst, test_pgsize, true); + value = pagemap_read_vaddr(pagemap_fd, area_dst); + pagemap_check_wp(value, true); + /* Make sure uffd-wp bit dropped when fork */ + if (pagemap_test_fork(true)) + err("Detected stall uffd-wp bit in child"); + + /* Exclusive required or PAGEOUT won't work */ + if (!(value & PM_MMAP_EXCLUSIVE)) + err("multiple mapping detected: 0x%"PRIx64, value); + + if (madvise(area_dst, test_pgsize, MADV_PAGEOUT)) + err("madvise(MADV_PAGEOUT) failed"); + + /* Uffd-wp should persist even swapped out */ + value = pagemap_read_vaddr(pagemap_fd, area_dst); + pagemap_check_wp(value, true); + /* Make sure uffd-wp bit dropped when fork */ + if (pagemap_test_fork(false)) + err("Detected stall uffd-wp bit in child"); + + /* Unprotect; this tests swap pte modifications */ + wp_range(uffd, (uint64_t)area_dst, page_size, false); + value = pagemap_read_vaddr(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + + /* Fault in the page from disk */ + *area_dst = 2; + value = pagemap_read_vaddr(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + + close(pagemap_fd); + printf("done\n"); +} + +static int userfaultfd_stress(void) +{ + void *area; + unsigned long nr; + struct uffdio_register uffdio_register; + struct uffd_stats uffd_stats[nr_cpus]; + + uffd_test_ctx_init(0); + + if (posix_memalign(&area, page_size, page_size)) + err("out of memory"); + zeropage = area; + bzero(zeropage, page_size); + + pthread_mutex_lock(&uffd_read_mutex); + + pthread_attr_init(&attr); + pthread_attr_setstacksize(&attr, 16*1024*1024); + + while (bounces--) { + printf("bounces: %d, mode:", bounces); + if (bounces & BOUNCE_RANDOM) + printf(" rnd"); + if (bounces & BOUNCE_RACINGFAULTS) + printf(" racing"); + if (bounces & BOUNCE_VERIFY) + printf(" ver"); + if (bounces & BOUNCE_POLL) + printf(" poll"); + else + printf(" read"); + printf(", "); + fflush(stdout); + + if (bounces & BOUNCE_POLL) + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + else + fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK); + + /* register */ + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); + + if (area_dst_alias) { + uffdio_register.range.start = (unsigned long) + area_dst_alias; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure alias"); + } + + /* + * The madvise done previously isn't enough: some + * uffd_thread could have read userfaults (one of + * those already resolved by the background thread) + * and it may be in the process of calling + * UFFDIO_COPY. UFFDIO_COPY will read the zapped + * area_src and it would map a zero page in it (of + * course such a UFFDIO_COPY is perfectly safe as it'd + * return -EEXIST). The problem comes at the next + * bounce though: that racing UFFDIO_COPY would + * generate zeropages in the area_src, so invalidating + * the previous MADV_DONTNEED. Without this additional + * MADV_DONTNEED those zeropages leftovers in the + * area_src would lead to -EEXIST failure during the + * next bounce, effectively leaving a zeropage in the + * area_dst. + * + * Try to comment this out madvise to see the memory + * corruption being caught pretty quick. + * + * khugepaged is also inhibited to collapse THP after + * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's + * required to MADV_DONTNEED here. + */ + uffd_test_ops->release_pages(area_dst); + + uffd_stats_reset(uffd_stats, nr_cpus); + + /* bounce pass */ + if (stress(uffd_stats)) + return 1; + + /* Clear all the write protections if there is any */ + if (test_uffdio_wp) + wp_range(uffd, (unsigned long)area_dst, + nr_pages * page_size, false); + + /* unregister */ + if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) + err("unregister failure"); + if (area_dst_alias) { + uffdio_register.range.start = (unsigned long) area_dst; + if (ioctl(uffd, UFFDIO_UNREGISTER, + &uffdio_register.range)) + err("unregister failure alias"); + } + + /* verification */ + if (bounces & BOUNCE_VERIFY) + for (nr = 0; nr < nr_pages; nr++) + if (*area_count(area_dst, nr) != count_verify[nr]) + err("error area_count %llu %llu %lu\n", + *area_count(area_src, nr), + count_verify[nr], nr); + + /* prepare next bounce */ + swap(area_src, area_dst); + + swap(area_src_alias, area_dst_alias); + + uffd_stats_report(uffd_stats, nr_cpus); + } + + if (test_type == TEST_ANON) { + /* + * shmem/hugetlb won't be able to run since they have different + * behavior on fork() (file-backed memory normally drops ptes + * directly when fork), meanwhile the pagemap test will verify + * pgtable entry of fork()ed child. + */ + userfaultfd_pagemap_test(page_size); + /* + * Hard-code for x86_64 for now for 2M THP, as x86_64 is + * currently the only one that supports uffd-wp + */ + userfaultfd_pagemap_test(page_size * 512); + } + + return userfaultfd_zeropage_test() || userfaultfd_sig_test() + || userfaultfd_events_test() || userfaultfd_minor_test(); +} + +/* + * Copied from mlock2-tests.c + */ +unsigned long default_huge_page_size(void) +{ + unsigned long hps = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return 0; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { + hps <<= 10; + break; + } + } + + free(line); + fclose(f); + return hps; +} + +static void set_test_type(const char *type) +{ + if (!strcmp(type, "anon")) { + test_type = TEST_ANON; + uffd_test_ops = &anon_uffd_test_ops; + } else if (!strcmp(type, "hugetlb")) { + test_type = TEST_HUGETLB; + uffd_test_ops = &hugetlb_uffd_test_ops; + } else if (!strcmp(type, "hugetlb_shared")) { + map_shared = true; + test_type = TEST_HUGETLB; + uffd_test_ops = &hugetlb_uffd_test_ops; + /* Minor faults require shared hugetlb; only enable here. */ + test_uffdio_minor = true; + } else if (!strcmp(type, "shmem")) { + map_shared = true; + test_type = TEST_SHMEM; + uffd_test_ops = &shmem_uffd_test_ops; + test_uffdio_minor = true; + } +} + +static void parse_test_type_arg(const char *raw_type) +{ + char *buf = strdup(raw_type); + uint64_t features = UFFD_API_FEATURES; + + while (buf) { + const char *token = strsep(&buf, ":"); + + if (!test_type) + set_test_type(token); + else if (!strcmp(token, "dev")) + test_dev_userfaultfd = true; + else if (!strcmp(token, "syscall")) + test_dev_userfaultfd = false; + else if (!strcmp(token, "collapse")) + test_collapse = true; + else + err("unrecognized test mod '%s'", token); + } + + if (!test_type) + err("failed to parse test type argument: '%s'", raw_type); + + if (test_collapse && test_type != TEST_SHMEM) + err("Unsupported test: %s", raw_type); + + if (test_type == TEST_HUGETLB) + page_size = hpage_size; + else + page_size = sysconf(_SC_PAGE_SIZE); + + if (!page_size) + err("Unable to determine page size"); + if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 + > page_size) + err("Impossible to run this test"); + + /* + * Whether we can test certain features depends not just on test type, + * but also on whether or not this particular kernel supports the + * feature. + */ + + userfaultfd_open(&features); + + test_uffdio_wp = test_uffdio_wp && + (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP); + test_uffdio_minor = test_uffdio_minor && + (features & uffd_minor_feature()); + + close(uffd); + uffd = -1; +} + +static void sigalrm(int sig) +{ + if (sig != SIGALRM) + abort(); + test_uffdio_copy_eexist = true; + test_uffdio_zeropage_eexist = true; + alarm(ALARM_INTERVAL_SECS); +} + +int main(int argc, char **argv) +{ + size_t bytes; + + if (argc < 4) + usage(); + + if (signal(SIGALRM, sigalrm) == SIG_ERR) + err("failed to arm SIGALRM"); + alarm(ALARM_INTERVAL_SECS); + + hpage_size = default_huge_page_size(); + parse_test_type_arg(argv[1]); + bytes = atol(argv[2]) * 1024 * 1024; + + if (test_collapse && bytes & (hpage_size - 1)) + err("MiB must be multiple of %lu if :collapse mod set", + hpage_size >> 20); + + nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + + if (test_collapse) { + /* nr_cpus must divide (bytes / page_size), otherwise, + * area allocations of (nr_pages * paze_size) won't be a + * multiple of hpage_size, even if bytes is a multiple of + * hpage_size. + * + * This means that nr_cpus must divide (N * (2 << (H-P)) + * where: + * bytes = hpage_size * N + * hpage_size = 2 << H + * page_size = 2 << P + * + * And we want to chose nr_cpus to be the largest value + * satisfying this constraint, not larger than the number + * of online CPUs. Unfortunately, prime factorization of + * N and nr_cpus may be arbitrary, so have to search for it. + * Instead, just use the highest power of 2 dividing both + * nr_cpus and (bytes / page_size). + */ + int x = factor_of_2(nr_cpus); + int y = factor_of_2(bytes / page_size); + + nr_cpus = x < y ? x : y; + } + nr_pages_per_cpu = bytes / page_size / nr_cpus; + if (!nr_pages_per_cpu) { + _err("invalid MiB"); + usage(); + } + + bounces = atoi(argv[3]); + if (bounces <= 0) { + _err("invalid bounces"); + usage(); + } + nr_pages = nr_pages_per_cpu * nr_cpus; + + if (test_type == TEST_SHMEM || test_type == TEST_HUGETLB) { + unsigned int memfd_flags = 0; + + if (test_type == TEST_HUGETLB) + memfd_flags = MFD_HUGETLB; + mem_fd = memfd_create(argv[0], memfd_flags); + if (mem_fd < 0) + err("memfd_create"); + if (ftruncate(mem_fd, nr_pages * page_size * 2)) + err("ftruncate"); + if (fallocate(mem_fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, + nr_pages * page_size * 2)) + err("fallocate"); + } + printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", + nr_pages, nr_pages_per_cpu); + return userfaultfd_stress(); +} + +#else /* __NR_userfaultfd */ + +#warning "missing __NR_userfaultfd definition" + +int main(void) +{ + printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n"); + return KSFT_SKIP; +} + +#endif /* __NR_userfaultfd */ diff --git a/tools/testing/selftests/mm/util.h b/tools/testing/selftests/mm/util.h new file mode 100644 index 000000000000..b27d26199334 --- /dev/null +++ b/tools/testing/selftests/mm/util.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __KSELFTEST_VM_UTIL_H +#define __KSELFTEST_VM_UTIL_H + +#include +#include +#include +#include /* ffsl() */ +#include /* _SC_PAGESIZE */ + +static unsigned int __page_size; +static unsigned int __page_shift; + +static inline unsigned int page_size(void) +{ + if (!__page_size) + __page_size = sysconf(_SC_PAGESIZE); + return __page_size; +} + +static inline unsigned int page_shift(void) +{ + if (!__page_shift) + __page_shift = (ffsl(page_size()) - 1); + return __page_shift; +} + +#define PAGE_SHIFT (page_shift()) +#define PAGE_SIZE (page_size()) +/* + * On ppc64 this will only work with radix 2M hugepage size + */ +#define HPAGE_SHIFT 21 +#define HPAGE_SIZE (1 << HPAGE_SHIFT) + +#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0) +#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1)) + + +static inline int64_t allocate_transhuge(void *ptr, int pagemap_fd) +{ + uint64_t ent[2]; + + /* drop pmd */ + if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANONYMOUS | + MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr) + errx(2, "mmap transhuge"); + + if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE)) + err(2, "MADV_HUGEPAGE"); + + /* allocate transparent huge page */ + *(volatile void **)ptr = ptr; + + if (pread(pagemap_fd, ent, sizeof(ent), + (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent)) + err(2, "read pagemap"); + + if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) && + PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) && + !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1))) + return PAGEMAP_PFN(ent[0]); + + return -1; +} + +#endif diff --git a/tools/testing/selftests/mm/va_128TBswitch.c b/tools/testing/selftests/mm/va_128TBswitch.c new file mode 100644 index 000000000000..1d2068989883 --- /dev/null +++ b/tools/testing/selftests/mm/va_128TBswitch.c @@ -0,0 +1,289 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * + * Authors: Kirill A. Shutemov + * Authors: Aneesh Kumar K.V + */ + +#include +#include +#include + +#include "../kselftest.h" + +#ifdef __powerpc64__ +#define PAGE_SIZE (64 << 10) +/* + * This will work with 16M and 2M hugepage size + */ +#define HUGETLB_SIZE (16 << 20) +#else +#define PAGE_SIZE (4 << 10) +#define HUGETLB_SIZE (2 << 20) +#endif + +/* + * >= 128TB is the hint addr value we used to select + * large address space. + */ +#define ADDR_SWITCH_HINT (1UL << 47) +#define LOW_ADDR ((void *) (1UL << 30)) +#define HIGH_ADDR ((void *) (1UL << 48)) + +struct testcase { + void *addr; + unsigned long size; + unsigned long flags; + const char *msg; + unsigned int low_addr_required:1; + unsigned int keep_mapped:1; +}; + +static struct testcase testcases[] = { + { + /* + * If stack is moved, we could possibly allocate + * this at the requested address. + */ + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", + .low_addr_required = 1, + }, + { + /* + * We should never allocate at the requested address or above it + * The len cross the 128TB boundary. Without MAP_FIXED + * we will always search in the lower address space. + */ + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, (2 * PAGE_SIZE))", + .low_addr_required = 1, + }, + { + /* + * Exact mapping at 128TB, the area is free we should get that + * even without MAP_FIXED. + */ + .addr = ((void *)(ADDR_SWITCH_HINT)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", + }, + { + .addr = NULL, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(NULL)", + .low_addr_required = 1, + }, + { + .addr = LOW_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(LOW_ADDR)", + .low_addr_required = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR)", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR) again", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(HIGH_ADDR, MAP_FIXED)", + }, + { + .addr = (void *) -1, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1)", + .keep_mapped = 1, + }, + { + .addr = (void *) -1, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1) again", + }, + { + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", + .low_addr_required = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2 * PAGE_SIZE)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE / 2), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE/2 , 2 * PAGE_SIZE)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = ((void *)(ADDR_SWITCH_HINT)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", + }, +}; + +static struct testcase hugetlb_testcases[] = { + { + .addr = NULL, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(NULL, MAP_HUGETLB)", + .low_addr_required = 1, + }, + { + .addr = LOW_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(LOW_ADDR, MAP_HUGETLB)", + .low_addr_required = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)", + }, + { + .addr = (void *) -1, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1, MAP_HUGETLB)", + .keep_mapped = 1, + }, + { + .addr = (void *) -1, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1, MAP_HUGETLB) again", + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), + .size = 2 * HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2*HUGETLB_SIZE, MAP_HUGETLB)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT , 2*HUGETLB_SIZE, MAP_FIXED | MAP_HUGETLB)", + }, +}; + +static int run_test(struct testcase *test, int count) +{ + void *p; + int i, ret = KSFT_PASS; + + for (i = 0; i < count; i++) { + struct testcase *t = test + i; + + p = mmap(t->addr, t->size, PROT_READ | PROT_WRITE, t->flags, -1, 0); + + printf("%s: %p - ", t->msg, p); + + if (p == MAP_FAILED) { + printf("FAILED\n"); + ret = KSFT_FAIL; + continue; + } + + if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) { + printf("FAILED\n"); + ret = KSFT_FAIL; + } else { + /* + * Do a dereference of the address returned so that we catch + * bugs in page fault handling + */ + memset(p, 0, t->size); + printf("OK\n"); + } + if (!t->keep_mapped) + munmap(p, t->size); + } + + return ret; +} + +static int supported_arch(void) +{ +#if defined(__powerpc64__) + return 1; +#elif defined(__x86_64__) + return 1; +#else + return 0; +#endif +} + +int main(int argc, char **argv) +{ + int ret; + + if (!supported_arch()) + return KSFT_SKIP; + + ret = run_test(testcases, ARRAY_SIZE(testcases)); + if (argc == 2 && !strcmp(argv[1], "--run-hugetlb")) + ret = run_test(hugetlb_testcases, ARRAY_SIZE(hugetlb_testcases)); + return ret; +} diff --git a/tools/testing/selftests/mm/va_128TBswitch.sh b/tools/testing/selftests/mm/va_128TBswitch.sh new file mode 100644 index 000000000000..41580751dc51 --- /dev/null +++ b/tools/testing/selftests/mm/va_128TBswitch.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2022 Adam Sindelar (Meta) +# +# This is a test for mmap behavior with 5-level paging. This script wraps the +# real test to check that the kernel is configured to support at least 5 +# pagetable levels. + +# 1 means the test failed +exitcode=1 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +fail() +{ + echo "$1" + exit $exitcode +} + +check_supported_x86_64() +{ + local config="/proc/config.gz" + [[ -f "${config}" ]] || config="/boot/config-$(uname -r)" + [[ -f "${config}" ]] || fail "Cannot find kernel config in /proc or /boot" + + # gzip -dcfq automatically handles both compressed and plaintext input. + # See man 1 gzip under '-f'. + local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2) + + if [[ "${pg_table_levels}" -lt 5 ]]; then + echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test" + exit $ksft_skip + fi +} + +check_test_requirements() +{ + # The test supports x86_64 and powerpc64. We currently have no useful + # eligibility check for powerpc64, and the test itself will reject other + # architectures. + case `uname -m` in + "x86_64") + check_supported_x86_64 + ;; + *) + return 0 + ;; + esac +} + +check_test_requirements +./va_128TBswitch diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c new file mode 100644 index 000000000000..c0592646ed93 --- /dev/null +++ b/tools/testing/selftests/mm/virtual_address_range.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2017, Anshuman Khandual, IBM Corp. + * + * Works on architectures which support 128TB virtual + * address range and beyond. + */ +#include +#include +#include +#include +#include +#include +#include + +/* + * Maximum address range mapped with a single mmap() + * call is little bit more than 16GB. Hence 16GB is + * chosen as the single chunk size for address space + * mapping. + */ +#define MAP_CHUNK_SIZE 17179869184UL /* 16GB */ + +/* + * Address space till 128TB is mapped without any hint + * and is enabled by default. Address space beyond 128TB + * till 512TB is obtained by passing hint address as the + * first argument into mmap() system call. + * + * The process heap address space is divided into two + * different areas one below 128TB and one above 128TB + * till it reaches 512TB. One with size 128TB and the + * other being 384TB. + * + * On Arm64 the address space is 256TB and no high mappings + * are supported so far. + */ + +#define NR_CHUNKS_128TB 8192UL /* Number of 16GB chunks for 128TB */ +#define NR_CHUNKS_256TB (NR_CHUNKS_128TB * 2UL) +#define NR_CHUNKS_384TB (NR_CHUNKS_128TB * 3UL) + +#define ADDR_MARK_128TB (1UL << 47) /* First address beyond 128TB */ +#define ADDR_MARK_256TB (1UL << 48) /* First address beyond 256TB */ + +#ifdef __aarch64__ +#define HIGH_ADDR_MARK ADDR_MARK_256TB +#define HIGH_ADDR_SHIFT 49 +#define NR_CHUNKS_LOW NR_CHUNKS_256TB +#define NR_CHUNKS_HIGH 0 +#else +#define HIGH_ADDR_MARK ADDR_MARK_128TB +#define HIGH_ADDR_SHIFT 48 +#define NR_CHUNKS_LOW NR_CHUNKS_128TB +#define NR_CHUNKS_HIGH NR_CHUNKS_384TB +#endif + +static char *hind_addr(void) +{ + int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT); + + return (char *) (1UL << bits); +} + +static int validate_addr(char *ptr, int high_addr) +{ + unsigned long addr = (unsigned long) ptr; + + if (high_addr) { + if (addr < HIGH_ADDR_MARK) { + printf("Bad address %lx\n", addr); + return 1; + } + return 0; + } + + if (addr > HIGH_ADDR_MARK) { + printf("Bad address %lx\n", addr); + return 1; + } + return 0; +} + +static int validate_lower_address_hint(void) +{ + char *ptr; + + ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ | + PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (ptr == MAP_FAILED) + return 0; + + return 1; +} + +int main(int argc, char *argv[]) +{ + char *ptr[NR_CHUNKS_LOW]; + char *hptr[NR_CHUNKS_HIGH]; + char *hint; + unsigned long i, lchunks, hchunks; + + for (i = 0; i < NR_CHUNKS_LOW; i++) { + ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (ptr[i] == MAP_FAILED) { + if (validate_lower_address_hint()) + return 1; + break; + } + + if (validate_addr(ptr[i], 0)) + return 1; + } + lchunks = i; + + for (i = 0; i < NR_CHUNKS_HIGH; i++) { + hint = hind_addr(); + hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (hptr[i] == MAP_FAILED) + break; + + if (validate_addr(hptr[i], 1)) + return 1; + } + hchunks = i; + + for (i = 0; i < lchunks; i++) + munmap(ptr[i], MAP_CHUNK_SIZE); + + for (i = 0; i < hchunks; i++) + munmap(hptr[i], MAP_CHUNK_SIZE); + + return 0; +} diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c new file mode 100644 index 000000000000..40e795624ff3 --- /dev/null +++ b/tools/testing/selftests/mm/vm_util.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "../kselftest.h" +#include "vm_util.h" + +#define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" +#define SMAP_FILE_PATH "/proc/self/smaps" +#define MAX_LINE_LENGTH 500 + +uint64_t pagemap_get_entry(int fd, char *start) +{ + const unsigned long pfn = (unsigned long)start / getpagesize(); + uint64_t entry; + int ret; + + ret = pread(fd, &entry, sizeof(entry), pfn * sizeof(entry)); + if (ret != sizeof(entry)) + ksft_exit_fail_msg("reading pagemap failed\n"); + return entry; +} + +bool pagemap_is_softdirty(int fd, char *start) +{ + uint64_t entry = pagemap_get_entry(fd, start); + + // Check if dirty bit (55th bit) is set + return entry & 0x0080000000000000ull; +} + +bool pagemap_is_swapped(int fd, char *start) +{ + uint64_t entry = pagemap_get_entry(fd, start); + + return entry & 0x4000000000000000ull; +} + +bool pagemap_is_populated(int fd, char *start) +{ + uint64_t entry = pagemap_get_entry(fd, start); + + /* Present or swapped. */ + return entry & 0xc000000000000000ull; +} + +unsigned long pagemap_get_pfn(int fd, char *start) +{ + uint64_t entry = pagemap_get_entry(fd, start); + + /* If present (63th bit), PFN is at bit 0 -- 54. */ + if (entry & 0x8000000000000000ull) + return entry & 0x007fffffffffffffull; + return -1ul; +} + +void clear_softdirty(void) +{ + int ret; + const char *ctrl = "4"; + int fd = open("/proc/self/clear_refs", O_WRONLY); + + if (fd < 0) + ksft_exit_fail_msg("opening clear_refs failed\n"); + ret = write(fd, ctrl, strlen(ctrl)); + close(fd); + if (ret != strlen(ctrl)) + ksft_exit_fail_msg("writing clear_refs failed\n"); +} + +bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len) +{ + while (fgets(buf, len, fp)) { + if (!strncmp(buf, pattern, strlen(pattern))) + return true; + } + return false; +} + +uint64_t read_pmd_pagesize(void) +{ + int fd; + char buf[20]; + ssize_t num_read; + + fd = open(PMD_SIZE_FILE_PATH, O_RDONLY); + if (fd == -1) + ksft_exit_fail_msg("Open hpage_pmd_size failed\n"); + + num_read = read(fd, buf, 19); + if (num_read < 1) { + close(fd); + ksft_exit_fail_msg("Read hpage_pmd_size failed\n"); + } + buf[num_read] = '\0'; + close(fd); + + return strtoul(buf, NULL, 10); +} + +bool __check_huge(void *addr, char *pattern, int nr_hpages, + uint64_t hpage_size) +{ + uint64_t thp = -1; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + char addr_pattern[MAX_LINE_LENGTH]; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", + (unsigned long) addr); + if (ret >= MAX_LINE_LENGTH) + ksft_exit_fail_msg("%s: Pattern is too long\n", __func__); + + fp = fopen(SMAP_FILE_PATH, "r"); + if (!fp) + ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, SMAP_FILE_PATH); + + if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) + goto err_out; + + /* + * Fetch the pattern in the same block and check the number of + * hugepages. + */ + if (!check_for_pattern(fp, pattern, buffer, sizeof(buffer))) + goto err_out; + + snprintf(addr_pattern, MAX_LINE_LENGTH, "%s%%9ld kB", pattern); + + if (sscanf(buffer, addr_pattern, &thp) != 1) + ksft_exit_fail_msg("Reading smap error\n"); + +err_out: + fclose(fp); + return thp == (nr_hpages * (hpage_size >> 10)); +} + +bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "AnonHugePages: ", nr_hpages, hpage_size); +} + +bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "FilePmdMapped:", nr_hpages, hpage_size); +} + +bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "ShmemPmdMapped:", nr_hpages, hpage_size); +} diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h new file mode 100644 index 000000000000..1995ee911ef2 --- /dev/null +++ b/tools/testing/selftests/mm/vm_util.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include + +uint64_t pagemap_get_entry(int fd, char *start); +bool pagemap_is_softdirty(int fd, char *start); +bool pagemap_is_swapped(int fd, char *start); +bool pagemap_is_populated(int fd, char *start); +unsigned long pagemap_get_pfn(int fd, char *start); +void clear_softdirty(void); +bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); +uint64_t read_pmd_pagesize(void); +bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); +bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size); +bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size); diff --git a/tools/testing/selftests/mm/write_hugetlb_memory.sh b/tools/testing/selftests/mm/write_hugetlb_memory.sh new file mode 100644 index 000000000000..70a02301f4c2 --- /dev/null +++ b/tools/testing/selftests/mm/write_hugetlb_memory.sh @@ -0,0 +1,23 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +set -e + +size=$1 +populate=$2 +write=$3 +cgroup=$4 +path=$5 +method=$6 +private=$7 +want_sleep=$8 +reserve=$9 + +echo "Putting task in cgroup '$cgroup'" +echo $$ > ${cgroup_path:-/dev/cgroup/memory}/"$cgroup"/cgroup.procs + +echo "Method is $method" + +set +e +./write_to_hugetlbfs -p "$path" -s "$size" "$write" "$populate" -m "$method" \ + "$private" "$want_sleep" "$reserve" diff --git a/tools/testing/selftests/mm/write_to_hugetlbfs.c b/tools/testing/selftests/mm/write_to_hugetlbfs.c new file mode 100644 index 000000000000..6a2caba19ee1 --- /dev/null +++ b/tools/testing/selftests/mm/write_to_hugetlbfs.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This program reserves and uses hugetlb memory, supporting a bunch of + * scenarios needed by the charged_reserved_hugetlb.sh test. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Global definitions. */ +enum method { + HUGETLBFS, + MMAP_MAP_HUGETLB, + SHM, + MAX_METHOD +}; + + +/* Global variables. */ +static const char *self; +static char *shmaddr; +static int shmid; + +/* + * Show usage and exit. + */ +static void exit_usage(void) +{ + printf("Usage: %s -p -s " + "[-m <0=hugetlbfs | 1=mmap(MAP_HUGETLB)>] [-l] [-r] " + "[-o] [-w] [-n]\n", + self); + exit(EXIT_FAILURE); +} + +void sig_handler(int signo) +{ + printf("Received %d.\n", signo); + if (signo == SIGINT) { + printf("Deleting the memory\n"); + if (shmdt((const void *)shmaddr) != 0) { + perror("Detach failure"); + shmctl(shmid, IPC_RMID, NULL); + exit(4); + } + + shmctl(shmid, IPC_RMID, NULL); + printf("Done deleting the memory\n"); + } + exit(2); +} + +int main(int argc, char **argv) +{ + int fd = 0; + int key = 0; + int *ptr = NULL; + int c = 0; + int size = 0; + char path[256] = ""; + enum method method = MAX_METHOD; + int want_sleep = 0, private = 0; + int populate = 0; + int write = 0; + int reserve = 1; + + if (signal(SIGINT, sig_handler) == SIG_ERR) + err(1, "\ncan't catch SIGINT\n"); + + /* Parse command-line arguments. */ + setvbuf(stdout, NULL, _IONBF, 0); + self = argv[0]; + + while ((c = getopt(argc, argv, "s:p:m:owlrn")) != -1) { + switch (c) { + case 's': + size = atoi(optarg); + break; + case 'p': + strncpy(path, optarg, sizeof(path)); + break; + case 'm': + if (atoi(optarg) >= MAX_METHOD) { + errno = EINVAL; + perror("Invalid -m."); + exit_usage(); + } + method = atoi(optarg); + break; + case 'o': + populate = 1; + break; + case 'w': + write = 1; + break; + case 'l': + want_sleep = 1; + break; + case 'r': + private + = 1; + break; + case 'n': + reserve = 0; + break; + default: + errno = EINVAL; + perror("Invalid arg"); + exit_usage(); + } + } + + if (strncmp(path, "", sizeof(path)) != 0) { + printf("Writing to this path: %s\n", path); + } else { + errno = EINVAL; + perror("path not found"); + exit_usage(); + } + + if (size != 0) { + printf("Writing this size: %d\n", size); + } else { + errno = EINVAL; + perror("size not found"); + exit_usage(); + } + + if (!populate) + printf("Not populating.\n"); + else + printf("Populating.\n"); + + if (!write) + printf("Not writing to memory.\n"); + + if (method == MAX_METHOD) { + errno = EINVAL; + perror("-m Invalid"); + exit_usage(); + } else + printf("Using method=%d\n", method); + + if (!private) + printf("Shared mapping.\n"); + else + printf("Private mapping.\n"); + + if (!reserve) + printf("NO_RESERVE mapping.\n"); + else + printf("RESERVE mapping.\n"); + + switch (method) { + case HUGETLBFS: + printf("Allocating using HUGETLBFS.\n"); + fd = open(path, O_CREAT | O_RDWR, 0777); + if (fd == -1) + err(1, "Failed to open file."); + + ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + (private ? MAP_PRIVATE : MAP_SHARED) | + (populate ? MAP_POPULATE : 0) | + (reserve ? 0 : MAP_NORESERVE), + fd, 0); + + if (ptr == MAP_FAILED) { + close(fd); + err(1, "Error mapping the file"); + } + break; + case MMAP_MAP_HUGETLB: + printf("Allocating using MAP_HUGETLB.\n"); + ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + (private ? (MAP_PRIVATE | MAP_ANONYMOUS) : + MAP_SHARED) | + MAP_HUGETLB | (populate ? MAP_POPULATE : 0) | + (reserve ? 0 : MAP_NORESERVE), + -1, 0); + + if (ptr == MAP_FAILED) + err(1, "mmap"); + + printf("Returned address is %p\n", ptr); + break; + case SHM: + printf("Allocating using SHM.\n"); + shmid = shmget(key, size, + SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); + if (shmid < 0) { + shmid = shmget(++key, size, + SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); + if (shmid < 0) + err(1, "shmget"); + } + printf("shmid: 0x%x, shmget key:%d\n", shmid, key); + + ptr = shmat(shmid, NULL, 0); + if (ptr == (int *)-1) { + perror("Shared memory attach failure"); + shmctl(shmid, IPC_RMID, NULL); + exit(2); + } + printf("shmaddr: %p\n", ptr); + + break; + default: + errno = EINVAL; + err(1, "Invalid method."); + } + + if (write) { + printf("Writing to memory.\n"); + memset(ptr, 1, size); + } + + if (want_sleep) { + /* Signal to caller that we're done. */ + printf("DONE\n"); + + /* Hold memory until external kill signal is delivered. */ + while (1) + sleep(100); + } + + if (method == HUGETLBFS) + close(fd); + + return 0; +} diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore deleted file mode 100644 index 1f8c36a9fa10..000000000000 --- a/tools/testing/selftests/vm/.gitignore +++ /dev/null @@ -1,38 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -cow -hugepage-mmap -hugepage-mremap -hugepage-shm -hugepage-vmemmap -hugetlb-madvise -khugepaged -map_hugetlb -map_populate -thuge-gen -compaction_test -migration -mlock2-tests -mrelease_test -mremap_dontunmap -mremap_test -on-fault-limit -transhuge-stress -protection_keys -protection_keys_32 -protection_keys_64 -madv_populate -userfaultfd -mlock-intersect-test -mlock-random-test -virtual_address_range -gup_test -va_128TBswitch -map_fixed_noreplace -write_to_hugetlbfs -hmm-tests -memfd_secret -soft-dirty -split_huge_page_test -ksm_tests -local_config.h -local_config.mk diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile deleted file mode 100644 index 89c14e41bd43..000000000000 --- a/tools/testing/selftests/vm/Makefile +++ /dev/null @@ -1,180 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# Makefile for vm selftests - -LOCAL_HDRS += $(selfdir)/vm/local_config.h $(top_srcdir)/mm/gup_test.h - -include local_config.mk - -uname_M := $(shell uname -m 2>/dev/null || echo not) -MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') - -# Without this, failed build products remain, with up-to-date timestamps, -# thus tricking Make (and you!) into believing that All Is Well, in subsequent -# make invocations: -.DELETE_ON_ERROR: - -# Avoid accidental wrong builds, due to built-in rules working just a little -# bit too well--but not quite as well as required for our situation here. -# -# In other words, "make userfaultfd" is supposed to fail to build at all, -# because this Makefile only supports either "make" (all), or "make /full/path". -# However, the built-in rules, if not suppressed, will pick up CFLAGS and the -# initial LDLIBS (but not the target-specific LDLIBS, because those are only -# set for the full path target!). This causes it to get pretty far into building -# things despite using incorrect values such as an *occasionally* incomplete -# LDLIBS. -MAKEFLAGS += --no-builtin-rules - -CFLAGS = -Wall -I $(top_srcdir) -I $(top_srcdir)/usr/include $(EXTRA_CFLAGS) $(KHDR_INCLUDES) -LDLIBS = -lrt -lpthread -TEST_GEN_FILES = cow -TEST_GEN_FILES += compaction_test -TEST_GEN_FILES += gup_test -TEST_GEN_FILES += hmm-tests -TEST_GEN_FILES += hugetlb-madvise -TEST_GEN_FILES += hugepage-mmap -TEST_GEN_FILES += hugepage-mremap -TEST_GEN_FILES += hugepage-shm -TEST_GEN_FILES += hugepage-vmemmap -TEST_GEN_FILES += khugepaged -TEST_GEN_PROGS = madv_populate -TEST_GEN_FILES += map_fixed_noreplace -TEST_GEN_FILES += map_hugetlb -TEST_GEN_FILES += map_populate -TEST_GEN_FILES += memfd_secret -TEST_GEN_FILES += migration -TEST_GEN_FILES += mlock-random-test -TEST_GEN_FILES += mlock2-tests -TEST_GEN_FILES += mrelease_test -TEST_GEN_FILES += mremap_dontunmap -TEST_GEN_FILES += mremap_test -TEST_GEN_FILES += on-fault-limit -TEST_GEN_FILES += thuge-gen -TEST_GEN_FILES += transhuge-stress -TEST_GEN_FILES += userfaultfd -TEST_GEN_PROGS += soft-dirty -TEST_GEN_PROGS += split_huge_page_test -TEST_GEN_FILES += ksm_tests -TEST_GEN_PROGS += ksm_functional_tests - -ifeq ($(MACHINE),x86_64) -CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32) -CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_program.c) -CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie) - -VMTARGETS := protection_keys -BINARIES_32 := $(VMTARGETS:%=%_32) -BINARIES_64 := $(VMTARGETS:%=%_64) - -ifeq ($(CAN_BUILD_WITH_NOPIE),1) -CFLAGS += -no-pie -endif - -ifeq ($(CAN_BUILD_I386),1) -TEST_GEN_FILES += $(BINARIES_32) -endif - -ifeq ($(CAN_BUILD_X86_64),1) -TEST_GEN_FILES += $(BINARIES_64) -endif -else - -ifneq (,$(findstring $(MACHINE),ppc64)) -TEST_GEN_FILES += protection_keys -endif - -endif - -ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sh64 sparc64 x86_64)) -TEST_GEN_FILES += va_128TBswitch -TEST_GEN_FILES += virtual_address_range -TEST_GEN_FILES += write_to_hugetlbfs -endif - -TEST_PROGS := run_vmtests.sh - -TEST_FILES := test_vmalloc.sh -TEST_FILES += test_hmm.sh -TEST_FILES += va_128TBswitch.sh - -include ../lib.mk - -$(OUTPUT)/cow: vm_util.c -$(OUTPUT)/khugepaged: vm_util.c -$(OUTPUT)/ksm_functional_tests: vm_util.c -$(OUTPUT)/madv_populate: vm_util.c -$(OUTPUT)/soft-dirty: vm_util.c -$(OUTPUT)/split_huge_page_test: vm_util.c -$(OUTPUT)/userfaultfd: vm_util.c - -ifeq ($(MACHINE),x86_64) -BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) -BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) - -define gen-target-rule-32 -$(1) $(1)_32: $(OUTPUT)/$(1)_32 -.PHONY: $(1) $(1)_32 -endef - -define gen-target-rule-64 -$(1) $(1)_64: $(OUTPUT)/$(1)_64 -.PHONY: $(1) $(1)_64 -endef - -ifeq ($(CAN_BUILD_I386),1) -$(BINARIES_32): CFLAGS += -m32 -mxsave -$(BINARIES_32): LDLIBS += -lrt -ldl -lm -$(BINARIES_32): $(OUTPUT)/%_32: %.c - $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ -$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-32,$(t)))) -endif - -ifeq ($(CAN_BUILD_X86_64),1) -$(BINARIES_64): CFLAGS += -m64 -mxsave -$(BINARIES_64): LDLIBS += -lrt -ldl -$(BINARIES_64): $(OUTPUT)/%_64: %.c - $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ -$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-64,$(t)))) -endif - -# x86_64 users should be encouraged to install 32-bit libraries -ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01) -all: warn_32bit_failure - -warn_32bit_failure: - @echo "Warning: you seem to have a broken 32-bit build" 2>&1; \ - echo "environment. This will reduce test coverage of 64-bit" 2>&1; \ - echo "kernels. If you are using a Debian-like distribution," 2>&1; \ - echo "try:"; 2>&1; \ - echo ""; \ - echo " apt-get install gcc-multilib libc6-i386 libc6-dev-i386"; \ - echo ""; \ - echo "If you are using a Fedora-like distribution, try:"; \ - echo ""; \ - echo " yum install glibc-devel.*i686"; \ - exit 0; -endif -endif - -# cow_EXTRA_LIBS may get set in local_config.mk, or it may be left empty. -$(OUTPUT)/cow: LDLIBS += $(COW_EXTRA_LIBS) - -$(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap - -$(OUTPUT)/ksm_tests: LDLIBS += -lnuma - -$(OUTPUT)/migration: LDLIBS += -lnuma - -local_config.mk local_config.h: check_config.sh - /bin/sh ./check_config.sh $(CC) - -EXTRA_CLEAN += local_config.mk local_config.h - -ifeq ($(COW_EXTRA_LIBS),) -all: warn_missing_liburing - -warn_missing_liburing: - @echo ; \ - echo "Warning: missing liburing support. Some COW tests will be skipped." ; \ - echo -endif diff --git a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh b/tools/testing/selftests/vm/charge_reserved_hugetlb.sh deleted file mode 100644 index a5cb4b09a46c..000000000000 --- a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh +++ /dev/null @@ -1,584 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -set -e - -if [[ $(id -u) -ne 0 ]]; then - echo "This test must be run as root. Skipping..." - exit $ksft_skip -fi - -fault_limit_file=limit_in_bytes -reservation_limit_file=rsvd.limit_in_bytes -fault_usage_file=usage_in_bytes -reservation_usage_file=rsvd.usage_in_bytes - -if [[ "$1" == "-cgroup-v2" ]]; then - cgroup2=1 - fault_limit_file=max - reservation_limit_file=rsvd.max - fault_usage_file=current - reservation_usage_file=rsvd.current -fi - -if [[ $cgroup2 ]]; then - cgroup_path=$(mount -t cgroup2 | head -1 | awk -e '{print $3}') - if [[ -z "$cgroup_path" ]]; then - cgroup_path=/dev/cgroup/memory - mount -t cgroup2 none $cgroup_path - do_umount=1 - fi - echo "+hugetlb" >$cgroup_path/cgroup.subtree_control -else - cgroup_path=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}') - if [[ -z "$cgroup_path" ]]; then - cgroup_path=/dev/cgroup/memory - mount -t cgroup memory,hugetlb $cgroup_path - do_umount=1 - fi -fi -export cgroup_path - -function cleanup() { - if [[ $cgroup2 ]]; then - echo $$ >$cgroup_path/cgroup.procs - else - echo $$ >$cgroup_path/tasks - fi - - if [[ -e /mnt/huge ]]; then - rm -rf /mnt/huge/* - umount /mnt/huge || echo error - rmdir /mnt/huge - fi - if [[ -e $cgroup_path/hugetlb_cgroup_test ]]; then - rmdir $cgroup_path/hugetlb_cgroup_test - fi - if [[ -e $cgroup_path/hugetlb_cgroup_test1 ]]; then - rmdir $cgroup_path/hugetlb_cgroup_test1 - fi - if [[ -e $cgroup_path/hugetlb_cgroup_test2 ]]; then - rmdir $cgroup_path/hugetlb_cgroup_test2 - fi - echo 0 >/proc/sys/vm/nr_hugepages - echo CLEANUP DONE -} - -function expect_equal() { - local expected="$1" - local actual="$2" - local error="$3" - - if [[ "$expected" != "$actual" ]]; then - echo "expected ($expected) != actual ($actual): $3" - cleanup - exit 1 - fi -} - -function get_machine_hugepage_size() { - hpz=$(grep -i hugepagesize /proc/meminfo) - kb=${hpz:14:-3} - mb=$(($kb / 1024)) - echo $mb -} - -MB=$(get_machine_hugepage_size) - -function setup_cgroup() { - local name="$1" - local cgroup_limit="$2" - local reservation_limit="$3" - - mkdir $cgroup_path/$name - - echo writing cgroup limit: "$cgroup_limit" - echo "$cgroup_limit" >$cgroup_path/$name/hugetlb.${MB}MB.$fault_limit_file - - echo writing reseravation limit: "$reservation_limit" - echo "$reservation_limit" > \ - $cgroup_path/$name/hugetlb.${MB}MB.$reservation_limit_file - - if [ -e "$cgroup_path/$name/cpuset.cpus" ]; then - echo 0 >$cgroup_path/$name/cpuset.cpus - fi - if [ -e "$cgroup_path/$name/cpuset.mems" ]; then - echo 0 >$cgroup_path/$name/cpuset.mems - fi -} - -function wait_for_hugetlb_memory_to_get_depleted() { - local cgroup="$1" - local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" - # Wait for hugetlbfs memory to get depleted. - while [ $(cat $path) != 0 ]; do - echo Waiting for hugetlb memory to get depleted. - cat $path - sleep 0.5 - done -} - -function wait_for_hugetlb_memory_to_get_reserved() { - local cgroup="$1" - local size="$2" - - local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" - # Wait for hugetlbfs memory to get written. - while [ $(cat $path) != $size ]; do - echo Waiting for hugetlb memory reservation to reach size $size. - cat $path - sleep 0.5 - done -} - -function wait_for_hugetlb_memory_to_get_written() { - local cgroup="$1" - local size="$2" - - local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file" - # Wait for hugetlbfs memory to get written. - while [ $(cat $path) != $size ]; do - echo Waiting for hugetlb memory to reach size $size. - cat $path - sleep 0.5 - done -} - -function write_hugetlbfs_and_get_usage() { - local cgroup="$1" - local size="$2" - local populate="$3" - local write="$4" - local path="$5" - local method="$6" - local private="$7" - local expect_failure="$8" - local reserve="$9" - - # Function return values. - reservation_failed=0 - oom_killed=0 - hugetlb_difference=0 - reserved_difference=0 - - local hugetlb_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file - local reserved_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file - - local hugetlb_before=$(cat $hugetlb_usage) - local reserved_before=$(cat $reserved_usage) - - echo - echo Starting: - echo hugetlb_usage="$hugetlb_before" - echo reserved_usage="$reserved_before" - echo expect_failure is "$expect_failure" - - output=$(mktemp) - set +e - if [[ "$method" == "1" ]] || [[ "$method" == 2 ]] || - [[ "$private" == "-r" ]] && [[ "$expect_failure" != 1 ]]; then - - bash write_hugetlb_memory.sh "$size" "$populate" "$write" \ - "$cgroup" "$path" "$method" "$private" "-l" "$reserve" 2>&1 | tee $output & - - local write_result=$? - local write_pid=$! - - until grep -q -i "DONE" $output; do - echo waiting for DONE signal. - if ! ps $write_pid > /dev/null - then - echo "FAIL: The write died" - cleanup - exit 1 - fi - sleep 0.5 - done - - echo ================= write_hugetlb_memory.sh output is: - cat $output - echo ================= end output. - - if [[ "$populate" == "-o" ]] || [[ "$write" == "-w" ]]; then - wait_for_hugetlb_memory_to_get_written "$cgroup" "$size" - elif [[ "$reserve" != "-n" ]]; then - wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size" - else - # This case doesn't produce visible effects, but we still have - # to wait for the async process to start and execute... - sleep 0.5 - fi - - echo write_result is $write_result - else - bash write_hugetlb_memory.sh "$size" "$populate" "$write" \ - "$cgroup" "$path" "$method" "$private" "$reserve" - local write_result=$? - - if [[ "$reserve" != "-n" ]]; then - wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size" - fi - fi - set -e - - if [[ "$write_result" == 1 ]]; then - reservation_failed=1 - fi - - # On linus/master, the above process gets SIGBUS'd on oomkill, with - # return code 135. On earlier kernels, it gets actual oomkill, with return - # code 137, so just check for both conditions in case we're testing - # against an earlier kernel. - if [[ "$write_result" == 135 ]] || [[ "$write_result" == 137 ]]; then - oom_killed=1 - fi - - local hugetlb_after=$(cat $hugetlb_usage) - local reserved_after=$(cat $reserved_usage) - - echo After write: - echo hugetlb_usage="$hugetlb_after" - echo reserved_usage="$reserved_after" - - hugetlb_difference=$(($hugetlb_after - $hugetlb_before)) - reserved_difference=$(($reserved_after - $reserved_before)) -} - -function cleanup_hugetlb_memory() { - set +e - local cgroup="$1" - if [[ "$(pgrep -f write_to_hugetlbfs)" != "" ]]; then - echo killing write_to_hugetlbfs - killall -2 write_to_hugetlbfs - wait_for_hugetlb_memory_to_get_depleted $cgroup - fi - set -e - - if [[ -e /mnt/huge ]]; then - rm -rf /mnt/huge/* - umount /mnt/huge - rmdir /mnt/huge - fi -} - -function run_test() { - local size=$(($1 * ${MB} * 1024 * 1024)) - local populate="$2" - local write="$3" - local cgroup_limit=$(($4 * ${MB} * 1024 * 1024)) - local reservation_limit=$(($5 * ${MB} * 1024 * 1024)) - local nr_hugepages="$6" - local method="$7" - local private="$8" - local expect_failure="$9" - local reserve="${10}" - - # Function return values. - hugetlb_difference=0 - reserved_difference=0 - reservation_failed=0 - oom_killed=0 - - echo nr hugepages = "$nr_hugepages" - echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages - - setup_cgroup "hugetlb_cgroup_test" "$cgroup_limit" "$reservation_limit" - - mkdir -p /mnt/huge - mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge - - write_hugetlbfs_and_get_usage "hugetlb_cgroup_test" "$size" "$populate" \ - "$write" "/mnt/huge/test" "$method" "$private" "$expect_failure" \ - "$reserve" - - cleanup_hugetlb_memory "hugetlb_cgroup_test" - - local final_hugetlb=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$fault_usage_file) - local final_reservation=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$reservation_usage_file) - - echo $hugetlb_difference - echo $reserved_difference - expect_equal "0" "$final_hugetlb" "final hugetlb is not zero" - expect_equal "0" "$final_reservation" "final reservation is not zero" -} - -function run_multiple_cgroup_test() { - local size1="$1" - local populate1="$2" - local write1="$3" - local cgroup_limit1="$4" - local reservation_limit1="$5" - - local size2="$6" - local populate2="$7" - local write2="$8" - local cgroup_limit2="$9" - local reservation_limit2="${10}" - - local nr_hugepages="${11}" - local method="${12}" - local private="${13}" - local expect_failure="${14}" - local reserve="${15}" - - # Function return values. - hugetlb_difference1=0 - reserved_difference1=0 - reservation_failed1=0 - oom_killed1=0 - - hugetlb_difference2=0 - reserved_difference2=0 - reservation_failed2=0 - oom_killed2=0 - - echo nr hugepages = "$nr_hugepages" - echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages - - setup_cgroup "hugetlb_cgroup_test1" "$cgroup_limit1" "$reservation_limit1" - setup_cgroup "hugetlb_cgroup_test2" "$cgroup_limit2" "$reservation_limit2" - - mkdir -p /mnt/huge - mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge - - write_hugetlbfs_and_get_usage "hugetlb_cgroup_test1" "$size1" \ - "$populate1" "$write1" "/mnt/huge/test1" "$method" "$private" \ - "$expect_failure" "$reserve" - - hugetlb_difference1=$hugetlb_difference - reserved_difference1=$reserved_difference - reservation_failed1=$reservation_failed - oom_killed1=$oom_killed - - local cgroup1_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$fault_usage_file - local cgroup1_reservation_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$reservation_usage_file - local cgroup2_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$fault_usage_file - local cgroup2_reservation_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$reservation_usage_file - - local usage_before_second_write=$(cat $cgroup1_hugetlb_usage) - local reservation_usage_before_second_write=$(cat $cgroup1_reservation_usage) - - write_hugetlbfs_and_get_usage "hugetlb_cgroup_test2" "$size2" \ - "$populate2" "$write2" "/mnt/huge/test2" "$method" "$private" \ - "$expect_failure" "$reserve" - - hugetlb_difference2=$hugetlb_difference - reserved_difference2=$reserved_difference - reservation_failed2=$reservation_failed - oom_killed2=$oom_killed - - expect_equal "$usage_before_second_write" \ - "$(cat $cgroup1_hugetlb_usage)" "Usage changed." - expect_equal "$reservation_usage_before_second_write" \ - "$(cat $cgroup1_reservation_usage)" "Reservation usage changed." - - cleanup_hugetlb_memory - - local final_hugetlb=$(cat $cgroup1_hugetlb_usage) - local final_reservation=$(cat $cgroup1_reservation_usage) - - expect_equal "0" "$final_hugetlb" \ - "hugetlbt_cgroup_test1 final hugetlb is not zero" - expect_equal "0" "$final_reservation" \ - "hugetlbt_cgroup_test1 final reservation is not zero" - - local final_hugetlb=$(cat $cgroup2_hugetlb_usage) - local final_reservation=$(cat $cgroup2_reservation_usage) - - expect_equal "0" "$final_hugetlb" \ - "hugetlb_cgroup_test2 final hugetlb is not zero" - expect_equal "0" "$final_reservation" \ - "hugetlb_cgroup_test2 final reservation is not zero" -} - -cleanup - -for populate in "" "-o"; do - for method in 0 1 2; do - for private in "" "-r"; do - for reserve in "" "-n"; do - - # Skip mmap(MAP_HUGETLB | MAP_SHARED). Doesn't seem to be supported. - if [[ "$method" == 1 ]] && [[ "$private" == "" ]]; then - continue - fi - - # Skip populated shmem tests. Doesn't seem to be supported. - if [[ "$method" == 2"" ]] && [[ "$populate" == "-o" ]]; then - continue - fi - - if [[ "$method" == 2"" ]] && [[ "$reserve" == "-n" ]]; then - continue - fi - - cleanup - echo - echo - echo - echo Test normal case. - echo private=$private, populate=$populate, method=$method, reserve=$reserve - run_test 5 "$populate" "" 10 10 10 "$method" "$private" "0" "$reserve" - - echo Memory charged to hugtlb=$hugetlb_difference - echo Memory charged to reservation=$reserved_difference - - if [[ "$populate" == "-o" ]]; then - expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \ - "Reserved memory charged to hugetlb cgroup." - else - expect_equal "0" "$hugetlb_difference" \ - "Reserved memory charged to hugetlb cgroup." - fi - - if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then - expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \ - "Reserved memory not charged to reservation usage." - else - expect_equal "0" "$reserved_difference" \ - "Reserved memory not charged to reservation usage." - fi - - echo 'PASS' - - cleanup - echo - echo - echo - echo Test normal case with write. - echo private=$private, populate=$populate, method=$method, reserve=$reserve - run_test 5 "$populate" '-w' 5 5 10 "$method" "$private" "0" "$reserve" - - echo Memory charged to hugtlb=$hugetlb_difference - echo Memory charged to reservation=$reserved_difference - - expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \ - "Reserved memory charged to hugetlb cgroup." - - expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \ - "Reserved memory not charged to reservation usage." - - echo 'PASS' - - cleanup - continue - echo - echo - echo - echo Test more than reservation case. - echo private=$private, populate=$populate, method=$method, reserve=$reserve - - if [ "$reserve" != "-n" ]; then - run_test "5" "$populate" '' "10" "2" "10" "$method" "$private" "1" \ - "$reserve" - - expect_equal "1" "$reservation_failed" "Reservation succeeded." - fi - - echo 'PASS' - - cleanup - - echo - echo - echo - echo Test more than cgroup limit case. - echo private=$private, populate=$populate, method=$method, reserve=$reserve - - # Not sure if shm memory can be cleaned up when the process gets sigbus'd. - if [[ "$method" != 2 ]]; then - run_test 5 "$populate" "-w" 2 10 10 "$method" "$private" "1" "$reserve" - - expect_equal "1" "$oom_killed" "Not oom killed." - fi - echo 'PASS' - - cleanup - - echo - echo - echo - echo Test normal case, multiple cgroups. - echo private=$private, populate=$populate, method=$method, reserve=$reserve - run_multiple_cgroup_test "3" "$populate" "" "10" "10" "5" \ - "$populate" "" "10" "10" "10" \ - "$method" "$private" "0" "$reserve" - - echo Memory charged to hugtlb1=$hugetlb_difference1 - echo Memory charged to reservation1=$reserved_difference1 - echo Memory charged to hugtlb2=$hugetlb_difference2 - echo Memory charged to reservation2=$reserved_difference2 - - if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then - expect_equal "3" "$reserved_difference1" \ - "Incorrect reservations charged to cgroup 1." - - expect_equal "5" "$reserved_difference2" \ - "Incorrect reservation charged to cgroup 2." - - else - expect_equal "0" "$reserved_difference1" \ - "Incorrect reservations charged to cgroup 1." - - expect_equal "0" "$reserved_difference2" \ - "Incorrect reservation charged to cgroup 2." - fi - - if [[ "$populate" == "-o" ]]; then - expect_equal "3" "$hugetlb_difference1" \ - "Incorrect hugetlb charged to cgroup 1." - - expect_equal "5" "$hugetlb_difference2" \ - "Incorrect hugetlb charged to cgroup 2." - - else - expect_equal "0" "$hugetlb_difference1" \ - "Incorrect hugetlb charged to cgroup 1." - - expect_equal "0" "$hugetlb_difference2" \ - "Incorrect hugetlb charged to cgroup 2." - fi - echo 'PASS' - - cleanup - echo - echo - echo - echo Test normal case with write, multiple cgroups. - echo private=$private, populate=$populate, method=$method, reserve=$reserve - run_multiple_cgroup_test "3" "$populate" "-w" "10" "10" "5" \ - "$populate" "-w" "10" "10" "10" \ - "$method" "$private" "0" "$reserve" - - echo Memory charged to hugtlb1=$hugetlb_difference1 - echo Memory charged to reservation1=$reserved_difference1 - echo Memory charged to hugtlb2=$hugetlb_difference2 - echo Memory charged to reservation2=$reserved_difference2 - - expect_equal "3" "$hugetlb_difference1" \ - "Incorrect hugetlb charged to cgroup 1." - - expect_equal "3" "$reserved_difference1" \ - "Incorrect reservation charged to cgroup 1." - - expect_equal "5" "$hugetlb_difference2" \ - "Incorrect hugetlb charged to cgroup 2." - - expect_equal "5" "$reserved_difference2" \ - "Incorrected reservation charged to cgroup 2." - echo 'PASS' - - cleanup - - done # reserve - done # private - done # populate -done # method - -if [[ $do_umount ]]; then - umount $cgroup_path - rmdir $cgroup_path -fi diff --git a/tools/testing/selftests/vm/check_config.sh b/tools/testing/selftests/vm/check_config.sh deleted file mode 100644 index bcba3af0acea..000000000000 --- a/tools/testing/selftests/vm/check_config.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# -# Probe for libraries and create header files to record the results. Both C -# header files and Makefile include fragments are created. - -OUTPUT_H_FILE=local_config.h -OUTPUT_MKFILE=local_config.mk - -tmpname=$(mktemp) -tmpfile_c=${tmpname}.c -tmpfile_o=${tmpname}.o - -# liburing -echo "#include " > $tmpfile_c -echo "#include " >> $tmpfile_c -echo "int func(void) { return 0; }" >> $tmpfile_c - -CC=${1:?"Usage: $0 # example compiler: gcc"} -$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1 - -if [ -f $tmpfile_o ]; then - echo "#define LOCAL_CONFIG_HAVE_LIBURING 1" > $OUTPUT_H_FILE - echo "COW_EXTRA_LIBS = -luring" > $OUTPUT_MKFILE -else - echo "// No liburing support found" > $OUTPUT_H_FILE - echo "# No liburing support found, so:" > $OUTPUT_MKFILE - echo "COW_EXTRA_LIBS = " >> $OUTPUT_MKFILE -fi - -rm ${tmpname}.* diff --git a/tools/testing/selftests/vm/compaction_test.c b/tools/testing/selftests/vm/compaction_test.c deleted file mode 100644 index 9b420140ba2b..000000000000 --- a/tools/testing/selftests/vm/compaction_test.c +++ /dev/null @@ -1,231 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * - * A test for the patch "Allow compaction of unevictable pages". - * With this patch we should be able to allocate at least 1/4 - * of RAM in huge pages. Without the patch much less is - * allocated. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../kselftest.h" - -#define MAP_SIZE_MB 100 -#define MAP_SIZE (MAP_SIZE_MB * 1024 * 1024) - -struct map_list { - void *map; - struct map_list *next; -}; - -int read_memory_info(unsigned long *memfree, unsigned long *hugepagesize) -{ - char buffer[256] = {0}; - char *cmd = "cat /proc/meminfo | grep -i memfree | grep -o '[0-9]*'"; - FILE *cmdfile = popen(cmd, "r"); - - if (!(fgets(buffer, sizeof(buffer), cmdfile))) { - perror("Failed to read meminfo\n"); - return -1; - } - - pclose(cmdfile); - - *memfree = atoll(buffer); - cmd = "cat /proc/meminfo | grep -i hugepagesize | grep -o '[0-9]*'"; - cmdfile = popen(cmd, "r"); - - if (!(fgets(buffer, sizeof(buffer), cmdfile))) { - perror("Failed to read meminfo\n"); - return -1; - } - - pclose(cmdfile); - *hugepagesize = atoll(buffer); - - return 0; -} - -int prereq(void) -{ - char allowed; - int fd; - - fd = open("/proc/sys/vm/compact_unevictable_allowed", - O_RDONLY | O_NONBLOCK); - if (fd < 0) { - perror("Failed to open\n" - "/proc/sys/vm/compact_unevictable_allowed\n"); - return -1; - } - - if (read(fd, &allowed, sizeof(char)) != sizeof(char)) { - perror("Failed to read from\n" - "/proc/sys/vm/compact_unevictable_allowed\n"); - close(fd); - return -1; - } - - close(fd); - if (allowed == '1') - return 0; - - return -1; -} - -int check_compaction(unsigned long mem_free, unsigned int hugepage_size) -{ - int fd; - int compaction_index = 0; - char initial_nr_hugepages[10] = {0}; - char nr_hugepages[10] = {0}; - - /* We want to test with 80% of available memory. Else, OOM killer comes - in to play */ - mem_free = mem_free * 0.8; - - fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK); - if (fd < 0) { - perror("Failed to open /proc/sys/vm/nr_hugepages"); - return -1; - } - - if (read(fd, initial_nr_hugepages, sizeof(initial_nr_hugepages)) <= 0) { - perror("Failed to read from /proc/sys/vm/nr_hugepages"); - goto close_fd; - } - - /* Start with the initial condition of 0 huge pages*/ - if (write(fd, "0", sizeof(char)) != sizeof(char)) { - perror("Failed to write 0 to /proc/sys/vm/nr_hugepages\n"); - goto close_fd; - } - - lseek(fd, 0, SEEK_SET); - - /* Request a large number of huge pages. The Kernel will allocate - as much as it can */ - if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) { - perror("Failed to write 100000 to /proc/sys/vm/nr_hugepages\n"); - goto close_fd; - } - - lseek(fd, 0, SEEK_SET); - - if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) { - perror("Failed to re-read from /proc/sys/vm/nr_hugepages\n"); - goto close_fd; - } - - /* We should have been able to request at least 1/3 rd of the memory in - huge pages */ - compaction_index = mem_free/(atoi(nr_hugepages) * hugepage_size); - - if (compaction_index > 3) { - printf("No of huge pages allocated = %d\n", - (atoi(nr_hugepages))); - fprintf(stderr, "ERROR: Less that 1/%d of memory is available\n" - "as huge pages\n", compaction_index); - goto close_fd; - } - - printf("No of huge pages allocated = %d\n", - (atoi(nr_hugepages))); - - lseek(fd, 0, SEEK_SET); - - if (write(fd, initial_nr_hugepages, strlen(initial_nr_hugepages)) - != strlen(initial_nr_hugepages)) { - perror("Failed to write value to /proc/sys/vm/nr_hugepages\n"); - goto close_fd; - } - - close(fd); - return 0; - - close_fd: - close(fd); - printf("Not OK. Compaction test failed."); - return -1; -} - - -int main(int argc, char **argv) -{ - struct rlimit lim; - struct map_list *list, *entry; - size_t page_size, i; - void *map = NULL; - unsigned long mem_free = 0; - unsigned long hugepage_size = 0; - long mem_fragmentable_MB = 0; - - if (prereq() != 0) { - printf("Either the sysctl compact_unevictable_allowed is not\n" - "set to 1 or couldn't read the proc file.\n" - "Skipping the test\n"); - return KSFT_SKIP; - } - - lim.rlim_cur = RLIM_INFINITY; - lim.rlim_max = RLIM_INFINITY; - if (setrlimit(RLIMIT_MEMLOCK, &lim)) { - perror("Failed to set rlimit:\n"); - return -1; - } - - page_size = getpagesize(); - - list = NULL; - - if (read_memory_info(&mem_free, &hugepage_size) != 0) { - printf("ERROR: Cannot read meminfo\n"); - return -1; - } - - mem_fragmentable_MB = mem_free * 0.8 / 1024; - - while (mem_fragmentable_MB > 0) { - map = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED, -1, 0); - if (map == MAP_FAILED) - break; - - entry = malloc(sizeof(struct map_list)); - if (!entry) { - munmap(map, MAP_SIZE); - break; - } - entry->map = map; - entry->next = list; - list = entry; - - /* Write something (in this case the address of the map) to - * ensure that KSM can't merge the mapped pages - */ - for (i = 0; i < MAP_SIZE; i += page_size) - *(unsigned long *)(map + i) = (unsigned long)map + i; - - mem_fragmentable_MB -= MAP_SIZE_MB; - } - - for (entry = list; entry != NULL; entry = entry->next) { - munmap(entry->map, MAP_SIZE); - if (!entry->next) - break; - entry = entry->next; - } - - if (check_compaction(mem_free, hugepage_size) == 0) - return 0; - - return -1; -} diff --git a/tools/testing/selftests/vm/config b/tools/testing/selftests/vm/config deleted file mode 100644 index be087c4bc396..000000000000 --- a/tools/testing/selftests/vm/config +++ /dev/null @@ -1,8 +0,0 @@ -CONFIG_SYSVIPC=y -CONFIG_USERFAULTFD=y -CONFIG_TEST_VMALLOC=m -CONFIG_DEVICE_PRIVATE=y -CONFIG_TEST_HMM=m -CONFIG_GUP_TEST=y -CONFIG_TRANSPARENT_HUGEPAGE=y -CONFIG_MEM_SOFT_DIRTY=y diff --git a/tools/testing/selftests/vm/cow.c b/tools/testing/selftests/vm/cow.c deleted file mode 100644 index 16216d893d96..000000000000 --- a/tools/testing/selftests/vm/cow.c +++ /dev/null @@ -1,1764 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * COW (Copy On Write) tests. - * - * Copyright 2022, Red Hat, Inc. - * - * Author(s): David Hildenbrand - */ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "local_config.h" -#ifdef LOCAL_CONFIG_HAVE_LIBURING -#include -#endif /* LOCAL_CONFIG_HAVE_LIBURING */ - -#include "../../../../mm/gup_test.h" -#include "../kselftest.h" -#include "vm_util.h" - -#ifndef MADV_COLLAPSE -#define MADV_COLLAPSE 25 -#endif - -static size_t pagesize; -static int pagemap_fd; -static size_t thpsize; -static int nr_hugetlbsizes; -static size_t hugetlbsizes[10]; -static int gup_fd; -static bool has_huge_zeropage; - -static void detect_thpsize(void) -{ - int fd = open("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", - O_RDONLY); - size_t size = 0; - char buf[15]; - int ret; - - if (fd < 0) - return; - - ret = pread(fd, buf, sizeof(buf), 0); - if (ret > 0 && ret < sizeof(buf)) { - buf[ret] = 0; - - size = strtoul(buf, NULL, 10); - if (size < pagesize) - size = 0; - if (size > 0) { - thpsize = size; - ksft_print_msg("[INFO] detected THP size: %zu KiB\n", - thpsize / 1024); - } - } - - close(fd); -} - -static void detect_huge_zeropage(void) -{ - int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page", - O_RDONLY); - size_t enabled = 0; - char buf[15]; - int ret; - - if (fd < 0) - return; - - ret = pread(fd, buf, sizeof(buf), 0); - if (ret > 0 && ret < sizeof(buf)) { - buf[ret] = 0; - - enabled = strtoul(buf, NULL, 10); - if (enabled == 1) { - has_huge_zeropage = true; - ksft_print_msg("[INFO] huge zeropage is enabled\n"); - } - } - - close(fd); -} - -static void detect_hugetlbsizes(void) -{ - DIR *dir = opendir("/sys/kernel/mm/hugepages/"); - - if (!dir) - return; - - while (nr_hugetlbsizes < ARRAY_SIZE(hugetlbsizes)) { - struct dirent *entry = readdir(dir); - size_t kb; - - if (!entry) - break; - if (entry->d_type != DT_DIR) - continue; - if (sscanf(entry->d_name, "hugepages-%zukB", &kb) != 1) - continue; - hugetlbsizes[nr_hugetlbsizes] = kb * 1024; - nr_hugetlbsizes++; - ksft_print_msg("[INFO] detected hugetlb size: %zu KiB\n", - kb); - } - closedir(dir); -} - -static bool range_is_swapped(void *addr, size_t size) -{ - for (; size; addr += pagesize, size -= pagesize) - if (!pagemap_is_swapped(pagemap_fd, addr)) - return false; - return true; -} - -struct comm_pipes { - int child_ready[2]; - int parent_ready[2]; -}; - -static int setup_comm_pipes(struct comm_pipes *comm_pipes) -{ - if (pipe(comm_pipes->child_ready) < 0) - return -errno; - if (pipe(comm_pipes->parent_ready) < 0) { - close(comm_pipes->child_ready[0]); - close(comm_pipes->child_ready[1]); - return -errno; - } - - return 0; -} - -static void close_comm_pipes(struct comm_pipes *comm_pipes) -{ - close(comm_pipes->child_ready[0]); - close(comm_pipes->child_ready[1]); - close(comm_pipes->parent_ready[0]); - close(comm_pipes->parent_ready[1]); -} - -static int child_memcmp_fn(char *mem, size_t size, - struct comm_pipes *comm_pipes) -{ - char *old = malloc(size); - char buf; - - /* Backup the original content. */ - memcpy(old, mem, size); - - /* Wait until the parent modified the page. */ - write(comm_pipes->child_ready[1], "0", 1); - while (read(comm_pipes->parent_ready[0], &buf, 1) != 1) - ; - - /* See if we still read the old values. */ - return memcmp(old, mem, size); -} - -static int child_vmsplice_memcmp_fn(char *mem, size_t size, - struct comm_pipes *comm_pipes) -{ - struct iovec iov = { - .iov_base = mem, - .iov_len = size, - }; - ssize_t cur, total, transferred; - char *old, *new; - int fds[2]; - char buf; - - old = malloc(size); - new = malloc(size); - - /* Backup the original content. */ - memcpy(old, mem, size); - - if (pipe(fds) < 0) - return -errno; - - /* Trigger a read-only pin. */ - transferred = vmsplice(fds[1], &iov, 1, 0); - if (transferred < 0) - return -errno; - if (transferred == 0) - return -EINVAL; - - /* Unmap it from our page tables. */ - if (munmap(mem, size) < 0) - return -errno; - - /* Wait until the parent modified it. */ - write(comm_pipes->child_ready[1], "0", 1); - while (read(comm_pipes->parent_ready[0], &buf, 1) != 1) - ; - - /* See if we still read the old values via the pipe. */ - for (total = 0; total < transferred; total += cur) { - cur = read(fds[0], new + total, transferred - total); - if (cur < 0) - return -errno; - } - - return memcmp(old, new, transferred); -} - -typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes); - -static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect, - child_fn fn) -{ - struct comm_pipes comm_pipes; - char buf; - int ret; - - ret = setup_comm_pipes(&comm_pipes); - if (ret) { - ksft_test_result_fail("pipe() failed\n"); - return; - } - - ret = fork(); - if (ret < 0) { - ksft_test_result_fail("fork() failed\n"); - goto close_comm_pipes; - } else if (!ret) { - exit(fn(mem, size, &comm_pipes)); - } - - while (read(comm_pipes.child_ready[0], &buf, 1) != 1) - ; - - if (do_mprotect) { - /* - * mprotect() optimizations might try avoiding - * write-faults by directly mapping pages writable. - */ - ret = mprotect(mem, size, PROT_READ); - ret |= mprotect(mem, size, PROT_READ|PROT_WRITE); - if (ret) { - ksft_test_result_fail("mprotect() failed\n"); - write(comm_pipes.parent_ready[1], "0", 1); - wait(&ret); - goto close_comm_pipes; - } - } - - /* Modify the page. */ - memset(mem, 0xff, size); - write(comm_pipes.parent_ready[1], "0", 1); - - wait(&ret); - if (WIFEXITED(ret)) - ret = WEXITSTATUS(ret); - else - ret = -EINVAL; - - ksft_test_result(!ret, "No leak from parent into child\n"); -close_comm_pipes: - close_comm_pipes(&comm_pipes); -} - -static void test_cow_in_parent(char *mem, size_t size) -{ - do_test_cow_in_parent(mem, size, false, child_memcmp_fn); -} - -static void test_cow_in_parent_mprotect(char *mem, size_t size) -{ - do_test_cow_in_parent(mem, size, true, child_memcmp_fn); -} - -static void test_vmsplice_in_child(char *mem, size_t size) -{ - do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn); -} - -static void test_vmsplice_in_child_mprotect(char *mem, size_t size) -{ - do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn); -} - -static void do_test_vmsplice_in_parent(char *mem, size_t size, - bool before_fork) -{ - struct iovec iov = { - .iov_base = mem, - .iov_len = size, - }; - ssize_t cur, total, transferred; - struct comm_pipes comm_pipes; - char *old, *new; - int ret, fds[2]; - char buf; - - old = malloc(size); - new = malloc(size); - - memcpy(old, mem, size); - - ret = setup_comm_pipes(&comm_pipes); - if (ret) { - ksft_test_result_fail("pipe() failed\n"); - goto free; - } - - if (pipe(fds) < 0) { - ksft_test_result_fail("pipe() failed\n"); - goto close_comm_pipes; - } - - if (before_fork) { - transferred = vmsplice(fds[1], &iov, 1, 0); - if (transferred <= 0) { - ksft_test_result_fail("vmsplice() failed\n"); - goto close_pipe; - } - } - - ret = fork(); - if (ret < 0) { - ksft_test_result_fail("fork() failed\n"); - goto close_pipe; - } else if (!ret) { - write(comm_pipes.child_ready[1], "0", 1); - while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) - ; - /* Modify page content in the child. */ - memset(mem, 0xff, size); - exit(0); - } - - if (!before_fork) { - transferred = vmsplice(fds[1], &iov, 1, 0); - if (transferred <= 0) { - ksft_test_result_fail("vmsplice() failed\n"); - wait(&ret); - goto close_pipe; - } - } - - while (read(comm_pipes.child_ready[0], &buf, 1) != 1) - ; - if (munmap(mem, size) < 0) { - ksft_test_result_fail("munmap() failed\n"); - goto close_pipe; - } - write(comm_pipes.parent_ready[1], "0", 1); - - /* Wait until the child is done writing. */ - wait(&ret); - if (!WIFEXITED(ret)) { - ksft_test_result_fail("wait() failed\n"); - goto close_pipe; - } - - /* See if we still read the old values. */ - for (total = 0; total < transferred; total += cur) { - cur = read(fds[0], new + total, transferred - total); - if (cur < 0) { - ksft_test_result_fail("read() failed\n"); - goto close_pipe; - } - } - - ksft_test_result(!memcmp(old, new, transferred), - "No leak from child into parent\n"); -close_pipe: - close(fds[0]); - close(fds[1]); -close_comm_pipes: - close_comm_pipes(&comm_pipes); -free: - free(old); - free(new); -} - -static void test_vmsplice_before_fork(char *mem, size_t size) -{ - do_test_vmsplice_in_parent(mem, size, true); -} - -static void test_vmsplice_after_fork(char *mem, size_t size) -{ - do_test_vmsplice_in_parent(mem, size, false); -} - -#ifdef LOCAL_CONFIG_HAVE_LIBURING -static void do_test_iouring(char *mem, size_t size, bool use_fork) -{ - struct comm_pipes comm_pipes; - struct io_uring_cqe *cqe; - struct io_uring_sqe *sqe; - struct io_uring ring; - ssize_t cur, total; - struct iovec iov; - char *buf, *tmp; - int ret, fd; - FILE *file; - - ret = setup_comm_pipes(&comm_pipes); - if (ret) { - ksft_test_result_fail("pipe() failed\n"); - return; - } - - file = tmpfile(); - if (!file) { - ksft_test_result_fail("tmpfile() failed\n"); - goto close_comm_pipes; - } - fd = fileno(file); - assert(fd); - - tmp = malloc(size); - if (!tmp) { - ksft_test_result_fail("malloc() failed\n"); - goto close_file; - } - - /* Skip on errors, as we might just lack kernel support. */ - ret = io_uring_queue_init(1, &ring, 0); - if (ret < 0) { - ksft_test_result_skip("io_uring_queue_init() failed\n"); - goto free_tmp; - } - - /* - * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN - * | FOLL_LONGTERM the range. - * - * Skip on errors, as we might just lack kernel support or might not - * have sufficient MEMLOCK permissions. - */ - iov.iov_base = mem; - iov.iov_len = size; - ret = io_uring_register_buffers(&ring, &iov, 1); - if (ret) { - ksft_test_result_skip("io_uring_register_buffers() failed\n"); - goto queue_exit; - } - - if (use_fork) { - /* - * fork() and keep the child alive until we're done. Note that - * we expect the pinned page to not get shared with the child. - */ - ret = fork(); - if (ret < 0) { - ksft_test_result_fail("fork() failed\n"); - goto unregister_buffers; - } else if (!ret) { - write(comm_pipes.child_ready[1], "0", 1); - while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) - ; - exit(0); - } - - while (read(comm_pipes.child_ready[0], &buf, 1) != 1) - ; - } else { - /* - * Map the page R/O into the page table. Enable softdirty - * tracking to stop the page from getting mapped R/W immediately - * again by mprotect() optimizations. Note that we don't have an - * easy way to test if that worked (the pagemap does not export - * if the page is mapped R/O vs. R/W). - */ - ret = mprotect(mem, size, PROT_READ); - clear_softdirty(); - ret |= mprotect(mem, size, PROT_READ | PROT_WRITE); - if (ret) { - ksft_test_result_fail("mprotect() failed\n"); - goto unregister_buffers; - } - } - - /* - * Modify the page and write page content as observed by the fixed - * buffer pin to the file so we can verify it. - */ - memset(mem, 0xff, size); - sqe = io_uring_get_sqe(&ring); - if (!sqe) { - ksft_test_result_fail("io_uring_get_sqe() failed\n"); - goto quit_child; - } - io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0); - - ret = io_uring_submit(&ring); - if (ret < 0) { - ksft_test_result_fail("io_uring_submit() failed\n"); - goto quit_child; - } - - ret = io_uring_wait_cqe(&ring, &cqe); - if (ret < 0) { - ksft_test_result_fail("io_uring_wait_cqe() failed\n"); - goto quit_child; - } - - if (cqe->res != size) { - ksft_test_result_fail("write_fixed failed\n"); - goto quit_child; - } - io_uring_cqe_seen(&ring, cqe); - - /* Read back the file content to the temporary buffer. */ - total = 0; - while (total < size) { - cur = pread(fd, tmp + total, size - total, total); - if (cur < 0) { - ksft_test_result_fail("pread() failed\n"); - goto quit_child; - } - total += cur; - } - - /* Finally, check if we read what we expected. */ - ksft_test_result(!memcmp(mem, tmp, size), - "Longterm R/W pin is reliable\n"); - -quit_child: - if (use_fork) { - write(comm_pipes.parent_ready[1], "0", 1); - wait(&ret); - } -unregister_buffers: - io_uring_unregister_buffers(&ring); -queue_exit: - io_uring_queue_exit(&ring); -free_tmp: - free(tmp); -close_file: - fclose(file); -close_comm_pipes: - close_comm_pipes(&comm_pipes); -} - -static void test_iouring_ro(char *mem, size_t size) -{ - do_test_iouring(mem, size, false); -} - -static void test_iouring_fork(char *mem, size_t size) -{ - do_test_iouring(mem, size, true); -} - -#endif /* LOCAL_CONFIG_HAVE_LIBURING */ - -enum ro_pin_test { - RO_PIN_TEST, - RO_PIN_TEST_SHARED, - RO_PIN_TEST_PREVIOUSLY_SHARED, - RO_PIN_TEST_RO_EXCLUSIVE, -}; - -static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test, - bool fast) -{ - struct pin_longterm_test args; - struct comm_pipes comm_pipes; - char *tmp, buf; - __u64 tmp_val; - int ret; - - if (gup_fd < 0) { - ksft_test_result_skip("gup_test not available\n"); - return; - } - - tmp = malloc(size); - if (!tmp) { - ksft_test_result_fail("malloc() failed\n"); - return; - } - - ret = setup_comm_pipes(&comm_pipes); - if (ret) { - ksft_test_result_fail("pipe() failed\n"); - goto free_tmp; - } - - switch (test) { - case RO_PIN_TEST: - break; - case RO_PIN_TEST_SHARED: - case RO_PIN_TEST_PREVIOUSLY_SHARED: - /* - * Share the pages with our child. As the pages are not pinned, - * this should just work. - */ - ret = fork(); - if (ret < 0) { - ksft_test_result_fail("fork() failed\n"); - goto close_comm_pipes; - } else if (!ret) { - write(comm_pipes.child_ready[1], "0", 1); - while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) - ; - exit(0); - } - - /* Wait until our child is ready. */ - while (read(comm_pipes.child_ready[0], &buf, 1) != 1) - ; - - if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) { - /* - * Tell the child to quit now and wait until it quit. - * The pages should now be mapped R/O into our page - * tables, but they are no longer shared. - */ - write(comm_pipes.parent_ready[1], "0", 1); - wait(&ret); - if (!WIFEXITED(ret)) - ksft_print_msg("[INFO] wait() failed\n"); - } - break; - case RO_PIN_TEST_RO_EXCLUSIVE: - /* - * Map the page R/O into the page table. Enable softdirty - * tracking to stop the page from getting mapped R/W immediately - * again by mprotect() optimizations. Note that we don't have an - * easy way to test if that worked (the pagemap does not export - * if the page is mapped R/O vs. R/W). - */ - ret = mprotect(mem, size, PROT_READ); - clear_softdirty(); - ret |= mprotect(mem, size, PROT_READ | PROT_WRITE); - if (ret) { - ksft_test_result_fail("mprotect() failed\n"); - goto close_comm_pipes; - } - break; - default: - assert(false); - } - - /* Take a R/O pin. This should trigger unsharing. */ - args.addr = (__u64)(uintptr_t)mem; - args.size = size; - args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0; - ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args); - if (ret) { - if (errno == EINVAL) - ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n"); - else - ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n"); - goto wait; - } - - /* Modify the page. */ - memset(mem, 0xff, size); - - /* - * Read back the content via the pin to the temporary buffer and - * test if we observed the modification. - */ - tmp_val = (__u64)(uintptr_t)tmp; - ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val); - if (ret) - ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n"); - else - ksft_test_result(!memcmp(mem, tmp, size), - "Longterm R/O pin is reliable\n"); - - ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP); - if (ret) - ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n"); -wait: - switch (test) { - case RO_PIN_TEST_SHARED: - write(comm_pipes.parent_ready[1], "0", 1); - wait(&ret); - if (!WIFEXITED(ret)) - ksft_print_msg("[INFO] wait() failed\n"); - break; - default: - break; - } -close_comm_pipes: - close_comm_pipes(&comm_pipes); -free_tmp: - free(tmp); -} - -static void test_ro_pin_on_shared(char *mem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false); -} - -static void test_ro_fast_pin_on_shared(char *mem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true); -} - -static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false); -} - -static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true); -} - -static void test_ro_pin_on_ro_exclusive(char *mem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false); -} - -static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true); -} - -typedef void (*test_fn)(char *mem, size_t size); - -static void do_run_with_base_page(test_fn fn, bool swapout) -{ - char *mem; - int ret; - - mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - return; - } - - ret = madvise(mem, pagesize, MADV_NOHUGEPAGE); - /* Ignore if not around on a kernel. */ - if (ret && errno != EINVAL) { - ksft_test_result_fail("MADV_NOHUGEPAGE failed\n"); - goto munmap; - } - - /* Populate a base page. */ - memset(mem, 0, pagesize); - - if (swapout) { - madvise(mem, pagesize, MADV_PAGEOUT); - if (!pagemap_is_swapped(pagemap_fd, mem)) { - ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); - goto munmap; - } - } - - fn(mem, pagesize); -munmap: - munmap(mem, pagesize); -} - -static void run_with_base_page(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with base page\n", desc); - do_run_with_base_page(fn, false); -} - -static void run_with_base_page_swap(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc); - do_run_with_base_page(fn, true); -} - -enum thp_run { - THP_RUN_PMD, - THP_RUN_PMD_SWAPOUT, - THP_RUN_PTE, - THP_RUN_PTE_SWAPOUT, - THP_RUN_SINGLE_PTE, - THP_RUN_SINGLE_PTE_SWAPOUT, - THP_RUN_PARTIAL_MREMAP, - THP_RUN_PARTIAL_SHARED, -}; - -static void do_run_with_thp(test_fn fn, enum thp_run thp_run) -{ - char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED; - size_t size, mmap_size, mremap_size; - int ret; - - /* For alignment purposes, we need twice the thp size. */ - mmap_size = 2 * thpsize; - mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (mmap_mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - return; - } - - /* We need a THP-aligned memory area. */ - mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1)); - - ret = madvise(mem, thpsize, MADV_HUGEPAGE); - if (ret) { - ksft_test_result_fail("MADV_HUGEPAGE failed\n"); - goto munmap; - } - - /* - * Try to populate a THP. Touch the first sub-page and test if we get - * another sub-page populated automatically. - */ - mem[0] = 0; - if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) { - ksft_test_result_skip("Did not get a THP populated\n"); - goto munmap; - } - memset(mem, 0, thpsize); - - size = thpsize; - switch (thp_run) { - case THP_RUN_PMD: - case THP_RUN_PMD_SWAPOUT: - break; - case THP_RUN_PTE: - case THP_RUN_PTE_SWAPOUT: - /* - * Trigger PTE-mapping the THP by temporarily mapping a single - * subpage R/O. - */ - ret = mprotect(mem + pagesize, pagesize, PROT_READ); - if (ret) { - ksft_test_result_fail("mprotect() failed\n"); - goto munmap; - } - ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); - if (ret) { - ksft_test_result_fail("mprotect() failed\n"); - goto munmap; - } - break; - case THP_RUN_SINGLE_PTE: - case THP_RUN_SINGLE_PTE_SWAPOUT: - /* - * Discard all but a single subpage of that PTE-mapped THP. What - * remains is a single PTE mapping a single subpage. - */ - ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED); - if (ret) { - ksft_test_result_fail("MADV_DONTNEED failed\n"); - goto munmap; - } - size = pagesize; - break; - case THP_RUN_PARTIAL_MREMAP: - /* - * Remap half of the THP. We need some new memory location - * for that. - */ - mremap_size = thpsize / 2; - mremap_mem = mmap(NULL, mremap_size, PROT_NONE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto munmap; - } - tmp = mremap(mem + mremap_size, mremap_size, mremap_size, - MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem); - if (tmp != mremap_mem) { - ksft_test_result_fail("mremap() failed\n"); - goto munmap; - } - size = mremap_size; - break; - case THP_RUN_PARTIAL_SHARED: - /* - * Share the first page of the THP with a child and quit the - * child. This will result in some parts of the THP never - * have been shared. - */ - ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK); - if (ret) { - ksft_test_result_fail("MADV_DONTFORK failed\n"); - goto munmap; - } - ret = fork(); - if (ret < 0) { - ksft_test_result_fail("fork() failed\n"); - goto munmap; - } else if (!ret) { - exit(0); - } - wait(&ret); - /* Allow for sharing all pages again. */ - ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK); - if (ret) { - ksft_test_result_fail("MADV_DOFORK failed\n"); - goto munmap; - } - break; - default: - assert(false); - } - - switch (thp_run) { - case THP_RUN_PMD_SWAPOUT: - case THP_RUN_PTE_SWAPOUT: - case THP_RUN_SINGLE_PTE_SWAPOUT: - madvise(mem, size, MADV_PAGEOUT); - if (!range_is_swapped(mem, size)) { - ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); - goto munmap; - } - break; - default: - break; - } - - fn(mem, size); -munmap: - munmap(mmap_mem, mmap_size); - if (mremap_mem != MAP_FAILED) - munmap(mremap_mem, mremap_size); -} - -static void run_with_thp(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with THP\n", desc); - do_run_with_thp(fn, THP_RUN_PMD); -} - -static void run_with_thp_swap(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with swapped-out THP\n", desc); - do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT); -} - -static void run_with_pte_mapped_thp(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with PTE-mapped THP\n", desc); - do_run_with_thp(fn, THP_RUN_PTE); -} - -static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP\n", desc); - do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT); -} - -static void run_with_single_pte_of_thp(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with single PTE of THP\n", desc); - do_run_with_thp(fn, THP_RUN_SINGLE_PTE); -} - -static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP\n", desc); - do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT); -} - -static void run_with_partial_mremap_thp(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP\n", desc); - do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP); -} - -static void run_with_partial_shared_thp(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with partially shared THP\n", desc); - do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED); -} - -static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize) -{ - int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB; - char *mem, *dummy; - - ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc, - hugetlbsize / 1024); - - flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT; - - mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0); - if (mem == MAP_FAILED) { - ksft_test_result_skip("need more free huge pages\n"); - return; - } - - /* Populate an huge page. */ - memset(mem, 0, hugetlbsize); - - /* - * We need a total of two hugetlb pages to handle COW/unsharing - * properly, otherwise we might get zapped by a SIGBUS. - */ - dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0); - if (dummy == MAP_FAILED) { - ksft_test_result_skip("need more free huge pages\n"); - goto munmap; - } - munmap(dummy, hugetlbsize); - - fn(mem, hugetlbsize); -munmap: - munmap(mem, hugetlbsize); -} - -struct test_case { - const char *desc; - test_fn fn; -}; - -/* - * Test cases that are specific to anonymous pages: pages in private mappings - * that may get shared via COW during fork(). - */ -static const struct test_case anon_test_cases[] = { - /* - * Basic COW tests for fork() without any GUP. If we miss to break COW, - * either the child can observe modifications by the parent or the - * other way around. - */ - { - "Basic COW after fork()", - test_cow_in_parent, - }, - /* - * Basic test, but do an additional mprotect(PROT_READ)+ - * mprotect(PROT_READ|PROT_WRITE) in the parent before write access. - */ - { - "Basic COW after fork() with mprotect() optimization", - test_cow_in_parent_mprotect, - }, - /* - * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If - * we miss to break COW, the child observes modifications by the parent. - * This is CVE-2020-29374 reported by Jann Horn. - */ - { - "vmsplice() + unmap in child", - test_vmsplice_in_child - }, - /* - * vmsplice() test, but do an additional mprotect(PROT_READ)+ - * mprotect(PROT_READ|PROT_WRITE) in the parent before write access. - */ - { - "vmsplice() + unmap in child with mprotect() optimization", - test_vmsplice_in_child_mprotect - }, - /* - * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after - * fork(); modify in the child. If we miss to break COW, the parent - * observes modifications by the child. - */ - { - "vmsplice() before fork(), unmap in parent after fork()", - test_vmsplice_before_fork, - }, - /* - * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the - * child. If we miss to break COW, the parent observes modifications by - * the child. - */ - { - "vmsplice() + unmap in parent after fork()", - test_vmsplice_after_fork, - }, -#ifdef LOCAL_CONFIG_HAVE_LIBURING - /* - * Take a R/W longterm pin and then map the page R/O into the page - * table to trigger a write fault on next access. When modifying the - * page, the page content must be visible via the pin. - */ - { - "R/O-mapping a page registered as iouring fixed buffer", - test_iouring_ro, - }, - /* - * Take a R/W longterm pin and then fork() a child. When modifying the - * page, the page content must be visible via the pin. We expect the - * pinned page to not get shared with the child. - */ - { - "fork() with an iouring fixed buffer", - test_iouring_fork, - }, - -#endif /* LOCAL_CONFIG_HAVE_LIBURING */ - /* - * Take a R/O longterm pin on a R/O-mapped shared anonymous page. - * When modifying the page via the page table, the page content change - * must be visible via the pin. - */ - { - "R/O GUP pin on R/O-mapped shared page", - test_ro_pin_on_shared, - }, - /* Same as above, but using GUP-fast. */ - { - "R/O GUP-fast pin on R/O-mapped shared page", - test_ro_fast_pin_on_shared, - }, - /* - * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that - * was previously shared. When modifying the page via the page table, - * the page content change must be visible via the pin. - */ - { - "R/O GUP pin on R/O-mapped previously-shared page", - test_ro_pin_on_ro_previously_shared, - }, - /* Same as above, but using GUP-fast. */ - { - "R/O GUP-fast pin on R/O-mapped previously-shared page", - test_ro_fast_pin_on_ro_previously_shared, - }, - /* - * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page. - * When modifying the page via the page table, the page content change - * must be visible via the pin. - */ - { - "R/O GUP pin on R/O-mapped exclusive page", - test_ro_pin_on_ro_exclusive, - }, - /* Same as above, but using GUP-fast. */ - { - "R/O GUP-fast pin on R/O-mapped exclusive page", - test_ro_fast_pin_on_ro_exclusive, - }, -}; - -static void run_anon_test_case(struct test_case const *test_case) -{ - int i; - - run_with_base_page(test_case->fn, test_case->desc); - run_with_base_page_swap(test_case->fn, test_case->desc); - if (thpsize) { - run_with_thp(test_case->fn, test_case->desc); - run_with_thp_swap(test_case->fn, test_case->desc); - run_with_pte_mapped_thp(test_case->fn, test_case->desc); - run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc); - run_with_single_pte_of_thp(test_case->fn, test_case->desc); - run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc); - run_with_partial_mremap_thp(test_case->fn, test_case->desc); - run_with_partial_shared_thp(test_case->fn, test_case->desc); - } - for (i = 0; i < nr_hugetlbsizes; i++) - run_with_hugetlb(test_case->fn, test_case->desc, - hugetlbsizes[i]); -} - -static void run_anon_test_cases(void) -{ - int i; - - ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n"); - - for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++) - run_anon_test_case(&anon_test_cases[i]); -} - -static int tests_per_anon_test_case(void) -{ - int tests = 2 + nr_hugetlbsizes; - - if (thpsize) - tests += 8; - return tests; -} - -enum anon_thp_collapse_test { - ANON_THP_COLLAPSE_UNSHARED, - ANON_THP_COLLAPSE_FULLY_SHARED, - ANON_THP_COLLAPSE_LOWER_SHARED, - ANON_THP_COLLAPSE_UPPER_SHARED, -}; - -static void do_test_anon_thp_collapse(char *mem, size_t size, - enum anon_thp_collapse_test test) -{ - struct comm_pipes comm_pipes; - char buf; - int ret; - - ret = setup_comm_pipes(&comm_pipes); - if (ret) { - ksft_test_result_fail("pipe() failed\n"); - return; - } - - /* - * Trigger PTE-mapping the THP by temporarily mapping a single subpage - * R/O, such that we can try collapsing it later. - */ - ret = mprotect(mem + pagesize, pagesize, PROT_READ); - if (ret) { - ksft_test_result_fail("mprotect() failed\n"); - goto close_comm_pipes; - } - ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); - if (ret) { - ksft_test_result_fail("mprotect() failed\n"); - goto close_comm_pipes; - } - - switch (test) { - case ANON_THP_COLLAPSE_UNSHARED: - /* Collapse before actually COW-sharing the page. */ - ret = madvise(mem, size, MADV_COLLAPSE); - if (ret) { - ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", - strerror(errno)); - goto close_comm_pipes; - } - break; - case ANON_THP_COLLAPSE_FULLY_SHARED: - /* COW-share the full PTE-mapped THP. */ - break; - case ANON_THP_COLLAPSE_LOWER_SHARED: - /* Don't COW-share the upper part of the THP. */ - ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK); - if (ret) { - ksft_test_result_fail("MADV_DONTFORK failed\n"); - goto close_comm_pipes; - } - break; - case ANON_THP_COLLAPSE_UPPER_SHARED: - /* Don't COW-share the lower part of the THP. */ - ret = madvise(mem, size / 2, MADV_DONTFORK); - if (ret) { - ksft_test_result_fail("MADV_DONTFORK failed\n"); - goto close_comm_pipes; - } - break; - default: - assert(false); - } - - ret = fork(); - if (ret < 0) { - ksft_test_result_fail("fork() failed\n"); - goto close_comm_pipes; - } else if (!ret) { - switch (test) { - case ANON_THP_COLLAPSE_UNSHARED: - case ANON_THP_COLLAPSE_FULLY_SHARED: - exit(child_memcmp_fn(mem, size, &comm_pipes)); - break; - case ANON_THP_COLLAPSE_LOWER_SHARED: - exit(child_memcmp_fn(mem, size / 2, &comm_pipes)); - break; - case ANON_THP_COLLAPSE_UPPER_SHARED: - exit(child_memcmp_fn(mem + size / 2, size / 2, - &comm_pipes)); - break; - default: - assert(false); - } - } - - while (read(comm_pipes.child_ready[0], &buf, 1) != 1) - ; - - switch (test) { - case ANON_THP_COLLAPSE_UNSHARED: - break; - case ANON_THP_COLLAPSE_UPPER_SHARED: - case ANON_THP_COLLAPSE_LOWER_SHARED: - /* - * Revert MADV_DONTFORK such that we merge the VMAs and are - * able to actually collapse. - */ - ret = madvise(mem, size, MADV_DOFORK); - if (ret) { - ksft_test_result_fail("MADV_DOFORK failed\n"); - write(comm_pipes.parent_ready[1], "0", 1); - wait(&ret); - goto close_comm_pipes; - } - /* FALLTHROUGH */ - case ANON_THP_COLLAPSE_FULLY_SHARED: - /* Collapse before anyone modified the COW-shared page. */ - ret = madvise(mem, size, MADV_COLLAPSE); - if (ret) { - ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", - strerror(errno)); - write(comm_pipes.parent_ready[1], "0", 1); - wait(&ret); - goto close_comm_pipes; - } - break; - default: - assert(false); - } - - /* Modify the page. */ - memset(mem, 0xff, size); - write(comm_pipes.parent_ready[1], "0", 1); - - wait(&ret); - if (WIFEXITED(ret)) - ret = WEXITSTATUS(ret); - else - ret = -EINVAL; - - ksft_test_result(!ret, "No leak from parent into child\n"); -close_comm_pipes: - close_comm_pipes(&comm_pipes); -} - -static void test_anon_thp_collapse_unshared(char *mem, size_t size) -{ - do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED); -} - -static void test_anon_thp_collapse_fully_shared(char *mem, size_t size) -{ - do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED); -} - -static void test_anon_thp_collapse_lower_shared(char *mem, size_t size) -{ - do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED); -} - -static void test_anon_thp_collapse_upper_shared(char *mem, size_t size) -{ - do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED); -} - -/* - * Test cases that are specific to anonymous THP: pages in private mappings - * that may get shared via COW during fork(). - */ -static const struct test_case anon_thp_test_cases[] = { - /* - * Basic COW test for fork() without any GUP when collapsing a THP - * before fork(). - * - * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place - * collapse") might easily get COW handling wrong when not collapsing - * exclusivity information properly. - */ - { - "Basic COW after fork() when collapsing before fork()", - test_anon_thp_collapse_unshared, - }, - /* Basic COW test, but collapse after COW-sharing a full THP. */ - { - "Basic COW after fork() when collapsing after fork() (fully shared)", - test_anon_thp_collapse_fully_shared, - }, - /* - * Basic COW test, but collapse after COW-sharing the lower half of a - * THP. - */ - { - "Basic COW after fork() when collapsing after fork() (lower shared)", - test_anon_thp_collapse_lower_shared, - }, - /* - * Basic COW test, but collapse after COW-sharing the upper half of a - * THP. - */ - { - "Basic COW after fork() when collapsing after fork() (upper shared)", - test_anon_thp_collapse_upper_shared, - }, -}; - -static void run_anon_thp_test_cases(void) -{ - int i; - - if (!thpsize) - return; - - ksft_print_msg("[INFO] Anonymous THP tests\n"); - - for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) { - struct test_case const *test_case = &anon_thp_test_cases[i]; - - ksft_print_msg("[RUN] %s\n", test_case->desc); - do_run_with_thp(test_case->fn, THP_RUN_PMD); - } -} - -static int tests_per_anon_thp_test_case(void) -{ - return thpsize ? 1 : 0; -} - -typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size); - -static void test_cow(char *mem, const char *smem, size_t size) -{ - char *old = malloc(size); - - /* Backup the original content. */ - memcpy(old, smem, size); - - /* Modify the page. */ - memset(mem, 0xff, size); - - /* See if we still read the old values via the other mapping. */ - ksft_test_result(!memcmp(smem, old, size), - "Other mapping not modified\n"); - free(old); -} - -static void test_ro_pin(char *mem, const char *smem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST, false); -} - -static void test_ro_fast_pin(char *mem, const char *smem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST, true); -} - -static void run_with_zeropage(non_anon_test_fn fn, const char *desc) -{ - char *mem, *smem, tmp; - - ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc); - - mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANON, -1, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - return; - } - - smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto munmap; - } - - /* Read from the page to populate the shared zeropage. */ - tmp = *mem + *smem; - asm volatile("" : "+r" (tmp)); - - fn(mem, smem, pagesize); -munmap: - munmap(mem, pagesize); - if (smem != MAP_FAILED) - munmap(smem, pagesize); -} - -static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) -{ - char *mem, *smem, *mmap_mem, *mmap_smem, tmp; - size_t mmap_size; - int ret; - - ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc); - - if (!has_huge_zeropage) { - ksft_test_result_skip("Huge zeropage not enabled\n"); - return; - } - - /* For alignment purposes, we need twice the thp size. */ - mmap_size = 2 * thpsize; - mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (mmap_mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - return; - } - mmap_smem = mmap(NULL, mmap_size, PROT_READ, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (mmap_smem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto munmap; - } - - /* We need a THP-aligned memory area. */ - mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1)); - smem = (char *)(((uintptr_t)mmap_smem + thpsize) & ~(thpsize - 1)); - - ret = madvise(mem, thpsize, MADV_HUGEPAGE); - ret |= madvise(smem, thpsize, MADV_HUGEPAGE); - if (ret) { - ksft_test_result_fail("MADV_HUGEPAGE failed\n"); - goto munmap; - } - - /* - * Read from the memory to populate the huge shared zeropage. Read from - * the first sub-page and test if we get another sub-page populated - * automatically. - */ - tmp = *mem + *smem; - asm volatile("" : "+r" (tmp)); - if (!pagemap_is_populated(pagemap_fd, mem + pagesize) || - !pagemap_is_populated(pagemap_fd, smem + pagesize)) { - ksft_test_result_skip("Did not get THPs populated\n"); - goto munmap; - } - - fn(mem, smem, thpsize); -munmap: - munmap(mmap_mem, mmap_size); - if (mmap_smem != MAP_FAILED) - munmap(mmap_smem, mmap_size); -} - -static void run_with_memfd(non_anon_test_fn fn, const char *desc) -{ - char *mem, *smem, tmp; - int fd; - - ksft_print_msg("[RUN] %s ... with memfd\n", desc); - - fd = memfd_create("test", 0); - if (fd < 0) { - ksft_test_result_fail("memfd_create() failed\n"); - return; - } - - /* File consists of a single page filled with zeroes. */ - if (fallocate(fd, 0, 0, pagesize)) { - ksft_test_result_fail("fallocate() failed\n"); - goto close; - } - - /* Create a private mapping of the memfd. */ - mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto close; - } - smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto munmap; - } - - /* Fault the page in. */ - tmp = *mem + *smem; - asm volatile("" : "+r" (tmp)); - - fn(mem, smem, pagesize); -munmap: - munmap(mem, pagesize); - if (smem != MAP_FAILED) - munmap(smem, pagesize); -close: - close(fd); -} - -static void run_with_tmpfile(non_anon_test_fn fn, const char *desc) -{ - char *mem, *smem, tmp; - FILE *file; - int fd; - - ksft_print_msg("[RUN] %s ... with tmpfile\n", desc); - - file = tmpfile(); - if (!file) { - ksft_test_result_fail("tmpfile() failed\n"); - return; - } - - fd = fileno(file); - if (fd < 0) { - ksft_test_result_skip("fileno() failed\n"); - return; - } - - /* File consists of a single page filled with zeroes. */ - if (fallocate(fd, 0, 0, pagesize)) { - ksft_test_result_fail("fallocate() failed\n"); - goto close; - } - - /* Create a private mapping of the memfd. */ - mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto close; - } - smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto munmap; - } - - /* Fault the page in. */ - tmp = *mem + *smem; - asm volatile("" : "+r" (tmp)); - - fn(mem, smem, pagesize); -munmap: - munmap(mem, pagesize); - if (smem != MAP_FAILED) - munmap(smem, pagesize); -close: - fclose(file); -} - -static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc, - size_t hugetlbsize) -{ - int flags = MFD_HUGETLB; - char *mem, *smem, tmp; - int fd; - - ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc, - hugetlbsize / 1024); - - flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT; - - fd = memfd_create("test", flags); - if (fd < 0) { - ksft_test_result_skip("memfd_create() failed\n"); - return; - } - - /* File consists of a single page filled with zeroes. */ - if (fallocate(fd, 0, 0, hugetlbsize)) { - ksft_test_result_skip("need more free huge pages\n"); - goto close; - } - - /* Create a private mapping of the memfd. */ - mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, - 0); - if (mem == MAP_FAILED) { - ksft_test_result_skip("need more free huge pages\n"); - goto close; - } - smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto munmap; - } - - /* Fault the page in. */ - tmp = *mem + *smem; - asm volatile("" : "+r" (tmp)); - - fn(mem, smem, hugetlbsize); -munmap: - munmap(mem, hugetlbsize); - if (mem != MAP_FAILED) - munmap(smem, hugetlbsize); -close: - close(fd); -} - -struct non_anon_test_case { - const char *desc; - non_anon_test_fn fn; -}; - -/* - * Test cases that target any pages in private mappings that are not anonymous: - * pages that may get shared via COW ndependent of fork(). This includes - * the shared zeropage(s), pagecache pages, ... - */ -static const struct non_anon_test_case non_anon_test_cases[] = { - /* - * Basic COW test without any GUP. If we miss to break COW, changes are - * visible via other private/shared mappings. - */ - { - "Basic COW", - test_cow, - }, - /* - * Take a R/O longterm pin. When modifying the page via the page table, - * the page content change must be visible via the pin. - */ - { - "R/O longterm GUP pin", - test_ro_pin, - }, - /* Same as above, but using GUP-fast. */ - { - "R/O longterm GUP-fast pin", - test_ro_fast_pin, - }, -}; - -static void run_non_anon_test_case(struct non_anon_test_case const *test_case) -{ - int i; - - run_with_zeropage(test_case->fn, test_case->desc); - run_with_memfd(test_case->fn, test_case->desc); - run_with_tmpfile(test_case->fn, test_case->desc); - if (thpsize) - run_with_huge_zeropage(test_case->fn, test_case->desc); - for (i = 0; i < nr_hugetlbsizes; i++) - run_with_memfd_hugetlb(test_case->fn, test_case->desc, - hugetlbsizes[i]); -} - -static void run_non_anon_test_cases(void) -{ - int i; - - ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n"); - - for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++) - run_non_anon_test_case(&non_anon_test_cases[i]); -} - -static int tests_per_non_anon_test_case(void) -{ - int tests = 3 + nr_hugetlbsizes; - - if (thpsize) - tests += 1; - return tests; -} - -int main(int argc, char **argv) -{ - int err; - - pagesize = getpagesize(); - detect_thpsize(); - detect_hugetlbsizes(); - detect_huge_zeropage(); - - ksft_print_header(); - ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() + - ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() + - ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case()); - - gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); - pagemap_fd = open("/proc/self/pagemap", O_RDONLY); - if (pagemap_fd < 0) - ksft_exit_fail_msg("opening pagemap failed\n"); - - run_anon_test_cases(); - run_anon_thp_test_cases(); - run_non_anon_test_cases(); - - err = ksft_get_fail_cnt(); - if (err) - ksft_exit_fail_msg("%d out of %d tests failed\n", - err, ksft_test_num()); - return ksft_exit_pass(); -} diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/vm/gup_test.c deleted file mode 100644 index e43879291dac..000000000000 --- a/tools/testing/selftests/vm/gup_test.c +++ /dev/null @@ -1,271 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "../kselftest.h" - -#include "util.h" - -#define MB (1UL << 20) - -/* Just the flags we need, copied from mm.h: */ -#define FOLL_WRITE 0x01 /* check pte is writable */ -#define FOLL_TOUCH 0x02 /* mark page accessed */ - -#define GUP_TEST_FILE "/sys/kernel/debug/gup_test" - -static unsigned long cmd = GUP_FAST_BENCHMARK; -static int gup_fd, repeats = 1; -static unsigned long size = 128 * MB; -/* Serialize prints */ -static pthread_mutex_t print_mutex = PTHREAD_MUTEX_INITIALIZER; - -static char *cmd_to_str(unsigned long cmd) -{ - switch (cmd) { - case GUP_FAST_BENCHMARK: - return "GUP_FAST_BENCHMARK"; - case PIN_FAST_BENCHMARK: - return "PIN_FAST_BENCHMARK"; - case PIN_LONGTERM_BENCHMARK: - return "PIN_LONGTERM_BENCHMARK"; - case GUP_BASIC_TEST: - return "GUP_BASIC_TEST"; - case PIN_BASIC_TEST: - return "PIN_BASIC_TEST"; - case DUMP_USER_PAGES_TEST: - return "DUMP_USER_PAGES_TEST"; - } - return "Unknown command"; -} - -void *gup_thread(void *data) -{ - struct gup_test gup = *(struct gup_test *)data; - int i; - - /* Only report timing information on the *_BENCHMARK commands: */ - if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) || - (cmd == PIN_LONGTERM_BENCHMARK)) { - for (i = 0; i < repeats; i++) { - gup.size = size; - if (ioctl(gup_fd, cmd, &gup)) - perror("ioctl"), exit(1); - - pthread_mutex_lock(&print_mutex); - printf("%s: Time: get:%lld put:%lld us", - cmd_to_str(cmd), gup.get_delta_usec, - gup.put_delta_usec); - if (gup.size != size) - printf(", truncated (size: %lld)", gup.size); - printf("\n"); - pthread_mutex_unlock(&print_mutex); - } - } else { - gup.size = size; - if (ioctl(gup_fd, cmd, &gup)) { - perror("ioctl"); - exit(1); - } - - pthread_mutex_lock(&print_mutex); - printf("%s: done\n", cmd_to_str(cmd)); - if (gup.size != size) - printf("Truncated (size: %lld)\n", gup.size); - pthread_mutex_unlock(&print_mutex); - } - - return NULL; -} - -int main(int argc, char **argv) -{ - struct gup_test gup = { 0 }; - int filed, i, opt, nr_pages = 1, thp = -1, write = 1, nthreads = 1, ret; - int flags = MAP_PRIVATE, touch = 0; - char *file = "/dev/zero"; - pthread_t *tid; - char *p; - - while ((opt = getopt(argc, argv, "m:r:n:F:f:abcj:tTLUuwWSHpz")) != -1) { - switch (opt) { - case 'a': - cmd = PIN_FAST_BENCHMARK; - break; - case 'b': - cmd = PIN_BASIC_TEST; - break; - case 'L': - cmd = PIN_LONGTERM_BENCHMARK; - break; - case 'c': - cmd = DUMP_USER_PAGES_TEST; - /* - * Dump page 0 (index 1). May be overridden later, by - * user's non-option arguments. - * - * .which_pages is zero-based, so that zero can mean "do - * nothing". - */ - gup.which_pages[0] = 1; - break; - case 'p': - /* works only with DUMP_USER_PAGES_TEST */ - gup.test_flags |= GUP_TEST_FLAG_DUMP_PAGES_USE_PIN; - break; - case 'F': - /* strtol, so you can pass flags in hex form */ - gup.gup_flags = strtol(optarg, 0, 0); - break; - case 'j': - nthreads = atoi(optarg); - break; - case 'm': - size = atoi(optarg) * MB; - break; - case 'r': - repeats = atoi(optarg); - break; - case 'n': - nr_pages = atoi(optarg); - break; - case 't': - thp = 1; - break; - case 'T': - thp = 0; - break; - case 'U': - cmd = GUP_BASIC_TEST; - break; - case 'u': - cmd = GUP_FAST_BENCHMARK; - break; - case 'w': - write = 1; - break; - case 'W': - write = 0; - break; - case 'f': - file = optarg; - break; - case 'S': - flags &= ~MAP_PRIVATE; - flags |= MAP_SHARED; - break; - case 'H': - flags |= (MAP_HUGETLB | MAP_ANONYMOUS); - break; - case 'z': - /* fault pages in gup, do not fault in userland */ - touch = 1; - break; - default: - return -1; - } - } - - if (optind < argc) { - int extra_arg_count = 0; - /* - * For example: - * - * ./gup_test -c 0 1 0x1001 - * - * ...to dump pages 0, 1, and 4097 - */ - - while ((optind < argc) && - (extra_arg_count < GUP_TEST_MAX_PAGES_TO_DUMP)) { - /* - * Do the 1-based indexing here, so that the user can - * use normal 0-based indexing on the command line. - */ - long page_index = strtol(argv[optind], 0, 0) + 1; - - gup.which_pages[extra_arg_count] = page_index; - extra_arg_count++; - optind++; - } - } - - filed = open(file, O_RDWR|O_CREAT); - if (filed < 0) { - perror("open"); - exit(filed); - } - - gup.nr_pages_per_call = nr_pages; - if (write) - gup.gup_flags |= FOLL_WRITE; - - gup_fd = open(GUP_TEST_FILE, O_RDWR); - if (gup_fd == -1) { - switch (errno) { - case EACCES: - if (getuid()) - printf("Please run this test as root\n"); - break; - case ENOENT: - if (opendir("/sys/kernel/debug") == NULL) { - printf("mount debugfs at /sys/kernel/debug\n"); - break; - } - printf("check if CONFIG_GUP_TEST is enabled in kernel config\n"); - break; - default: - perror("failed to open " GUP_TEST_FILE); - break; - } - exit(KSFT_SKIP); - } - - p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0); - if (p == MAP_FAILED) { - perror("mmap"); - exit(1); - } - gup.addr = (unsigned long)p; - - if (thp == 1) - madvise(p, size, MADV_HUGEPAGE); - else if (thp == 0) - madvise(p, size, MADV_NOHUGEPAGE); - - /* - * FOLL_TOUCH, in gup_test, is used as an either/or case: either - * fault pages in from the kernel via FOLL_TOUCH, or fault them - * in here, from user space. This allows comparison of performance - * between those two cases. - */ - if (touch) { - gup.gup_flags |= FOLL_TOUCH; - } else { - for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE) - p[0] = 0; - } - - tid = malloc(sizeof(pthread_t) * nthreads); - assert(tid); - for (i = 0; i < nthreads; i++) { - ret = pthread_create(&tid[i], NULL, gup_thread, &gup); - assert(ret == 0); - } - for (i = 0; i < nthreads; i++) { - ret = pthread_join(tid[i], NULL); - assert(ret == 0); - } - free(tid); - - return 0; -} diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c deleted file mode 100644 index 4adaad1b822f..000000000000 --- a/tools/testing/selftests/vm/hmm-tests.c +++ /dev/null @@ -1,2054 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * HMM stands for Heterogeneous Memory Management, it is a helper layer inside - * the linux kernel to help device drivers mirror a process address space in - * the device. This allows the device to use the same address space which - * makes communication and data exchange a lot easier. - * - * This framework's sole purpose is to exercise various code paths inside - * the kernel to make sure that HMM performs as expected and to flush out any - * bugs. - */ - -#include "../kselftest_harness.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -/* - * This is a private UAPI to the kernel test module so it isn't exported - * in the usual include/uapi/... directory. - */ -#include -#include - -struct hmm_buffer { - void *ptr; - void *mirror; - unsigned long size; - int fd; - uint64_t cpages; - uint64_t faults; -}; - -enum { - HMM_PRIVATE_DEVICE_ONE, - HMM_PRIVATE_DEVICE_TWO, - HMM_COHERENCE_DEVICE_ONE, - HMM_COHERENCE_DEVICE_TWO, -}; - -#define TWOMEG (1 << 21) -#define HMM_BUFFER_SIZE (1024 << 12) -#define HMM_PATH_MAX 64 -#define NTIMES 10 - -#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) -/* Just the flags we need, copied from mm.h: */ -#define FOLL_WRITE 0x01 /* check pte is writable */ -#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite */ - -FIXTURE(hmm) -{ - int fd; - unsigned int page_size; - unsigned int page_shift; -}; - -FIXTURE_VARIANT(hmm) -{ - int device_number; -}; - -FIXTURE_VARIANT_ADD(hmm, hmm_device_private) -{ - .device_number = HMM_PRIVATE_DEVICE_ONE, -}; - -FIXTURE_VARIANT_ADD(hmm, hmm_device_coherent) -{ - .device_number = HMM_COHERENCE_DEVICE_ONE, -}; - -FIXTURE(hmm2) -{ - int fd0; - int fd1; - unsigned int page_size; - unsigned int page_shift; -}; - -FIXTURE_VARIANT(hmm2) -{ - int device_number0; - int device_number1; -}; - -FIXTURE_VARIANT_ADD(hmm2, hmm2_device_private) -{ - .device_number0 = HMM_PRIVATE_DEVICE_ONE, - .device_number1 = HMM_PRIVATE_DEVICE_TWO, -}; - -FIXTURE_VARIANT_ADD(hmm2, hmm2_device_coherent) -{ - .device_number0 = HMM_COHERENCE_DEVICE_ONE, - .device_number1 = HMM_COHERENCE_DEVICE_TWO, -}; - -static int hmm_open(int unit) -{ - char pathname[HMM_PATH_MAX]; - int fd; - - snprintf(pathname, sizeof(pathname), "/dev/hmm_dmirror%d", unit); - fd = open(pathname, O_RDWR, 0); - if (fd < 0) - fprintf(stderr, "could not open hmm dmirror driver (%s)\n", - pathname); - return fd; -} - -static bool hmm_is_coherent_type(int dev_num) -{ - return (dev_num >= HMM_COHERENCE_DEVICE_ONE); -} - -FIXTURE_SETUP(hmm) -{ - self->page_size = sysconf(_SC_PAGE_SIZE); - self->page_shift = ffs(self->page_size) - 1; - - self->fd = hmm_open(variant->device_number); - if (self->fd < 0 && hmm_is_coherent_type(variant->device_number)) - SKIP(exit(0), "DEVICE_COHERENT not available"); - ASSERT_GE(self->fd, 0); -} - -FIXTURE_SETUP(hmm2) -{ - self->page_size = sysconf(_SC_PAGE_SIZE); - self->page_shift = ffs(self->page_size) - 1; - - self->fd0 = hmm_open(variant->device_number0); - if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0)) - SKIP(exit(0), "DEVICE_COHERENT not available"); - ASSERT_GE(self->fd0, 0); - self->fd1 = hmm_open(variant->device_number1); - ASSERT_GE(self->fd1, 0); -} - -FIXTURE_TEARDOWN(hmm) -{ - int ret = close(self->fd); - - ASSERT_EQ(ret, 0); - self->fd = -1; -} - -FIXTURE_TEARDOWN(hmm2) -{ - int ret = close(self->fd0); - - ASSERT_EQ(ret, 0); - self->fd0 = -1; - - ret = close(self->fd1); - ASSERT_EQ(ret, 0); - self->fd1 = -1; -} - -static int hmm_dmirror_cmd(int fd, - unsigned long request, - struct hmm_buffer *buffer, - unsigned long npages) -{ - struct hmm_dmirror_cmd cmd; - int ret; - - /* Simulate a device reading system memory. */ - cmd.addr = (__u64)buffer->ptr; - cmd.ptr = (__u64)buffer->mirror; - cmd.npages = npages; - - for (;;) { - ret = ioctl(fd, request, &cmd); - if (ret == 0) - break; - if (errno == EINTR) - continue; - return -errno; - } - buffer->cpages = cmd.cpages; - buffer->faults = cmd.faults; - - return 0; -} - -static void hmm_buffer_free(struct hmm_buffer *buffer) -{ - if (buffer == NULL) - return; - - if (buffer->ptr) - munmap(buffer->ptr, buffer->size); - free(buffer->mirror); - free(buffer); -} - -/* - * Create a temporary file that will be deleted on close. - */ -static int hmm_create_file(unsigned long size) -{ - char path[HMM_PATH_MAX]; - int fd; - - strcpy(path, "/tmp"); - fd = open(path, O_TMPFILE | O_EXCL | O_RDWR, 0600); - if (fd >= 0) { - int r; - - do { - r = ftruncate(fd, size); - } while (r == -1 && errno == EINTR); - if (!r) - return fd; - close(fd); - } - return -1; -} - -/* - * Return a random unsigned number. - */ -static unsigned int hmm_random(void) -{ - static int fd = -1; - unsigned int r; - - if (fd < 0) { - fd = open("/dev/urandom", O_RDONLY); - if (fd < 0) { - fprintf(stderr, "%s:%d failed to open /dev/urandom\n", - __FILE__, __LINE__); - return ~0U; - } - } - read(fd, &r, sizeof(r)); - return r; -} - -static void hmm_nanosleep(unsigned int n) -{ - struct timespec t; - - t.tv_sec = 0; - t.tv_nsec = n; - nanosleep(&t, NULL); -} - -static int hmm_migrate_sys_to_dev(int fd, - struct hmm_buffer *buffer, - unsigned long npages) -{ - return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages); -} - -static int hmm_migrate_dev_to_sys(int fd, - struct hmm_buffer *buffer, - unsigned long npages) -{ - return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages); -} - -/* - * Simple NULL test of device open/close. - */ -TEST_F(hmm, open_close) -{ -} - -/* - * Read private anonymous memory. - */ -TEST_F(hmm, anon_read) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - int val; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* - * Initialize buffer in system memory but leave the first two pages - * zero (pte_none and pfn_zero). - */ - i = 2 * self->page_size / sizeof(*ptr); - for (ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Set buffer permission to read-only. */ - ret = mprotect(buffer->ptr, size, PROT_READ); - ASSERT_EQ(ret, 0); - - /* Populate the CPU page table with a special zero page. */ - val = *(int *)(buffer->ptr + self->page_size); - ASSERT_EQ(val, 0); - - /* Simulate a device reading system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device read. */ - ptr = buffer->mirror; - for (i = 0; i < 2 * self->page_size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], 0); - for (; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Read private anonymous memory which has been protected with - * mprotect() PROT_NONE. - */ -TEST_F(hmm, anon_read_prot) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Initialize mirror buffer so we can verify it isn't written. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = -i; - - /* Protect buffer from reading. */ - ret = mprotect(buffer->ptr, size, PROT_NONE); - ASSERT_EQ(ret, 0); - - /* Simulate a device reading system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); - ASSERT_EQ(ret, -EFAULT); - - /* Allow CPU to read the buffer so we can check it. */ - ret = mprotect(buffer->ptr, size, PROT_READ); - ASSERT_EQ(ret, 0); - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - hmm_buffer_free(buffer); -} - -/* - * Write private anonymous memory. - */ -TEST_F(hmm, anon_write) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Write private anonymous memory which has been protected with - * mprotect() PROT_READ. - */ -TEST_F(hmm, anon_write_prot) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Simulate a device reading a zero page of memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, 1); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, 1); - ASSERT_EQ(buffer->faults, 1); - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, -EPERM); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], 0); - - /* Now allow writing and see that the zero page is replaced. */ - ret = mprotect(buffer->ptr, size, PROT_WRITE | PROT_READ); - ASSERT_EQ(ret, 0); - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Check that a device writing an anonymous private mapping - * will copy-on-write if a child process inherits the mapping. - */ -TEST_F(hmm, anon_write_child) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - pid_t pid; - int child_fd; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer->ptr so we can tell if it is written. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = -i; - - pid = fork(); - if (pid == -1) - ASSERT_EQ(pid, 0); - if (pid != 0) { - waitpid(pid, &ret, 0); - ASSERT_EQ(WIFEXITED(ret), 1); - - /* Check that the parent's buffer did not change. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - return; - } - - /* Check that we see the parent's values. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - /* The child process needs its own mirror to its own mm. */ - child_fd = hmm_open(0); - ASSERT_GE(child_fd, 0); - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - close(child_fd); - exit(0); -} - -/* - * Check that a device writing an anonymous shared mapping - * will not copy-on-write if a child process inherits the mapping. - */ -TEST_F(hmm, anon_write_child_shared) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - pid_t pid; - int child_fd; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer->ptr so we can tell if it is written. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = -i; - - pid = fork(); - if (pid == -1) - ASSERT_EQ(pid, 0); - if (pid != 0) { - waitpid(pid, &ret, 0); - ASSERT_EQ(WIFEXITED(ret), 1); - - /* Check that the parent's buffer did change. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - return; - } - - /* Check that we see the parent's values. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - /* The child process needs its own mirror to its own mm. */ - child_fd = hmm_open(0); - ASSERT_GE(child_fd, 0); - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - close(child_fd); - exit(0); -} - -/* - * Write private anonymous huge page. - */ -TEST_F(hmm, anon_write_huge) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - void *old_ptr; - void *map; - int *ptr; - int ret; - - size = 2 * TWOMEG; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - size = TWOMEG; - npages = size >> self->page_shift; - map = (void *)ALIGN((uintptr_t)buffer->ptr, size); - ret = madvise(map, size, MADV_HUGEPAGE); - ASSERT_EQ(ret, 0); - old_ptr = buffer->ptr; - buffer->ptr = map; - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - buffer->ptr = old_ptr; - hmm_buffer_free(buffer); -} - -/* - * Read numeric data from raw and tagged kernel status files. Used to read - * /proc and /sys data (without a tag) and from /proc/meminfo (with a tag). - */ -static long file_read_ulong(char *file, const char *tag) -{ - int fd; - char buf[2048]; - int len; - char *p, *q; - long val; - - fd = open(file, O_RDONLY); - if (fd < 0) { - /* Error opening the file */ - return -1; - } - - len = read(fd, buf, sizeof(buf)); - close(fd); - if (len < 0) { - /* Error in reading the file */ - return -1; - } - if (len == sizeof(buf)) { - /* Error file is too large */ - return -1; - } - buf[len] = '\0'; - - /* Search for a tag if provided */ - if (tag) { - p = strstr(buf, tag); - if (!p) - return -1; /* looks like the line we want isn't there */ - p += strlen(tag); - } else - p = buf; - - val = strtol(p, &q, 0); - if (*q != ' ') { - /* Error parsing the file */ - return -1; - } - - return val; -} - -/* - * Write huge TLBFS page. - */ -TEST_F(hmm, anon_write_hugetlbfs) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long default_hsize; - unsigned long i; - int *ptr; - int ret; - - default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:"); - if (default_hsize < 0 || default_hsize*1024 < default_hsize) - SKIP(return, "Huge page size could not be determined"); - default_hsize = default_hsize*1024; /* KB to B */ - - size = ALIGN(TWOMEG, default_hsize); - npages = size >> self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, - -1, 0); - if (buffer->ptr == MAP_FAILED) { - free(buffer); - SKIP(return, "Huge page could not be allocated"); - } - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - munmap(buffer->ptr, buffer->size); - buffer->ptr = NULL; - hmm_buffer_free(buffer); -} - -/* - * Read mmap'ed file memory. - */ -TEST_F(hmm, file_read) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - int fd; - ssize_t len; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - fd = hmm_create_file(size); - ASSERT_GE(fd, 0); - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = fd; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - /* Write initial contents of the file. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - len = pwrite(fd, buffer->mirror, size, 0); - ASSERT_EQ(len, size); - memset(buffer->mirror, 0, size); - - buffer->ptr = mmap(NULL, size, - PROT_READ, - MAP_SHARED, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Simulate a device reading system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Write mmap'ed file memory. - */ -TEST_F(hmm, file_write) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - int fd; - ssize_t len; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - fd = hmm_create_file(size); - ASSERT_GE(fd, 0); - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = fd; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_SHARED, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Check that the device also wrote the file. */ - len = pread(fd, buffer->mirror, size, 0); - ASSERT_EQ(len, size); - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Migrate anonymous memory to device private memory. - */ -TEST_F(hmm, migrate) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Migrate memory to device. */ - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Migrate anonymous memory to device private memory and fault some of it back - * to system memory, then try migrating the resulting mix of system and device - * private memory to the device. - */ -TEST_F(hmm, migrate_fault) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Migrate memory to device. */ - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Fault half the pages back to system memory and check them. */ - for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i) - ASSERT_EQ(ptr[i], i); - - /* Migrate memory to the device again. */ - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -TEST_F(hmm, migrate_release) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Migrate memory to device. */ - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Release device memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_RELEASE, buffer, npages); - ASSERT_EQ(ret, 0); - - /* Fault pages back to system memory and check them. */ - for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Migrate anonymous shared memory to device private memory. - */ -TEST_F(hmm, migrate_shared) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Migrate memory to device. */ - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, -ENOENT); - - hmm_buffer_free(buffer); -} - -/* - * Try to migrate various memory types to device private memory. - */ -TEST_F(hmm2, migrate_mixed) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - int *ptr; - unsigned char *p; - int ret; - int val; - - npages = 6; - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - /* Reserve a range of addresses. */ - buffer->ptr = mmap(NULL, size, - PROT_NONE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - p = buffer->ptr; - - /* Migrating a protected area should be an error. */ - ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages); - ASSERT_EQ(ret, -EINVAL); - - /* Punch a hole after the first page address. */ - ret = munmap(buffer->ptr + self->page_size, self->page_size); - ASSERT_EQ(ret, 0); - - /* We expect an error if the vma doesn't cover the range. */ - ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 3); - ASSERT_EQ(ret, -EINVAL); - - /* Page 2 will be a read-only zero page. */ - ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size, - PROT_READ); - ASSERT_EQ(ret, 0); - ptr = (int *)(buffer->ptr + 2 * self->page_size); - val = *ptr + 3; - ASSERT_EQ(val, 3); - - /* Page 3 will be read-only. */ - ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, - PROT_READ | PROT_WRITE); - ASSERT_EQ(ret, 0); - ptr = (int *)(buffer->ptr + 3 * self->page_size); - *ptr = val; - ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, - PROT_READ); - ASSERT_EQ(ret, 0); - - /* Page 4-5 will be read-write. */ - ret = mprotect(buffer->ptr + 4 * self->page_size, 2 * self->page_size, - PROT_READ | PROT_WRITE); - ASSERT_EQ(ret, 0); - ptr = (int *)(buffer->ptr + 4 * self->page_size); - *ptr = val; - ptr = (int *)(buffer->ptr + 5 * self->page_size); - *ptr = val; - - /* Now try to migrate pages 2-5 to device 1. */ - buffer->ptr = p + 2 * self->page_size; - ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 4); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, 4); - - /* Page 5 won't be migrated to device 0 because it's on device 1. */ - buffer->ptr = p + 5 * self->page_size; - ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1); - ASSERT_EQ(ret, -ENOENT); - buffer->ptr = p; - - buffer->ptr = p; - hmm_buffer_free(buffer); -} - -/* - * Migrate anonymous memory to device memory and back to system memory - * multiple times. In case of private zone configuration, this is done - * through fault pages accessed by CPU. In case of coherent zone configuration, - * the pages from the device should be explicitly migrated back to system memory. - * The reason is Coherent device zone has coherent access by CPU, therefore - * it will not generate any page fault. - */ -TEST_F(hmm, migrate_multiple) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - unsigned long c; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - for (c = 0; c < NTIMES; c++) { - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Migrate memory to device. */ - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Migrate back to system memory and check them. */ - if (hmm_is_coherent_type(variant->device_number)) { - ret = hmm_migrate_dev_to_sys(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - } - - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); - } -} - -/* - * Read anonymous memory multiple times. - */ -TEST_F(hmm, anon_read_multiple) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - unsigned long c; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - for (c = 0; c < NTIMES; c++) { - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i + c; - - /* Simulate a device reading system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, - npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i + c); - - hmm_buffer_free(buffer); - } -} - -void *unmap_buffer(void *p) -{ - struct hmm_buffer *buffer = p; - - /* Delay for a bit and then unmap buffer while it is being read. */ - hmm_nanosleep(hmm_random() % 32000); - munmap(buffer->ptr + buffer->size / 2, buffer->size / 2); - buffer->ptr = NULL; - - return NULL; -} - -/* - * Try reading anonymous memory while it is being unmapped. - */ -TEST_F(hmm, anon_teardown) -{ - unsigned long npages; - unsigned long size; - unsigned long c; - void *ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - for (c = 0; c < NTIMES; ++c) { - pthread_t thread; - struct hmm_buffer *buffer; - unsigned long i; - int *ptr; - int rc; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i + c; - - rc = pthread_create(&thread, NULL, unmap_buffer, buffer); - ASSERT_EQ(rc, 0); - - /* Simulate a device reading system memory. */ - rc = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, - npages); - if (rc == 0) { - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; - i < size / sizeof(*ptr); - ++i) - ASSERT_EQ(ptr[i], i + c); - } - - pthread_join(thread, &ret); - hmm_buffer_free(buffer); - } -} - -/* - * Test memory snapshot without faulting in pages accessed by the device. - */ -TEST_F(hmm, mixedmap) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned char *m; - int ret; - - npages = 1; - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(npages); - ASSERT_NE(buffer->mirror, NULL); - - - /* Reserve a range of addresses. */ - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE, - self->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Simulate a device snapshotting CPU pagetables. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device saw. */ - m = buffer->mirror; - ASSERT_EQ(m[0], HMM_DMIRROR_PROT_READ); - - hmm_buffer_free(buffer); -} - -/* - * Test memory snapshot without faulting in pages accessed by the device. - */ -TEST_F(hmm2, snapshot) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - int *ptr; - unsigned char *p; - unsigned char *m; - int ret; - int val; - - npages = 7; - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(npages); - ASSERT_NE(buffer->mirror, NULL); - - /* Reserve a range of addresses. */ - buffer->ptr = mmap(NULL, size, - PROT_NONE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - p = buffer->ptr; - - /* Punch a hole after the first page address. */ - ret = munmap(buffer->ptr + self->page_size, self->page_size); - ASSERT_EQ(ret, 0); - - /* Page 2 will be read-only zero page. */ - ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size, - PROT_READ); - ASSERT_EQ(ret, 0); - ptr = (int *)(buffer->ptr + 2 * self->page_size); - val = *ptr + 3; - ASSERT_EQ(val, 3); - - /* Page 3 will be read-only. */ - ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, - PROT_READ | PROT_WRITE); - ASSERT_EQ(ret, 0); - ptr = (int *)(buffer->ptr + 3 * self->page_size); - *ptr = val; - ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, - PROT_READ); - ASSERT_EQ(ret, 0); - - /* Page 4-6 will be read-write. */ - ret = mprotect(buffer->ptr + 4 * self->page_size, 3 * self->page_size, - PROT_READ | PROT_WRITE); - ASSERT_EQ(ret, 0); - ptr = (int *)(buffer->ptr + 4 * self->page_size); - *ptr = val; - - /* Page 5 will be migrated to device 0. */ - buffer->ptr = p + 5 * self->page_size; - ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, 1); - - /* Page 6 will be migrated to device 1. */ - buffer->ptr = p + 6 * self->page_size; - ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 1); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, 1); - - /* Simulate a device snapshotting CPU pagetables. */ - buffer->ptr = p; - ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_SNAPSHOT, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device saw. */ - m = buffer->mirror; - ASSERT_EQ(m[0], HMM_DMIRROR_PROT_ERROR); - ASSERT_EQ(m[1], HMM_DMIRROR_PROT_ERROR); - ASSERT_EQ(m[2], HMM_DMIRROR_PROT_ZERO | HMM_DMIRROR_PROT_READ); - ASSERT_EQ(m[3], HMM_DMIRROR_PROT_READ); - ASSERT_EQ(m[4], HMM_DMIRROR_PROT_WRITE); - if (!hmm_is_coherent_type(variant->device_number0)) { - ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL | - HMM_DMIRROR_PROT_WRITE); - ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE); - } else { - ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | - HMM_DMIRROR_PROT_WRITE); - ASSERT_EQ(m[6], HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE | - HMM_DMIRROR_PROT_WRITE); - } - - hmm_buffer_free(buffer); -} - -/* - * Test the hmm_range_fault() HMM_PFN_PMD flag for large pages that - * should be mapped by a large page table entry. - */ -TEST_F(hmm, compound) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long default_hsize; - int *ptr; - unsigned char *m; - int ret; - unsigned long i; - - /* Skip test if we can't allocate a hugetlbfs page. */ - - default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:"); - if (default_hsize < 0 || default_hsize*1024 < default_hsize) - SKIP(return, "Huge page size could not be determined"); - default_hsize = default_hsize*1024; /* KB to B */ - - size = ALIGN(TWOMEG, default_hsize); - npages = size >> self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, - -1, 0); - if (buffer->ptr == MAP_FAILED) { - free(buffer); - return; - } - - buffer->size = size; - buffer->mirror = malloc(npages); - ASSERT_NE(buffer->mirror, NULL); - - /* Initialize the pages the device will snapshot in buffer->ptr. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Simulate a device snapshotting CPU pagetables. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device saw. */ - m = buffer->mirror; - for (i = 0; i < npages; ++i) - ASSERT_EQ(m[i], HMM_DMIRROR_PROT_WRITE | - HMM_DMIRROR_PROT_PMD); - - /* Make the region read-only. */ - ret = mprotect(buffer->ptr, size, PROT_READ); - ASSERT_EQ(ret, 0); - - /* Simulate a device snapshotting CPU pagetables. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device saw. */ - m = buffer->mirror; - for (i = 0; i < npages; ++i) - ASSERT_EQ(m[i], HMM_DMIRROR_PROT_READ | - HMM_DMIRROR_PROT_PMD); - - munmap(buffer->ptr, buffer->size); - buffer->ptr = NULL; - hmm_buffer_free(buffer); -} - -/* - * Test two devices reading the same memory (double mapped). - */ -TEST_F(hmm2, double_map) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = 6; - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(npages); - ASSERT_NE(buffer->mirror, NULL); - - /* Reserve a range of addresses. */ - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Make region read-only. */ - ret = mprotect(buffer->ptr, size, PROT_READ); - ASSERT_EQ(ret, 0); - - /* Simulate device 0 reading system memory. */ - ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Simulate device 1 reading system memory. */ - ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_READ, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Migrate pages to device 1 and try to read from device 0. */ - ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what device 0 read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Basic check of exclusive faulting. - */ -TEST_F(hmm, exclusive) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Map memory exclusively for device access. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Fault pages back to system memory and check them. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i]++, i); - - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i+1); - - /* Check atomic access revoked */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_CHECK_EXCLUSIVE, buffer, npages); - ASSERT_EQ(ret, 0); - - hmm_buffer_free(buffer); -} - -TEST_F(hmm, exclusive_mprotect) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Map memory exclusively for device access. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - ret = mprotect(buffer->ptr, size, PROT_READ); - ASSERT_EQ(ret, 0); - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, -EPERM); - - hmm_buffer_free(buffer); -} - -/* - * Check copy-on-write works. - */ -TEST_F(hmm, exclusive_cow) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Map memory exclusively for device access. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - fork(); - - /* Fault pages back to system memory and check them. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i]++, i); - - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i+1); - - hmm_buffer_free(buffer); -} - -static int gup_test_exec(int gup_fd, unsigned long addr, int cmd, - int npages, int size, int flags) -{ - struct gup_test gup = { - .nr_pages_per_call = npages, - .addr = addr, - .gup_flags = FOLL_WRITE | flags, - .size = size, - }; - - if (ioctl(gup_fd, cmd, &gup)) { - perror("ioctl on error\n"); - return errno; - } - - return 0; -} - -/* - * Test get user device pages through gup_test. Setting PIN_LONGTERM flag. - * This should trigger a migration back to system memory for both, private - * and coherent type pages. - * This test makes use of gup_test module. Make sure GUP_TEST_CONFIG is added - * to your configuration before you run it. - */ -TEST_F(hmm, hmm_gup_test) -{ - struct hmm_buffer *buffer; - int gup_fd; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - unsigned char *m; - - gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); - if (gup_fd == -1) - SKIP(return, "Skipping test, could not find gup_test driver"); - - npages = 4; - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Migrate memory to device. */ - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - ASSERT_EQ(gup_test_exec(gup_fd, - (unsigned long)buffer->ptr, - GUP_BASIC_TEST, 1, self->page_size, 0), 0); - ASSERT_EQ(gup_test_exec(gup_fd, - (unsigned long)buffer->ptr + 1 * self->page_size, - GUP_FAST_BENCHMARK, 1, self->page_size, 0), 0); - ASSERT_EQ(gup_test_exec(gup_fd, - (unsigned long)buffer->ptr + 2 * self->page_size, - PIN_FAST_BENCHMARK, 1, self->page_size, FOLL_LONGTERM), 0); - ASSERT_EQ(gup_test_exec(gup_fd, - (unsigned long)buffer->ptr + 3 * self->page_size, - PIN_LONGTERM_BENCHMARK, 1, self->page_size, 0), 0); - - /* Take snapshot to CPU pagetables */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - m = buffer->mirror; - if (hmm_is_coherent_type(variant->device_number)) { - ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[0]); - ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[1]); - } else { - ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[0]); - ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[1]); - } - ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[2]); - ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[3]); - /* - * Check again the content on the pages. Make sure there's no - * corrupted data. - */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - close(gup_fd); - hmm_buffer_free(buffer); -} - -/* - * Test copy-on-write in device pages. - * In case of writing to COW private page(s), a page fault will migrate pages - * back to system memory first. Then, these pages will be duplicated. In case - * of COW device coherent type, pages are duplicated directly from device - * memory. - */ -TEST_F(hmm, hmm_cow_in_device) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - unsigned char *m; - pid_t pid; - int status; - - npages = 4; - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Migrate memory to device. */ - - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - pid = fork(); - if (pid == -1) - ASSERT_EQ(pid, 0); - if (!pid) { - /* Child process waitd for SIGTERM from the parent. */ - while (1) { - } - perror("Should not reach this\n"); - exit(0); - } - /* Parent process writes to COW pages(s) and gets a - * new copy in system. In case of device private pages, - * this write causes a migration to system mem first. - */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Terminate child and wait */ - EXPECT_EQ(0, kill(pid, SIGTERM)); - EXPECT_EQ(pid, waitpid(pid, &status, 0)); - EXPECT_NE(0, WIFSIGNALED(status)); - EXPECT_EQ(SIGTERM, WTERMSIG(status)); - - /* Take snapshot to CPU pagetables */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - m = buffer->mirror; - for (i = 0; i < npages; i++) - ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[i]); - - hmm_buffer_free(buffer); -} -TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/vm/hugepage-mmap.c b/tools/testing/selftests/vm/hugepage-mmap.c deleted file mode 100644 index 955ef87f382c..000000000000 --- a/tools/testing/selftests/vm/hugepage-mmap.c +++ /dev/null @@ -1,91 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * hugepage-mmap: - * - * Example of using huge page memory in a user application using the mmap - * system call. Before running this application, make sure that the - * administrator has mounted the hugetlbfs filesystem (on some directory - * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this - * example, the app is requesting memory of size 256MB that is backed by - * huge pages. - * - * For the ia64 architecture, the Linux kernel reserves Region number 4 for - * huge pages. That means that if one requires a fixed address, a huge page - * aligned address starting with 0x800000... will be required. If a fixed - * address is not required, the kernel will select an address in the proper - * range. - * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. - */ -#define _GNU_SOURCE -#include -#include -#include -#include -#include - -#define LENGTH (256UL*1024*1024) -#define PROTECTION (PROT_READ | PROT_WRITE) - -/* Only ia64 requires this */ -#ifdef __ia64__ -#define ADDR (void *)(0x8000000000000000UL) -#define FLAGS (MAP_SHARED | MAP_FIXED) -#else -#define ADDR (void *)(0x0UL) -#define FLAGS (MAP_SHARED) -#endif - -static void check_bytes(char *addr) -{ - printf("First hex is %x\n", *((unsigned int *)addr)); -} - -static void write_bytes(char *addr) -{ - unsigned long i; - - for (i = 0; i < LENGTH; i++) - *(addr + i) = (char)i; -} - -static int read_bytes(char *addr) -{ - unsigned long i; - - check_bytes(addr); - for (i = 0; i < LENGTH; i++) - if (*(addr + i) != (char)i) { - printf("Mismatch at %lu\n", i); - return 1; - } - return 0; -} - -int main(void) -{ - void *addr; - int fd, ret; - - fd = memfd_create("hugepage-mmap", MFD_HUGETLB); - if (fd < 0) { - perror("memfd_create() failed"); - exit(1); - } - - addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - close(fd); - exit(1); - } - - printf("Returned address is %p\n", addr); - check_bytes(addr); - write_bytes(addr); - ret = read_bytes(addr); - - munmap(addr, LENGTH); - close(fd); - - return ret; -} diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c deleted file mode 100644 index e53b5eaa8fce..000000000000 --- a/tools/testing/selftests/vm/hugepage-mremap.c +++ /dev/null @@ -1,188 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * hugepage-mremap: - * - * Example of remapping huge page memory in a user application using the - * mremap system call. The path to a file in a hugetlbfs filesystem must - * be passed as the last argument to this test. The amount of memory used - * by this test in MBs can optionally be passed as an argument. If no memory - * amount is passed, the default amount is 10MB. - * - * To make sure the test triggers pmd sharing and goes through the 'unshare' - * path in the mremap code use 1GB (1024) or more. - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include /* Definition of O_* constants */ -#include /* Definition of SYS_* constants */ -#include -#include -#include - -#define DEFAULT_LENGTH_MB 10UL -#define MB_TO_BYTES(x) (x * 1024 * 1024) - -#define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC) -#define FLAGS (MAP_SHARED | MAP_ANONYMOUS) - -static void check_bytes(char *addr) -{ - printf("First hex is %x\n", *((unsigned int *)addr)); -} - -static void write_bytes(char *addr, size_t len) -{ - unsigned long i; - - for (i = 0; i < len; i++) - *(addr + i) = (char)i; -} - -static int read_bytes(char *addr, size_t len) -{ - unsigned long i; - - check_bytes(addr); - for (i = 0; i < len; i++) - if (*(addr + i) != (char)i) { - printf("Mismatch at %lu\n", i); - return 1; - } - return 0; -} - -static void register_region_with_uffd(char *addr, size_t len) -{ - long uffd; /* userfaultfd file descriptor */ - struct uffdio_api uffdio_api; - struct uffdio_register uffdio_register; - - /* Create and enable userfaultfd object. */ - - uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); - if (uffd == -1) { - perror("userfaultfd"); - exit(1); - } - - uffdio_api.api = UFFD_API; - uffdio_api.features = 0; - if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { - perror("ioctl-UFFDIO_API"); - exit(1); - } - - /* Create a private anonymous mapping. The memory will be - * demand-zero paged--that is, not yet allocated. When we - * actually touch the memory, it will be allocated via - * the userfaultfd. - */ - - addr = mmap(NULL, len, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - - printf("Address returned by mmap() = %p\n", addr); - - /* Register the memory range of the mapping we just created for - * handling by the userfaultfd object. In mode, we request to track - * missing pages (i.e., pages that have not yet been faulted in). - */ - - uffdio_register.range.start = (unsigned long)addr; - uffdio_register.range.len = len; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { - perror("ioctl-UFFDIO_REGISTER"); - exit(1); - } -} - -int main(int argc, char *argv[]) -{ - size_t length = 0; - int ret = 0, fd; - - if (argc >= 2 && !strcmp(argv[1], "-h")) { - printf("Usage: %s [length_in_MB]\n", argv[0]); - exit(1); - } - - /* Read memory length as the first arg if valid, otherwise fallback to - * the default length. - */ - if (argc >= 2) - length = (size_t)atoi(argv[1]); - else - length = DEFAULT_LENGTH_MB; - - length = MB_TO_BYTES(length); - fd = memfd_create(argv[0], MFD_HUGETLB); - if (fd < 0) { - perror("Open failed"); - exit(1); - } - - /* mmap to a PUD aligned address to hopefully trigger pmd sharing. */ - unsigned long suggested_addr = 0x7eaa40000000; - void *haddr = mmap((void *)suggested_addr, length, PROTECTION, - MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); - printf("Map haddr: Returned address is %p\n", haddr); - if (haddr == MAP_FAILED) { - perror("mmap1"); - exit(1); - } - - /* mmap again to a dummy address to hopefully trigger pmd sharing. */ - suggested_addr = 0x7daa40000000; - void *daddr = mmap((void *)suggested_addr, length, PROTECTION, - MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); - printf("Map daddr: Returned address is %p\n", daddr); - if (daddr == MAP_FAILED) { - perror("mmap3"); - exit(1); - } - - suggested_addr = 0x7faa40000000; - void *vaddr = - mmap((void *)suggested_addr, length, PROTECTION, FLAGS, -1, 0); - printf("Map vaddr: Returned address is %p\n", vaddr); - if (vaddr == MAP_FAILED) { - perror("mmap2"); - exit(1); - } - - register_region_with_uffd(haddr, length); - - void *addr = mremap(haddr, length, length, - MREMAP_MAYMOVE | MREMAP_FIXED, vaddr); - if (addr == MAP_FAILED) { - perror("mremap"); - exit(1); - } - - printf("Mremap: Returned address is %p\n", addr); - check_bytes(addr); - write_bytes(addr, length); - ret = read_bytes(addr, length); - - munmap(addr, length); - - addr = mremap(addr, length, length, 0); - if (addr != MAP_FAILED) { - printf("mremap: Expected failure, but call succeeded\n"); - exit(1); - } - - close(fd); - - return ret; -} diff --git a/tools/testing/selftests/vm/hugepage-shm.c b/tools/testing/selftests/vm/hugepage-shm.c deleted file mode 100644 index e2527f32005b..000000000000 --- a/tools/testing/selftests/vm/hugepage-shm.c +++ /dev/null @@ -1,101 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * hugepage-shm: - * - * Example of using huge page memory in a user application using Sys V shared - * memory system calls. In this example the app is requesting 256MB of - * memory that is backed by huge pages. The application uses the flag - * SHM_HUGETLB in the shmget system call to inform the kernel that it is - * requesting huge pages. - * - * For the ia64 architecture, the Linux kernel reserves Region number 4 for - * huge pages. That means that if one requires a fixed address, a huge page - * aligned address starting with 0x800000... will be required. If a fixed - * address is not required, the kernel will select an address in the proper - * range. - * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. - * - * Note: The default shared memory limit is quite low on many kernels, - * you may need to increase it via: - * - * echo 268435456 > /proc/sys/kernel/shmmax - * - * This will increase the maximum size per shared memory segment to 256MB. - * The other limit that you will hit eventually is shmall which is the - * total amount of shared memory in pages. To set it to 16GB on a system - * with a 4kB pagesize do: - * - * echo 4194304 > /proc/sys/kernel/shmall - */ - -#include -#include -#include -#include -#include -#include - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - -#define LENGTH (256UL*1024*1024) - -#define dprintf(x) printf(x) - -/* Only ia64 requires this */ -#ifdef __ia64__ -#define ADDR (void *)(0x8000000000000000UL) -#define SHMAT_FLAGS (SHM_RND) -#else -#define ADDR (void *)(0x0UL) -#define SHMAT_FLAGS (0) -#endif - -int main(void) -{ - int shmid; - unsigned long i; - char *shmaddr; - - shmid = shmget(2, LENGTH, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); - if (shmid < 0) { - perror("shmget"); - exit(1); - } - printf("shmid: 0x%x\n", shmid); - - shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS); - if (shmaddr == (char *)-1) { - perror("Shared memory attach failure"); - shmctl(shmid, IPC_RMID, NULL); - exit(2); - } - printf("shmaddr: %p\n", shmaddr); - - dprintf("Starting the writes:\n"); - for (i = 0; i < LENGTH; i++) { - shmaddr[i] = (char)(i); - if (!(i % (1024 * 1024))) - dprintf("."); - } - dprintf("\n"); - - dprintf("Starting the Check..."); - for (i = 0; i < LENGTH; i++) - if (shmaddr[i] != (char)i) { - printf("\nIndex %lu mismatched\n", i); - exit(3); - } - dprintf("Done.\n"); - - if (shmdt((const void *)shmaddr) != 0) { - perror("Detach failure"); - shmctl(shmid, IPC_RMID, NULL); - exit(4); - } - - shmctl(shmid, IPC_RMID, NULL); - - return 0; -} diff --git a/tools/testing/selftests/vm/hugepage-vmemmap.c b/tools/testing/selftests/vm/hugepage-vmemmap.c deleted file mode 100644 index 557bdbd4f87e..000000000000 --- a/tools/testing/selftests/vm/hugepage-vmemmap.c +++ /dev/null @@ -1,144 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * A test case of using hugepage memory in a user application using the - * mmap system call with MAP_HUGETLB flag. Before running this program - * make sure the administrator has allocated enough default sized huge - * pages to cover the 2 MB allocation. - */ -#include -#include -#include -#include -#include - -#define MAP_LENGTH (2UL * 1024 * 1024) - -#ifndef MAP_HUGETLB -#define MAP_HUGETLB 0x40000 /* arch specific */ -#endif - -#define PAGE_SIZE 4096 - -#define PAGE_COMPOUND_HEAD (1UL << 15) -#define PAGE_COMPOUND_TAIL (1UL << 16) -#define PAGE_HUGE (1UL << 17) - -#define HEAD_PAGE_FLAGS (PAGE_COMPOUND_HEAD | PAGE_HUGE) -#define TAIL_PAGE_FLAGS (PAGE_COMPOUND_TAIL | PAGE_HUGE) - -#define PM_PFRAME_BITS 55 -#define PM_PFRAME_MASK ~((1UL << PM_PFRAME_BITS) - 1) - -/* - * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. - * That means the addresses starting with 0x800000... will need to be - * specified. Specifying a fixed address is not required on ppc64, i386 - * or x86_64. - */ -#ifdef __ia64__ -#define MAP_ADDR (void *)(0x8000000000000000UL) -#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) -#else -#define MAP_ADDR NULL -#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) -#endif - -static void write_bytes(char *addr, size_t length) -{ - unsigned long i; - - for (i = 0; i < length; i++) - *(addr + i) = (char)i; -} - -static unsigned long virt_to_pfn(void *addr) -{ - int fd; - unsigned long pagemap; - - fd = open("/proc/self/pagemap", O_RDONLY); - if (fd < 0) - return -1UL; - - lseek(fd, (unsigned long)addr / PAGE_SIZE * sizeof(pagemap), SEEK_SET); - read(fd, &pagemap, sizeof(pagemap)); - close(fd); - - return pagemap & ~PM_PFRAME_MASK; -} - -static int check_page_flags(unsigned long pfn) -{ - int fd, i; - unsigned long pageflags; - - fd = open("/proc/kpageflags", O_RDONLY); - if (fd < 0) - return -1; - - lseek(fd, pfn * sizeof(pageflags), SEEK_SET); - - read(fd, &pageflags, sizeof(pageflags)); - if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) { - close(fd); - printf("Head page flags (%lx) is invalid\n", pageflags); - return -1; - } - - /* - * pages other than the first page must be tail and shouldn't be head; - * this also verifies kernel has correctly set the fake page_head to tail - * while hugetlb_free_vmemmap is enabled. - */ - for (i = 1; i < MAP_LENGTH / PAGE_SIZE; i++) { - read(fd, &pageflags, sizeof(pageflags)); - if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS || - (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) { - close(fd); - printf("Tail page flags (%lx) is invalid\n", pageflags); - return -1; - } - } - - close(fd); - - return 0; -} - -int main(int argc, char **argv) -{ - void *addr; - unsigned long pfn; - - addr = mmap(MAP_ADDR, MAP_LENGTH, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - - /* Trigger allocation of HugeTLB page. */ - write_bytes(addr, MAP_LENGTH); - - pfn = virt_to_pfn(addr); - if (pfn == -1UL) { - munmap(addr, MAP_LENGTH); - perror("virt_to_pfn"); - exit(1); - } - - printf("Returned address is %p whose pfn is %lx\n", addr, pfn); - - if (check_page_flags(pfn) < 0) { - munmap(addr, MAP_LENGTH); - perror("check_page_flags"); - exit(1); - } - - /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ - if (munmap(addr, MAP_LENGTH)) { - perror("munmap"); - exit(1); - } - - return 0; -} diff --git a/tools/testing/selftests/vm/hugetlb-madvise.c b/tools/testing/selftests/vm/hugetlb-madvise.c deleted file mode 100644 index a634f47d1e56..000000000000 --- a/tools/testing/selftests/vm/hugetlb-madvise.c +++ /dev/null @@ -1,406 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * hugepage-madvise: - * - * Basic functional testing of madvise MADV_DONTNEED and MADV_REMOVE - * on hugetlb mappings. - * - * Before running this test, make sure the administrator has pre-allocated - * at least MIN_FREE_PAGES hugetlb pages and they are free. In addition, - * the test takes an argument that is the path to a file in a hugetlbfs - * filesystem. Therefore, a hugetlbfs filesystem must be mounted on some - * directory. - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#define __USE_GNU -#include - -#define MIN_FREE_PAGES 20 -#define NR_HUGE_PAGES 10 /* common number of pages to map/allocate */ - -#define validate_free_pages(exp_free) \ - do { \ - int fhp = get_free_hugepages(); \ - if (fhp != (exp_free)) { \ - printf("Unexpected number of free huge " \ - "pages line %d\n", __LINE__); \ - exit(1); \ - } \ - } while (0) - -unsigned long huge_page_size; -unsigned long base_page_size; - -/* - * default_huge_page_size copied from mlock2-tests.c - */ -unsigned long default_huge_page_size(void) -{ - unsigned long hps = 0; - char *line = NULL; - size_t linelen = 0; - FILE *f = fopen("/proc/meminfo", "r"); - - if (!f) - return 0; - while (getline(&line, &linelen, f) > 0) { - if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { - hps <<= 10; - break; - } - } - - free(line); - fclose(f); - return hps; -} - -unsigned long get_free_hugepages(void) -{ - unsigned long fhp = 0; - char *line = NULL; - size_t linelen = 0; - FILE *f = fopen("/proc/meminfo", "r"); - - if (!f) - return fhp; - while (getline(&line, &linelen, f) > 0) { - if (sscanf(line, "HugePages_Free: %lu", &fhp) == 1) - break; - } - - free(line); - fclose(f); - return fhp; -} - -void write_fault_pages(void *addr, unsigned long nr_pages) -{ - unsigned long i; - - for (i = 0; i < nr_pages; i++) - *((unsigned long *)(addr + (i * huge_page_size))) = i; -} - -void read_fault_pages(void *addr, unsigned long nr_pages) -{ - unsigned long dummy = 0; - unsigned long i; - - for (i = 0; i < nr_pages; i++) - dummy += *((unsigned long *)(addr + (i * huge_page_size))); -} - -int main(int argc, char **argv) -{ - unsigned long free_hugepages; - void *addr, *addr2; - int fd; - int ret; - - huge_page_size = default_huge_page_size(); - if (!huge_page_size) { - printf("Unable to determine huge page size, exiting!\n"); - exit(1); - } - base_page_size = sysconf(_SC_PAGE_SIZE); - if (!huge_page_size) { - printf("Unable to determine base page size, exiting!\n"); - exit(1); - } - - free_hugepages = get_free_hugepages(); - if (free_hugepages < MIN_FREE_PAGES) { - printf("Not enough free huge pages to test, exiting!\n"); - exit(1); - } - - fd = memfd_create(argv[0], MFD_HUGETLB); - if (fd < 0) { - perror("memfd_create() failed"); - exit(1); - } - - /* - * Test validity of MADV_DONTNEED addr and length arguments. mmap - * size is NR_HUGE_PAGES + 2. One page at the beginning and end of - * the mapping will be unmapped so we KNOW there is nothing mapped - * there. - */ - addr = mmap(NULL, (NR_HUGE_PAGES + 2) * huge_page_size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, - -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - if (munmap(addr, huge_page_size) || - munmap(addr + (NR_HUGE_PAGES + 1) * huge_page_size, - huge_page_size)) { - perror("munmap"); - exit(1); - } - addr = addr + huge_page_size; - - write_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* addr before mapping should fail */ - ret = madvise(addr - base_page_size, NR_HUGE_PAGES * huge_page_size, - MADV_DONTNEED); - if (!ret) { - printf("Unexpected success of madvise call with invalid addr line %d\n", - __LINE__); - exit(1); - } - - /* addr + length after mapping should fail */ - ret = madvise(addr, (NR_HUGE_PAGES * huge_page_size) + base_page_size, - MADV_DONTNEED); - if (!ret) { - printf("Unexpected success of madvise call with invalid length line %d\n", - __LINE__); - exit(1); - } - - (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); - - /* - * Test alignment of MADV_DONTNEED addr and length arguments - */ - addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, - -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - write_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* addr is not huge page size aligned and should fail */ - ret = madvise(addr + base_page_size, - NR_HUGE_PAGES * huge_page_size - base_page_size, - MADV_DONTNEED); - if (!ret) { - printf("Unexpected success of madvise call with unaligned start address %d\n", - __LINE__); - exit(1); - } - - /* addr + length should be aligned down to huge page size */ - if (madvise(addr, - ((NR_HUGE_PAGES - 1) * huge_page_size) + base_page_size, - MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } - - /* should free all but last page in mapping */ - validate_free_pages(free_hugepages - 1); - - (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); - validate_free_pages(free_hugepages); - - /* - * Test MADV_DONTNEED on anonymous private mapping - */ - addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, - -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - write_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } - - /* should free all pages in mapping */ - validate_free_pages(free_hugepages); - - (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); - - /* - * Test MADV_DONTNEED on private mapping of hugetlb file - */ - if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { - perror("fallocate"); - exit(1); - } - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE, fd, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - - /* read should not consume any pages */ - read_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* madvise should not free any pages */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* writes should allocate private pages */ - write_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); - - /* madvise should free private pages */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* writes should allocate private pages */ - write_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); - - /* - * The fallocate below certainly should free the pages associated - * with the file. However, pages in the private mapping are also - * freed. This is not the 'correct' behavior, but is expected - * because this is how it has worked since the initial hugetlb - * implementation. - */ - if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - 0, NR_HUGE_PAGES * huge_page_size)) { - perror("fallocate"); - exit(1); - } - validate_free_pages(free_hugepages); - - (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); - - /* - * Test MADV_DONTNEED on shared mapping of hugetlb file - */ - if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { - perror("fallocate"); - exit(1); - } - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, - PROT_READ | PROT_WRITE, - MAP_SHARED, fd, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - - /* write should not consume any pages */ - write_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* madvise should not free any pages */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* - * Test MADV_REMOVE on shared mapping of hugetlb file - * - * madvise is same as hole punch and should free all pages. - */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) { - perror("madvise"); - exit(1); - } - validate_free_pages(free_hugepages); - (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); - - /* - * Test MADV_REMOVE on shared and private mapping of hugetlb file - */ - if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { - perror("fallocate"); - exit(1); - } - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, - PROT_READ | PROT_WRITE, - MAP_SHARED, fd, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - - /* shared write should not consume any additional pages */ - write_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - addr2 = mmap(NULL, NR_HUGE_PAGES * huge_page_size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE, fd, 0); - if (addr2 == MAP_FAILED) { - perror("mmap"); - exit(1); - } - - /* private read should not consume any pages */ - read_fault_pages(addr2, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* private write should consume additional pages */ - write_fault_pages(addr2, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); - - /* madvise of shared mapping should not free any pages */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } - validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); - - /* madvise of private mapping should free private pages */ - if (madvise(addr2, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* private write should consume additional pages again */ - write_fault_pages(addr2, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); - - /* - * madvise should free both file and private pages although this is - * not correct. private pages should not be freed, but this is - * expected. See comment associated with FALLOC_FL_PUNCH_HOLE call. - */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) { - perror("madvise"); - exit(1); - } - validate_free_pages(free_hugepages); - - (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); - (void)munmap(addr2, NR_HUGE_PAGES * huge_page_size); - - close(fd); - return 0; -} diff --git a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh b/tools/testing/selftests/vm/hugetlb_reparenting_test.sh deleted file mode 100644 index bf2d2a684edf..000000000000 --- a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh +++ /dev/null @@ -1,252 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -set -e - -if [[ $(id -u) -ne 0 ]]; then - echo "This test must be run as root. Skipping..." - exit $ksft_skip -fi - -usage_file=usage_in_bytes - -if [[ "$1" == "-cgroup-v2" ]]; then - cgroup2=1 - usage_file=current -fi - - -if [[ $cgroup2 ]]; then - CGROUP_ROOT=$(mount -t cgroup2 | head -1 | awk -e '{print $3}') - if [[ -z "$CGROUP_ROOT" ]]; then - CGROUP_ROOT=/dev/cgroup/memory - mount -t cgroup2 none $CGROUP_ROOT - do_umount=1 - fi - echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control -else - CGROUP_ROOT=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}') - if [[ -z "$CGROUP_ROOT" ]]; then - CGROUP_ROOT=/dev/cgroup/memory - mount -t cgroup memory,hugetlb $CGROUP_ROOT - do_umount=1 - fi -fi -MNT='/mnt/huge/' - -function get_machine_hugepage_size() { - hpz=$(grep -i hugepagesize /proc/meminfo) - kb=${hpz:14:-3} - mb=$(($kb / 1024)) - echo $mb -} - -MB=$(get_machine_hugepage_size) - -function cleanup() { - echo cleanup - set +e - rm -rf "$MNT"/* 2>/dev/null - umount "$MNT" 2>/dev/null - rmdir "$MNT" 2>/dev/null - rmdir "$CGROUP_ROOT"/a/b 2>/dev/null - rmdir "$CGROUP_ROOT"/a 2>/dev/null - rmdir "$CGROUP_ROOT"/test1 2>/dev/null - echo 0 >/proc/sys/vm/nr_hugepages - set -e -} - -function assert_state() { - local expected_a="$1" - local expected_a_hugetlb="$2" - local expected_b="" - local expected_b_hugetlb="" - - if [ ! -z ${3:-} ] && [ ! -z ${4:-} ]; then - expected_b="$3" - expected_b_hugetlb="$4" - fi - local tolerance=$((5 * 1024 * 1024)) - - local actual_a - actual_a="$(cat "$CGROUP_ROOT"/a/memory.$usage_file)" - if [[ $actual_a -lt $(($expected_a - $tolerance)) ]] || - [[ $actual_a -gt $(($expected_a + $tolerance)) ]]; then - echo actual a = $((${actual_a%% *} / 1024 / 1024)) MB - echo expected a = $((${expected_a%% *} / 1024 / 1024)) MB - echo fail - - cleanup - exit 1 - fi - - local actual_a_hugetlb - actual_a_hugetlb="$(cat "$CGROUP_ROOT"/a/hugetlb.${MB}MB.$usage_file)" - if [[ $actual_a_hugetlb -lt $(($expected_a_hugetlb - $tolerance)) ]] || - [[ $actual_a_hugetlb -gt $(($expected_a_hugetlb + $tolerance)) ]]; then - echo actual a hugetlb = $((${actual_a_hugetlb%% *} / 1024 / 1024)) MB - echo expected a hugetlb = $((${expected_a_hugetlb%% *} / 1024 / 1024)) MB - echo fail - - cleanup - exit 1 - fi - - if [[ -z "$expected_b" || -z "$expected_b_hugetlb" ]]; then - return - fi - - local actual_b - actual_b="$(cat "$CGROUP_ROOT"/a/b/memory.$usage_file)" - if [[ $actual_b -lt $(($expected_b - $tolerance)) ]] || - [[ $actual_b -gt $(($expected_b + $tolerance)) ]]; then - echo actual b = $((${actual_b%% *} / 1024 / 1024)) MB - echo expected b = $((${expected_b%% *} / 1024 / 1024)) MB - echo fail - - cleanup - exit 1 - fi - - local actual_b_hugetlb - actual_b_hugetlb="$(cat "$CGROUP_ROOT"/a/b/hugetlb.${MB}MB.$usage_file)" - if [[ $actual_b_hugetlb -lt $(($expected_b_hugetlb - $tolerance)) ]] || - [[ $actual_b_hugetlb -gt $(($expected_b_hugetlb + $tolerance)) ]]; then - echo actual b hugetlb = $((${actual_b_hugetlb%% *} / 1024 / 1024)) MB - echo expected b hugetlb = $((${expected_b_hugetlb%% *} / 1024 / 1024)) MB - echo fail - - cleanup - exit 1 - fi -} - -function setup() { - echo 100 >/proc/sys/vm/nr_hugepages - mkdir "$CGROUP_ROOT"/a - sleep 1 - if [[ $cgroup2 ]]; then - echo "+hugetlb +memory" >$CGROUP_ROOT/a/cgroup.subtree_control - else - echo 0 >$CGROUP_ROOT/a/cpuset.mems - echo 0 >$CGROUP_ROOT/a/cpuset.cpus - fi - - mkdir "$CGROUP_ROOT"/a/b - - if [[ ! $cgroup2 ]]; then - echo 0 >$CGROUP_ROOT/a/b/cpuset.mems - echo 0 >$CGROUP_ROOT/a/b/cpuset.cpus - fi - - mkdir -p "$MNT" - mount -t hugetlbfs none "$MNT" -} - -write_hugetlbfs() { - local cgroup="$1" - local path="$2" - local size="$3" - - if [[ $cgroup2 ]]; then - echo $$ >$CGROUP_ROOT/$cgroup/cgroup.procs - else - echo 0 >$CGROUP_ROOT/$cgroup/cpuset.mems - echo 0 >$CGROUP_ROOT/$cgroup/cpuset.cpus - echo $$ >"$CGROUP_ROOT/$cgroup/tasks" - fi - ./write_to_hugetlbfs -p "$path" -s "$size" -m 0 -o - if [[ $cgroup2 ]]; then - echo $$ >$CGROUP_ROOT/cgroup.procs - else - echo $$ >"$CGROUP_ROOT/tasks" - fi - echo -} - -set -e - -size=$((${MB} * 1024 * 1024 * 25)) # 50MB = 25 * 2MB hugepages. - -cleanup - -echo -echo -echo Test charge, rmdir, uncharge -setup -echo mkdir -mkdir $CGROUP_ROOT/test1 - -echo write -write_hugetlbfs test1 "$MNT"/test $size - -echo rmdir -rmdir $CGROUP_ROOT/test1 -mkdir $CGROUP_ROOT/test1 - -echo uncharge -rm -rf /mnt/huge/* - -cleanup - -echo done -echo -echo -if [[ ! $cgroup2 ]]; then - echo "Test parent and child hugetlb usage" - setup - - echo write - write_hugetlbfs a "$MNT"/test $size - - echo Assert memory charged correctly for parent use. - assert_state 0 $size 0 0 - - write_hugetlbfs a/b "$MNT"/test2 $size - - echo Assert memory charged correctly for child use. - assert_state 0 $(($size * 2)) 0 $size - - rmdir "$CGROUP_ROOT"/a/b - sleep 5 - echo Assert memory reparent correctly. - assert_state 0 $(($size * 2)) - - rm -rf "$MNT"/* - umount "$MNT" - echo Assert memory uncharged correctly. - assert_state 0 0 - - cleanup -fi - -echo -echo -echo "Test child only hugetlb usage" -echo setup -setup - -echo write -write_hugetlbfs a/b "$MNT"/test2 $size - -echo Assert memory charged correctly for child only use. -assert_state 0 $(($size)) 0 $size - -rmdir "$CGROUP_ROOT"/a/b -echo Assert memory reparent correctly. -assert_state 0 $size - -rm -rf "$MNT"/* -umount "$MNT" -echo Assert memory uncharged correctly. -assert_state 0 0 - -cleanup - -echo ALL PASS - -umount $CGROUP_ROOT -rm -rf $CGROUP_ROOT diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c deleted file mode 100644 index 64126c8cd561..000000000000 --- a/tools/testing/selftests/vm/khugepaged.c +++ /dev/null @@ -1,1558 +0,0 @@ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "linux/magic.h" - -#include "vm_util.h" - -#ifndef MADV_PAGEOUT -#define MADV_PAGEOUT 21 -#endif -#ifndef MADV_POPULATE_READ -#define MADV_POPULATE_READ 22 -#endif -#ifndef MADV_COLLAPSE -#define MADV_COLLAPSE 25 -#endif - -#define BASE_ADDR ((void *)(1UL << 30)) -static unsigned long hpage_pmd_size; -static unsigned long page_size; -static int hpage_pmd_nr; - -#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" -#define PID_SMAPS "/proc/self/smaps" -#define TEST_FILE "collapse_test_file" - -#define MAX_LINE_LENGTH 500 - -enum vma_type { - VMA_ANON, - VMA_FILE, - VMA_SHMEM, -}; - -struct mem_ops { - void *(*setup_area)(int nr_hpages); - void (*cleanup_area)(void *p, unsigned long size); - void (*fault)(void *p, unsigned long start, unsigned long end); - bool (*check_huge)(void *addr, int nr_hpages); - const char *name; -}; - -static struct mem_ops *file_ops; -static struct mem_ops *anon_ops; -static struct mem_ops *shmem_ops; - -struct collapse_context { - void (*collapse)(const char *msg, char *p, int nr_hpages, - struct mem_ops *ops, bool expect); - bool enforce_pte_scan_limits; - const char *name; -}; - -static struct collapse_context *khugepaged_context; -static struct collapse_context *madvise_context; - -struct file_info { - const char *dir; - char path[PATH_MAX]; - enum vma_type type; - int fd; - char dev_queue_read_ahead_path[PATH_MAX]; -}; - -static struct file_info finfo; - -enum thp_enabled { - THP_ALWAYS, - THP_MADVISE, - THP_NEVER, -}; - -static const char *thp_enabled_strings[] = { - "always", - "madvise", - "never", - NULL -}; - -enum thp_defrag { - THP_DEFRAG_ALWAYS, - THP_DEFRAG_DEFER, - THP_DEFRAG_DEFER_MADVISE, - THP_DEFRAG_MADVISE, - THP_DEFRAG_NEVER, -}; - -static const char *thp_defrag_strings[] = { - "always", - "defer", - "defer+madvise", - "madvise", - "never", - NULL -}; - -enum shmem_enabled { - SHMEM_ALWAYS, - SHMEM_WITHIN_SIZE, - SHMEM_ADVISE, - SHMEM_NEVER, - SHMEM_DENY, - SHMEM_FORCE, -}; - -static const char *shmem_enabled_strings[] = { - "always", - "within_size", - "advise", - "never", - "deny", - "force", - NULL -}; - -struct khugepaged_settings { - bool defrag; - unsigned int alloc_sleep_millisecs; - unsigned int scan_sleep_millisecs; - unsigned int max_ptes_none; - unsigned int max_ptes_swap; - unsigned int max_ptes_shared; - unsigned long pages_to_scan; -}; - -struct settings { - enum thp_enabled thp_enabled; - enum thp_defrag thp_defrag; - enum shmem_enabled shmem_enabled; - bool use_zero_page; - struct khugepaged_settings khugepaged; - unsigned long read_ahead_kb; -}; - -static struct settings saved_settings; -static bool skip_settings_restore; - -static int exit_status; - -static void success(const char *msg) -{ - printf(" \e[32m%s\e[0m\n", msg); -} - -static void fail(const char *msg) -{ - printf(" \e[31m%s\e[0m\n", msg); - exit_status++; -} - -static void skip(const char *msg) -{ - printf(" \e[33m%s\e[0m\n", msg); -} - -static int read_file(const char *path, char *buf, size_t buflen) -{ - int fd; - ssize_t numread; - - fd = open(path, O_RDONLY); - if (fd == -1) - return 0; - - numread = read(fd, buf, buflen - 1); - if (numread < 1) { - close(fd); - return 0; - } - - buf[numread] = '\0'; - close(fd); - - return (unsigned int) numread; -} - -static int write_file(const char *path, const char *buf, size_t buflen) -{ - int fd; - ssize_t numwritten; - - fd = open(path, O_WRONLY); - if (fd == -1) { - printf("open(%s)\n", path); - exit(EXIT_FAILURE); - return 0; - } - - numwritten = write(fd, buf, buflen - 1); - close(fd); - if (numwritten < 1) { - printf("write(%s)\n", buf); - exit(EXIT_FAILURE); - return 0; - } - - return (unsigned int) numwritten; -} - -static int read_string(const char *name, const char *strings[]) -{ - char path[PATH_MAX]; - char buf[256]; - char *c; - int ret; - - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - - if (!read_file(path, buf, sizeof(buf))) { - perror(path); - exit(EXIT_FAILURE); - } - - c = strchr(buf, '['); - if (!c) { - printf("%s: Parse failure\n", __func__); - exit(EXIT_FAILURE); - } - - c++; - memmove(buf, c, sizeof(buf) - (c - buf)); - - c = strchr(buf, ']'); - if (!c) { - printf("%s: Parse failure\n", __func__); - exit(EXIT_FAILURE); - } - *c = '\0'; - - ret = 0; - while (strings[ret]) { - if (!strcmp(strings[ret], buf)) - return ret; - ret++; - } - - printf("Failed to parse %s\n", name); - exit(EXIT_FAILURE); -} - -static void write_string(const char *name, const char *val) -{ - char path[PATH_MAX]; - int ret; - - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - - if (!write_file(path, val, strlen(val) + 1)) { - perror(path); - exit(EXIT_FAILURE); - } -} - -static const unsigned long _read_num(const char *path) -{ - char buf[21]; - - if (read_file(path, buf, sizeof(buf)) < 0) { - perror("read_file(read_num)"); - exit(EXIT_FAILURE); - } - - return strtoul(buf, NULL, 10); -} - -static const unsigned long read_num(const char *name) -{ - char path[PATH_MAX]; - int ret; - - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - return _read_num(path); -} - -static void _write_num(const char *path, unsigned long num) -{ - char buf[21]; - - sprintf(buf, "%ld", num); - if (!write_file(path, buf, strlen(buf) + 1)) { - perror(path); - exit(EXIT_FAILURE); - } -} - -static void write_num(const char *name, unsigned long num) -{ - char path[PATH_MAX]; - int ret; - - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - _write_num(path, num); -} - -static void write_settings(struct settings *settings) -{ - struct khugepaged_settings *khugepaged = &settings->khugepaged; - - write_string("enabled", thp_enabled_strings[settings->thp_enabled]); - write_string("defrag", thp_defrag_strings[settings->thp_defrag]); - write_string("shmem_enabled", - shmem_enabled_strings[settings->shmem_enabled]); - write_num("use_zero_page", settings->use_zero_page); - - write_num("khugepaged/defrag", khugepaged->defrag); - write_num("khugepaged/alloc_sleep_millisecs", - khugepaged->alloc_sleep_millisecs); - write_num("khugepaged/scan_sleep_millisecs", - khugepaged->scan_sleep_millisecs); - write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none); - write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap); - write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared); - write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); - - if (file_ops && finfo.type == VMA_FILE) - _write_num(finfo.dev_queue_read_ahead_path, - settings->read_ahead_kb); -} - -#define MAX_SETTINGS_DEPTH 4 -static struct settings settings_stack[MAX_SETTINGS_DEPTH]; -static int settings_index; - -static struct settings *current_settings(void) -{ - if (!settings_index) { - printf("Fail: No settings set"); - exit(EXIT_FAILURE); - } - return settings_stack + settings_index - 1; -} - -static void push_settings(struct settings *settings) -{ - if (settings_index >= MAX_SETTINGS_DEPTH) { - printf("Fail: Settings stack exceeded"); - exit(EXIT_FAILURE); - } - settings_stack[settings_index++] = *settings; - write_settings(current_settings()); -} - -static void pop_settings(void) -{ - if (settings_index <= 0) { - printf("Fail: Settings stack empty"); - exit(EXIT_FAILURE); - } - --settings_index; - write_settings(current_settings()); -} - -static void restore_settings(int sig) -{ - if (skip_settings_restore) - goto out; - - printf("Restore THP and khugepaged settings..."); - write_settings(&saved_settings); - success("OK"); - if (sig) - exit(EXIT_FAILURE); -out: - exit(exit_status); -} - -static void save_settings(void) -{ - printf("Save THP and khugepaged settings..."); - saved_settings = (struct settings) { - .thp_enabled = read_string("enabled", thp_enabled_strings), - .thp_defrag = read_string("defrag", thp_defrag_strings), - .shmem_enabled = - read_string("shmem_enabled", shmem_enabled_strings), - .use_zero_page = read_num("use_zero_page"), - }; - saved_settings.khugepaged = (struct khugepaged_settings) { - .defrag = read_num("khugepaged/defrag"), - .alloc_sleep_millisecs = - read_num("khugepaged/alloc_sleep_millisecs"), - .scan_sleep_millisecs = - read_num("khugepaged/scan_sleep_millisecs"), - .max_ptes_none = read_num("khugepaged/max_ptes_none"), - .max_ptes_swap = read_num("khugepaged/max_ptes_swap"), - .max_ptes_shared = read_num("khugepaged/max_ptes_shared"), - .pages_to_scan = read_num("khugepaged/pages_to_scan"), - }; - if (file_ops && finfo.type == VMA_FILE) - saved_settings.read_ahead_kb = - _read_num(finfo.dev_queue_read_ahead_path); - - success("OK"); - - signal(SIGTERM, restore_settings); - signal(SIGINT, restore_settings); - signal(SIGHUP, restore_settings); - signal(SIGQUIT, restore_settings); -} - -static void get_finfo(const char *dir) -{ - struct stat path_stat; - struct statfs fs; - char buf[1 << 10]; - char path[PATH_MAX]; - char *str, *end; - - finfo.dir = dir; - stat(finfo.dir, &path_stat); - if (!S_ISDIR(path_stat.st_mode)) { - printf("%s: Not a directory (%s)\n", __func__, finfo.dir); - exit(EXIT_FAILURE); - } - if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE, - finfo.dir) >= sizeof(finfo.path)) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - if (statfs(finfo.dir, &fs)) { - perror("statfs()"); - exit(EXIT_FAILURE); - } - finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE; - if (finfo.type == VMA_SHMEM) - return; - - /* Find owning device's queue/read_ahead_kb control */ - if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent", - major(path_stat.st_dev), minor(path_stat.st_dev)) - >= sizeof(path)) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - if (read_file(path, buf, sizeof(buf)) < 0) { - perror("read_file(read_num)"); - exit(EXIT_FAILURE); - } - if (strstr(buf, "DEVTYPE=disk")) { - /* Found it */ - if (snprintf(finfo.dev_queue_read_ahead_path, - sizeof(finfo.dev_queue_read_ahead_path), - "/sys/dev/block/%d:%d/queue/read_ahead_kb", - major(path_stat.st_dev), minor(path_stat.st_dev)) - >= sizeof(finfo.dev_queue_read_ahead_path)) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - return; - } - if (!strstr(buf, "DEVTYPE=partition")) { - printf("%s: Unknown device type: %s\n", __func__, path); - exit(EXIT_FAILURE); - } - /* - * Partition of block device - need to find actual device. - * Using naming convention that devnameN is partition of - * device devname. - */ - str = strstr(buf, "DEVNAME="); - if (!str) { - printf("%s: Could not read: %s", __func__, path); - exit(EXIT_FAILURE); - } - str += 8; - end = str; - while (*end) { - if (isdigit(*end)) { - *end = '\0'; - if (snprintf(finfo.dev_queue_read_ahead_path, - sizeof(finfo.dev_queue_read_ahead_path), - "/sys/block/%s/queue/read_ahead_kb", - str) >= sizeof(finfo.dev_queue_read_ahead_path)) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - return; - } - ++end; - } - printf("%s: Could not read: %s\n", __func__, path); - exit(EXIT_FAILURE); -} - -static bool check_swap(void *addr, unsigned long size) -{ - bool swap = false; - int ret; - FILE *fp; - char buffer[MAX_LINE_LENGTH]; - char addr_pattern[MAX_LINE_LENGTH]; - - ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", - (unsigned long) addr); - if (ret >= MAX_LINE_LENGTH) { - printf("%s: Pattern is too long\n", __func__); - exit(EXIT_FAILURE); - } - - - fp = fopen(PID_SMAPS, "r"); - if (!fp) { - printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); - exit(EXIT_FAILURE); - } - if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) - goto err_out; - - ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB", - size >> 10); - if (ret >= MAX_LINE_LENGTH) { - printf("%s: Pattern is too long\n", __func__); - exit(EXIT_FAILURE); - } - /* - * Fetch the Swap: in the same block and check whether it got - * the expected number of hugeepages next. - */ - if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer))) - goto err_out; - - if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) - goto err_out; - - swap = true; -err_out: - fclose(fp); - return swap; -} - -static void *alloc_mapping(int nr) -{ - void *p; - - p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (p != BASE_ADDR) { - printf("Failed to allocate VMA at %p\n", BASE_ADDR); - exit(EXIT_FAILURE); - } - - return p; -} - -static void fill_memory(int *p, unsigned long start, unsigned long end) -{ - int i; - - for (i = start / page_size; i < end / page_size; i++) - p[i * page_size / sizeof(*p)] = i + 0xdead0000; -} - -/* - * MADV_COLLAPSE is a best-effort request and may fail if an internal - * resource is temporarily unavailable, in which case it will set errno to - * EAGAIN. In such a case, immediately reattempt the operation one more - * time. - */ -static int madvise_collapse_retry(void *p, unsigned long size) -{ - bool retry = true; - int ret; - -retry: - ret = madvise(p, size, MADV_COLLAPSE); - if (ret && errno == EAGAIN && retry) { - retry = false; - goto retry; - } - return ret; -} - -/* - * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with - * validate_memory()'able contents. - */ -static void *alloc_hpage(struct mem_ops *ops) -{ - void *p = ops->setup_area(1); - - ops->fault(p, 0, hpage_pmd_size); - - /* - * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE. - * The latter is ineligible for collapse by MADV_COLLAPSE - * while the former might cause MADV_COLLAPSE to race with - * khugepaged on low-load system (like a test machine), which - * would cause MADV_COLLAPSE to fail with EAGAIN. - */ - printf("Allocate huge page..."); - if (madvise_collapse_retry(p, hpage_pmd_size)) { - perror("madvise(MADV_COLLAPSE)"); - exit(EXIT_FAILURE); - } - if (!ops->check_huge(p, 1)) { - perror("madvise(MADV_COLLAPSE)"); - exit(EXIT_FAILURE); - } - if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) { - perror("madvise(MADV_HUGEPAGE)"); - exit(EXIT_FAILURE); - } - success("OK"); - return p; -} - -static void validate_memory(int *p, unsigned long start, unsigned long end) -{ - int i; - - for (i = start / page_size; i < end / page_size; i++) { - if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) { - printf("Page %d is corrupted: %#x\n", - i, p[i * page_size / sizeof(*p)]); - exit(EXIT_FAILURE); - } - } -} - -static void *anon_setup_area(int nr_hpages) -{ - return alloc_mapping(nr_hpages); -} - -static void anon_cleanup_area(void *p, unsigned long size) -{ - munmap(p, size); -} - -static void anon_fault(void *p, unsigned long start, unsigned long end) -{ - fill_memory(p, start, end); -} - -static bool anon_check_huge(void *addr, int nr_hpages) -{ - return check_huge_anon(addr, nr_hpages, hpage_pmd_size); -} - -static void *file_setup_area(int nr_hpages) -{ - int fd; - void *p; - unsigned long size; - - unlink(finfo.path); /* Cleanup from previous failed tests */ - printf("Creating %s for collapse%s...", finfo.path, - finfo.type == VMA_SHMEM ? " (tmpfs)" : ""); - fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL, - 777); - if (fd < 0) { - perror("open()"); - exit(EXIT_FAILURE); - } - - size = nr_hpages * hpage_pmd_size; - p = alloc_mapping(nr_hpages); - fill_memory(p, 0, size); - write(fd, p, size); - close(fd); - munmap(p, size); - success("OK"); - - printf("Opening %s read only for collapse...", finfo.path); - finfo.fd = open(finfo.path, O_RDONLY, 777); - if (finfo.fd < 0) { - perror("open()"); - exit(EXIT_FAILURE); - } - p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC, - MAP_PRIVATE, finfo.fd, 0); - if (p == MAP_FAILED || p != BASE_ADDR) { - perror("mmap()"); - exit(EXIT_FAILURE); - } - - /* Drop page cache */ - write_file("/proc/sys/vm/drop_caches", "3", 2); - success("OK"); - return p; -} - -static void file_cleanup_area(void *p, unsigned long size) -{ - munmap(p, size); - close(finfo.fd); - unlink(finfo.path); -} - -static void file_fault(void *p, unsigned long start, unsigned long end) -{ - if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) { - perror("madvise(MADV_POPULATE_READ"); - exit(EXIT_FAILURE); - } -} - -static bool file_check_huge(void *addr, int nr_hpages) -{ - switch (finfo.type) { - case VMA_FILE: - return check_huge_file(addr, nr_hpages, hpage_pmd_size); - case VMA_SHMEM: - return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); - default: - exit(EXIT_FAILURE); - return false; - } -} - -static void *shmem_setup_area(int nr_hpages) -{ - void *p; - unsigned long size = nr_hpages * hpage_pmd_size; - - finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0); - if (finfo.fd < 0) { - perror("memfd_create()"); - exit(EXIT_FAILURE); - } - if (ftruncate(finfo.fd, size)) { - perror("ftruncate()"); - exit(EXIT_FAILURE); - } - p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd, - 0); - if (p != BASE_ADDR) { - perror("mmap()"); - exit(EXIT_FAILURE); - } - return p; -} - -static void shmem_cleanup_area(void *p, unsigned long size) -{ - munmap(p, size); - close(finfo.fd); -} - -static bool shmem_check_huge(void *addr, int nr_hpages) -{ - return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); -} - -static struct mem_ops __anon_ops = { - .setup_area = &anon_setup_area, - .cleanup_area = &anon_cleanup_area, - .fault = &anon_fault, - .check_huge = &anon_check_huge, - .name = "anon", -}; - -static struct mem_ops __file_ops = { - .setup_area = &file_setup_area, - .cleanup_area = &file_cleanup_area, - .fault = &file_fault, - .check_huge = &file_check_huge, - .name = "file", -}; - -static struct mem_ops __shmem_ops = { - .setup_area = &shmem_setup_area, - .cleanup_area = &shmem_cleanup_area, - .fault = &anon_fault, - .check_huge = &shmem_check_huge, - .name = "shmem", -}; - -static void __madvise_collapse(const char *msg, char *p, int nr_hpages, - struct mem_ops *ops, bool expect) -{ - int ret; - struct settings settings = *current_settings(); - - printf("%s...", msg); - - /* - * Prevent khugepaged interference and tests that MADV_COLLAPSE - * ignores /sys/kernel/mm/transparent_hugepage/enabled - */ - settings.thp_enabled = THP_NEVER; - settings.shmem_enabled = SHMEM_NEVER; - push_settings(&settings); - - /* Clear VM_NOHUGEPAGE */ - madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); - ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size); - if (((bool)ret) == expect) - fail("Fail: Bad return value"); - else if (!ops->check_huge(p, expect ? nr_hpages : 0)) - fail("Fail: check_huge()"); - else - success("OK"); - - pop_settings(); -} - -static void madvise_collapse(const char *msg, char *p, int nr_hpages, - struct mem_ops *ops, bool expect) -{ - /* Sanity check */ - if (!ops->check_huge(p, 0)) { - printf("Unexpected huge page\n"); - exit(EXIT_FAILURE); - } - __madvise_collapse(msg, p, nr_hpages, ops, expect); -} - -#define TICK 500000 -static bool wait_for_scan(const char *msg, char *p, int nr_hpages, - struct mem_ops *ops) -{ - int full_scans; - int timeout = 6; /* 3 seconds */ - - /* Sanity check */ - if (!ops->check_huge(p, 0)) { - printf("Unexpected huge page\n"); - exit(EXIT_FAILURE); - } - - madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); - - /* Wait until the second full_scan completed */ - full_scans = read_num("khugepaged/full_scans") + 2; - - printf("%s...", msg); - while (timeout--) { - if (ops->check_huge(p, nr_hpages)) - break; - if (read_num("khugepaged/full_scans") >= full_scans) - break; - printf("."); - usleep(TICK); - } - - madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE); - - return timeout == -1; -} - -static void khugepaged_collapse(const char *msg, char *p, int nr_hpages, - struct mem_ops *ops, bool expect) -{ - if (wait_for_scan(msg, p, nr_hpages, ops)) { - if (expect) - fail("Timeout"); - else - success("OK"); - return; - } - - /* - * For file and shmem memory, khugepaged only retracts pte entries after - * putting the new hugepage in the page cache. The hugepage must be - * subsequently refaulted to install the pmd mapping for the mm. - */ - if (ops != &__anon_ops) - ops->fault(p, 0, nr_hpages * hpage_pmd_size); - - if (ops->check_huge(p, expect ? nr_hpages : 0)) - success("OK"); - else - fail("Fail"); -} - -static struct collapse_context __khugepaged_context = { - .collapse = &khugepaged_collapse, - .enforce_pte_scan_limits = true, - .name = "khugepaged", -}; - -static struct collapse_context __madvise_context = { - .collapse = &madvise_collapse, - .enforce_pte_scan_limits = false, - .name = "madvise", -}; - -static bool is_tmpfs(struct mem_ops *ops) -{ - return ops == &__file_ops && finfo.type == VMA_SHMEM; -} - -static void alloc_at_fault(void) -{ - struct settings settings = *current_settings(); - char *p; - - settings.thp_enabled = THP_ALWAYS; - push_settings(&settings); - - p = alloc_mapping(1); - *p = 1; - printf("Allocate huge page on fault..."); - if (check_huge_anon(p, 1, hpage_pmd_size)) - success("OK"); - else - fail("Fail"); - - pop_settings(); - - madvise(p, page_size, MADV_DONTNEED); - printf("Split huge PMD on MADV_DONTNEED..."); - if (check_huge_anon(p, 0, hpage_pmd_size)) - success("OK"); - else - fail("Fail"); - munmap(p, hpage_pmd_size); -} - -static void collapse_full(struct collapse_context *c, struct mem_ops *ops) -{ - void *p; - int nr_hpages = 4; - unsigned long size = nr_hpages * hpage_pmd_size; - - p = ops->setup_area(nr_hpages); - ops->fault(p, 0, size); - c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages, - ops, true); - validate_memory(p, 0, size); - ops->cleanup_area(p, size); -} - -static void collapse_empty(struct collapse_context *c, struct mem_ops *ops) -{ - void *p; - - p = ops->setup_area(1); - c->collapse("Do not collapse empty PTE table", p, 1, ops, false); - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops) -{ - void *p; - - p = ops->setup_area(1); - ops->fault(p, 0, page_size); - c->collapse("Collapse PTE table with single PTE entry present", p, - 1, ops, true); - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops) -{ - int max_ptes_none = hpage_pmd_nr / 2; - struct settings settings = *current_settings(); - void *p; - - settings.khugepaged.max_ptes_none = max_ptes_none; - push_settings(&settings); - - p = ops->setup_area(1); - - if (is_tmpfs(ops)) { - /* shmem pages always in the page cache */ - printf("tmpfs..."); - skip("Skip"); - goto skip; - } - - ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); - c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1, - ops, !c->enforce_pte_scan_limits); - validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); - - if (c->enforce_pte_scan_limits) { - ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); - c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops, - true); - validate_memory(p, 0, - (hpage_pmd_nr - max_ptes_none) * page_size); - } -skip: - ops->cleanup_area(p, hpage_pmd_size); - pop_settings(); -} - -static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops) -{ - void *p; - - p = ops->setup_area(1); - ops->fault(p, 0, hpage_pmd_size); - - printf("Swapout one page..."); - if (madvise(p, page_size, MADV_PAGEOUT)) { - perror("madvise(MADV_PAGEOUT)"); - exit(EXIT_FAILURE); - } - if (check_swap(p, page_size)) { - success("OK"); - } else { - fail("Fail"); - goto out; - } - - c->collapse("Collapse with swapping in single PTE entry", p, 1, ops, - true); - validate_memory(p, 0, hpage_pmd_size); -out: - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops) -{ - int max_ptes_swap = read_num("khugepaged/max_ptes_swap"); - void *p; - - p = ops->setup_area(1); - ops->fault(p, 0, hpage_pmd_size); - - printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); - if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) { - perror("madvise(MADV_PAGEOUT)"); - exit(EXIT_FAILURE); - } - if (check_swap(p, (max_ptes_swap + 1) * page_size)) { - success("OK"); - } else { - fail("Fail"); - goto out; - } - - c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops, - !c->enforce_pte_scan_limits); - validate_memory(p, 0, hpage_pmd_size); - - if (c->enforce_pte_scan_limits) { - ops->fault(p, 0, hpage_pmd_size); - printf("Swapout %d of %d pages...", max_ptes_swap, - hpage_pmd_nr); - if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { - perror("madvise(MADV_PAGEOUT)"); - exit(EXIT_FAILURE); - } - if (check_swap(p, max_ptes_swap * page_size)) { - success("OK"); - } else { - fail("Fail"); - goto out; - } - - c->collapse("Collapse with max_ptes_swap pages swapped out", p, - 1, ops, true); - validate_memory(p, 0, hpage_pmd_size); - } -out: - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops) -{ - void *p; - - p = alloc_hpage(ops); - - if (is_tmpfs(ops)) { - /* MADV_DONTNEED won't evict tmpfs pages */ - printf("tmpfs..."); - skip("Skip"); - goto skip; - } - - madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - printf("Split huge page leaving single PTE mapping compound page..."); - madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - - c->collapse("Collapse PTE table with single PTE mapping compound page", - p, 1, ops, true); - validate_memory(p, 0, page_size); -skip: - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops) -{ - void *p; - - p = alloc_hpage(ops); - printf("Split huge page leaving single PTE page table full of compound pages..."); - madvise(p, page_size, MADV_NOHUGEPAGE); - madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - - c->collapse("Collapse PTE table full of compound pages", p, 1, ops, - true); - validate_memory(p, 0, hpage_pmd_size); - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops) -{ - void *p; - int i; - - p = ops->setup_area(1); - for (i = 0; i < hpage_pmd_nr; i++) { - printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", - i + 1, hpage_pmd_nr); - - madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); - ops->fault(BASE_ADDR, 0, hpage_pmd_size); - if (!ops->check_huge(BASE_ADDR, 1)) { - printf("Failed to allocate huge page\n"); - exit(EXIT_FAILURE); - } - madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE); - - p = mremap(BASE_ADDR - i * page_size, - i * page_size + hpage_pmd_size, - (i + 1) * page_size, - MREMAP_MAYMOVE | MREMAP_FIXED, - BASE_ADDR + 2 * hpage_pmd_size); - if (p == MAP_FAILED) { - perror("mremap+unmap"); - exit(EXIT_FAILURE); - } - - p = mremap(BASE_ADDR + 2 * hpage_pmd_size, - (i + 1) * page_size, - (i + 1) * page_size + hpage_pmd_size, - MREMAP_MAYMOVE | MREMAP_FIXED, - BASE_ADDR - (i + 1) * page_size); - if (p == MAP_FAILED) { - perror("mremap+alloc"); - exit(EXIT_FAILURE); - } - } - - ops->cleanup_area(BASE_ADDR, hpage_pmd_size); - ops->fault(p, 0, hpage_pmd_size); - if (!ops->check_huge(p, 1)) - success("OK"); - else - fail("Fail"); - - c->collapse("Collapse PTE table full of different compound pages", p, 1, - ops, true); - - validate_memory(p, 0, hpage_pmd_size); - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_fork(struct collapse_context *c, struct mem_ops *ops) -{ - int wstatus; - void *p; - - p = ops->setup_area(1); - - printf("Allocate small page..."); - ops->fault(p, 0, page_size); - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - - printf("Share small page over fork()..."); - if (!fork()) { - /* Do not touch settings on child exit */ - skip_settings_restore = true; - exit_status = 0; - - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - - ops->fault(p, page_size, 2 * page_size); - c->collapse("Collapse PTE table with single page shared with parent process", - p, 1, ops, true); - - validate_memory(p, 0, page_size); - ops->cleanup_area(p, hpage_pmd_size); - exit(exit_status); - } - - wait(&wstatus); - exit_status += WEXITSTATUS(wstatus); - - printf("Check if parent still has small page..."); - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - validate_memory(p, 0, page_size); - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops) -{ - int wstatus; - void *p; - - p = alloc_hpage(ops); - printf("Share huge page over fork()..."); - if (!fork()) { - /* Do not touch settings on child exit */ - skip_settings_restore = true; - exit_status = 0; - - if (ops->check_huge(p, 1)) - success("OK"); - else - fail("Fail"); - - printf("Split huge page PMD in child process..."); - madvise(p, page_size, MADV_NOHUGEPAGE); - madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - ops->fault(p, 0, page_size); - - write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); - c->collapse("Collapse PTE table full of compound pages in child", - p, 1, ops, true); - write_num("khugepaged/max_ptes_shared", - current_settings()->khugepaged.max_ptes_shared); - - validate_memory(p, 0, hpage_pmd_size); - ops->cleanup_area(p, hpage_pmd_size); - exit(exit_status); - } - - wait(&wstatus); - exit_status += WEXITSTATUS(wstatus); - - printf("Check if parent still has huge page..."); - if (ops->check_huge(p, 1)) - success("OK"); - else - fail("Fail"); - validate_memory(p, 0, hpage_pmd_size); - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops) -{ - int max_ptes_shared = read_num("khugepaged/max_ptes_shared"); - int wstatus; - void *p; - - p = alloc_hpage(ops); - printf("Share huge page over fork()..."); - if (!fork()) { - /* Do not touch settings on child exit */ - skip_settings_restore = true; - exit_status = 0; - - if (ops->check_huge(p, 1)) - success("OK"); - else - fail("Fail"); - - printf("Trigger CoW on page %d of %d...", - hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); - ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - - c->collapse("Maybe collapse with max_ptes_shared exceeded", p, - 1, ops, !c->enforce_pte_scan_limits); - - if (c->enforce_pte_scan_limits) { - printf("Trigger CoW on page %d of %d...", - hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); - ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) * - page_size); - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - - c->collapse("Collapse with max_ptes_shared PTEs shared", - p, 1, ops, true); - } - - validate_memory(p, 0, hpage_pmd_size); - ops->cleanup_area(p, hpage_pmd_size); - exit(exit_status); - } - - wait(&wstatus); - exit_status += WEXITSTATUS(wstatus); - - printf("Check if parent still has huge page..."); - if (ops->check_huge(p, 1)) - success("OK"); - else - fail("Fail"); - validate_memory(p, 0, hpage_pmd_size); - ops->cleanup_area(p, hpage_pmd_size); -} - -static void madvise_collapse_existing_thps(struct collapse_context *c, - struct mem_ops *ops) -{ - void *p; - - p = ops->setup_area(1); - ops->fault(p, 0, hpage_pmd_size); - c->collapse("Collapse fully populated PTE table...", p, 1, ops, true); - validate_memory(p, 0, hpage_pmd_size); - - /* c->collapse() will find a hugepage and complain - call directly. */ - __madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true); - validate_memory(p, 0, hpage_pmd_size); - ops->cleanup_area(p, hpage_pmd_size); -} - -/* - * Test race with khugepaged where page tables have been retracted and - * pmd cleared. - */ -static void madvise_retracted_page_tables(struct collapse_context *c, - struct mem_ops *ops) -{ - void *p; - int nr_hpages = 1; - unsigned long size = nr_hpages * hpage_pmd_size; - - p = ops->setup_area(nr_hpages); - ops->fault(p, 0, size); - - /* Let khugepaged collapse and leave pmd cleared */ - if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages, - ops)) { - fail("Timeout"); - return; - } - success("OK"); - c->collapse("Install huge PMD from page cache", p, nr_hpages, ops, - true); - validate_memory(p, 0, size); - ops->cleanup_area(p, size); -} - -static void usage(void) -{ - fprintf(stderr, "\nUsage: ./khugepaged [dir]\n\n"); - fprintf(stderr, "\t\t: :\n"); - fprintf(stderr, "\t\t: [all|khugepaged|madvise]\n"); - fprintf(stderr, "\t\t: [all|anon|file|shmem]\n"); - fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n"); - fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n"); - fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n"); - fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n"); - fprintf(stderr, "\tmounted with huge=madvise option for khugepaged tests to work\n"); - exit(1); -} - -static void parse_test_type(int argc, const char **argv) -{ - char *buf; - const char *token; - - if (argc == 1) { - /* Backwards compatibility */ - khugepaged_context = &__khugepaged_context; - madvise_context = &__madvise_context; - anon_ops = &__anon_ops; - return; - } - - buf = strdup(argv[1]); - token = strsep(&buf, ":"); - - if (!strcmp(token, "all")) { - khugepaged_context = &__khugepaged_context; - madvise_context = &__madvise_context; - } else if (!strcmp(token, "khugepaged")) { - khugepaged_context = &__khugepaged_context; - } else if (!strcmp(token, "madvise")) { - madvise_context = &__madvise_context; - } else { - usage(); - } - - if (!buf) - usage(); - - if (!strcmp(buf, "all")) { - file_ops = &__file_ops; - anon_ops = &__anon_ops; - shmem_ops = &__shmem_ops; - } else if (!strcmp(buf, "anon")) { - anon_ops = &__anon_ops; - } else if (!strcmp(buf, "file")) { - file_ops = &__file_ops; - } else if (!strcmp(buf, "shmem")) { - shmem_ops = &__shmem_ops; - } else { - usage(); - } - - if (!file_ops) - return; - - if (argc != 3) - usage(); -} - -int main(int argc, const char **argv) -{ - struct settings default_settings = { - .thp_enabled = THP_MADVISE, - .thp_defrag = THP_DEFRAG_ALWAYS, - .shmem_enabled = SHMEM_ADVISE, - .use_zero_page = 0, - .khugepaged = { - .defrag = 1, - .alloc_sleep_millisecs = 10, - .scan_sleep_millisecs = 10, - }, - /* - * When testing file-backed memory, the collapse path - * looks at how many pages are found in the page cache, not - * what pages are mapped. Disable read ahead optimization so - * pages don't find their way into the page cache unless - * we mem_ops->fault() them in. - */ - .read_ahead_kb = 0, - }; - - parse_test_type(argc, argv); - - if (file_ops) - get_finfo(argv[2]); - - setbuf(stdout, NULL); - - page_size = getpagesize(); - hpage_pmd_size = read_pmd_pagesize(); - hpage_pmd_nr = hpage_pmd_size / page_size; - - default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; - default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8; - default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2; - default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; - - save_settings(); - push_settings(&default_settings); - - alloc_at_fault(); - -#define TEST(t, c, o) do { \ - if (c && o) { \ - printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \ - t(c, o); \ - } \ - } while (0) - - TEST(collapse_full, khugepaged_context, anon_ops); - TEST(collapse_full, khugepaged_context, file_ops); - TEST(collapse_full, khugepaged_context, shmem_ops); - TEST(collapse_full, madvise_context, anon_ops); - TEST(collapse_full, madvise_context, file_ops); - TEST(collapse_full, madvise_context, shmem_ops); - - TEST(collapse_empty, khugepaged_context, anon_ops); - TEST(collapse_empty, madvise_context, anon_ops); - - TEST(collapse_single_pte_entry, khugepaged_context, anon_ops); - TEST(collapse_single_pte_entry, khugepaged_context, file_ops); - TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops); - TEST(collapse_single_pte_entry, madvise_context, anon_ops); - TEST(collapse_single_pte_entry, madvise_context, file_ops); - TEST(collapse_single_pte_entry, madvise_context, shmem_ops); - - TEST(collapse_max_ptes_none, khugepaged_context, anon_ops); - TEST(collapse_max_ptes_none, khugepaged_context, file_ops); - TEST(collapse_max_ptes_none, madvise_context, anon_ops); - TEST(collapse_max_ptes_none, madvise_context, file_ops); - - TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops); - TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops); - TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops); - TEST(collapse_single_pte_entry_compound, madvise_context, file_ops); - - TEST(collapse_full_of_compound, khugepaged_context, anon_ops); - TEST(collapse_full_of_compound, khugepaged_context, file_ops); - TEST(collapse_full_of_compound, khugepaged_context, shmem_ops); - TEST(collapse_full_of_compound, madvise_context, anon_ops); - TEST(collapse_full_of_compound, madvise_context, file_ops); - TEST(collapse_full_of_compound, madvise_context, shmem_ops); - - TEST(collapse_compound_extreme, khugepaged_context, anon_ops); - TEST(collapse_compound_extreme, madvise_context, anon_ops); - - TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops); - TEST(collapse_swapin_single_pte, madvise_context, anon_ops); - - TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops); - TEST(collapse_max_ptes_swap, madvise_context, anon_ops); - - TEST(collapse_fork, khugepaged_context, anon_ops); - TEST(collapse_fork, madvise_context, anon_ops); - - TEST(collapse_fork_compound, khugepaged_context, anon_ops); - TEST(collapse_fork_compound, madvise_context, anon_ops); - - TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops); - TEST(collapse_max_ptes_shared, madvise_context, anon_ops); - - TEST(madvise_collapse_existing_thps, madvise_context, anon_ops); - TEST(madvise_collapse_existing_thps, madvise_context, file_ops); - TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops); - - TEST(madvise_retracted_page_tables, madvise_context, file_ops); - TEST(madvise_retracted_page_tables, madvise_context, shmem_ops); - - restore_settings(0); -} diff --git a/tools/testing/selftests/vm/ksm_functional_tests.c b/tools/testing/selftests/vm/ksm_functional_tests.c deleted file mode 100644 index d8b5b4930412..000000000000 --- a/tools/testing/selftests/vm/ksm_functional_tests.c +++ /dev/null @@ -1,279 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * KSM functional tests - * - * Copyright 2022, Red Hat, Inc. - * - * Author(s): David Hildenbrand - */ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../kselftest.h" -#include "vm_util.h" - -#define KiB 1024u -#define MiB (1024 * KiB) - -static int ksm_fd; -static int ksm_full_scans_fd; -static int pagemap_fd; -static size_t pagesize; - -static bool range_maps_duplicates(char *addr, unsigned long size) -{ - unsigned long offs_a, offs_b, pfn_a, pfn_b; - - /* - * There is no easy way to check if there are KSM pages mapped into - * this range. We only check that the range does not map the same PFN - * twice by comparing each pair of mapped pages. - */ - for (offs_a = 0; offs_a < size; offs_a += pagesize) { - pfn_a = pagemap_get_pfn(pagemap_fd, addr + offs_a); - /* Page not present or PFN not exposed by the kernel. */ - if (pfn_a == -1ul || !pfn_a) - continue; - - for (offs_b = offs_a + pagesize; offs_b < size; - offs_b += pagesize) { - pfn_b = pagemap_get_pfn(pagemap_fd, addr + offs_b); - if (pfn_b == -1ul || !pfn_b) - continue; - if (pfn_a == pfn_b) - return true; - } - } - return false; -} - -static long ksm_get_full_scans(void) -{ - char buf[10]; - ssize_t ret; - - ret = pread(ksm_full_scans_fd, buf, sizeof(buf) - 1, 0); - if (ret <= 0) - return -errno; - buf[ret] = 0; - - return strtol(buf, NULL, 10); -} - -static int ksm_merge(void) -{ - long start_scans, end_scans; - - /* Wait for two full scans such that any possible merging happened. */ - start_scans = ksm_get_full_scans(); - if (start_scans < 0) - return start_scans; - if (write(ksm_fd, "1", 1) != 1) - return -errno; - do { - end_scans = ksm_get_full_scans(); - if (end_scans < 0) - return end_scans; - } while (end_scans < start_scans + 2); - - return 0; -} - -static char *mmap_and_merge_range(char val, unsigned long size) -{ - char *map; - - map = mmap(NULL, size, PROT_READ|PROT_WRITE, - MAP_PRIVATE|MAP_ANON, -1, 0); - if (map == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - return MAP_FAILED; - } - - /* Don't use THP. Ignore if THP are not around on a kernel. */ - if (madvise(map, size, MADV_NOHUGEPAGE) && errno != EINVAL) { - ksft_test_result_fail("MADV_NOHUGEPAGE failed\n"); - goto unmap; - } - - /* Make sure each page contains the same values to merge them. */ - memset(map, val, size); - if (madvise(map, size, MADV_MERGEABLE)) { - ksft_test_result_fail("MADV_MERGEABLE failed\n"); - goto unmap; - } - - /* Run KSM to trigger merging and wait. */ - if (ksm_merge()) { - ksft_test_result_fail("Running KSM failed\n"); - goto unmap; - } - return map; -unmap: - munmap(map, size); - return MAP_FAILED; -} - -static void test_unmerge(void) -{ - const unsigned int size = 2 * MiB; - char *map; - - ksft_print_msg("[RUN] %s\n", __func__); - - map = mmap_and_merge_range(0xcf, size); - if (map == MAP_FAILED) - return; - - if (madvise(map, size, MADV_UNMERGEABLE)) { - ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); - goto unmap; - } - - ksft_test_result(!range_maps_duplicates(map, size), - "Pages were unmerged\n"); -unmap: - munmap(map, size); -} - -static void test_unmerge_discarded(void) -{ - const unsigned int size = 2 * MiB; - char *map; - - ksft_print_msg("[RUN] %s\n", __func__); - - map = mmap_and_merge_range(0xcf, size); - if (map == MAP_FAILED) - return; - - /* Discard half of all mapped pages so we have pte_none() entries. */ - if (madvise(map, size / 2, MADV_DONTNEED)) { - ksft_test_result_fail("MADV_DONTNEED failed\n"); - goto unmap; - } - - if (madvise(map, size, MADV_UNMERGEABLE)) { - ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); - goto unmap; - } - - ksft_test_result(!range_maps_duplicates(map, size), - "Pages were unmerged\n"); -unmap: - munmap(map, size); -} - -#ifdef __NR_userfaultfd -static void test_unmerge_uffd_wp(void) -{ - struct uffdio_writeprotect uffd_writeprotect; - struct uffdio_register uffdio_register; - const unsigned int size = 2 * MiB; - struct uffdio_api uffdio_api; - char *map; - int uffd; - - ksft_print_msg("[RUN] %s\n", __func__); - - map = mmap_and_merge_range(0xcf, size); - if (map == MAP_FAILED) - return; - - /* See if UFFD is around. */ - uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); - if (uffd < 0) { - ksft_test_result_skip("__NR_userfaultfd failed\n"); - goto unmap; - } - - /* See if UFFD-WP is around. */ - uffdio_api.api = UFFD_API; - uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP; - if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) { - ksft_test_result_fail("UFFDIO_API failed\n"); - goto close_uffd; - } - if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) { - ksft_test_result_skip("UFFD_FEATURE_PAGEFAULT_FLAG_WP not available\n"); - goto close_uffd; - } - - /* Register UFFD-WP, no need for an actual handler. */ - uffdio_register.range.start = (unsigned long) map; - uffdio_register.range.len = size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) { - ksft_test_result_fail("UFFDIO_REGISTER_MODE_WP failed\n"); - goto close_uffd; - } - - /* Write-protect the range using UFFD-WP. */ - uffd_writeprotect.range.start = (unsigned long) map; - uffd_writeprotect.range.len = size; - uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP; - if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { - ksft_test_result_fail("UFFDIO_WRITEPROTECT failed\n"); - goto close_uffd; - } - - if (madvise(map, size, MADV_UNMERGEABLE)) { - ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); - goto close_uffd; - } - - ksft_test_result(!range_maps_duplicates(map, size), - "Pages were unmerged\n"); -close_uffd: - close(uffd); -unmap: - munmap(map, size); -} -#endif - -int main(int argc, char **argv) -{ - unsigned int tests = 2; - int err; - -#ifdef __NR_userfaultfd - tests++; -#endif - - ksft_print_header(); - ksft_set_plan(tests); - - pagesize = getpagesize(); - - ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); - if (ksm_fd < 0) - ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n"); - ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY); - if (ksm_full_scans_fd < 0) - ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n"); - pagemap_fd = open("/proc/self/pagemap", O_RDONLY); - if (pagemap_fd < 0) - ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); - - test_unmerge(); - test_unmerge_discarded(); -#ifdef __NR_userfaultfd - test_unmerge_uffd_wp(); -#endif - - err = ksft_get_fail_cnt(); - if (err) - ksft_exit_fail_msg("%d out of %d tests failed\n", - err, ksft_test_num()); - return ksft_exit_pass(); -} diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c deleted file mode 100644 index f9eb4d67e0dd..000000000000 --- a/tools/testing/selftests/vm/ksm_tests.c +++ /dev/null @@ -1,849 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../kselftest.h" -#include -#include "util.h" - -#define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/" -#define KSM_FP(s) (KSM_SYSFS_PATH s) -#define KSM_SCAN_LIMIT_SEC_DEFAULT 120 -#define KSM_PAGE_COUNT_DEFAULT 10l -#define KSM_PROT_STR_DEFAULT "rw" -#define KSM_USE_ZERO_PAGES_DEFAULT false -#define KSM_MERGE_ACROSS_NODES_DEFAULT true -#define MB (1ul << 20) - -struct ksm_sysfs { - unsigned long max_page_sharing; - unsigned long merge_across_nodes; - unsigned long pages_to_scan; - unsigned long run; - unsigned long sleep_millisecs; - unsigned long stable_node_chains_prune_millisecs; - unsigned long use_zero_pages; -}; - -enum ksm_test_name { - CHECK_KSM_MERGE, - CHECK_KSM_UNMERGE, - CHECK_KSM_ZERO_PAGE_MERGE, - CHECK_KSM_NUMA_MERGE, - KSM_MERGE_TIME, - KSM_MERGE_TIME_HUGE_PAGES, - KSM_UNMERGE_TIME, - KSM_COW_TIME -}; - -static int ksm_write_sysfs(const char *file_path, unsigned long val) -{ - FILE *f = fopen(file_path, "w"); - - if (!f) { - fprintf(stderr, "f %s\n", file_path); - perror("fopen"); - return 1; - } - if (fprintf(f, "%lu", val) < 0) { - perror("fprintf"); - fclose(f); - return 1; - } - fclose(f); - - return 0; -} - -static int ksm_read_sysfs(const char *file_path, unsigned long *val) -{ - FILE *f = fopen(file_path, "r"); - - if (!f) { - fprintf(stderr, "f %s\n", file_path); - perror("fopen"); - return 1; - } - if (fscanf(f, "%lu", val) != 1) { - perror("fscanf"); - fclose(f); - return 1; - } - fclose(f); - - return 0; -} - -static int str_to_prot(char *prot_str) -{ - int prot = 0; - - if ((strchr(prot_str, 'r')) != NULL) - prot |= PROT_READ; - if ((strchr(prot_str, 'w')) != NULL) - prot |= PROT_WRITE; - if ((strchr(prot_str, 'x')) != NULL) - prot |= PROT_EXEC; - - return prot; -} - -static void print_help(void) -{ - printf("usage: ksm_tests [-h] [-a prot] [-p page_count] [-l timeout]\n" - "[-z use_zero_pages] [-m merge_across_nodes] [-s size]\n"); - - printf("Supported :\n" - " -M (page merging)\n" - " -Z (zero pages merging)\n" - " -N (merging of pages in different NUMA nodes)\n" - " -U (page unmerging)\n" - " -P evaluate merging time and speed.\n" - " For this test, the size of duplicated memory area (in MiB)\n" - " must be provided using -s option\n" - " -H evaluate merging time and speed of area allocated mostly with huge pages\n" - " For this test, the size of duplicated memory area (in MiB)\n" - " must be provided using -s option\n" - " -D evaluate unmerging time and speed when disabling KSM.\n" - " For this test, the size of duplicated memory area (in MiB)\n" - " must be provided using -s option\n" - " -C evaluate the time required to break COW of merged pages.\n\n"); - - printf(" -a: specify the access protections of pages.\n" - " must be of the form [rwx].\n" - " Default: %s\n", KSM_PROT_STR_DEFAULT); - printf(" -p: specify the number of pages to test.\n" - " Default: %ld\n", KSM_PAGE_COUNT_DEFAULT); - printf(" -l: limit the maximum running time (in seconds) for a test.\n" - " Default: %d seconds\n", KSM_SCAN_LIMIT_SEC_DEFAULT); - printf(" -z: change use_zero_pages tunable\n" - " Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT); - printf(" -m: change merge_across_nodes tunable\n" - " Default: %d\n", KSM_MERGE_ACROSS_NODES_DEFAULT); - printf(" -s: the size of duplicated memory area (in MiB)\n"); - - exit(0); -} - -static void *allocate_memory(void *ptr, int prot, int mapping, char data, size_t map_size) -{ - void *map_ptr = mmap(ptr, map_size, PROT_WRITE, mapping, -1, 0); - - if (!map_ptr) { - perror("mmap"); - return NULL; - } - memset(map_ptr, data, map_size); - if (mprotect(map_ptr, map_size, prot)) { - perror("mprotect"); - munmap(map_ptr, map_size); - return NULL; - } - - return map_ptr; -} - -static int ksm_do_scan(int scan_count, struct timespec start_time, int timeout) -{ - struct timespec cur_time; - unsigned long cur_scan, init_scan; - - if (ksm_read_sysfs(KSM_FP("full_scans"), &init_scan)) - return 1; - cur_scan = init_scan; - - while (cur_scan < init_scan + scan_count) { - if (ksm_read_sysfs(KSM_FP("full_scans"), &cur_scan)) - return 1; - if (clock_gettime(CLOCK_MONOTONIC_RAW, &cur_time)) { - perror("clock_gettime"); - return 1; - } - if ((cur_time.tv_sec - start_time.tv_sec) > timeout) { - printf("Scan time limit exceeded\n"); - return 1; - } - } - - return 0; -} - -static int ksm_merge_pages(void *addr, size_t size, struct timespec start_time, int timeout) -{ - if (madvise(addr, size, MADV_MERGEABLE)) { - perror("madvise"); - return 1; - } - if (ksm_write_sysfs(KSM_FP("run"), 1)) - return 1; - - /* Since merging occurs only after 2 scans, make sure to get at least 2 full scans */ - if (ksm_do_scan(2, start_time, timeout)) - return 1; - - return 0; -} - -static int ksm_unmerge_pages(void *addr, size_t size, - struct timespec start_time, int timeout) -{ - if (madvise(addr, size, MADV_UNMERGEABLE)) { - perror("madvise"); - return 1; - } - return 0; -} - -static bool assert_ksm_pages_count(long dupl_page_count) -{ - unsigned long max_page_sharing, pages_sharing, pages_shared; - - if (ksm_read_sysfs(KSM_FP("pages_shared"), &pages_shared) || - ksm_read_sysfs(KSM_FP("pages_sharing"), &pages_sharing) || - ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing)) - return false; - - /* - * Since there must be at least 2 pages for merging and 1 page can be - * shared with the limited number of pages (max_page_sharing), sometimes - * there are 'leftover' pages that cannot be merged. For example, if there - * are 11 pages and max_page_sharing = 10, then only 10 pages will be - * merged and the 11th page won't be affected. As a result, when the number - * of duplicate pages is divided by max_page_sharing and the remainder is 1, - * pages_shared and pages_sharing values will be equal between dupl_page_count - * and dupl_page_count - 1. - */ - if (dupl_page_count % max_page_sharing == 1 || dupl_page_count % max_page_sharing == 0) { - if (pages_shared == dupl_page_count / max_page_sharing && - pages_sharing == pages_shared * (max_page_sharing - 1)) - return true; - } else { - if (pages_shared == (dupl_page_count / max_page_sharing + 1) && - pages_sharing == dupl_page_count - pages_shared) - return true; - } - - return false; -} - -static int ksm_save_def(struct ksm_sysfs *ksm_sysfs) -{ - if (ksm_read_sysfs(KSM_FP("max_page_sharing"), &ksm_sysfs->max_page_sharing) || - numa_available() ? 0 : - ksm_read_sysfs(KSM_FP("merge_across_nodes"), &ksm_sysfs->merge_across_nodes) || - ksm_read_sysfs(KSM_FP("sleep_millisecs"), &ksm_sysfs->sleep_millisecs) || - ksm_read_sysfs(KSM_FP("pages_to_scan"), &ksm_sysfs->pages_to_scan) || - ksm_read_sysfs(KSM_FP("run"), &ksm_sysfs->run) || - ksm_read_sysfs(KSM_FP("stable_node_chains_prune_millisecs"), - &ksm_sysfs->stable_node_chains_prune_millisecs) || - ksm_read_sysfs(KSM_FP("use_zero_pages"), &ksm_sysfs->use_zero_pages)) - return 1; - - return 0; -} - -static int ksm_restore(struct ksm_sysfs *ksm_sysfs) -{ - if (ksm_write_sysfs(KSM_FP("max_page_sharing"), ksm_sysfs->max_page_sharing) || - numa_available() ? 0 : - ksm_write_sysfs(KSM_FP("merge_across_nodes"), ksm_sysfs->merge_across_nodes) || - ksm_write_sysfs(KSM_FP("pages_to_scan"), ksm_sysfs->pages_to_scan) || - ksm_write_sysfs(KSM_FP("run"), ksm_sysfs->run) || - ksm_write_sysfs(KSM_FP("sleep_millisecs"), ksm_sysfs->sleep_millisecs) || - ksm_write_sysfs(KSM_FP("stable_node_chains_prune_millisecs"), - ksm_sysfs->stable_node_chains_prune_millisecs) || - ksm_write_sysfs(KSM_FP("use_zero_pages"), ksm_sysfs->use_zero_pages)) - return 1; - - return 0; -} - -static int check_ksm_merge(int mapping, int prot, long page_count, int timeout, size_t page_size) -{ - void *map_ptr; - struct timespec start_time; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - return KSFT_FAIL; - } - - /* fill pages with the same data and merge them */ - map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); - if (!map_ptr) - return KSFT_FAIL; - - if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) - goto err_out; - - /* verify that the right number of pages are merged */ - if (assert_ksm_pages_count(page_count)) { - printf("OK\n"); - munmap(map_ptr, page_size * page_count); - return KSFT_PASS; - } - -err_out: - printf("Not OK\n"); - munmap(map_ptr, page_size * page_count); - return KSFT_FAIL; -} - -static int check_ksm_unmerge(int mapping, int prot, int timeout, size_t page_size) -{ - void *map_ptr; - struct timespec start_time; - int page_count = 2; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - return KSFT_FAIL; - } - - /* fill pages with the same data and merge them */ - map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); - if (!map_ptr) - return KSFT_FAIL; - - if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) - goto err_out; - - /* change 1 byte in each of the 2 pages -- KSM must automatically unmerge them */ - memset(map_ptr, '-', 1); - memset(map_ptr + page_size, '+', 1); - - /* get at least 1 scan, so KSM can detect that the pages were modified */ - if (ksm_do_scan(1, start_time, timeout)) - goto err_out; - - /* check that unmerging was successful and 0 pages are currently merged */ - if (assert_ksm_pages_count(0)) { - printf("OK\n"); - munmap(map_ptr, page_size * page_count); - return KSFT_PASS; - } - -err_out: - printf("Not OK\n"); - munmap(map_ptr, page_size * page_count); - return KSFT_FAIL; -} - -static int check_ksm_zero_page_merge(int mapping, int prot, long page_count, int timeout, - bool use_zero_pages, size_t page_size) -{ - void *map_ptr; - struct timespec start_time; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - return KSFT_FAIL; - } - - if (ksm_write_sysfs(KSM_FP("use_zero_pages"), use_zero_pages)) - return KSFT_FAIL; - - /* fill pages with zero and try to merge them */ - map_ptr = allocate_memory(NULL, prot, mapping, 0, page_size * page_count); - if (!map_ptr) - return KSFT_FAIL; - - if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) - goto err_out; - - /* - * verify that the right number of pages are merged: - * 1) if use_zero_pages is set to 1, empty pages are merged - * with the kernel zero page instead of with each other; - * 2) if use_zero_pages is set to 0, empty pages are not treated specially - * and merged as usual. - */ - if (use_zero_pages && !assert_ksm_pages_count(0)) - goto err_out; - else if (!use_zero_pages && !assert_ksm_pages_count(page_count)) - goto err_out; - - printf("OK\n"); - munmap(map_ptr, page_size * page_count); - return KSFT_PASS; - -err_out: - printf("Not OK\n"); - munmap(map_ptr, page_size * page_count); - return KSFT_FAIL; -} - -static int get_next_mem_node(int node) -{ - - long node_size; - int mem_node = 0; - int i, max_node = numa_max_node(); - - for (i = node + 1; i <= max_node + node; i++) { - mem_node = i % (max_node + 1); - node_size = numa_node_size(mem_node, NULL); - if (node_size > 0) - break; - } - return mem_node; -} - -static int get_first_mem_node(void) -{ - return get_next_mem_node(numa_max_node()); -} - -static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_across_nodes, - size_t page_size) -{ - void *numa1_map_ptr, *numa2_map_ptr; - struct timespec start_time; - int page_count = 2; - int first_node; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - return KSFT_FAIL; - } - - if (numa_available() < 0) { - perror("NUMA support not enabled"); - return KSFT_SKIP; - } - if (numa_num_configured_nodes() <= 1) { - printf("At least 2 NUMA nodes must be available\n"); - return KSFT_SKIP; - } - if (ksm_write_sysfs(KSM_FP("merge_across_nodes"), merge_across_nodes)) - return KSFT_FAIL; - - /* allocate 2 pages in 2 different NUMA nodes and fill them with the same data */ - first_node = get_first_mem_node(); - numa1_map_ptr = numa_alloc_onnode(page_size, first_node); - numa2_map_ptr = numa_alloc_onnode(page_size, get_next_mem_node(first_node)); - if (!numa1_map_ptr || !numa2_map_ptr) { - perror("numa_alloc_onnode"); - return KSFT_FAIL; - } - - memset(numa1_map_ptr, '*', page_size); - memset(numa2_map_ptr, '*', page_size); - - /* try to merge the pages */ - if (ksm_merge_pages(numa1_map_ptr, page_size, start_time, timeout) || - ksm_merge_pages(numa2_map_ptr, page_size, start_time, timeout)) - goto err_out; - - /* - * verify that the right number of pages are merged: - * 1) if merge_across_nodes was enabled, 2 duplicate pages will be merged; - * 2) if merge_across_nodes = 0, there must be 0 merged pages, since there is - * only 1 unique page in each node and they can't be shared. - */ - if (merge_across_nodes && !assert_ksm_pages_count(page_count)) - goto err_out; - else if (!merge_across_nodes && !assert_ksm_pages_count(0)) - goto err_out; - - numa_free(numa1_map_ptr, page_size); - numa_free(numa2_map_ptr, page_size); - printf("OK\n"); - return KSFT_PASS; - -err_out: - numa_free(numa1_map_ptr, page_size); - numa_free(numa2_map_ptr, page_size); - printf("Not OK\n"); - return KSFT_FAIL; -} - -static int ksm_merge_hugepages_time(int mapping, int prot, int timeout, size_t map_size) -{ - void *map_ptr, *map_ptr_orig; - struct timespec start_time, end_time; - unsigned long scan_time_ns; - int pagemap_fd, n_normal_pages, n_huge_pages; - - map_size *= MB; - size_t len = map_size; - - len -= len % HPAGE_SIZE; - map_ptr_orig = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0); - map_ptr = map_ptr_orig + HPAGE_SIZE - (uintptr_t)map_ptr_orig % HPAGE_SIZE; - - if (map_ptr_orig == MAP_FAILED) - err(2, "initial mmap"); - - if (madvise(map_ptr, len + HPAGE_SIZE, MADV_HUGEPAGE)) - err(2, "MADV_HUGEPAGE"); - - pagemap_fd = open("/proc/self/pagemap", O_RDONLY); - if (pagemap_fd < 0) - err(2, "open pagemap"); - - n_normal_pages = 0; - n_huge_pages = 0; - for (void *p = map_ptr; p < map_ptr + len; p += HPAGE_SIZE) { - if (allocate_transhuge(p, pagemap_fd) < 0) - n_normal_pages++; - else - n_huge_pages++; - } - printf("Number of normal pages: %d\n", n_normal_pages); - printf("Number of huge pages: %d\n", n_huge_pages); - - memset(map_ptr, '*', len); - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - goto err_out; - } - if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) - goto err_out; - if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { - perror("clock_gettime"); - goto err_out; - } - - scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + - (end_time.tv_nsec - start_time.tv_nsec); - - printf("Total size: %lu MiB\n", map_size / MB); - printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, - scan_time_ns % NSEC_PER_SEC); - printf("Average speed: %.3f MiB/s\n", (map_size / MB) / - ((double)scan_time_ns / NSEC_PER_SEC)); - - munmap(map_ptr_orig, len + HPAGE_SIZE); - return KSFT_PASS; - -err_out: - printf("Not OK\n"); - munmap(map_ptr_orig, len + HPAGE_SIZE); - return KSFT_FAIL; -} - -static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size) -{ - void *map_ptr; - struct timespec start_time, end_time; - unsigned long scan_time_ns; - - map_size *= MB; - - map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size); - if (!map_ptr) - return KSFT_FAIL; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - goto err_out; - } - if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) - goto err_out; - if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { - perror("clock_gettime"); - goto err_out; - } - - scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + - (end_time.tv_nsec - start_time.tv_nsec); - - printf("Total size: %lu MiB\n", map_size / MB); - printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, - scan_time_ns % NSEC_PER_SEC); - printf("Average speed: %.3f MiB/s\n", (map_size / MB) / - ((double)scan_time_ns / NSEC_PER_SEC)); - - munmap(map_ptr, map_size); - return KSFT_PASS; - -err_out: - printf("Not OK\n"); - munmap(map_ptr, map_size); - return KSFT_FAIL; -} - -static int ksm_unmerge_time(int mapping, int prot, int timeout, size_t map_size) -{ - void *map_ptr; - struct timespec start_time, end_time; - unsigned long scan_time_ns; - - map_size *= MB; - - map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size); - if (!map_ptr) - return KSFT_FAIL; - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - goto err_out; - } - if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) - goto err_out; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - goto err_out; - } - if (ksm_unmerge_pages(map_ptr, map_size, start_time, timeout)) - goto err_out; - if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { - perror("clock_gettime"); - goto err_out; - } - - scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + - (end_time.tv_nsec - start_time.tv_nsec); - - printf("Total size: %lu MiB\n", map_size / MB); - printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, - scan_time_ns % NSEC_PER_SEC); - printf("Average speed: %.3f MiB/s\n", (map_size / MB) / - ((double)scan_time_ns / NSEC_PER_SEC)); - - munmap(map_ptr, map_size); - return KSFT_PASS; - -err_out: - printf("Not OK\n"); - munmap(map_ptr, map_size); - return KSFT_FAIL; -} - -static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size) -{ - void *map_ptr; - struct timespec start_time, end_time; - unsigned long cow_time_ns; - - /* page_count must be less than 2*page_size */ - size_t page_count = 4000; - - map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); - if (!map_ptr) - return KSFT_FAIL; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - return KSFT_FAIL; - } - for (size_t i = 0; i < page_count - 1; i = i + 2) - memset(map_ptr + page_size * i, '-', 1); - if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { - perror("clock_gettime"); - return KSFT_FAIL; - } - - cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + - (end_time.tv_nsec - start_time.tv_nsec); - - printf("Total size: %lu MiB\n\n", (page_size * page_count) / MB); - printf("Not merged pages:\n"); - printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, - cow_time_ns % NSEC_PER_SEC); - printf("Average speed: %.3f MiB/s\n\n", ((page_size * (page_count / 2)) / MB) / - ((double)cow_time_ns / NSEC_PER_SEC)); - - /* Create 2000 pairs of duplicate pages */ - for (size_t i = 0; i < page_count - 1; i = i + 2) { - memset(map_ptr + page_size * i, '+', i / 2 + 1); - memset(map_ptr + page_size * (i + 1), '+', i / 2 + 1); - } - if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) - goto err_out; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - goto err_out; - } - for (size_t i = 0; i < page_count - 1; i = i + 2) - memset(map_ptr + page_size * i, '-', 1); - if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { - perror("clock_gettime"); - goto err_out; - } - - cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + - (end_time.tv_nsec - start_time.tv_nsec); - - printf("Merged pages:\n"); - printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, - cow_time_ns % NSEC_PER_SEC); - printf("Average speed: %.3f MiB/s\n", ((page_size * (page_count / 2)) / MB) / - ((double)cow_time_ns / NSEC_PER_SEC)); - - munmap(map_ptr, page_size * page_count); - return KSFT_PASS; - -err_out: - printf("Not OK\n"); - munmap(map_ptr, page_size * page_count); - return KSFT_FAIL; -} - -int main(int argc, char *argv[]) -{ - int ret, opt; - int prot = 0; - int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT; - long page_count = KSM_PAGE_COUNT_DEFAULT; - size_t page_size = sysconf(_SC_PAGESIZE); - struct ksm_sysfs ksm_sysfs_old; - int test_name = CHECK_KSM_MERGE; - bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT; - bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT; - long size_MB = 0; - - while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCHD")) != -1) { - switch (opt) { - case 'a': - prot = str_to_prot(optarg); - break; - case 'p': - page_count = atol(optarg); - if (page_count <= 0) { - printf("The number of pages must be greater than 0\n"); - return KSFT_FAIL; - } - break; - case 'l': - ksm_scan_limit_sec = atoi(optarg); - if (ksm_scan_limit_sec <= 0) { - printf("Timeout value must be greater than 0\n"); - return KSFT_FAIL; - } - break; - case 'h': - print_help(); - break; - case 'z': - if (strcmp(optarg, "0") == 0) - use_zero_pages = 0; - else - use_zero_pages = 1; - break; - case 'm': - if (strcmp(optarg, "0") == 0) - merge_across_nodes = 0; - else - merge_across_nodes = 1; - break; - case 's': - size_MB = atoi(optarg); - if (size_MB <= 0) { - printf("Size must be greater than 0\n"); - return KSFT_FAIL; - } - case 'M': - break; - case 'U': - test_name = CHECK_KSM_UNMERGE; - break; - case 'Z': - test_name = CHECK_KSM_ZERO_PAGE_MERGE; - break; - case 'N': - test_name = CHECK_KSM_NUMA_MERGE; - break; - case 'P': - test_name = KSM_MERGE_TIME; - break; - case 'H': - test_name = KSM_MERGE_TIME_HUGE_PAGES; - break; - case 'D': - test_name = KSM_UNMERGE_TIME; - break; - case 'C': - test_name = KSM_COW_TIME; - break; - default: - return KSFT_FAIL; - } - } - - if (prot == 0) - prot = str_to_prot(KSM_PROT_STR_DEFAULT); - - if (access(KSM_SYSFS_PATH, F_OK)) { - printf("Config KSM not enabled\n"); - return KSFT_SKIP; - } - - if (ksm_save_def(&ksm_sysfs_old)) { - printf("Cannot save default tunables\n"); - return KSFT_FAIL; - } - - if (ksm_write_sysfs(KSM_FP("run"), 2) || - ksm_write_sysfs(KSM_FP("sleep_millisecs"), 0) || - numa_available() ? 0 : - ksm_write_sysfs(KSM_FP("merge_across_nodes"), 1) || - ksm_write_sysfs(KSM_FP("pages_to_scan"), page_count)) - return KSFT_FAIL; - - switch (test_name) { - case CHECK_KSM_MERGE: - ret = check_ksm_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, - ksm_scan_limit_sec, page_size); - break; - case CHECK_KSM_UNMERGE: - ret = check_ksm_unmerge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, - page_size); - break; - case CHECK_KSM_ZERO_PAGE_MERGE: - ret = check_ksm_zero_page_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, - ksm_scan_limit_sec, use_zero_pages, page_size); - break; - case CHECK_KSM_NUMA_MERGE: - ret = check_ksm_numa_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, - merge_across_nodes, page_size); - break; - case KSM_MERGE_TIME: - if (size_MB == 0) { - printf("Option '-s' is required.\n"); - return KSFT_FAIL; - } - ret = ksm_merge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, - size_MB); - break; - case KSM_MERGE_TIME_HUGE_PAGES: - if (size_MB == 0) { - printf("Option '-s' is required.\n"); - return KSFT_FAIL; - } - ret = ksm_merge_hugepages_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, - ksm_scan_limit_sec, size_MB); - break; - case KSM_UNMERGE_TIME: - if (size_MB == 0) { - printf("Option '-s' is required.\n"); - return KSFT_FAIL; - } - ret = ksm_unmerge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, - ksm_scan_limit_sec, size_MB); - break; - case KSM_COW_TIME: - ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, - page_size); - break; - } - - if (ksm_restore(&ksm_sysfs_old)) { - printf("Cannot restore default tunables\n"); - return KSFT_FAIL; - } - - return ret; -} diff --git a/tools/testing/selftests/vm/madv_populate.c b/tools/testing/selftests/vm/madv_populate.c deleted file mode 100644 index 262eae6b58f2..000000000000 --- a/tools/testing/selftests/vm/madv_populate.c +++ /dev/null @@ -1,296 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * MADV_POPULATE_READ and MADV_POPULATE_WRITE tests - * - * Copyright 2021, Red Hat, Inc. - * - * Author(s): David Hildenbrand - */ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../kselftest.h" -#include "vm_util.h" - -#ifndef MADV_POPULATE_READ -#define MADV_POPULATE_READ 22 -#endif /* MADV_POPULATE_READ */ -#ifndef MADV_POPULATE_WRITE -#define MADV_POPULATE_WRITE 23 -#endif /* MADV_POPULATE_WRITE */ - -/* - * For now, we're using 2 MiB of private anonymous memory for all tests. - */ -#define SIZE (2 * 1024 * 1024) - -static size_t pagesize; - -static void sense_support(void) -{ - char *addr; - int ret; - - addr = mmap(0, pagesize, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (!addr) - ksft_exit_fail_msg("mmap failed\n"); - - ret = madvise(addr, pagesize, MADV_POPULATE_READ); - if (ret) - ksft_exit_skip("MADV_POPULATE_READ is not available\n"); - - ret = madvise(addr, pagesize, MADV_POPULATE_WRITE); - if (ret) - ksft_exit_skip("MADV_POPULATE_WRITE is not available\n"); - - munmap(addr, pagesize); -} - -static void test_prot_read(void) -{ - char *addr; - int ret; - - ksft_print_msg("[RUN] %s\n", __func__); - - addr = mmap(0, SIZE, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (addr == MAP_FAILED) - ksft_exit_fail_msg("mmap failed\n"); - - ret = madvise(addr, SIZE, MADV_POPULATE_READ); - ksft_test_result(!ret, "MADV_POPULATE_READ with PROT_READ\n"); - - ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); - ksft_test_result(ret == -1 && errno == EINVAL, - "MADV_POPULATE_WRITE with PROT_READ\n"); - - munmap(addr, SIZE); -} - -static void test_prot_write(void) -{ - char *addr; - int ret; - - ksft_print_msg("[RUN] %s\n", __func__); - - addr = mmap(0, SIZE, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (addr == MAP_FAILED) - ksft_exit_fail_msg("mmap failed\n"); - - ret = madvise(addr, SIZE, MADV_POPULATE_READ); - ksft_test_result(ret == -1 && errno == EINVAL, - "MADV_POPULATE_READ with PROT_WRITE\n"); - - ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); - ksft_test_result(!ret, "MADV_POPULATE_WRITE with PROT_WRITE\n"); - - munmap(addr, SIZE); -} - -static void test_holes(void) -{ - char *addr; - int ret; - - ksft_print_msg("[RUN] %s\n", __func__); - - addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (addr == MAP_FAILED) - ksft_exit_fail_msg("mmap failed\n"); - ret = munmap(addr + pagesize, pagesize); - if (ret) - ksft_exit_fail_msg("munmap failed\n"); - - /* Hole in the middle */ - ret = madvise(addr, SIZE, MADV_POPULATE_READ); - ksft_test_result(ret == -1 && errno == ENOMEM, - "MADV_POPULATE_READ with holes in the middle\n"); - ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); - ksft_test_result(ret == -1 && errno == ENOMEM, - "MADV_POPULATE_WRITE with holes in the middle\n"); - - /* Hole at end */ - ret = madvise(addr, 2 * pagesize, MADV_POPULATE_READ); - ksft_test_result(ret == -1 && errno == ENOMEM, - "MADV_POPULATE_READ with holes at the end\n"); - ret = madvise(addr, 2 * pagesize, MADV_POPULATE_WRITE); - ksft_test_result(ret == -1 && errno == ENOMEM, - "MADV_POPULATE_WRITE with holes at the end\n"); - - /* Hole at beginning */ - ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_READ); - ksft_test_result(ret == -1 && errno == ENOMEM, - "MADV_POPULATE_READ with holes at the beginning\n"); - ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_WRITE); - ksft_test_result(ret == -1 && errno == ENOMEM, - "MADV_POPULATE_WRITE with holes at the beginning\n"); - - munmap(addr, SIZE); -} - -static bool range_is_populated(char *start, ssize_t size) -{ - int fd = open("/proc/self/pagemap", O_RDONLY); - bool ret = true; - - if (fd < 0) - ksft_exit_fail_msg("opening pagemap failed\n"); - for (; size > 0 && ret; size -= pagesize, start += pagesize) - if (!pagemap_is_populated(fd, start)) - ret = false; - close(fd); - return ret; -} - -static bool range_is_not_populated(char *start, ssize_t size) -{ - int fd = open("/proc/self/pagemap", O_RDONLY); - bool ret = true; - - if (fd < 0) - ksft_exit_fail_msg("opening pagemap failed\n"); - for (; size > 0 && ret; size -= pagesize, start += pagesize) - if (pagemap_is_populated(fd, start)) - ret = false; - close(fd); - return ret; -} - -static void test_populate_read(void) -{ - char *addr; - int ret; - - ksft_print_msg("[RUN] %s\n", __func__); - - addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (addr == MAP_FAILED) - ksft_exit_fail_msg("mmap failed\n"); - ksft_test_result(range_is_not_populated(addr, SIZE), - "range initially not populated\n"); - - ret = madvise(addr, SIZE, MADV_POPULATE_READ); - ksft_test_result(!ret, "MADV_POPULATE_READ\n"); - ksft_test_result(range_is_populated(addr, SIZE), - "range is populated\n"); - - munmap(addr, SIZE); -} - -static void test_populate_write(void) -{ - char *addr; - int ret; - - ksft_print_msg("[RUN] %s\n", __func__); - - addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (addr == MAP_FAILED) - ksft_exit_fail_msg("mmap failed\n"); - ksft_test_result(range_is_not_populated(addr, SIZE), - "range initially not populated\n"); - - ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); - ksft_test_result(!ret, "MADV_POPULATE_WRITE\n"); - ksft_test_result(range_is_populated(addr, SIZE), - "range is populated\n"); - - munmap(addr, SIZE); -} - -static bool range_is_softdirty(char *start, ssize_t size) -{ - int fd = open("/proc/self/pagemap", O_RDONLY); - bool ret = true; - - if (fd < 0) - ksft_exit_fail_msg("opening pagemap failed\n"); - for (; size > 0 && ret; size -= pagesize, start += pagesize) - if (!pagemap_is_softdirty(fd, start)) - ret = false; - close(fd); - return ret; -} - -static bool range_is_not_softdirty(char *start, ssize_t size) -{ - int fd = open("/proc/self/pagemap", O_RDONLY); - bool ret = true; - - if (fd < 0) - ksft_exit_fail_msg("opening pagemap failed\n"); - for (; size > 0 && ret; size -= pagesize, start += pagesize) - if (pagemap_is_softdirty(fd, start)) - ret = false; - close(fd); - return ret; -} - -static void test_softdirty(void) -{ - char *addr; - int ret; - - ksft_print_msg("[RUN] %s\n", __func__); - - addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (addr == MAP_FAILED) - ksft_exit_fail_msg("mmap failed\n"); - - /* Clear any softdirty bits. */ - clear_softdirty(); - ksft_test_result(range_is_not_softdirty(addr, SIZE), - "range is not softdirty\n"); - - /* Populating READ should set softdirty. */ - ret = madvise(addr, SIZE, MADV_POPULATE_READ); - ksft_test_result(!ret, "MADV_POPULATE_READ\n"); - ksft_test_result(range_is_not_softdirty(addr, SIZE), - "range is not softdirty\n"); - - /* Populating WRITE should set softdirty. */ - ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); - ksft_test_result(!ret, "MADV_POPULATE_WRITE\n"); - ksft_test_result(range_is_softdirty(addr, SIZE), - "range is softdirty\n"); - - munmap(addr, SIZE); -} - -int main(int argc, char **argv) -{ - int err; - - pagesize = getpagesize(); - - ksft_print_header(); - ksft_set_plan(21); - - sense_support(); - test_prot_read(); - test_prot_write(); - test_holes(); - test_populate_read(); - test_populate_write(); - test_softdirty(); - - err = ksft_get_fail_cnt(); - if (err) - ksft_exit_fail_msg("%d out of %d tests failed\n", - err, ksft_test_num()); - return ksft_exit_pass(); -} diff --git a/tools/testing/selftests/vm/map_fixed_noreplace.c b/tools/testing/selftests/vm/map_fixed_noreplace.c deleted file mode 100644 index eed44322d1a6..000000000000 --- a/tools/testing/selftests/vm/map_fixed_noreplace.c +++ /dev/null @@ -1,231 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -/* - * Test that MAP_FIXED_NOREPLACE works. - * - * Copyright 2018, Jann Horn - * Copyright 2018, Michael Ellerman, IBM Corporation. - */ - -#include -#include -#include -#include -#include - -#ifndef MAP_FIXED_NOREPLACE -#define MAP_FIXED_NOREPLACE 0x100000 -#endif - -static void dump_maps(void) -{ - char cmd[32]; - - snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid()); - system(cmd); -} - -static unsigned long find_base_addr(unsigned long size) -{ - void *addr; - unsigned long flags; - - flags = MAP_PRIVATE | MAP_ANONYMOUS; - addr = mmap(NULL, size, PROT_NONE, flags, -1, 0); - if (addr == MAP_FAILED) { - printf("Error: couldn't map the space we need for the test\n"); - return 0; - } - - if (munmap(addr, size) != 0) { - printf("Error: couldn't map the space we need for the test\n"); - return 0; - } - return (unsigned long)addr; -} - -int main(void) -{ - unsigned long base_addr; - unsigned long flags, addr, size, page_size; - char *p; - - page_size = sysconf(_SC_PAGE_SIZE); - - //let's find a base addr that is free before we start the tests - size = 5 * page_size; - base_addr = find_base_addr(size); - if (!base_addr) { - printf("Error: couldn't map the space we need for the test\n"); - return 1; - } - - flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE; - - // Check we can map all the areas we need below - errno = 0; - addr = base_addr; - size = 5 * page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p == MAP_FAILED) { - dump_maps(); - printf("Error: couldn't map the space we need for the test\n"); - return 1; - } - - errno = 0; - if (munmap((void *)addr, 5 * page_size) != 0) { - dump_maps(); - printf("Error: munmap failed!?\n"); - return 1; - } - printf("unmap() successful\n"); - - errno = 0; - addr = base_addr + page_size; - size = 3 * page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p == MAP_FAILED) { - dump_maps(); - printf("Error: first mmap() failed unexpectedly\n"); - return 1; - } - - /* - * Exact same mapping again: - * base | free | new - * +1 | mapped | new - * +2 | mapped | new - * +3 | mapped | new - * +4 | free | new - */ - errno = 0; - addr = base_addr; - size = 5 * page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p != MAP_FAILED) { - dump_maps(); - printf("Error:1: mmap() succeeded when it shouldn't have\n"); - return 1; - } - - /* - * Second mapping contained within first: - * - * base | free | - * +1 | mapped | - * +2 | mapped | new - * +3 | mapped | - * +4 | free | - */ - errno = 0; - addr = base_addr + (2 * page_size); - size = page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p != MAP_FAILED) { - dump_maps(); - printf("Error:2: mmap() succeeded when it shouldn't have\n"); - return 1; - } - - /* - * Overlap end of existing mapping: - * base | free | - * +1 | mapped | - * +2 | mapped | - * +3 | mapped | new - * +4 | free | new - */ - errno = 0; - addr = base_addr + (3 * page_size); - size = 2 * page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p != MAP_FAILED) { - dump_maps(); - printf("Error:3: mmap() succeeded when it shouldn't have\n"); - return 1; - } - - /* - * Overlap start of existing mapping: - * base | free | new - * +1 | mapped | new - * +2 | mapped | - * +3 | mapped | - * +4 | free | - */ - errno = 0; - addr = base_addr; - size = 2 * page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p != MAP_FAILED) { - dump_maps(); - printf("Error:4: mmap() succeeded when it shouldn't have\n"); - return 1; - } - - /* - * Adjacent to start of existing mapping: - * base | free | new - * +1 | mapped | - * +2 | mapped | - * +3 | mapped | - * +4 | free | - */ - errno = 0; - addr = base_addr; - size = page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p == MAP_FAILED) { - dump_maps(); - printf("Error:5: mmap() failed when it shouldn't have\n"); - return 1; - } - - /* - * Adjacent to end of existing mapping: - * base | free | - * +1 | mapped | - * +2 | mapped | - * +3 | mapped | - * +4 | free | new - */ - errno = 0; - addr = base_addr + (4 * page_size); - size = page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p == MAP_FAILED) { - dump_maps(); - printf("Error:6: mmap() failed when it shouldn't have\n"); - return 1; - } - - addr = base_addr; - size = 5 * page_size; - if (munmap((void *)addr, size) != 0) { - dump_maps(); - printf("Error: munmap failed!?\n"); - return 1; - } - printf("unmap() successful\n"); - - printf("OK\n"); - return 0; -} diff --git a/tools/testing/selftests/vm/map_hugetlb.c b/tools/testing/selftests/vm/map_hugetlb.c deleted file mode 100644 index 312889edb84a..000000000000 --- a/tools/testing/selftests/vm/map_hugetlb.c +++ /dev/null @@ -1,109 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Example of using hugepage memory in a user application using the mmap - * system call with MAP_HUGETLB flag. Before running this program make - * sure the administrator has allocated enough default sized huge pages - * to cover the 256 MB allocation. - * - * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. - * That means the addresses starting with 0x800000... will need to be - * specified. Specifying a fixed address is not required on ppc64, i386 - * or x86_64. - */ -#include -#include -#include -#include -#include - -#define LENGTH (256UL*1024*1024) -#define PROTECTION (PROT_READ | PROT_WRITE) - -#ifndef MAP_HUGETLB -#define MAP_HUGETLB 0x40000 /* arch specific */ -#endif - -#ifndef MAP_HUGE_SHIFT -#define MAP_HUGE_SHIFT 26 -#endif - -#ifndef MAP_HUGE_MASK -#define MAP_HUGE_MASK 0x3f -#endif - -/* Only ia64 requires this */ -#ifdef __ia64__ -#define ADDR (void *)(0x8000000000000000UL) -#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) -#else -#define ADDR (void *)(0x0UL) -#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) -#endif - -static void check_bytes(char *addr) -{ - printf("First hex is %x\n", *((unsigned int *)addr)); -} - -static void write_bytes(char *addr, size_t length) -{ - unsigned long i; - - for (i = 0; i < length; i++) - *(addr + i) = (char)i; -} - -static int read_bytes(char *addr, size_t length) -{ - unsigned long i; - - check_bytes(addr); - for (i = 0; i < length; i++) - if (*(addr + i) != (char)i) { - printf("Mismatch at %lu\n", i); - return 1; - } - return 0; -} - -int main(int argc, char **argv) -{ - void *addr; - int ret; - size_t length = LENGTH; - int flags = FLAGS; - int shift = 0; - - if (argc > 1) - length = atol(argv[1]) << 20; - if (argc > 2) { - shift = atoi(argv[2]); - if (shift) - flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; - } - - if (shift) - printf("%u kB hugepages\n", 1 << (shift - 10)); - else - printf("Default size hugepages\n"); - printf("Mapping %lu Mbytes\n", (unsigned long)length >> 20); - - addr = mmap(ADDR, length, PROTECTION, flags, -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - - printf("Returned address is %p\n", addr); - check_bytes(addr); - write_bytes(addr, length); - ret = read_bytes(addr, length); - - /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ - if (munmap(addr, length)) { - perror("munmap"); - exit(1); - } - - return ret; -} diff --git a/tools/testing/selftests/vm/map_populate.c b/tools/testing/selftests/vm/map_populate.c deleted file mode 100644 index 6b8aeaa0bf7a..000000000000 --- a/tools/testing/selftests/vm/map_populate.c +++ /dev/null @@ -1,113 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2018 Dmitry Safonov, Arista Networks - * - * MAP_POPULATE | MAP_PRIVATE should COW VMA pages. - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifndef MMAP_SZ -#define MMAP_SZ 4096 -#endif - -#define BUG_ON(condition, description) \ - do { \ - if (condition) { \ - fprintf(stderr, "[FAIL]\t%s:%d\t%s:%s\n", __func__, \ - __LINE__, (description), strerror(errno)); \ - exit(1); \ - } \ - } while (0) - -static int parent_f(int sock, unsigned long *smap, int child) -{ - int status, ret; - - ret = read(sock, &status, sizeof(int)); - BUG_ON(ret <= 0, "read(sock)"); - - *smap = 0x22222BAD; - ret = msync(smap, MMAP_SZ, MS_SYNC); - BUG_ON(ret, "msync()"); - - ret = write(sock, &status, sizeof(int)); - BUG_ON(ret <= 0, "write(sock)"); - - waitpid(child, &status, 0); - BUG_ON(!WIFEXITED(status), "child in unexpected state"); - - return WEXITSTATUS(status); -} - -static int child_f(int sock, unsigned long *smap, int fd) -{ - int ret, buf = 0; - - smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_POPULATE, fd, 0); - BUG_ON(smap == MAP_FAILED, "mmap()"); - - BUG_ON(*smap != 0xdeadbabe, "MAP_PRIVATE | MAP_POPULATE changed file"); - - ret = write(sock, &buf, sizeof(int)); - BUG_ON(ret <= 0, "write(sock)"); - - ret = read(sock, &buf, sizeof(int)); - BUG_ON(ret <= 0, "read(sock)"); - - BUG_ON(*smap == 0x22222BAD, "MAP_POPULATE didn't COW private page"); - BUG_ON(*smap != 0xdeadbabe, "mapping was corrupted"); - - return 0; -} - -int main(int argc, char **argv) -{ - int sock[2], child, ret; - FILE *ftmp; - unsigned long *smap; - - ftmp = tmpfile(); - BUG_ON(ftmp == 0, "tmpfile()"); - - ret = ftruncate(fileno(ftmp), MMAP_SZ); - BUG_ON(ret, "ftruncate()"); - - smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE, - MAP_SHARED, fileno(ftmp), 0); - BUG_ON(smap == MAP_FAILED, "mmap()"); - - *smap = 0xdeadbabe; - /* Probably unnecessary, but let it be. */ - ret = msync(smap, MMAP_SZ, MS_SYNC); - BUG_ON(ret, "msync()"); - - ret = socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sock); - BUG_ON(ret, "socketpair()"); - - child = fork(); - BUG_ON(child == -1, "fork()"); - - if (child) { - ret = close(sock[0]); - BUG_ON(ret, "close()"); - - return parent_f(sock[1], smap, child); - } - - ret = close(sock[1]); - BUG_ON(ret, "close()"); - - return child_f(sock[0], smap, fileno(ftmp)); -} diff --git a/tools/testing/selftests/vm/memfd_secret.c b/tools/testing/selftests/vm/memfd_secret.c deleted file mode 100644 index 957b9e18c729..000000000000 --- a/tools/testing/selftests/vm/memfd_secret.c +++ /dev/null @@ -1,296 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright IBM Corporation, 2021 - * - * Author: Mike Rapoport - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "../kselftest.h" - -#define fail(fmt, ...) ksft_test_result_fail(fmt, ##__VA_ARGS__) -#define pass(fmt, ...) ksft_test_result_pass(fmt, ##__VA_ARGS__) -#define skip(fmt, ...) ksft_test_result_skip(fmt, ##__VA_ARGS__) - -#ifdef __NR_memfd_secret - -#define PATTERN 0x55 - -static const int prot = PROT_READ | PROT_WRITE; -static const int mode = MAP_SHARED; - -static unsigned long page_size; -static unsigned long mlock_limit_cur; -static unsigned long mlock_limit_max; - -static int memfd_secret(unsigned int flags) -{ - return syscall(__NR_memfd_secret, flags); -} - -static void test_file_apis(int fd) -{ - char buf[64]; - - if ((read(fd, buf, sizeof(buf)) >= 0) || - (write(fd, buf, sizeof(buf)) >= 0) || - (pread(fd, buf, sizeof(buf), 0) >= 0) || - (pwrite(fd, buf, sizeof(buf), 0) >= 0)) - fail("unexpected file IO\n"); - else - pass("file IO is blocked as expected\n"); -} - -static void test_mlock_limit(int fd) -{ - size_t len; - char *mem; - - len = mlock_limit_cur; - mem = mmap(NULL, len, prot, mode, fd, 0); - if (mem == MAP_FAILED) { - fail("unable to mmap secret memory\n"); - return; - } - munmap(mem, len); - - len = mlock_limit_max * 2; - mem = mmap(NULL, len, prot, mode, fd, 0); - if (mem != MAP_FAILED) { - fail("unexpected mlock limit violation\n"); - munmap(mem, len); - return; - } - - pass("mlock limit is respected\n"); -} - -static void try_process_vm_read(int fd, int pipefd[2]) -{ - struct iovec liov, riov; - char buf[64]; - char *mem; - - if (read(pipefd[0], &mem, sizeof(mem)) < 0) { - fail("pipe write: %s\n", strerror(errno)); - exit(KSFT_FAIL); - } - - liov.iov_len = riov.iov_len = sizeof(buf); - liov.iov_base = buf; - riov.iov_base = mem; - - if (process_vm_readv(getppid(), &liov, 1, &riov, 1, 0) < 0) { - if (errno == ENOSYS) - exit(KSFT_SKIP); - exit(KSFT_PASS); - } - - exit(KSFT_FAIL); -} - -static void try_ptrace(int fd, int pipefd[2]) -{ - pid_t ppid = getppid(); - int status; - char *mem; - long ret; - - if (read(pipefd[0], &mem, sizeof(mem)) < 0) { - perror("pipe write"); - exit(KSFT_FAIL); - } - - ret = ptrace(PTRACE_ATTACH, ppid, 0, 0); - if (ret) { - perror("ptrace_attach"); - exit(KSFT_FAIL); - } - - ret = waitpid(ppid, &status, WUNTRACED); - if ((ret != ppid) || !(WIFSTOPPED(status))) { - fprintf(stderr, "weird waitppid result %ld stat %x\n", - ret, status); - exit(KSFT_FAIL); - } - - if (ptrace(PTRACE_PEEKDATA, ppid, mem, 0)) - exit(KSFT_PASS); - - exit(KSFT_FAIL); -} - -static void check_child_status(pid_t pid, const char *name) -{ - int status; - - waitpid(pid, &status, 0); - - if (WIFEXITED(status) && WEXITSTATUS(status) == KSFT_SKIP) { - skip("%s is not supported\n", name); - return; - } - - if ((WIFEXITED(status) && WEXITSTATUS(status) == KSFT_PASS) || - WIFSIGNALED(status)) { - pass("%s is blocked as expected\n", name); - return; - } - - fail("%s: unexpected memory access\n", name); -} - -static void test_remote_access(int fd, const char *name, - void (*func)(int fd, int pipefd[2])) -{ - int pipefd[2]; - pid_t pid; - char *mem; - - if (pipe(pipefd)) { - fail("pipe failed: %s\n", strerror(errno)); - return; - } - - pid = fork(); - if (pid < 0) { - fail("fork failed: %s\n", strerror(errno)); - return; - } - - if (pid == 0) { - func(fd, pipefd); - return; - } - - mem = mmap(NULL, page_size, prot, mode, fd, 0); - if (mem == MAP_FAILED) { - fail("Unable to mmap secret memory\n"); - return; - } - - ftruncate(fd, page_size); - memset(mem, PATTERN, page_size); - - if (write(pipefd[1], &mem, sizeof(mem)) < 0) { - fail("pipe write: %s\n", strerror(errno)); - return; - } - - check_child_status(pid, name); -} - -static void test_process_vm_read(int fd) -{ - test_remote_access(fd, "process_vm_read", try_process_vm_read); -} - -static void test_ptrace(int fd) -{ - test_remote_access(fd, "ptrace", try_ptrace); -} - -static int set_cap_limits(rlim_t max) -{ - struct rlimit new; - cap_t cap = cap_init(); - - new.rlim_cur = max; - new.rlim_max = max; - if (setrlimit(RLIMIT_MEMLOCK, &new)) { - perror("setrlimit() returns error"); - return -1; - } - - /* drop capabilities including CAP_IPC_LOCK */ - if (cap_set_proc(cap)) { - perror("cap_set_proc() returns error"); - return -2; - } - - return 0; -} - -static void prepare(void) -{ - struct rlimit rlim; - - page_size = sysconf(_SC_PAGE_SIZE); - if (!page_size) - ksft_exit_fail_msg("Failed to get page size %s\n", - strerror(errno)); - - if (getrlimit(RLIMIT_MEMLOCK, &rlim)) - ksft_exit_fail_msg("Unable to detect mlock limit: %s\n", - strerror(errno)); - - mlock_limit_cur = rlim.rlim_cur; - mlock_limit_max = rlim.rlim_max; - - printf("page_size: %ld, mlock.soft: %ld, mlock.hard: %ld\n", - page_size, mlock_limit_cur, mlock_limit_max); - - if (page_size > mlock_limit_cur) - mlock_limit_cur = page_size; - if (page_size > mlock_limit_max) - mlock_limit_max = page_size; - - if (set_cap_limits(mlock_limit_max)) - ksft_exit_fail_msg("Unable to set mlock limit: %s\n", - strerror(errno)); -} - -#define NUM_TESTS 4 - -int main(int argc, char *argv[]) -{ - int fd; - - prepare(); - - ksft_print_header(); - ksft_set_plan(NUM_TESTS); - - fd = memfd_secret(0); - if (fd < 0) { - if (errno == ENOSYS) - ksft_exit_skip("memfd_secret is not supported\n"); - else - ksft_exit_fail_msg("memfd_secret failed: %s\n", - strerror(errno)); - } - - test_mlock_limit(fd); - test_file_apis(fd); - test_process_vm_read(fd); - test_ptrace(fd); - - close(fd); - - ksft_finished(); -} - -#else /* __NR_memfd_secret */ - -int main(int argc, char *argv[]) -{ - printf("skip: skipping memfd_secret test (missing __NR_memfd_secret)\n"); - return KSFT_SKIP; -} - -#endif /* __NR_memfd_secret */ diff --git a/tools/testing/selftests/vm/migration.c b/tools/testing/selftests/vm/migration.c deleted file mode 100644 index 1cec8425e3ca..000000000000 --- a/tools/testing/selftests/vm/migration.c +++ /dev/null @@ -1,193 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * The main purpose of the tests here is to exercise the migration entry code - * paths in the kernel. - */ - -#include "../kselftest_harness.h" -#include -#include -#include -#include -#include -#include -#include -#include - -#define TWOMEG (2<<20) -#define RUNTIME (60) - -#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) - -FIXTURE(migration) -{ - pthread_t *threads; - pid_t *pids; - int nthreads; - int n1; - int n2; -}; - -FIXTURE_SETUP(migration) -{ - int n; - - ASSERT_EQ(numa_available(), 0); - self->nthreads = numa_num_task_cpus() - 1; - self->n1 = -1; - self->n2 = -1; - - for (n = 0; n < numa_max_possible_node(); n++) - if (numa_bitmask_isbitset(numa_all_nodes_ptr, n)) { - if (self->n1 == -1) { - self->n1 = n; - } else { - self->n2 = n; - break; - } - } - - self->threads = malloc(self->nthreads * sizeof(*self->threads)); - ASSERT_NE(self->threads, NULL); - self->pids = malloc(self->nthreads * sizeof(*self->pids)); - ASSERT_NE(self->pids, NULL); -}; - -FIXTURE_TEARDOWN(migration) -{ - free(self->threads); - free(self->pids); -} - -int migrate(uint64_t *ptr, int n1, int n2) -{ - int ret, tmp; - int status = 0; - struct timespec ts1, ts2; - - if (clock_gettime(CLOCK_MONOTONIC, &ts1)) - return -1; - - while (1) { - if (clock_gettime(CLOCK_MONOTONIC, &ts2)) - return -1; - - if (ts2.tv_sec - ts1.tv_sec >= RUNTIME) - return 0; - - ret = move_pages(0, 1, (void **) &ptr, &n2, &status, - MPOL_MF_MOVE_ALL); - if (ret) { - if (ret > 0) - printf("Didn't migrate %d pages\n", ret); - else - perror("Couldn't migrate pages"); - return -2; - } - - tmp = n2; - n2 = n1; - n1 = tmp; - } - - return 0; -} - -void *access_mem(void *ptr) -{ - uint64_t y = 0; - volatile uint64_t *x = ptr; - - while (1) { - pthread_testcancel(); - y += *x; - } - - return NULL; -} - -/* - * Basic migration entry testing. One thread will move pages back and forth - * between nodes whilst other threads try and access them triggering the - * migration entry wait paths in the kernel. - */ -TEST_F_TIMEOUT(migration, private_anon, 2*RUNTIME) -{ - uint64_t *ptr; - int i; - - if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) - SKIP(return, "Not enough threads or NUMA nodes available"); - - ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - ASSERT_NE(ptr, MAP_FAILED); - - memset(ptr, 0xde, TWOMEG); - for (i = 0; i < self->nthreads - 1; i++) - if (pthread_create(&self->threads[i], NULL, access_mem, ptr)) - perror("Couldn't create thread"); - - ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); - for (i = 0; i < self->nthreads - 1; i++) - ASSERT_EQ(pthread_cancel(self->threads[i]), 0); -} - -/* - * Same as the previous test but with shared memory. - */ -TEST_F_TIMEOUT(migration, shared_anon, 2*RUNTIME) -{ - pid_t pid; - uint64_t *ptr; - int i; - - if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) - SKIP(return, "Not enough threads or NUMA nodes available"); - - ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, -1, 0); - ASSERT_NE(ptr, MAP_FAILED); - - memset(ptr, 0xde, TWOMEG); - for (i = 0; i < self->nthreads - 1; i++) { - pid = fork(); - if (!pid) - access_mem(ptr); - else - self->pids[i] = pid; - } - - ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); - for (i = 0; i < self->nthreads - 1; i++) - ASSERT_EQ(kill(self->pids[i], SIGTERM), 0); -} - -/* - * Tests the pmd migration entry paths. - */ -TEST_F_TIMEOUT(migration, private_anon_thp, 2*RUNTIME) -{ - uint64_t *ptr; - int i; - - if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) - SKIP(return, "Not enough threads or NUMA nodes available"); - - ptr = mmap(NULL, 2*TWOMEG, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - ASSERT_NE(ptr, MAP_FAILED); - - ptr = (uint64_t *) ALIGN((uintptr_t) ptr, TWOMEG); - ASSERT_EQ(madvise(ptr, TWOMEG, MADV_HUGEPAGE), 0); - memset(ptr, 0xde, TWOMEG); - for (i = 0; i < self->nthreads - 1; i++) - if (pthread_create(&self->threads[i], NULL, access_mem, ptr)) - perror("Couldn't create thread"); - - ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); - for (i = 0; i < self->nthreads - 1; i++) - ASSERT_EQ(pthread_cancel(self->threads[i]), 0); -} - -TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/vm/mlock-random-test.c b/tools/testing/selftests/vm/mlock-random-test.c deleted file mode 100644 index 782ea94dee2f..000000000000 --- a/tools/testing/selftests/vm/mlock-random-test.c +++ /dev/null @@ -1,294 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * It tests the mlock/mlock2() when they are invoked - * on randomly memory region. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "mlock2.h" - -#define CHUNK_UNIT (128 * 1024) -#define MLOCK_RLIMIT_SIZE (CHUNK_UNIT * 2) -#define MLOCK_WITHIN_LIMIT_SIZE CHUNK_UNIT -#define MLOCK_OUTOF_LIMIT_SIZE (CHUNK_UNIT * 3) - -#define TEST_LOOP 100 -#define PAGE_ALIGN(size, ps) (((size) + ((ps) - 1)) & ~((ps) - 1)) - -int set_cap_limits(rlim_t max) -{ - struct rlimit new; - cap_t cap = cap_init(); - - new.rlim_cur = max; - new.rlim_max = max; - if (setrlimit(RLIMIT_MEMLOCK, &new)) { - perror("setrlimit() returns error\n"); - return -1; - } - - /* drop capabilities including CAP_IPC_LOCK */ - if (cap_set_proc(cap)) { - perror("cap_set_proc() returns error\n"); - return -2; - } - - return 0; -} - -int get_proc_locked_vm_size(void) -{ - FILE *f; - int ret = -1; - char line[1024] = {0}; - unsigned long lock_size = 0; - - f = fopen("/proc/self/status", "r"); - if (!f) { - perror("fopen"); - return -1; - } - - while (fgets(line, 1024, f)) { - if (strstr(line, "VmLck")) { - ret = sscanf(line, "VmLck:\t%8lu kB", &lock_size); - if (ret <= 0) { - printf("sscanf() on VmLck error: %s: %d\n", - line, ret); - fclose(f); - return -1; - } - fclose(f); - return (int)(lock_size << 10); - } - } - - perror("cannot parse VmLck in /proc/self/status\n"); - fclose(f); - return -1; -} - -/* - * Get the MMUPageSize of the memory region including input - * address from proc file. - * - * return value: on error case, 0 will be returned. - * Otherwise the page size(in bytes) is returned. - */ -int get_proc_page_size(unsigned long addr) -{ - FILE *smaps; - char *line; - unsigned long mmupage_size = 0; - size_t size; - - smaps = seek_to_smaps_entry(addr); - if (!smaps) { - printf("Unable to parse /proc/self/smaps\n"); - return 0; - } - - while (getline(&line, &size, smaps) > 0) { - if (!strstr(line, "MMUPageSize")) { - free(line); - line = NULL; - size = 0; - continue; - } - - /* found the MMUPageSize of this section */ - if (sscanf(line, "MMUPageSize: %8lu kB", - &mmupage_size) < 1) { - printf("Unable to parse smaps entry for Size:%s\n", - line); - break; - } - - } - free(line); - if (smaps) - fclose(smaps); - return mmupage_size << 10; -} - -/* - * Test mlock/mlock2() on provided memory chunk. - * It expects the mlock/mlock2() to be successful (within rlimit) - * - * With allocated memory chunk [p, p + alloc_size), this - * test will choose start/len randomly to perform mlock/mlock2 - * [start, start + len] memory range. The range is within range - * of the allocated chunk. - * - * The memory region size alloc_size is within the rlimit. - * So we always expect a success of mlock/mlock2. - * - * VmLck is assumed to be 0 before this test. - * - * return value: 0 - success - * else: failure - */ -int test_mlock_within_limit(char *p, int alloc_size) -{ - int i; - int ret = 0; - int locked_vm_size = 0; - struct rlimit cur; - int page_size = 0; - - getrlimit(RLIMIT_MEMLOCK, &cur); - if (cur.rlim_cur < alloc_size) { - printf("alloc_size[%d] < %u rlimit,lead to mlock failure\n", - alloc_size, (unsigned int)cur.rlim_cur); - return -1; - } - - srand(time(NULL)); - for (i = 0; i < TEST_LOOP; i++) { - /* - * - choose mlock/mlock2 randomly - * - choose lock_size randomly but lock_size < alloc_size - * - choose start_offset randomly but p+start_offset+lock_size - * < p+alloc_size - */ - int is_mlock = !!(rand() % 2); - int lock_size = rand() % alloc_size; - int start_offset = rand() % (alloc_size - lock_size); - - if (is_mlock) - ret = mlock(p + start_offset, lock_size); - else - ret = mlock2_(p + start_offset, lock_size, - MLOCK_ONFAULT); - - if (ret) { - printf("%s() failure at |%p(%d)| mlock:|%p(%d)|\n", - is_mlock ? "mlock" : "mlock2", - p, alloc_size, - p + start_offset, lock_size); - return ret; - } - } - - /* - * Check VmLck left by the tests. - */ - locked_vm_size = get_proc_locked_vm_size(); - page_size = get_proc_page_size((unsigned long)p); - if (page_size == 0) { - printf("cannot get proc MMUPageSize\n"); - return -1; - } - - if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size) { - printf("test_mlock_within_limit() left VmLck:%d on %d chunk\n", - locked_vm_size, alloc_size); - return -1; - } - - return 0; -} - - -/* - * We expect the mlock/mlock2() to be fail (outof limitation) - * - * With allocated memory chunk [p, p + alloc_size), this - * test will randomly choose start/len and perform mlock/mlock2 - * on [start, start+len] range. - * - * The memory region size alloc_size is above the rlimit. - * And the len to be locked is higher than rlimit. - * So we always expect a failure of mlock/mlock2. - * No locked page number should be increased as a side effect. - * - * return value: 0 - success - * else: failure - */ -int test_mlock_outof_limit(char *p, int alloc_size) -{ - int i; - int ret = 0; - int locked_vm_size = 0, old_locked_vm_size = 0; - struct rlimit cur; - - getrlimit(RLIMIT_MEMLOCK, &cur); - if (cur.rlim_cur >= alloc_size) { - printf("alloc_size[%d] >%u rlimit, violates test condition\n", - alloc_size, (unsigned int)cur.rlim_cur); - return -1; - } - - old_locked_vm_size = get_proc_locked_vm_size(); - srand(time(NULL)); - for (i = 0; i < TEST_LOOP; i++) { - int is_mlock = !!(rand() % 2); - int lock_size = (rand() % (alloc_size - cur.rlim_cur)) - + cur.rlim_cur; - int start_offset = rand() % (alloc_size - lock_size); - - if (is_mlock) - ret = mlock(p + start_offset, lock_size); - else - ret = mlock2_(p + start_offset, lock_size, - MLOCK_ONFAULT); - if (ret == 0) { - printf("%s() succeeds? on %p(%d) mlock%p(%d)\n", - is_mlock ? "mlock" : "mlock2", - p, alloc_size, - p + start_offset, lock_size); - return -1; - } - } - - locked_vm_size = get_proc_locked_vm_size(); - if (locked_vm_size != old_locked_vm_size) { - printf("tests leads to new mlocked page: old[%d], new[%d]\n", - old_locked_vm_size, - locked_vm_size); - return -1; - } - - return 0; -} - -int main(int argc, char **argv) -{ - char *p = NULL; - int ret = 0; - - if (set_cap_limits(MLOCK_RLIMIT_SIZE)) - return -1; - - p = malloc(MLOCK_WITHIN_LIMIT_SIZE); - if (p == NULL) { - perror("malloc() failure\n"); - return -1; - } - ret = test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE); - if (ret) - return ret; - munlock(p, MLOCK_WITHIN_LIMIT_SIZE); - free(p); - - - p = malloc(MLOCK_OUTOF_LIMIT_SIZE); - if (p == NULL) { - perror("malloc() failure\n"); - return -1; - } - ret = test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE); - if (ret) - return ret; - munlock(p, MLOCK_OUTOF_LIMIT_SIZE); - free(p); - - return 0; -} diff --git a/tools/testing/selftests/vm/mlock2-tests.c b/tools/testing/selftests/vm/mlock2-tests.c deleted file mode 100644 index 11b2301f3aa3..000000000000 --- a/tools/testing/selftests/vm/mlock2-tests.c +++ /dev/null @@ -1,520 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include "mlock2.h" - -#include "../kselftest.h" - -struct vm_boundaries { - unsigned long start; - unsigned long end; -}; - -static int get_vm_area(unsigned long addr, struct vm_boundaries *area) -{ - FILE *file; - int ret = 1; - char line[1024] = {0}; - char *end_addr; - char *stop; - unsigned long start; - unsigned long end; - - if (!area) - return ret; - - file = fopen("/proc/self/maps", "r"); - if (!file) { - perror("fopen"); - return ret; - } - - memset(area, 0, sizeof(struct vm_boundaries)); - - while(fgets(line, 1024, file)) { - end_addr = strchr(line, '-'); - if (!end_addr) { - printf("cannot parse /proc/self/maps\n"); - goto out; - } - *end_addr = '\0'; - end_addr++; - stop = strchr(end_addr, ' '); - if (!stop) { - printf("cannot parse /proc/self/maps\n"); - goto out; - } - stop = '\0'; - - sscanf(line, "%lx", &start); - sscanf(end_addr, "%lx", &end); - - if (start <= addr && end > addr) { - area->start = start; - area->end = end; - ret = 0; - goto out; - } - } -out: - fclose(file); - return ret; -} - -#define VMFLAGS "VmFlags:" - -static bool is_vmflag_set(unsigned long addr, const char *vmflag) -{ - char *line = NULL; - char *flags; - size_t size = 0; - bool ret = false; - FILE *smaps; - - smaps = seek_to_smaps_entry(addr); - if (!smaps) { - printf("Unable to parse /proc/self/smaps\n"); - goto out; - } - - while (getline(&line, &size, smaps) > 0) { - if (!strstr(line, VMFLAGS)) { - free(line); - line = NULL; - size = 0; - continue; - } - - flags = line + strlen(VMFLAGS); - ret = (strstr(flags, vmflag) != NULL); - goto out; - } - -out: - free(line); - fclose(smaps); - return ret; -} - -#define SIZE "Size:" -#define RSS "Rss:" -#define LOCKED "lo" - -static unsigned long get_value_for_name(unsigned long addr, const char *name) -{ - char *line = NULL; - size_t size = 0; - char *value_ptr; - FILE *smaps = NULL; - unsigned long value = -1UL; - - smaps = seek_to_smaps_entry(addr); - if (!smaps) { - printf("Unable to parse /proc/self/smaps\n"); - goto out; - } - - while (getline(&line, &size, smaps) > 0) { - if (!strstr(line, name)) { - free(line); - line = NULL; - size = 0; - continue; - } - - value_ptr = line + strlen(name); - if (sscanf(value_ptr, "%lu kB", &value) < 1) { - printf("Unable to parse smaps entry for Size\n"); - goto out; - } - break; - } - -out: - if (smaps) - fclose(smaps); - free(line); - return value; -} - -static bool is_vma_lock_on_fault(unsigned long addr) -{ - bool locked; - unsigned long vma_size, vma_rss; - - locked = is_vmflag_set(addr, LOCKED); - if (!locked) - return false; - - vma_size = get_value_for_name(addr, SIZE); - vma_rss = get_value_for_name(addr, RSS); - - /* only one page is faulted in */ - return (vma_rss < vma_size); -} - -#define PRESENT_BIT 0x8000000000000000ULL -#define PFN_MASK 0x007FFFFFFFFFFFFFULL -#define UNEVICTABLE_BIT (1UL << 18) - -static int lock_check(unsigned long addr) -{ - bool locked; - unsigned long vma_size, vma_rss; - - locked = is_vmflag_set(addr, LOCKED); - if (!locked) - return false; - - vma_size = get_value_for_name(addr, SIZE); - vma_rss = get_value_for_name(addr, RSS); - - return (vma_rss == vma_size); -} - -static int unlock_lock_check(char *map) -{ - if (is_vmflag_set((unsigned long)map, LOCKED)) { - printf("VMA flag %s is present on page 1 after unlock\n", LOCKED); - return 1; - } - - return 0; -} - -static int test_mlock_lock() -{ - char *map; - int ret = 1; - unsigned long page_size = getpagesize(); - - map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (map == MAP_FAILED) { - perror("test_mlock_locked mmap"); - goto out; - } - - if (mlock2_(map, 2 * page_size, 0)) { - if (errno == ENOSYS) { - printf("Cannot call new mlock family, skipping test\n"); - _exit(KSFT_SKIP); - } - perror("mlock2(0)"); - goto unmap; - } - - if (!lock_check((unsigned long)map)) - goto unmap; - - /* Now unlock and recheck attributes */ - if (munlock(map, 2 * page_size)) { - perror("munlock()"); - goto unmap; - } - - ret = unlock_lock_check(map); - -unmap: - munmap(map, 2 * page_size); -out: - return ret; -} - -static int onfault_check(char *map) -{ - *map = 'a'; - if (!is_vma_lock_on_fault((unsigned long)map)) { - printf("VMA is not marked for lock on fault\n"); - return 1; - } - - return 0; -} - -static int unlock_onfault_check(char *map) -{ - unsigned long page_size = getpagesize(); - - if (is_vma_lock_on_fault((unsigned long)map) || - is_vma_lock_on_fault((unsigned long)map + page_size)) { - printf("VMA is still lock on fault after unlock\n"); - return 1; - } - - return 0; -} - -static int test_mlock_onfault() -{ - char *map; - int ret = 1; - unsigned long page_size = getpagesize(); - - map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (map == MAP_FAILED) { - perror("test_mlock_locked mmap"); - goto out; - } - - if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) { - if (errno == ENOSYS) { - printf("Cannot call new mlock family, skipping test\n"); - _exit(KSFT_SKIP); - } - perror("mlock2(MLOCK_ONFAULT)"); - goto unmap; - } - - if (onfault_check(map)) - goto unmap; - - /* Now unlock and recheck attributes */ - if (munlock(map, 2 * page_size)) { - if (errno == ENOSYS) { - printf("Cannot call new mlock family, skipping test\n"); - _exit(KSFT_SKIP); - } - perror("munlock()"); - goto unmap; - } - - ret = unlock_onfault_check(map); -unmap: - munmap(map, 2 * page_size); -out: - return ret; -} - -static int test_lock_onfault_of_present() -{ - char *map; - int ret = 1; - unsigned long page_size = getpagesize(); - - map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (map == MAP_FAILED) { - perror("test_mlock_locked mmap"); - goto out; - } - - *map = 'a'; - - if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) { - if (errno == ENOSYS) { - printf("Cannot call new mlock family, skipping test\n"); - _exit(KSFT_SKIP); - } - perror("mlock2(MLOCK_ONFAULT)"); - goto unmap; - } - - if (!is_vma_lock_on_fault((unsigned long)map) || - !is_vma_lock_on_fault((unsigned long)map + page_size)) { - printf("VMA with present pages is not marked lock on fault\n"); - goto unmap; - } - ret = 0; -unmap: - munmap(map, 2 * page_size); -out: - return ret; -} - -static int test_munlockall() -{ - char *map; - int ret = 1; - unsigned long page_size = getpagesize(); - - map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - - if (map == MAP_FAILED) { - perror("test_munlockall mmap"); - goto out; - } - - if (mlockall(MCL_CURRENT)) { - perror("mlockall(MCL_CURRENT)"); - goto out; - } - - if (!lock_check((unsigned long)map)) - goto unmap; - - if (munlockall()) { - perror("munlockall()"); - goto unmap; - } - - if (unlock_lock_check(map)) - goto unmap; - - munmap(map, 2 * page_size); - - map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - - if (map == MAP_FAILED) { - perror("test_munlockall second mmap"); - goto out; - } - - if (mlockall(MCL_CURRENT | MCL_ONFAULT)) { - perror("mlockall(MCL_CURRENT | MCL_ONFAULT)"); - goto unmap; - } - - if (onfault_check(map)) - goto unmap; - - if (munlockall()) { - perror("munlockall()"); - goto unmap; - } - - if (unlock_onfault_check(map)) - goto unmap; - - if (mlockall(MCL_CURRENT | MCL_FUTURE)) { - perror("mlockall(MCL_CURRENT | MCL_FUTURE)"); - goto out; - } - - if (!lock_check((unsigned long)map)) - goto unmap; - - if (munlockall()) { - perror("munlockall()"); - goto unmap; - } - - ret = unlock_lock_check(map); - -unmap: - munmap(map, 2 * page_size); -out: - munlockall(); - return ret; -} - -static int test_vma_management(bool call_mlock) -{ - int ret = 1; - void *map; - unsigned long page_size = getpagesize(); - struct vm_boundaries page1; - struct vm_boundaries page2; - struct vm_boundaries page3; - - map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (map == MAP_FAILED) { - perror("mmap()"); - return ret; - } - - if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) { - if (errno == ENOSYS) { - printf("Cannot call new mlock family, skipping test\n"); - _exit(KSFT_SKIP); - } - perror("mlock(ONFAULT)\n"); - goto out; - } - - if (get_vm_area((unsigned long)map, &page1) || - get_vm_area((unsigned long)map + page_size, &page2) || - get_vm_area((unsigned long)map + page_size * 2, &page3)) { - printf("couldn't find mapping in /proc/self/maps\n"); - goto out; - } - - /* - * Before we unlock a portion, we need to that all three pages are in - * the same VMA. If they are not we abort this test (Note that this is - * not a failure) - */ - if (page1.start != page2.start || page2.start != page3.start) { - printf("VMAs are not merged to start, aborting test\n"); - ret = 0; - goto out; - } - - if (munlock(map + page_size, page_size)) { - perror("munlock()"); - goto out; - } - - if (get_vm_area((unsigned long)map, &page1) || - get_vm_area((unsigned long)map + page_size, &page2) || - get_vm_area((unsigned long)map + page_size * 2, &page3)) { - printf("couldn't find mapping in /proc/self/maps\n"); - goto out; - } - - /* All three VMAs should be different */ - if (page1.start == page2.start || page2.start == page3.start) { - printf("failed to split VMA for munlock\n"); - goto out; - } - - /* Now unlock the first and third page and check the VMAs again */ - if (munlock(map, page_size * 3)) { - perror("munlock()"); - goto out; - } - - if (get_vm_area((unsigned long)map, &page1) || - get_vm_area((unsigned long)map + page_size, &page2) || - get_vm_area((unsigned long)map + page_size * 2, &page3)) { - printf("couldn't find mapping in /proc/self/maps\n"); - goto out; - } - - /* Now all three VMAs should be the same */ - if (page1.start != page2.start || page2.start != page3.start) { - printf("failed to merge VMAs after munlock\n"); - goto out; - } - - ret = 0; -out: - munmap(map, 3 * page_size); - return ret; -} - -static int test_mlockall(int (test_function)(bool call_mlock)) -{ - int ret = 1; - - if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) { - perror("mlockall"); - return ret; - } - - ret = test_function(false); - munlockall(); - return ret; -} - -int main(int argc, char **argv) -{ - int ret = 0; - ret += test_mlock_lock(); - ret += test_mlock_onfault(); - ret += test_munlockall(); - ret += test_lock_onfault_of_present(); - ret += test_vma_management(true); - ret += test_mlockall(test_vma_management); - return ret; -} diff --git a/tools/testing/selftests/vm/mlock2.h b/tools/testing/selftests/vm/mlock2.h deleted file mode 100644 index 2a6e76c226bc..000000000000 --- a/tools/testing/selftests/vm/mlock2.h +++ /dev/null @@ -1,63 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include -#include -#include -#include - -#ifndef MLOCK_ONFAULT -#define MLOCK_ONFAULT 1 -#endif - -#ifndef MCL_ONFAULT -#define MCL_ONFAULT (MCL_FUTURE << 1) -#endif - -static int mlock2_(void *start, size_t len, int flags) -{ -#ifdef __NR_mlock2 - return syscall(__NR_mlock2, start, len, flags); -#else - errno = ENOSYS; - return -1; -#endif -} - -static FILE *seek_to_smaps_entry(unsigned long addr) -{ - FILE *file; - char *line = NULL; - size_t size = 0; - unsigned long start, end; - char perms[5]; - unsigned long offset; - char dev[32]; - unsigned long inode; - char path[BUFSIZ]; - - file = fopen("/proc/self/smaps", "r"); - if (!file) { - perror("fopen smaps"); - _exit(1); - } - - while (getline(&line, &size, file) > 0) { - if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n", - &start, &end, perms, &offset, dev, &inode, path) < 6) - goto next; - - if (start <= addr && addr < end) - goto out; - -next: - free(line); - line = NULL; - size = 0; - } - - fclose(file); - file = NULL; - -out: - free(line); - return file; -} diff --git a/tools/testing/selftests/vm/mrelease_test.c b/tools/testing/selftests/vm/mrelease_test.c deleted file mode 100644 index 6c62966ab5db..000000000000 --- a/tools/testing/selftests/vm/mrelease_test.c +++ /dev/null @@ -1,206 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 2022 Google LLC - */ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include - -#include "util.h" - -#include "../kselftest.h" - -#ifndef __NR_pidfd_open -#define __NR_pidfd_open -1 -#endif - -#ifndef __NR_process_mrelease -#define __NR_process_mrelease -1 -#endif - -#define MB(x) (x << 20) -#define MAX_SIZE_MB 1024 - -static int alloc_noexit(unsigned long nr_pages, int pipefd) -{ - int ppid = getppid(); - int timeout = 10; /* 10sec timeout to get killed */ - unsigned long i; - char *buf; - - buf = (char *)mmap(NULL, nr_pages * PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANON, 0, 0); - if (buf == MAP_FAILED) { - perror("mmap failed, halting the test"); - return KSFT_FAIL; - } - - for (i = 0; i < nr_pages; i++) - *((unsigned long *)(buf + (i * PAGE_SIZE))) = i; - - /* Signal the parent that the child is ready */ - if (write(pipefd, "", 1) < 0) { - perror("write"); - return KSFT_FAIL; - } - - /* Wait to be killed (when reparenting happens) */ - while (getppid() == ppid && timeout > 0) { - sleep(1); - timeout--; - } - - munmap(buf, nr_pages * PAGE_SIZE); - - return (timeout > 0) ? KSFT_PASS : KSFT_FAIL; -} - -/* The process_mrelease calls in this test are expected to fail */ -static void run_negative_tests(int pidfd) -{ - int res; - /* Test invalid flags. Expect to fail with EINVAL error code. */ - if (!syscall(__NR_process_mrelease, pidfd, (unsigned int)-1) || - errno != EINVAL) { - res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); - perror("process_mrelease with wrong flags"); - exit(res); - } - /* - * Test reaping while process is alive with no pending SIGKILL. - * Expect to fail with EINVAL error code. - */ - if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL) { - res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); - perror("process_mrelease on a live process"); - exit(res); - } -} - -static int child_main(int pipefd[], size_t size) -{ - int res; - - /* Allocate and fault-in memory and wait to be killed */ - close(pipefd[0]); - res = alloc_noexit(MB(size) / PAGE_SIZE, pipefd[1]); - close(pipefd[1]); - return res; -} - -int main(void) -{ - int pipefd[2], pidfd; - bool success, retry; - size_t size; - pid_t pid; - char byte; - int res; - - /* Test a wrong pidfd */ - if (!syscall(__NR_process_mrelease, -1, 0) || errno != EBADF) { - res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); - perror("process_mrelease with wrong pidfd"); - exit(res); - } - - /* Start the test with 1MB child memory allocation */ - size = 1; -retry: - /* - * Pipe for the child to signal when it's done allocating - * memory - */ - if (pipe(pipefd)) { - perror("pipe"); - exit(KSFT_FAIL); - } - pid = fork(); - if (pid < 0) { - perror("fork"); - close(pipefd[0]); - close(pipefd[1]); - exit(KSFT_FAIL); - } - - if (pid == 0) { - /* Child main routine */ - res = child_main(pipefd, size); - exit(res); - } - - /* - * Parent main routine: - * Wait for the child to finish allocations, then kill and reap - */ - close(pipefd[1]); - /* Block until the child is ready */ - res = read(pipefd[0], &byte, 1); - close(pipefd[0]); - if (res < 0) { - perror("read"); - if (!kill(pid, SIGKILL)) - waitpid(pid, NULL, 0); - exit(KSFT_FAIL); - } - - pidfd = syscall(__NR_pidfd_open, pid, 0); - if (pidfd < 0) { - perror("pidfd_open"); - if (!kill(pid, SIGKILL)) - waitpid(pid, NULL, 0); - exit(KSFT_FAIL); - } - - /* Run negative tests which require a live child */ - run_negative_tests(pidfd); - - if (kill(pid, SIGKILL)) { - res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); - perror("kill"); - exit(res); - } - - success = (syscall(__NR_process_mrelease, pidfd, 0) == 0); - if (!success) { - /* - * If we failed to reap because the child exited too soon, - * before we could call process_mrelease. Double child's memory - * which causes it to spend more time on cleanup and increases - * our chances of reaping its memory before it exits. - * Retry until we succeed or reach MAX_SIZE_MB. - */ - if (errno == ESRCH) { - retry = (size <= MAX_SIZE_MB); - } else { - res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); - perror("process_mrelease"); - waitpid(pid, NULL, 0); - exit(res); - } - } - - /* Cleanup to prevent zombies */ - if (waitpid(pid, NULL, 0) < 0) { - perror("waitpid"); - exit(KSFT_FAIL); - } - close(pidfd); - - if (!success) { - if (retry) { - size *= 2; - goto retry; - } - printf("All process_mrelease attempts failed!\n"); - exit(KSFT_FAIL); - } - - printf("Success reaping a child with %zuMB of memory allocations\n", - size); - return KSFT_PASS; -} diff --git a/tools/testing/selftests/vm/mremap_dontunmap.c b/tools/testing/selftests/vm/mremap_dontunmap.c deleted file mode 100644 index f01dc4a85b0b..000000000000 --- a/tools/testing/selftests/vm/mremap_dontunmap.c +++ /dev/null @@ -1,364 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -/* - * Tests for mremap w/ MREMAP_DONTUNMAP. - * - * Copyright 2020, Brian Geffon - */ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include - -#include "../kselftest.h" - -#ifndef MREMAP_DONTUNMAP -#define MREMAP_DONTUNMAP 4 -#endif - -unsigned long page_size; -char *page_buffer; - -static void dump_maps(void) -{ - char cmd[32]; - - snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid()); - system(cmd); -} - -#define BUG_ON(condition, description) \ - do { \ - if (condition) { \ - fprintf(stderr, "[FAIL]\t%s():%d\t%s:%s\n", __func__, \ - __LINE__, (description), strerror(errno)); \ - dump_maps(); \ - exit(1); \ - } \ - } while (0) - -// Try a simple operation for to "test" for kernel support this prevents -// reporting tests as failed when it's run on an older kernel. -static int kernel_support_for_mremap_dontunmap() -{ - int ret = 0; - unsigned long num_pages = 1; - void *source_mapping = mmap(NULL, num_pages * page_size, PROT_NONE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(source_mapping == MAP_FAILED, "mmap"); - - // This simple remap should only fail if MREMAP_DONTUNMAP isn't - // supported. - void *dest_mapping = - mremap(source_mapping, num_pages * page_size, num_pages * page_size, - MREMAP_DONTUNMAP | MREMAP_MAYMOVE, 0); - if (dest_mapping == MAP_FAILED) { - ret = errno; - } else { - BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, - "unable to unmap destination mapping"); - } - - BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, - "unable to unmap source mapping"); - return ret; -} - -// This helper will just validate that an entire mapping contains the expected -// byte. -static int check_region_contains_byte(void *addr, unsigned long size, char byte) -{ - BUG_ON(size & (page_size - 1), - "check_region_contains_byte expects page multiples"); - BUG_ON((unsigned long)addr & (page_size - 1), - "check_region_contains_byte expects page alignment"); - - memset(page_buffer, byte, page_size); - - unsigned long num_pages = size / page_size; - unsigned long i; - - // Compare each page checking that it contains our expected byte. - for (i = 0; i < num_pages; ++i) { - int ret = - memcmp(addr + (i * page_size), page_buffer, page_size); - if (ret) { - return ret; - } - } - - return 0; -} - -// this test validates that MREMAP_DONTUNMAP moves the pagetables while leaving -// the source mapping mapped. -static void mremap_dontunmap_simple() -{ - unsigned long num_pages = 5; - - void *source_mapping = - mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(source_mapping == MAP_FAILED, "mmap"); - - memset(source_mapping, 'a', num_pages * page_size); - - // Try to just move the whole mapping anywhere (not fixed). - void *dest_mapping = - mremap(source_mapping, num_pages * page_size, num_pages * page_size, - MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); - BUG_ON(dest_mapping == MAP_FAILED, "mremap"); - - // Validate that the pages have been moved, we know they were moved if - // the dest_mapping contains a's. - BUG_ON(check_region_contains_byte - (dest_mapping, num_pages * page_size, 'a') != 0, - "pages did not migrate"); - BUG_ON(check_region_contains_byte - (source_mapping, num_pages * page_size, 0) != 0, - "source should have no ptes"); - - BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, - "unable to unmap destination mapping"); - BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, - "unable to unmap source mapping"); -} - -// This test validates that MREMAP_DONTUNMAP on a shared mapping works as expected. -static void mremap_dontunmap_simple_shmem() -{ - unsigned long num_pages = 5; - - int mem_fd = memfd_create("memfd", MFD_CLOEXEC); - BUG_ON(mem_fd < 0, "memfd_create"); - - BUG_ON(ftruncate(mem_fd, num_pages * page_size) < 0, - "ftruncate"); - - void *source_mapping = - mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, - MAP_FILE | MAP_SHARED, mem_fd, 0); - BUG_ON(source_mapping == MAP_FAILED, "mmap"); - - BUG_ON(close(mem_fd) < 0, "close"); - - memset(source_mapping, 'a', num_pages * page_size); - - // Try to just move the whole mapping anywhere (not fixed). - void *dest_mapping = - mremap(source_mapping, num_pages * page_size, num_pages * page_size, - MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); - if (dest_mapping == MAP_FAILED && errno == EINVAL) { - // Old kernel which doesn't support MREMAP_DONTUNMAP on shmem. - BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, - "unable to unmap source mapping"); - return; - } - - BUG_ON(dest_mapping == MAP_FAILED, "mremap"); - - // Validate that the pages have been moved, we know they were moved if - // the dest_mapping contains a's. - BUG_ON(check_region_contains_byte - (dest_mapping, num_pages * page_size, 'a') != 0, - "pages did not migrate"); - - // Because the region is backed by shmem, we will actually see the same - // memory at the source location still. - BUG_ON(check_region_contains_byte - (source_mapping, num_pages * page_size, 'a') != 0, - "source should have no ptes"); - - BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, - "unable to unmap destination mapping"); - BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, - "unable to unmap source mapping"); -} - -// This test validates MREMAP_DONTUNMAP will move page tables to a specific -// destination using MREMAP_FIXED, also while validating that the source -// remains intact. -static void mremap_dontunmap_simple_fixed() -{ - unsigned long num_pages = 5; - - // Since we want to guarantee that we can remap to a point, we will - // create a mapping up front. - void *dest_mapping = - mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(dest_mapping == MAP_FAILED, "mmap"); - memset(dest_mapping, 'X', num_pages * page_size); - - void *source_mapping = - mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(source_mapping == MAP_FAILED, "mmap"); - memset(source_mapping, 'a', num_pages * page_size); - - void *remapped_mapping = - mremap(source_mapping, num_pages * page_size, num_pages * page_size, - MREMAP_FIXED | MREMAP_DONTUNMAP | MREMAP_MAYMOVE, - dest_mapping); - BUG_ON(remapped_mapping == MAP_FAILED, "mremap"); - BUG_ON(remapped_mapping != dest_mapping, - "mremap should have placed the remapped mapping at dest_mapping"); - - // The dest mapping will have been unmap by mremap so we expect the Xs - // to be gone and replaced with a's. - BUG_ON(check_region_contains_byte - (dest_mapping, num_pages * page_size, 'a') != 0, - "pages did not migrate"); - - // And the source mapping will have had its ptes dropped. - BUG_ON(check_region_contains_byte - (source_mapping, num_pages * page_size, 0) != 0, - "source should have no ptes"); - - BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, - "unable to unmap destination mapping"); - BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, - "unable to unmap source mapping"); -} - -// This test validates that we can MREMAP_DONTUNMAP for a portion of an -// existing mapping. -static void mremap_dontunmap_partial_mapping() -{ - /* - * source mapping: - * -------------- - * | aaaaaaaaaa | - * -------------- - * to become: - * -------------- - * | aaaaa00000 | - * -------------- - * With the destination mapping containing 5 pages of As. - * --------- - * | aaaaa | - * --------- - */ - unsigned long num_pages = 10; - void *source_mapping = - mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(source_mapping == MAP_FAILED, "mmap"); - memset(source_mapping, 'a', num_pages * page_size); - - // We will grab the last 5 pages of the source and move them. - void *dest_mapping = - mremap(source_mapping + (5 * page_size), 5 * page_size, - 5 * page_size, - MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); - BUG_ON(dest_mapping == MAP_FAILED, "mremap"); - - // We expect the first 5 pages of the source to contain a's and the - // final 5 pages to contain zeros. - BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 'a') != - 0, "first 5 pages of source should have original pages"); - BUG_ON(check_region_contains_byte - (source_mapping + (5 * page_size), 5 * page_size, 0) != 0, - "final 5 pages of source should have no ptes"); - - // Finally we expect the destination to have 5 pages worth of a's. - BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') != - 0, "dest mapping should contain ptes from the source"); - - BUG_ON(munmap(dest_mapping, 5 * page_size) == -1, - "unable to unmap destination mapping"); - BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, - "unable to unmap source mapping"); -} - -// This test validates that we can remap over only a portion of a mapping. -static void mremap_dontunmap_partial_mapping_overwrite(void) -{ - /* - * source mapping: - * --------- - * |aaaaa| - * --------- - * dest mapping initially: - * ----------- - * |XXXXXXXXXX| - * ------------ - * Source to become: - * --------- - * |00000| - * --------- - * With the destination mapping containing 5 pages of As. - * ------------ - * |aaaaaXXXXX| - * ------------ - */ - void *source_mapping = - mmap(NULL, 5 * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(source_mapping == MAP_FAILED, "mmap"); - memset(source_mapping, 'a', 5 * page_size); - - void *dest_mapping = - mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(dest_mapping == MAP_FAILED, "mmap"); - memset(dest_mapping, 'X', 10 * page_size); - - // We will grab the last 5 pages of the source and move them. - void *remapped_mapping = - mremap(source_mapping, 5 * page_size, - 5 * page_size, - MREMAP_DONTUNMAP | MREMAP_MAYMOVE | MREMAP_FIXED, dest_mapping); - BUG_ON(dest_mapping == MAP_FAILED, "mremap"); - BUG_ON(dest_mapping != remapped_mapping, "expected to remap to dest_mapping"); - - BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 0) != - 0, "first 5 pages of source should have no ptes"); - - // Finally we expect the destination to have 5 pages worth of a's. - BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') != 0, - "dest mapping should contain ptes from the source"); - - // Finally the last 5 pages shouldn't have been touched. - BUG_ON(check_region_contains_byte(dest_mapping + (5 * page_size), - 5 * page_size, 'X') != 0, - "dest mapping should have retained the last 5 pages"); - - BUG_ON(munmap(dest_mapping, 10 * page_size) == -1, - "unable to unmap destination mapping"); - BUG_ON(munmap(source_mapping, 5 * page_size) == -1, - "unable to unmap source mapping"); -} - -int main(void) -{ - page_size = sysconf(_SC_PAGE_SIZE); - - // test for kernel support for MREMAP_DONTUNMAP skipping the test if - // not. - if (kernel_support_for_mremap_dontunmap() != 0) { - printf("No kernel support for MREMAP_DONTUNMAP\n"); - return KSFT_SKIP; - } - - // Keep a page sized buffer around for when we need it. - page_buffer = - mmap(NULL, page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(page_buffer == MAP_FAILED, "unable to mmap a page."); - - mremap_dontunmap_simple(); - mremap_dontunmap_simple_shmem(); - mremap_dontunmap_simple_fixed(); - mremap_dontunmap_partial_mapping(); - mremap_dontunmap_partial_mapping_overwrite(); - - BUG_ON(munmap(page_buffer, page_size) == -1, - "unable to unmap page buffer"); - - printf("OK\n"); - return 0; -} diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c deleted file mode 100644 index 9496346973d4..000000000000 --- a/tools/testing/selftests/vm/mremap_test.c +++ /dev/null @@ -1,475 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 2020 Google LLC - */ -#define _GNU_SOURCE - -#include -#include -#include -#include -#include -#include -#include - -#include "../kselftest.h" - -#define EXPECT_SUCCESS 0 -#define EXPECT_FAILURE 1 -#define NON_OVERLAPPING 0 -#define OVERLAPPING 1 -#define NS_PER_SEC 1000000000ULL -#define VALIDATION_DEFAULT_THRESHOLD 4 /* 4MB */ -#define VALIDATION_NO_THRESHOLD 0 /* Verify the entire region */ - -#define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) - -struct config { - unsigned long long src_alignment; - unsigned long long dest_alignment; - unsigned long long region_size; - int overlapping; -}; - -struct test { - const char *name; - struct config config; - int expect_failure; -}; - -enum { - _1KB = 1ULL << 10, /* 1KB -> not page aligned */ - _4KB = 4ULL << 10, - _8KB = 8ULL << 10, - _1MB = 1ULL << 20, - _2MB = 2ULL << 20, - _4MB = 4ULL << 20, - _1GB = 1ULL << 30, - _2GB = 2ULL << 30, - PMD = _2MB, - PUD = _1GB, -}; - -#define PTE page_size - -#define MAKE_TEST(source_align, destination_align, size, \ - overlaps, should_fail, test_name) \ -(struct test){ \ - .name = test_name, \ - .config = { \ - .src_alignment = source_align, \ - .dest_alignment = destination_align, \ - .region_size = size, \ - .overlapping = overlaps, \ - }, \ - .expect_failure = should_fail \ -} - -/* - * Returns false if the requested remap region overlaps with an - * existing mapping (e.g text, stack) else returns true. - */ -static bool is_remap_region_valid(void *addr, unsigned long long size) -{ - void *remap_addr = NULL; - bool ret = true; - - /* Use MAP_FIXED_NOREPLACE flag to ensure region is not mapped */ - remap_addr = mmap(addr, size, PROT_READ | PROT_WRITE, - MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, - -1, 0); - - if (remap_addr == MAP_FAILED) { - if (errno == EEXIST) - ret = false; - } else { - munmap(remap_addr, size); - } - - return ret; -} - -/* Returns mmap_min_addr sysctl tunable from procfs */ -static unsigned long long get_mmap_min_addr(void) -{ - FILE *fp; - int n_matched; - static unsigned long long addr; - - if (addr) - return addr; - - fp = fopen("/proc/sys/vm/mmap_min_addr", "r"); - if (fp == NULL) { - ksft_print_msg("Failed to open /proc/sys/vm/mmap_min_addr: %s\n", - strerror(errno)); - exit(KSFT_SKIP); - } - - n_matched = fscanf(fp, "%llu", &addr); - if (n_matched != 1) { - ksft_print_msg("Failed to read /proc/sys/vm/mmap_min_addr: %s\n", - strerror(errno)); - fclose(fp); - exit(KSFT_SKIP); - } - - fclose(fp); - return addr; -} - -/* - * This test validates that merge is called when expanding a mapping. - * Mapping containing three pages is created, middle page is unmapped - * and then the mapping containing the first page is expanded so that - * it fills the created hole. The two parts should merge creating - * single mapping with three pages. - */ -static void mremap_expand_merge(unsigned long page_size) -{ - char *test_name = "mremap expand merge"; - FILE *fp; - char *line = NULL; - size_t len = 0; - bool success = false; - char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - munmap(start + page_size, page_size); - mremap(start, page_size, 2 * page_size, 0); - - fp = fopen("/proc/self/maps", "r"); - if (fp == NULL) { - ksft_test_result_fail("%s\n", test_name); - return; - } - - while (getline(&line, &len, fp) != -1) { - char *first = strtok(line, "- "); - void *first_val = (void *)strtol(first, NULL, 16); - char *second = strtok(NULL, "- "); - void *second_val = (void *) strtol(second, NULL, 16); - - if (first_val == start && second_val == start + 3 * page_size) { - success = true; - break; - } - } - if (success) - ksft_test_result_pass("%s\n", test_name); - else - ksft_test_result_fail("%s\n", test_name); - fclose(fp); -} - -/* - * Returns the start address of the mapping on success, else returns - * NULL on failure. - */ -static void *get_source_mapping(struct config c) -{ - unsigned long long addr = 0ULL; - void *src_addr = NULL; - unsigned long long mmap_min_addr; - - mmap_min_addr = get_mmap_min_addr(); - -retry: - addr += c.src_alignment; - if (addr < mmap_min_addr) - goto retry; - - src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE, - MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, - -1, 0); - if (src_addr == MAP_FAILED) { - if (errno == EPERM || errno == EEXIST) - goto retry; - goto error; - } - /* - * Check that the address is aligned to the specified alignment. - * Addresses which have alignments that are multiples of that - * specified are not considered valid. For instance, 1GB address is - * 2MB-aligned, however it will not be considered valid for a - * requested alignment of 2MB. This is done to reduce coincidental - * alignment in the tests. - */ - if (((unsigned long long) src_addr & (c.src_alignment - 1)) || - !((unsigned long long) src_addr & c.src_alignment)) { - munmap(src_addr, c.region_size); - goto retry; - } - - if (!src_addr) - goto error; - - return src_addr; -error: - ksft_print_msg("Failed to map source region: %s\n", - strerror(errno)); - return NULL; -} - -/* Returns the time taken for the remap on success else returns -1. */ -static long long remap_region(struct config c, unsigned int threshold_mb, - char pattern_seed) -{ - void *addr, *src_addr, *dest_addr; - unsigned long long i; - struct timespec t_start = {0, 0}, t_end = {0, 0}; - long long start_ns, end_ns, align_mask, ret, offset; - unsigned long long threshold; - - if (threshold_mb == VALIDATION_NO_THRESHOLD) - threshold = c.region_size; - else - threshold = MIN(threshold_mb * _1MB, c.region_size); - - src_addr = get_source_mapping(c); - if (!src_addr) { - ret = -1; - goto out; - } - - /* Set byte pattern */ - srand(pattern_seed); - for (i = 0; i < threshold; i++) - memset((char *) src_addr + i, (char) rand(), 1); - - /* Mask to zero out lower bits of address for alignment */ - align_mask = ~(c.dest_alignment - 1); - /* Offset of destination address from the end of the source region */ - offset = (c.overlapping) ? -c.dest_alignment : c.dest_alignment; - addr = (void *) (((unsigned long long) src_addr + c.region_size - + offset) & align_mask); - - /* See comment in get_source_mapping() */ - if (!((unsigned long long) addr & c.dest_alignment)) - addr = (void *) ((unsigned long long) addr | c.dest_alignment); - - /* Don't destroy existing mappings unless expected to overlap */ - while (!is_remap_region_valid(addr, c.region_size) && !c.overlapping) { - /* Check for unsigned overflow */ - if (addr + c.dest_alignment < addr) { - ksft_print_msg("Couldn't find a valid region to remap to\n"); - ret = -1; - goto out; - } - addr += c.dest_alignment; - } - - clock_gettime(CLOCK_MONOTONIC, &t_start); - dest_addr = mremap(src_addr, c.region_size, c.region_size, - MREMAP_MAYMOVE|MREMAP_FIXED, (char *) addr); - clock_gettime(CLOCK_MONOTONIC, &t_end); - - if (dest_addr == MAP_FAILED) { - ksft_print_msg("mremap failed: %s\n", strerror(errno)); - ret = -1; - goto clean_up_src; - } - - /* Verify byte pattern after remapping */ - srand(pattern_seed); - for (i = 0; i < threshold; i++) { - char c = (char) rand(); - - if (((char *) dest_addr)[i] != c) { - ksft_print_msg("Data after remap doesn't match at offset %d\n", - i); - ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff, - ((char *) dest_addr)[i] & 0xff); - ret = -1; - goto clean_up_dest; - } - } - - start_ns = t_start.tv_sec * NS_PER_SEC + t_start.tv_nsec; - end_ns = t_end.tv_sec * NS_PER_SEC + t_end.tv_nsec; - ret = end_ns - start_ns; - -/* - * Since the destination address is specified using MREMAP_FIXED, subsequent - * mremap will unmap any previous mapping at the address range specified by - * dest_addr and region_size. This significantly affects the remap time of - * subsequent tests. So we clean up mappings after each test. - */ -clean_up_dest: - munmap(dest_addr, c.region_size); -clean_up_src: - munmap(src_addr, c.region_size); -out: - return ret; -} - -static void run_mremap_test_case(struct test test_case, int *failures, - unsigned int threshold_mb, - unsigned int pattern_seed) -{ - long long remap_time = remap_region(test_case.config, threshold_mb, - pattern_seed); - - if (remap_time < 0) { - if (test_case.expect_failure) - ksft_test_result_xfail("%s\n\tExpected mremap failure\n", - test_case.name); - else { - ksft_test_result_fail("%s\n", test_case.name); - *failures += 1; - } - } else { - /* - * Comparing mremap time is only applicable if entire region - * was faulted in. - */ - if (threshold_mb == VALIDATION_NO_THRESHOLD || - test_case.config.region_size <= threshold_mb * _1MB) - ksft_test_result_pass("%s\n\tmremap time: %12lldns\n", - test_case.name, remap_time); - else - ksft_test_result_pass("%s\n", test_case.name); - } -} - -static void usage(const char *cmd) -{ - fprintf(stderr, - "Usage: %s [[-t ] [-p ]]\n" - "-t\t only validate threshold_mb of the remapped region\n" - " \t if 0 is supplied no threshold is used; all tests\n" - " \t are run and remapped regions validated fully.\n" - " \t The default threshold used is 4MB.\n" - "-p\t provide a seed to generate the random pattern for\n" - " \t validating the remapped region.\n", cmd); -} - -static int parse_args(int argc, char **argv, unsigned int *threshold_mb, - unsigned int *pattern_seed) -{ - const char *optstr = "t:p:"; - int opt; - - while ((opt = getopt(argc, argv, optstr)) != -1) { - switch (opt) { - case 't': - *threshold_mb = atoi(optarg); - break; - case 'p': - *pattern_seed = atoi(optarg); - break; - default: - usage(argv[0]); - return -1; - } - } - - if (optind < argc) { - usage(argv[0]); - return -1; - } - - return 0; -} - -#define MAX_TEST 13 -#define MAX_PERF_TEST 3 -int main(int argc, char **argv) -{ - int failures = 0; - int i, run_perf_tests; - unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; - unsigned int pattern_seed; - int num_expand_tests = 1; - struct test test_cases[MAX_TEST]; - struct test perf_test_cases[MAX_PERF_TEST]; - int page_size; - time_t t; - - pattern_seed = (unsigned int) time(&t); - - if (parse_args(argc, argv, &threshold_mb, &pattern_seed) < 0) - exit(EXIT_FAILURE); - - ksft_print_msg("Test configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n", - threshold_mb, pattern_seed); - - page_size = sysconf(_SC_PAGESIZE); - - /* Expected mremap failures */ - test_cases[0] = MAKE_TEST(page_size, page_size, page_size, - OVERLAPPING, EXPECT_FAILURE, - "mremap - Source and Destination Regions Overlapping"); - - test_cases[1] = MAKE_TEST(page_size, page_size/4, page_size, - NON_OVERLAPPING, EXPECT_FAILURE, - "mremap - Destination Address Misaligned (1KB-aligned)"); - test_cases[2] = MAKE_TEST(page_size/4, page_size, page_size, - NON_OVERLAPPING, EXPECT_FAILURE, - "mremap - Source Address Misaligned (1KB-aligned)"); - - /* Src addr PTE aligned */ - test_cases[3] = MAKE_TEST(PTE, PTE, PTE * 2, - NON_OVERLAPPING, EXPECT_SUCCESS, - "8KB mremap - Source PTE-aligned, Destination PTE-aligned"); - - /* Src addr 1MB aligned */ - test_cases[4] = MAKE_TEST(_1MB, PTE, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2MB mremap - Source 1MB-aligned, Destination PTE-aligned"); - test_cases[5] = MAKE_TEST(_1MB, _1MB, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2MB mremap - Source 1MB-aligned, Destination 1MB-aligned"); - - /* Src addr PMD aligned */ - test_cases[6] = MAKE_TEST(PMD, PTE, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, - "4MB mremap - Source PMD-aligned, Destination PTE-aligned"); - test_cases[7] = MAKE_TEST(PMD, _1MB, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, - "4MB mremap - Source PMD-aligned, Destination 1MB-aligned"); - test_cases[8] = MAKE_TEST(PMD, PMD, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, - "4MB mremap - Source PMD-aligned, Destination PMD-aligned"); - - /* Src addr PUD aligned */ - test_cases[9] = MAKE_TEST(PUD, PTE, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2GB mremap - Source PUD-aligned, Destination PTE-aligned"); - test_cases[10] = MAKE_TEST(PUD, _1MB, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2GB mremap - Source PUD-aligned, Destination 1MB-aligned"); - test_cases[11] = MAKE_TEST(PUD, PMD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2GB mremap - Source PUD-aligned, Destination PMD-aligned"); - test_cases[12] = MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2GB mremap - Source PUD-aligned, Destination PUD-aligned"); - - perf_test_cases[0] = MAKE_TEST(page_size, page_size, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "1GB mremap - Source PTE-aligned, Destination PTE-aligned"); - /* - * mremap 1GB region - Page table level aligned time - * comparison. - */ - perf_test_cases[1] = MAKE_TEST(PMD, PMD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "1GB mremap - Source PMD-aligned, Destination PMD-aligned"); - perf_test_cases[2] = MAKE_TEST(PUD, PUD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "1GB mremap - Source PUD-aligned, Destination PUD-aligned"); - - run_perf_tests = (threshold_mb == VALIDATION_NO_THRESHOLD) || - (threshold_mb * _1MB >= _1GB); - - ksft_set_plan(ARRAY_SIZE(test_cases) + (run_perf_tests ? - ARRAY_SIZE(perf_test_cases) : 0) + num_expand_tests); - - for (i = 0; i < ARRAY_SIZE(test_cases); i++) - run_mremap_test_case(test_cases[i], &failures, threshold_mb, - pattern_seed); - - mremap_expand_merge(page_size); - - if (run_perf_tests) { - ksft_print_msg("\n%s\n", - "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:"); - for (i = 0; i < ARRAY_SIZE(perf_test_cases); i++) - run_mremap_test_case(perf_test_cases[i], &failures, - threshold_mb, pattern_seed); - } - - if (failures > 0) - ksft_exit_fail(); - else - ksft_exit_pass(); -} diff --git a/tools/testing/selftests/vm/on-fault-limit.c b/tools/testing/selftests/vm/on-fault-limit.c deleted file mode 100644 index 634d87dfb2a4..000000000000 --- a/tools/testing/selftests/vm/on-fault-limit.c +++ /dev/null @@ -1,48 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include - -#ifndef MCL_ONFAULT -#define MCL_ONFAULT (MCL_FUTURE << 1) -#endif - -static int test_limit(void) -{ - int ret = 1; - struct rlimit lims; - void *map; - - if (getrlimit(RLIMIT_MEMLOCK, &lims)) { - perror("getrlimit"); - return ret; - } - - if (mlockall(MCL_ONFAULT | MCL_FUTURE)) { - perror("mlockall"); - return ret; - } - - map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0); - if (map != MAP_FAILED) - printf("mmap should have failed, but didn't\n"); - else { - ret = 0; - munmap(map, 2 * lims.rlim_max); - } - - munlockall(); - return ret; -} - -int main(int argc, char **argv) -{ - int ret = 0; - - ret += test_limit(); - return ret; -} diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h deleted file mode 100644 index 92f3be3dd8e5..000000000000 --- a/tools/testing/selftests/vm/pkey-helpers.h +++ /dev/null @@ -1,226 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _PKEYS_HELPER_H -#define _PKEYS_HELPER_H -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../kselftest.h" - -/* Define some kernel-like types */ -#define u8 __u8 -#define u16 __u16 -#define u32 __u32 -#define u64 __u64 - -#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) - -#ifndef DEBUG_LEVEL -#define DEBUG_LEVEL 0 -#endif -#define DPRINT_IN_SIGNAL_BUF_SIZE 4096 -extern int dprint_in_signal; -extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; - -extern int test_nr; -extern int iteration_nr; - -#ifdef __GNUC__ -__attribute__((format(printf, 1, 2))) -#endif -static inline void sigsafe_printf(const char *format, ...) -{ - va_list ap; - - if (!dprint_in_signal) { - va_start(ap, format); - vprintf(format, ap); - va_end(ap); - } else { - int ret; - /* - * No printf() functions are signal-safe. - * They deadlock easily. Write the format - * string to get some output, even if - * incomplete. - */ - ret = write(1, format, strlen(format)); - if (ret < 0) - exit(1); - } -} -#define dprintf_level(level, args...) do { \ - if (level <= DEBUG_LEVEL) \ - sigsafe_printf(args); \ -} while (0) -#define dprintf0(args...) dprintf_level(0, args) -#define dprintf1(args...) dprintf_level(1, args) -#define dprintf2(args...) dprintf_level(2, args) -#define dprintf3(args...) dprintf_level(3, args) -#define dprintf4(args...) dprintf_level(4, args) - -extern void abort_hooks(void); -#define pkey_assert(condition) do { \ - if (!(condition)) { \ - dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ - __FILE__, __LINE__, \ - test_nr, iteration_nr); \ - dprintf0("errno at assert: %d", errno); \ - abort_hooks(); \ - exit(__LINE__); \ - } \ -} while (0) - -__attribute__((noinline)) int read_ptr(int *ptr); -void expected_pkey_fault(int pkey); -int sys_pkey_alloc(unsigned long flags, unsigned long init_val); -int sys_pkey_free(unsigned long pkey); -int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, - unsigned long pkey); -void record_pkey_malloc(void *ptr, long size, int prot); - -#if defined(__i386__) || defined(__x86_64__) /* arch */ -#include "pkey-x86.h" -#elif defined(__powerpc64__) /* arch */ -#include "pkey-powerpc.h" -#else /* arch */ -#error Architecture not supported -#endif /* arch */ - -#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) - -static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) -{ - u32 shift = pkey_bit_position(pkey); - /* mask out bits from pkey in old value */ - reg &= ~((u64)PKEY_MASK << shift); - /* OR in new bits for pkey */ - reg |= (flags & PKEY_MASK) << shift; - return reg; -} - -static inline u64 get_pkey_bits(u64 reg, int pkey) -{ - u32 shift = pkey_bit_position(pkey); - /* - * shift down the relevant bits to the lowest two, then - * mask off all the other higher bits - */ - return ((reg >> shift) & PKEY_MASK); -} - -extern u64 shadow_pkey_reg; - -static inline u64 _read_pkey_reg(int line) -{ - u64 pkey_reg = __read_pkey_reg(); - - dprintf4("read_pkey_reg(line=%d) pkey_reg: %016llx" - " shadow: %016llx\n", - line, pkey_reg, shadow_pkey_reg); - assert(pkey_reg == shadow_pkey_reg); - - return pkey_reg; -} - -#define read_pkey_reg() _read_pkey_reg(__LINE__) - -static inline void write_pkey_reg(u64 pkey_reg) -{ - dprintf4("%s() changing %016llx to %016llx\n", __func__, - __read_pkey_reg(), pkey_reg); - /* will do the shadow check for us: */ - read_pkey_reg(); - __write_pkey_reg(pkey_reg); - shadow_pkey_reg = pkey_reg; - dprintf4("%s(%016llx) pkey_reg: %016llx\n", __func__, - pkey_reg, __read_pkey_reg()); -} - -/* - * These are technically racy. since something could - * change PKEY register between the read and the write. - */ -static inline void __pkey_access_allow(int pkey, int do_allow) -{ - u64 pkey_reg = read_pkey_reg(); - int bit = pkey * 2; - - if (do_allow) - pkey_reg &= (1<si_pkey; -#else - return (u32 *)(((u8 *)si) + si_pkey_offset); -#endif -} - -static inline int kernel_has_pkeys(void) -{ - /* try allocating a key and see if it succeeds */ - int ret = sys_pkey_alloc(0, 0); - if (ret <= 0) { - return 0; - } - sys_pkey_free(ret); - return 1; -} - -static inline int is_pkeys_supported(void) -{ - /* check if the cpu supports pkeys */ - if (!cpu_has_pkeys()) { - dprintf1("SKIP: %s: no CPU support\n", __func__); - return 0; - } - - /* check if the kernel supports pkeys */ - if (!kernel_has_pkeys()) { - dprintf1("SKIP: %s: no kernel support\n", __func__); - return 0; - } - - return 1; -} - -#endif /* _PKEYS_HELPER_H */ diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h deleted file mode 100644 index 1ebb586b2fbc..000000000000 --- a/tools/testing/selftests/vm/pkey-powerpc.h +++ /dev/null @@ -1,133 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _PKEYS_POWERPC_H -#define _PKEYS_POWERPC_H - -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 386 -#endif -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 384 -# define SYS_pkey_free 385 -#endif -#define REG_IP_IDX PT_NIP -#define REG_TRAPNO PT_TRAP -#define gregs gp_regs -#define fpregs fp_regs -#define si_pkey_offset 0x20 - -#undef PKEY_DISABLE_ACCESS -#define PKEY_DISABLE_ACCESS 0x3 /* disable read and write */ - -#undef PKEY_DISABLE_WRITE -#define PKEY_DISABLE_WRITE 0x2 - -#define NR_PKEYS 32 -#define NR_RESERVED_PKEYS_4K 27 /* pkey-0, pkey-1, exec-only-pkey - and 24 other keys that cannot be - represented in the PTE */ -#define NR_RESERVED_PKEYS_64K_3KEYS 3 /* PowerNV and KVM: pkey-0, - pkey-1 and exec-only key */ -#define NR_RESERVED_PKEYS_64K_4KEYS 4 /* PowerVM: pkey-0, pkey-1, - pkey-31 and exec-only key */ -#define PKEY_BITS_PER_PKEY 2 -#define HPAGE_SIZE (1UL << 24) -#define PAGE_SIZE sysconf(_SC_PAGESIZE) - -static inline u32 pkey_bit_position(int pkey) -{ - return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY; -} - -static inline u64 __read_pkey_reg(void) -{ - u64 pkey_reg; - - asm volatile("mfspr %0, 0xd" : "=r" (pkey_reg)); - - return pkey_reg; -} - -static inline void __write_pkey_reg(u64 pkey_reg) -{ - u64 amr = pkey_reg; - - dprintf4("%s() changing %016llx to %016llx\n", - __func__, __read_pkey_reg(), pkey_reg); - - asm volatile("isync; mtspr 0xd, %0; isync" - : : "r" ((unsigned long)(amr)) : "memory"); - - dprintf4("%s() pkey register after changing %016llx to %016llx\n", - __func__, __read_pkey_reg(), pkey_reg); -} - -static inline int cpu_has_pkeys(void) -{ - /* No simple way to determine this */ - return 1; -} - -static inline bool arch_is_powervm() -{ - struct stat buf; - - if ((stat("/sys/firmware/devicetree/base/ibm,partition-name", &buf) == 0) && - (stat("/sys/firmware/devicetree/base/hmc-managed?", &buf) == 0) && - (stat("/sys/firmware/devicetree/base/chosen/qemu,graphic-width", &buf) == -1) ) - return true; - - return false; -} - -static inline int get_arch_reserved_keys(void) -{ - if (sysconf(_SC_PAGESIZE) == 4096) - return NR_RESERVED_PKEYS_4K; - else - if (arch_is_powervm()) - return NR_RESERVED_PKEYS_64K_4KEYS; - else - return NR_RESERVED_PKEYS_64K_3KEYS; -} - -void expect_fault_on_read_execonly_key(void *p1, int pkey) -{ - /* - * powerpc does not allow userspace to change permissions of exec-only - * keys since those keys are not allocated by userspace. The signal - * handler wont be able to reset the permissions, which means the code - * will infinitely continue to segfault here. - */ - return; -} - -/* 4-byte instructions * 16384 = 64K page */ -#define __page_o_noops() asm(".rept 16384 ; nop; .endr") - -void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) -{ - void *ptr; - int ret; - - dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, - size, prot, pkey); - pkey_assert(pkey < NR_PKEYS); - ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - pkey_assert(ptr != (void *)-1); - - ret = syscall(__NR_subpage_prot, ptr, size, NULL); - if (ret) { - perror("subpage_perm"); - return PTR_ERR_ENOTSUP; - } - - ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); - pkey_assert(!ret); - record_pkey_malloc(ptr, size, prot); - - dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); - return ptr; -} - -#endif /* _PKEYS_POWERPC_H */ diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/vm/pkey-x86.h deleted file mode 100644 index 72c14cd3ddc7..000000000000 --- a/tools/testing/selftests/vm/pkey-x86.h +++ /dev/null @@ -1,177 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _PKEYS_X86_H -#define _PKEYS_X86_H - -#ifdef __i386__ - -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 380 -#endif - -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 381 -# define SYS_pkey_free 382 -#endif - -#define REG_IP_IDX REG_EIP -#define si_pkey_offset 0x14 - -#else - -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 329 -#endif - -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 330 -# define SYS_pkey_free 331 -#endif - -#define REG_IP_IDX REG_RIP -#define si_pkey_offset 0x20 - -#endif - -#ifndef PKEY_DISABLE_ACCESS -# define PKEY_DISABLE_ACCESS 0x1 -#endif - -#ifndef PKEY_DISABLE_WRITE -# define PKEY_DISABLE_WRITE 0x2 -#endif - -#define NR_PKEYS 16 -#define NR_RESERVED_PKEYS 2 /* pkey-0 and exec-only-pkey */ -#define PKEY_BITS_PER_PKEY 2 -#define HPAGE_SIZE (1UL<<21) -#define PAGE_SIZE 4096 -#define MB (1<<20) - -static inline void __page_o_noops(void) -{ - /* 8-bytes of instruction * 512 bytes = 1 page */ - asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr"); -} - -static inline u64 __read_pkey_reg(void) -{ - unsigned int eax, edx; - unsigned int ecx = 0; - unsigned pkey_reg; - - asm volatile(".byte 0x0f,0x01,0xee\n\t" - : "=a" (eax), "=d" (edx) - : "c" (ecx)); - pkey_reg = eax; - return pkey_reg; -} - -static inline void __write_pkey_reg(u64 pkey_reg) -{ - unsigned int eax = pkey_reg; - unsigned int ecx = 0; - unsigned int edx = 0; - - dprintf4("%s() changing %016llx to %016llx\n", __func__, - __read_pkey_reg(), pkey_reg); - asm volatile(".byte 0x0f,0x01,0xef\n\t" - : : "a" (eax), "c" (ecx), "d" (edx)); - assert(pkey_reg == __read_pkey_reg()); -} - -/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */ -#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */ -#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ - -static inline int cpu_has_pkeys(void) -{ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - - __cpuid_count(0x7, 0x0, eax, ebx, ecx, edx); - - if (!(ecx & X86_FEATURE_PKU)) { - dprintf2("cpu does not have PKU\n"); - return 0; - } - if (!(ecx & X86_FEATURE_OSPKE)) { - dprintf2("cpu does not have OSPKE\n"); - return 0; - } - return 1; -} - -static inline int cpu_max_xsave_size(void) -{ - unsigned long XSTATE_CPUID = 0xd; - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - - __cpuid_count(XSTATE_CPUID, 0, eax, ebx, ecx, edx); - return ecx; -} - -static inline u32 pkey_bit_position(int pkey) -{ - return pkey * PKEY_BITS_PER_PKEY; -} - -#define XSTATE_PKEY_BIT (9) -#define XSTATE_PKEY 0x200 -#define XSTATE_BV_OFFSET 512 - -int pkey_reg_xstate_offset(void) -{ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - int xstate_offset; - int xstate_size; - unsigned long XSTATE_CPUID = 0xd; - int leaf; - - /* assume that XSTATE_PKEY is set in XCR0 */ - leaf = XSTATE_PKEY_BIT; - { - __cpuid_count(XSTATE_CPUID, leaf, eax, ebx, ecx, edx); - - if (leaf == XSTATE_PKEY_BIT) { - xstate_offset = ebx; - xstate_size = eax; - } - } - - if (xstate_size == 0) { - printf("could not find size/offset of PKEY in xsave state\n"); - return 0; - } - - return xstate_offset; -} - -static inline int get_arch_reserved_keys(void) -{ - return NR_RESERVED_PKEYS; -} - -void expect_fault_on_read_execonly_key(void *p1, int pkey) -{ - int ptr_contents; - - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - expected_pkey_fault(pkey); -} - -void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) -{ - return PTR_ERR_ENOTSUP; -} - -#endif /* _PKEYS_X86_H */ diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c deleted file mode 100644 index 95f403a0c46d..000000000000 --- a/tools/testing/selftests/vm/protection_keys.c +++ /dev/null @@ -1,1788 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst) - * - * There are examples in here of: - * * how to set protection keys on memory - * * how to set/clear bits in pkey registers (the rights register) - * * how to handle SEGV_PKUERR signals and extract pkey-relevant - * information from the siginfo - * - * Things to add: - * make sure KSM and KSM COW breaking works - * prefault pages in at malloc, or not - * protect MPX bounds tables with protection keys? - * make sure VMA splitting/merging is working correctly - * OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys - * look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel - * do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks - * - * Compile like this: - * gcc -mxsave -o protection_keys -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm - * gcc -mxsave -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm - */ -#define _GNU_SOURCE -#define __SANE_USERSPACE_TYPES__ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "pkey-helpers.h" - -int iteration_nr = 1; -int test_nr; - -u64 shadow_pkey_reg; -int dprint_in_signal; -char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; - -void cat_into_file(char *str, char *file) -{ - int fd = open(file, O_RDWR); - int ret; - - dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file); - /* - * these need to be raw because they are called under - * pkey_assert() - */ - if (fd < 0) { - fprintf(stderr, "error opening '%s'\n", str); - perror("error: "); - exit(__LINE__); - } - - ret = write(fd, str, strlen(str)); - if (ret != strlen(str)) { - perror("write to file failed"); - fprintf(stderr, "filename: '%s' str: '%s'\n", file, str); - exit(__LINE__); - } - close(fd); -} - -#if CONTROL_TRACING > 0 -static int warned_tracing; -int tracing_root_ok(void) -{ - if (geteuid() != 0) { - if (!warned_tracing) - fprintf(stderr, "WARNING: not run as root, " - "can not do tracing control\n"); - warned_tracing = 1; - return 0; - } - return 1; -} -#endif - -void tracing_on(void) -{ -#if CONTROL_TRACING > 0 -#define TRACEDIR "/sys/kernel/debug/tracing" - char pidstr[32]; - - if (!tracing_root_ok()) - return; - - sprintf(pidstr, "%d", getpid()); - cat_into_file("0", TRACEDIR "/tracing_on"); - cat_into_file("\n", TRACEDIR "/trace"); - if (1) { - cat_into_file("function_graph", TRACEDIR "/current_tracer"); - cat_into_file("1", TRACEDIR "/options/funcgraph-proc"); - } else { - cat_into_file("nop", TRACEDIR "/current_tracer"); - } - cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid"); - cat_into_file("1", TRACEDIR "/tracing_on"); - dprintf1("enabled tracing\n"); -#endif -} - -void tracing_off(void) -{ -#if CONTROL_TRACING > 0 - if (!tracing_root_ok()) - return; - cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on"); -#endif -} - -void abort_hooks(void) -{ - fprintf(stderr, "running %s()...\n", __func__); - tracing_off(); -#ifdef SLEEP_ON_ABORT - sleep(SLEEP_ON_ABORT); -#endif -} - -/* - * This attempts to have roughly a page of instructions followed by a few - * instructions that do a write, and another page of instructions. That - * way, we are pretty sure that the write is in the second page of - * instructions and has at least a page of padding behind it. - * - * *That* lets us be sure to madvise() away the write instruction, which - * will then fault, which makes sure that the fault code handles - * execute-only memory properly. - */ -#ifdef __powerpc64__ -/* This way, both 4K and 64K alignment are maintained */ -__attribute__((__aligned__(65536))) -#else -__attribute__((__aligned__(PAGE_SIZE))) -#endif -void lots_o_noops_around_write(int *write_to_me) -{ - dprintf3("running %s()\n", __func__); - __page_o_noops(); - /* Assume this happens in the second page of instructions: */ - *write_to_me = __LINE__; - /* pad out by another page: */ - __page_o_noops(); - dprintf3("%s() done\n", __func__); -} - -void dump_mem(void *dumpme, int len_bytes) -{ - char *c = (void *)dumpme; - int i; - - for (i = 0; i < len_bytes; i += sizeof(u64)) { - u64 *ptr = (u64 *)(c + i); - dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr); - } -} - -static u32 hw_pkey_get(int pkey, unsigned long flags) -{ - u64 pkey_reg = __read_pkey_reg(); - - dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", - __func__, pkey, flags, 0, 0); - dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg); - - return (u32) get_pkey_bits(pkey_reg, pkey); -} - -static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) -{ - u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); - u64 old_pkey_reg = __read_pkey_reg(); - u64 new_pkey_reg; - - /* make sure that 'rights' only contains the bits we expect: */ - assert(!(rights & ~mask)); - - /* modify bits accordingly in old pkey_reg and assign it */ - new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights); - - __write_pkey_reg(new_pkey_reg); - - dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x" - " pkey_reg now: %016llx old_pkey_reg: %016llx\n", - __func__, pkey, rights, flags, 0, __read_pkey_reg(), - old_pkey_reg); - return 0; -} - -void pkey_disable_set(int pkey, int flags) -{ - unsigned long syscall_flags = 0; - int ret; - int pkey_rights; - u64 orig_pkey_reg = read_pkey_reg(); - - dprintf1("START->%s(%d, 0x%x)\n", __func__, - pkey, flags); - pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - pkey_assert(pkey_rights >= 0); - - pkey_rights |= flags; - - ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); - assert(!ret); - /* pkey_reg and flags have the same format */ - shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); - dprintf1("%s(%d) shadow: 0x%016llx\n", - __func__, pkey, shadow_pkey_reg); - - pkey_assert(ret >= 0); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - dprintf1("%s(%d) pkey_reg: 0x%016llx\n", - __func__, pkey, read_pkey_reg()); - if (flags) - pkey_assert(read_pkey_reg() >= orig_pkey_reg); - dprintf1("END<---%s(%d, 0x%x)\n", __func__, - pkey, flags); -} - -void pkey_disable_clear(int pkey, int flags) -{ - unsigned long syscall_flags = 0; - int ret; - int pkey_rights = hw_pkey_get(pkey, syscall_flags); - u64 orig_pkey_reg = read_pkey_reg(); - - pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); - - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - pkey_assert(pkey_rights >= 0); - - pkey_rights &= ~flags; - - ret = hw_pkey_set(pkey, pkey_rights, 0); - shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); - pkey_assert(ret >= 0); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, - pkey, read_pkey_reg()); - if (flags) - assert(read_pkey_reg() <= orig_pkey_reg); -} - -void pkey_write_allow(int pkey) -{ - pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); -} -void pkey_write_deny(int pkey) -{ - pkey_disable_set(pkey, PKEY_DISABLE_WRITE); -} -void pkey_access_allow(int pkey) -{ - pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); -} -void pkey_access_deny(int pkey) -{ - pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); -} - -/* Failed address bound checks: */ -#ifndef SEGV_BNDERR -# define SEGV_BNDERR 3 -#endif - -#ifndef SEGV_PKUERR -# define SEGV_PKUERR 4 -#endif - -static char *si_code_str(int si_code) -{ - if (si_code == SEGV_MAPERR) - return "SEGV_MAPERR"; - if (si_code == SEGV_ACCERR) - return "SEGV_ACCERR"; - if (si_code == SEGV_BNDERR) - return "SEGV_BNDERR"; - if (si_code == SEGV_PKUERR) - return "SEGV_PKUERR"; - return "UNKNOWN"; -} - -int pkey_faults; -int last_si_pkey = -1; -void signal_handler(int signum, siginfo_t *si, void *vucontext) -{ - ucontext_t *uctxt = vucontext; - int trapno; - unsigned long ip; - char *fpregs; -#if defined(__i386__) || defined(__x86_64__) /* arch */ - u32 *pkey_reg_ptr; - int pkey_reg_offset; -#endif /* arch */ - u64 siginfo_pkey; - u32 *si_pkey_ptr; - - dprint_in_signal = 1; - dprintf1(">>>>===============SIGSEGV============================\n"); - dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", - __func__, __LINE__, - __read_pkey_reg(), shadow_pkey_reg); - - trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; - ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; - fpregs = (char *) uctxt->uc_mcontext.fpregs; - - dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n", - __func__, trapno, ip, si_code_str(si->si_code), - si->si_code); - -#if defined(__i386__) || defined(__x86_64__) /* arch */ -#ifdef __i386__ - /* - * 32-bit has some extra padding so that userspace can tell whether - * the XSTATE header is present in addition to the "legacy" FPU - * state. We just assume that it is here. - */ - fpregs += 0x70; -#endif /* i386 */ - pkey_reg_offset = pkey_reg_xstate_offset(); - pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]); - - /* - * If we got a PKEY fault, we *HAVE* to have at least one bit set in - * here. - */ - dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset()); - if (DEBUG_LEVEL > 4) - dump_mem(pkey_reg_ptr - 128, 256); - pkey_assert(*pkey_reg_ptr); -#endif /* arch */ - - dprintf1("siginfo: %p\n", si); - dprintf1(" fpregs: %p\n", fpregs); - - if ((si->si_code == SEGV_MAPERR) || - (si->si_code == SEGV_ACCERR) || - (si->si_code == SEGV_BNDERR)) { - printf("non-PK si_code, exiting...\n"); - exit(4); - } - - si_pkey_ptr = siginfo_get_pkey_ptr(si); - dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr); - dump_mem((u8 *)si_pkey_ptr - 8, 24); - siginfo_pkey = *si_pkey_ptr; - pkey_assert(siginfo_pkey < NR_PKEYS); - last_si_pkey = siginfo_pkey; - - /* - * need __read_pkey_reg() version so we do not do shadow_pkey_reg - * checking - */ - dprintf1("signal pkey_reg from pkey_reg: %016llx\n", - __read_pkey_reg()); - dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey); -#if defined(__i386__) || defined(__x86_64__) /* arch */ - dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr); - *(u64 *)pkey_reg_ptr = 0x00000000; - dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n"); -#elif defined(__powerpc64__) /* arch */ - /* restore access and let the faulting instruction continue */ - pkey_access_allow(siginfo_pkey); -#endif /* arch */ - pkey_faults++; - dprintf1("<<<<==================================================\n"); - dprint_in_signal = 0; -} - -int wait_all_children(void) -{ - int status; - return waitpid(-1, &status, 0); -} - -void sig_chld(int x) -{ - dprint_in_signal = 1; - dprintf2("[%d] SIGCHLD: %d\n", getpid(), x); - dprint_in_signal = 0; -} - -void setup_sigsegv_handler(void) -{ - int r, rs; - struct sigaction newact; - struct sigaction oldact; - - /* #PF is mapped to sigsegv */ - int signum = SIGSEGV; - - newact.sa_handler = 0; - newact.sa_sigaction = signal_handler; - - /*sigset_t - signals to block while in the handler */ - /* get the old signal mask. */ - rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask); - pkey_assert(rs == 0); - - /* call sa_sigaction, not sa_handler*/ - newact.sa_flags = SA_SIGINFO; - - newact.sa_restorer = 0; /* void(*)(), obsolete */ - r = sigaction(signum, &newact, &oldact); - r = sigaction(SIGALRM, &newact, &oldact); - pkey_assert(r == 0); -} - -void setup_handlers(void) -{ - signal(SIGCHLD, &sig_chld); - setup_sigsegv_handler(); -} - -pid_t fork_lazy_child(void) -{ - pid_t forkret; - - forkret = fork(); - pkey_assert(forkret >= 0); - dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); - - if (!forkret) { - /* in the child */ - while (1) { - dprintf1("child sleeping...\n"); - sleep(30); - } - } - return forkret; -} - -int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, - unsigned long pkey) -{ - int sret; - - dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__, - ptr, size, orig_prot, pkey); - - errno = 0; - sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey); - if (errno) { - dprintf2("SYS_mprotect_key sret: %d\n", sret); - dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot); - dprintf2("SYS_mprotect_key failed, errno: %d\n", errno); - if (DEBUG_LEVEL >= 2) - perror("SYS_mprotect_pkey"); - } - return sret; -} - -int sys_pkey_alloc(unsigned long flags, unsigned long init_val) -{ - int ret = syscall(SYS_pkey_alloc, flags, init_val); - dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n", - __func__, flags, init_val, ret, errno); - return ret; -} - -int alloc_pkey(void) -{ - int ret; - unsigned long init_val = 0x0; - - dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", - __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); - ret = sys_pkey_alloc(0, init_val); - /* - * pkey_alloc() sets PKEY register, so we need to reflect it in - * shadow_pkey_reg: - */ - dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", - __func__, __LINE__, ret, __read_pkey_reg(), - shadow_pkey_reg); - if (ret > 0) { - /* clear both the bits: */ - shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, - ~PKEY_MASK); - dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", - __func__, - __LINE__, ret, __read_pkey_reg(), - shadow_pkey_reg); - /* - * move the new state in from init_val - * (remember, we cheated and init_val == pkey_reg format) - */ - shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, - init_val); - } - dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", - __func__, __LINE__, ret, __read_pkey_reg(), - shadow_pkey_reg); - dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno); - /* for shadow checking: */ - read_pkey_reg(); - dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", - __func__, __LINE__, ret, __read_pkey_reg(), - shadow_pkey_reg); - return ret; -} - -int sys_pkey_free(unsigned long pkey) -{ - int ret = syscall(SYS_pkey_free, pkey); - dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret); - return ret; -} - -/* - * I had a bug where pkey bits could be set by mprotect() but - * not cleared. This ensures we get lots of random bit sets - * and clears on the vma and pte pkey bits. - */ -int alloc_random_pkey(void) -{ - int max_nr_pkey_allocs; - int ret; - int i; - int alloced_pkeys[NR_PKEYS]; - int nr_alloced = 0; - int random_index; - memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); - - /* allocate every possible key and make a note of which ones we got */ - max_nr_pkey_allocs = NR_PKEYS; - for (i = 0; i < max_nr_pkey_allocs; i++) { - int new_pkey = alloc_pkey(); - if (new_pkey < 0) - break; - alloced_pkeys[nr_alloced++] = new_pkey; - } - - pkey_assert(nr_alloced > 0); - /* select a random one out of the allocated ones */ - random_index = rand() % nr_alloced; - ret = alloced_pkeys[random_index]; - /* now zero it out so we don't free it next */ - alloced_pkeys[random_index] = 0; - - /* go through the allocated ones that we did not want and free them */ - for (i = 0; i < nr_alloced; i++) { - int free_ret; - if (!alloced_pkeys[i]) - continue; - free_ret = sys_pkey_free(alloced_pkeys[i]); - pkey_assert(!free_ret); - } - dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", __func__, - __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); - return ret; -} - -int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, - unsigned long pkey) -{ - int nr_iterations = random() % 100; - int ret; - - while (0) { - int rpkey = alloc_random_pkey(); - ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); - dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", - ptr, size, orig_prot, pkey, ret); - if (nr_iterations-- < 0) - break; - - dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", - __func__, __LINE__, ret, __read_pkey_reg(), - shadow_pkey_reg); - sys_pkey_free(rpkey); - dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", - __func__, __LINE__, ret, __read_pkey_reg(), - shadow_pkey_reg); - } - pkey_assert(pkey < NR_PKEYS); - - ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); - dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", - ptr, size, orig_prot, pkey, ret); - pkey_assert(!ret); - dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", __func__, - __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); - return ret; -} - -struct pkey_malloc_record { - void *ptr; - long size; - int prot; -}; -struct pkey_malloc_record *pkey_malloc_records; -struct pkey_malloc_record *pkey_last_malloc_record; -long nr_pkey_malloc_records; -void record_pkey_malloc(void *ptr, long size, int prot) -{ - long i; - struct pkey_malloc_record *rec = NULL; - - for (i = 0; i < nr_pkey_malloc_records; i++) { - rec = &pkey_malloc_records[i]; - /* find a free record */ - if (rec) - break; - } - if (!rec) { - /* every record is full */ - size_t old_nr_records = nr_pkey_malloc_records; - size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1); - size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record); - dprintf2("new_nr_records: %zd\n", new_nr_records); - dprintf2("new_size: %zd\n", new_size); - pkey_malloc_records = realloc(pkey_malloc_records, new_size); - pkey_assert(pkey_malloc_records != NULL); - rec = &pkey_malloc_records[nr_pkey_malloc_records]; - /* - * realloc() does not initialize memory, so zero it from - * the first new record all the way to the end. - */ - for (i = 0; i < new_nr_records - old_nr_records; i++) - memset(rec + i, 0, sizeof(*rec)); - } - dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n", - (int)(rec - pkey_malloc_records), rec, ptr, size); - rec->ptr = ptr; - rec->size = size; - rec->prot = prot; - pkey_last_malloc_record = rec; - nr_pkey_malloc_records++; -} - -void free_pkey_malloc(void *ptr) -{ - long i; - int ret; - dprintf3("%s(%p)\n", __func__, ptr); - for (i = 0; i < nr_pkey_malloc_records; i++) { - struct pkey_malloc_record *rec = &pkey_malloc_records[i]; - dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n", - ptr, i, rec, rec->ptr, rec->size); - if ((ptr < rec->ptr) || - (ptr >= rec->ptr + rec->size)) - continue; - - dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n", - ptr, i, rec, rec->ptr, rec->size); - nr_pkey_malloc_records--; - ret = munmap(rec->ptr, rec->size); - dprintf3("munmap ret: %d\n", ret); - pkey_assert(!ret); - dprintf3("clearing rec->ptr, rec: %p\n", rec); - rec->ptr = NULL; - dprintf3("done clearing rec->ptr, rec: %p\n", rec); - return; - } - pkey_assert(false); -} - - -void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) -{ - void *ptr; - int ret; - - read_pkey_reg(); - dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, - size, prot, pkey); - pkey_assert(pkey < NR_PKEYS); - ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - pkey_assert(ptr != (void *)-1); - ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); - pkey_assert(!ret); - record_pkey_malloc(ptr, size, prot); - read_pkey_reg(); - - dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); - return ptr; -} - -void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) -{ - int ret; - void *ptr; - - dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, - size, prot, pkey); - /* - * Guarantee we can fit at least one huge page in the resulting - * allocation by allocating space for 2: - */ - size = ALIGN_UP(size, HPAGE_SIZE * 2); - ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - pkey_assert(ptr != (void *)-1); - record_pkey_malloc(ptr, size, prot); - mprotect_pkey(ptr, size, prot, pkey); - - dprintf1("unaligned ptr: %p\n", ptr); - ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE); - dprintf1(" aligned ptr: %p\n", ptr); - ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE); - dprintf1("MADV_HUGEPAGE ret: %d\n", ret); - ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED); - dprintf1("MADV_WILLNEED ret: %d\n", ret); - memset(ptr, 0, HPAGE_SIZE); - - dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr); - return ptr; -} - -int hugetlb_setup_ok; -#define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages" -#define GET_NR_HUGE_PAGES 10 -void setup_hugetlbfs(void) -{ - int err; - int fd; - char buf[256]; - long hpagesz_kb; - long hpagesz_mb; - - if (geteuid() != 0) { - fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n"); - return; - } - - cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages"); - - /* - * Now go make sure that we got the pages and that they - * are PMD-level pages. Someone might have made PUD-level - * pages the default. - */ - hpagesz_kb = HPAGE_SIZE / 1024; - hpagesz_mb = hpagesz_kb / 1024; - sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb); - fd = open(buf, O_RDONLY); - if (fd < 0) { - fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n", - hpagesz_mb, strerror(errno)); - return; - } - - /* -1 to guarantee leaving the trailing \0 */ - err = read(fd, buf, sizeof(buf)-1); - close(fd); - if (err <= 0) { - fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n", - hpagesz_mb, strerror(errno)); - return; - } - - if (atoi(buf) != GET_NR_HUGE_PAGES) { - fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n", - hpagesz_mb, buf, GET_NR_HUGE_PAGES); - return; - } - - hugetlb_setup_ok = 1; -} - -void *malloc_pkey_hugetlb(long size, int prot, u16 pkey) -{ - void *ptr; - int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB; - - if (!hugetlb_setup_ok) - return PTR_ERR_ENOTSUP; - - dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey); - size = ALIGN_UP(size, HPAGE_SIZE * 2); - pkey_assert(pkey < NR_PKEYS); - ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0); - pkey_assert(ptr != (void *)-1); - mprotect_pkey(ptr, size, prot, pkey); - - record_pkey_malloc(ptr, size, prot); - - dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr); - return ptr; -} - -void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey) -{ - void *ptr; - int fd; - - dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, - size, prot, pkey); - pkey_assert(pkey < NR_PKEYS); - fd = open("/dax/foo", O_RDWR); - pkey_assert(fd >= 0); - - ptr = mmap(0, size, prot, MAP_SHARED, fd, 0); - pkey_assert(ptr != (void *)-1); - - mprotect_pkey(ptr, size, prot, pkey); - - record_pkey_malloc(ptr, size, prot); - - dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr); - close(fd); - return ptr; -} - -void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { - - malloc_pkey_with_mprotect, - malloc_pkey_with_mprotect_subpage, - malloc_pkey_anon_huge, - malloc_pkey_hugetlb -/* can not do direct with the pkey_mprotect() API: - malloc_pkey_mmap_direct, - malloc_pkey_mmap_dax, -*/ -}; - -void *malloc_pkey(long size, int prot, u16 pkey) -{ - void *ret; - static int malloc_type; - int nr_malloc_types = ARRAY_SIZE(pkey_malloc); - - pkey_assert(pkey < NR_PKEYS); - - while (1) { - pkey_assert(malloc_type < nr_malloc_types); - - ret = pkey_malloc[malloc_type](size, prot, pkey); - pkey_assert(ret != (void *)-1); - - malloc_type++; - if (malloc_type >= nr_malloc_types) - malloc_type = (random()%nr_malloc_types); - - /* try again if the malloc_type we tried is unsupported */ - if (ret == PTR_ERR_ENOTSUP) - continue; - - break; - } - - dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__, - size, prot, pkey, ret); - return ret; -} - -int last_pkey_faults; -#define UNKNOWN_PKEY -2 -void expected_pkey_fault(int pkey) -{ - dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n", - __func__, last_pkey_faults, pkey_faults); - dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey); - pkey_assert(last_pkey_faults + 1 == pkey_faults); - - /* - * For exec-only memory, we do not know the pkey in - * advance, so skip this check. - */ - if (pkey != UNKNOWN_PKEY) - pkey_assert(last_si_pkey == pkey); - -#if defined(__i386__) || defined(__x86_64__) /* arch */ - /* - * The signal handler shold have cleared out PKEY register to let the - * test program continue. We now have to restore it. - */ - if (__read_pkey_reg() != 0) -#else /* arch */ - if (__read_pkey_reg() != shadow_pkey_reg) -#endif /* arch */ - pkey_assert(0); - - __write_pkey_reg(shadow_pkey_reg); - dprintf1("%s() set pkey_reg=%016llx to restore state after signal " - "nuked it\n", __func__, shadow_pkey_reg); - last_pkey_faults = pkey_faults; - last_si_pkey = -1; -} - -#define do_not_expect_pkey_fault(msg) do { \ - if (last_pkey_faults != pkey_faults) \ - dprintf0("unexpected PKey fault: %s\n", msg); \ - pkey_assert(last_pkey_faults == pkey_faults); \ -} while (0) - -int test_fds[10] = { -1 }; -int nr_test_fds; -void __save_test_fd(int fd) -{ - pkey_assert(fd >= 0); - pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds)); - test_fds[nr_test_fds] = fd; - nr_test_fds++; -} - -int get_test_read_fd(void) -{ - int test_fd = open("/etc/passwd", O_RDONLY); - __save_test_fd(test_fd); - return test_fd; -} - -void close_test_fds(void) -{ - int i; - - for (i = 0; i < nr_test_fds; i++) { - if (test_fds[i] < 0) - continue; - close(test_fds[i]); - test_fds[i] = -1; - } - nr_test_fds = 0; -} - -#define barrier() __asm__ __volatile__("": : :"memory") -__attribute__((noinline)) int read_ptr(int *ptr) -{ - /* - * Keep GCC from optimizing this away somehow - */ - barrier(); - return *ptr; -} - -void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) -{ - int i, err; - int max_nr_pkey_allocs; - int alloced_pkeys[NR_PKEYS]; - int nr_alloced = 0; - long size; - - pkey_assert(pkey_last_malloc_record); - size = pkey_last_malloc_record->size; - /* - * This is a bit of a hack. But mprotect() requires - * huge-page-aligned sizes when operating on hugetlbfs. - * So, make sure that we use something that's a multiple - * of a huge page when we can. - */ - if (size >= HPAGE_SIZE) - size = HPAGE_SIZE; - - /* allocate every possible key and make sure key-0 never got allocated */ - max_nr_pkey_allocs = NR_PKEYS; - for (i = 0; i < max_nr_pkey_allocs; i++) { - int new_pkey = alloc_pkey(); - pkey_assert(new_pkey != 0); - - if (new_pkey < 0) - break; - alloced_pkeys[nr_alloced++] = new_pkey; - } - /* free all the allocated keys */ - for (i = 0; i < nr_alloced; i++) { - int free_ret; - - if (!alloced_pkeys[i]) - continue; - free_ret = sys_pkey_free(alloced_pkeys[i]); - pkey_assert(!free_ret); - } - - /* attach key-0 in various modes */ - err = sys_mprotect_pkey(ptr, size, PROT_READ, 0); - pkey_assert(!err); - err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0); - pkey_assert(!err); - err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0); - pkey_assert(!err); - err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0); - pkey_assert(!err); - err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0); - pkey_assert(!err); -} - -void test_read_of_write_disabled_region(int *ptr, u16 pkey) -{ - int ptr_contents; - - dprintf1("disabling write access to PKEY[1], doing read\n"); - pkey_write_deny(pkey); - ptr_contents = read_ptr(ptr); - dprintf1("*ptr: %d\n", ptr_contents); - dprintf1("\n"); -} -void test_read_of_access_disabled_region(int *ptr, u16 pkey) -{ - int ptr_contents; - - dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr); - read_pkey_reg(); - pkey_access_deny(pkey); - ptr_contents = read_ptr(ptr); - dprintf1("*ptr: %d\n", ptr_contents); - expected_pkey_fault(pkey); -} - -void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, - u16 pkey) -{ - int ptr_contents; - - dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", - pkey, ptr); - ptr_contents = read_ptr(ptr); - dprintf1("reading ptr before disabling the read : %d\n", - ptr_contents); - read_pkey_reg(); - pkey_access_deny(pkey); - ptr_contents = read_ptr(ptr); - dprintf1("*ptr: %d\n", ptr_contents); - expected_pkey_fault(pkey); -} - -void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, - u16 pkey) -{ - *ptr = __LINE__; - dprintf1("disabling write access; after accessing the page, " - "to PKEY[%02d], doing write\n", pkey); - pkey_write_deny(pkey); - *ptr = __LINE__; - expected_pkey_fault(pkey); -} - -void test_write_of_write_disabled_region(int *ptr, u16 pkey) -{ - dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); - pkey_write_deny(pkey); - *ptr = __LINE__; - expected_pkey_fault(pkey); -} -void test_write_of_access_disabled_region(int *ptr, u16 pkey) -{ - dprintf1("disabling access to PKEY[%02d], doing write\n", pkey); - pkey_access_deny(pkey); - *ptr = __LINE__; - expected_pkey_fault(pkey); -} - -void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, - u16 pkey) -{ - *ptr = __LINE__; - dprintf1("disabling access; after accessing the page, " - " to PKEY[%02d], doing write\n", pkey); - pkey_access_deny(pkey); - *ptr = __LINE__; - expected_pkey_fault(pkey); -} - -void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) -{ - int ret; - int test_fd = get_test_read_fd(); - - dprintf1("disabling access to PKEY[%02d], " - "having kernel read() to buffer\n", pkey); - pkey_access_deny(pkey); - ret = read(test_fd, ptr, 1); - dprintf1("read ret: %d\n", ret); - pkey_assert(ret); -} -void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey) -{ - int ret; - int test_fd = get_test_read_fd(); - - pkey_write_deny(pkey); - ret = read(test_fd, ptr, 100); - dprintf1("read ret: %d\n", ret); - if (ret < 0 && (DEBUG_LEVEL > 0)) - perror("verbose read result (OK for this to be bad)"); - pkey_assert(ret); -} - -void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey) -{ - int pipe_ret, vmsplice_ret; - struct iovec iov; - int pipe_fds[2]; - - pipe_ret = pipe(pipe_fds); - - pkey_assert(pipe_ret == 0); - dprintf1("disabling access to PKEY[%02d], " - "having kernel vmsplice from buffer\n", pkey); - pkey_access_deny(pkey); - iov.iov_base = ptr; - iov.iov_len = PAGE_SIZE; - vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT); - dprintf1("vmsplice() ret: %d\n", vmsplice_ret); - pkey_assert(vmsplice_ret == -1); - - close(pipe_fds[0]); - close(pipe_fds[1]); -} - -void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey) -{ - int ignored = 0xdada; - int futex_ret; - int some_int = __LINE__; - - dprintf1("disabling write to PKEY[%02d], " - "doing futex gunk in buffer\n", pkey); - *ptr = some_int; - pkey_write_deny(pkey); - futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL, - &ignored, ignored); - if (DEBUG_LEVEL > 0) - perror("futex"); - dprintf1("futex() ret: %d\n", futex_ret); -} - -/* Assumes that all pkeys other than 'pkey' are unallocated */ -void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey) -{ - int err; - int i; - - /* Note: 0 is the default pkey, so don't mess with it */ - for (i = 1; i < NR_PKEYS; i++) { - if (pkey == i) - continue; - - dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i); - err = sys_pkey_free(i); - pkey_assert(err); - - err = sys_pkey_free(i); - pkey_assert(err); - - err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i); - pkey_assert(err); - } -} - -/* Assumes that all pkeys other than 'pkey' are unallocated */ -void test_pkey_syscalls_bad_args(int *ptr, u16 pkey) -{ - int err; - int bad_pkey = NR_PKEYS+99; - - /* pass a known-invalid pkey in: */ - err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey); - pkey_assert(err); -} - -void become_child(void) -{ - pid_t forkret; - - forkret = fork(); - pkey_assert(forkret >= 0); - dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); - - if (!forkret) { - /* in the child */ - return; - } - exit(0); -} - -/* Assumes that all pkeys other than 'pkey' are unallocated */ -void test_pkey_alloc_exhaust(int *ptr, u16 pkey) -{ - int err; - int allocated_pkeys[NR_PKEYS] = {0}; - int nr_allocated_pkeys = 0; - int i; - - for (i = 0; i < NR_PKEYS*3; i++) { - int new_pkey; - dprintf1("%s() alloc loop: %d\n", __func__, i); - new_pkey = alloc_pkey(); - dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", - __func__, __LINE__, err, __read_pkey_reg(), - shadow_pkey_reg); - read_pkey_reg(); /* for shadow checking */ - dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC); - if ((new_pkey == -1) && (errno == ENOSPC)) { - dprintf2("%s() failed to allocate pkey after %d tries\n", - __func__, nr_allocated_pkeys); - } else { - /* - * Ensure the number of successes never - * exceeds the number of keys supported - * in the hardware. - */ - pkey_assert(nr_allocated_pkeys < NR_PKEYS); - allocated_pkeys[nr_allocated_pkeys++] = new_pkey; - } - - /* - * Make sure that allocation state is properly - * preserved across fork(). - */ - if (i == NR_PKEYS*2) - become_child(); - } - - dprintf3("%s()::%d\n", __func__, __LINE__); - - /* - * On x86: - * There are 16 pkeys supported in hardware. Three are - * allocated by the time we get here: - * 1. The default key (0) - * 2. One possibly consumed by an execute-only mapping. - * 3. One allocated by the test code and passed in via - * 'pkey' to this function. - * Ensure that we can allocate at least another 13 (16-3). - * - * On powerpc: - * There are either 5, 28, 29 or 32 pkeys supported in - * hardware depending on the page size (4K or 64K) and - * platform (powernv or powervm). Four are allocated by - * the time we get here. These include pkey-0, pkey-1, - * exec-only pkey and the one allocated by the test code. - * Ensure that we can allocate the remaining. - */ - pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1)); - - for (i = 0; i < nr_allocated_pkeys; i++) { - err = sys_pkey_free(allocated_pkeys[i]); - pkey_assert(!err); - read_pkey_reg(); /* for shadow checking */ - } -} - -void arch_force_pkey_reg_init(void) -{ -#if defined(__i386__) || defined(__x86_64__) /* arch */ - u64 *buf; - - /* - * All keys should be allocated and set to allow reads and - * writes, so the register should be all 0. If not, just - * skip the test. - */ - if (read_pkey_reg()) - return; - - /* - * Just allocate an absurd about of memory rather than - * doing the XSAVE size enumeration dance. - */ - buf = mmap(NULL, 1*MB, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - - /* These __builtins require compiling with -mxsave */ - - /* XSAVE to build a valid buffer: */ - __builtin_ia32_xsave(buf, XSTATE_PKEY); - /* Clear XSTATE_BV[PKRU]: */ - buf[XSTATE_BV_OFFSET/sizeof(u64)] &= ~XSTATE_PKEY; - /* XRSTOR will likely get PKRU back to the init state: */ - __builtin_ia32_xrstor(buf, XSTATE_PKEY); - - munmap(buf, 1*MB); -#endif -} - - -/* - * This is mostly useless on ppc for now. But it will not - * hurt anything and should give some better coverage as - * a long-running test that continually checks the pkey - * register. - */ -void test_pkey_init_state(int *ptr, u16 pkey) -{ - int err; - int allocated_pkeys[NR_PKEYS] = {0}; - int nr_allocated_pkeys = 0; - int i; - - for (i = 0; i < NR_PKEYS; i++) { - int new_pkey = alloc_pkey(); - - if (new_pkey < 0) - continue; - allocated_pkeys[nr_allocated_pkeys++] = new_pkey; - } - - dprintf3("%s()::%d\n", __func__, __LINE__); - - arch_force_pkey_reg_init(); - - /* - * Loop for a bit, hoping to get exercise the kernel - * context switch code. - */ - for (i = 0; i < 1000000; i++) - read_pkey_reg(); - - for (i = 0; i < nr_allocated_pkeys; i++) { - err = sys_pkey_free(allocated_pkeys[i]); - pkey_assert(!err); - read_pkey_reg(); /* for shadow checking */ - } -} - -/* - * pkey 0 is special. It is allocated by default, so you do not - * have to call pkey_alloc() to use it first. Make sure that it - * is usable. - */ -void test_mprotect_with_pkey_0(int *ptr, u16 pkey) -{ - long size; - int prot; - - assert(pkey_last_malloc_record); - size = pkey_last_malloc_record->size; - /* - * This is a bit of a hack. But mprotect() requires - * huge-page-aligned sizes when operating on hugetlbfs. - * So, make sure that we use something that's a multiple - * of a huge page when we can. - */ - if (size >= HPAGE_SIZE) - size = HPAGE_SIZE; - prot = pkey_last_malloc_record->prot; - - /* Use pkey 0 */ - mprotect_pkey(ptr, size, prot, 0); - - /* Make sure that we can set it back to the original pkey. */ - mprotect_pkey(ptr, size, prot, pkey); -} - -void test_ptrace_of_child(int *ptr, u16 pkey) -{ - __attribute__((__unused__)) int peek_result; - pid_t child_pid; - void *ignored = 0; - long ret; - int status; - /* - * This is the "control" for our little expermient. Make sure - * we can always access it when ptracing. - */ - int *plain_ptr_unaligned = malloc(HPAGE_SIZE); - int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE); - - /* - * Fork a child which is an exact copy of this process, of course. - * That means we can do all of our tests via ptrace() and then plain - * memory access and ensure they work differently. - */ - child_pid = fork_lazy_child(); - dprintf1("[%d] child pid: %d\n", getpid(), child_pid); - - ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored); - if (ret) - perror("attach"); - dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__); - pkey_assert(ret != -1); - ret = waitpid(child_pid, &status, WUNTRACED); - if ((ret != child_pid) || !(WIFSTOPPED(status))) { - fprintf(stderr, "weird waitpid result %ld stat %x\n", - ret, status); - pkey_assert(0); - } - dprintf2("waitpid ret: %ld\n", ret); - dprintf2("waitpid status: %d\n", status); - - pkey_access_deny(pkey); - pkey_write_deny(pkey); - - /* Write access, untested for now: - ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data); - pkey_assert(ret != -1); - dprintf1("poke at %p: %ld\n", peek_at, ret); - */ - - /* - * Try to access the pkey-protected "ptr" via ptrace: - */ - ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored); - /* expect it to work, without an error: */ - pkey_assert(ret != -1); - /* Now access from the current task, and expect an exception: */ - peek_result = read_ptr(ptr); - expected_pkey_fault(pkey); - - /* - * Try to access the NON-pkey-protected "plain_ptr" via ptrace: - */ - ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored); - /* expect it to work, without an error: */ - pkey_assert(ret != -1); - /* Now access from the current task, and expect NO exception: */ - peek_result = read_ptr(plain_ptr); - do_not_expect_pkey_fault("read plain pointer after ptrace"); - - ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0); - pkey_assert(ret != -1); - - ret = kill(child_pid, SIGKILL); - pkey_assert(ret != -1); - - wait(&status); - - free(plain_ptr_unaligned); -} - -void *get_pointer_to_instructions(void) -{ - void *p1; - - p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE); - dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write); - /* lots_o_noops_around_write should be page-aligned already */ - assert(p1 == &lots_o_noops_around_write); - - /* Point 'p1' at the *second* page of the function: */ - p1 += PAGE_SIZE; - - /* - * Try to ensure we fault this in on next touch to ensure - * we get an instruction fault as opposed to a data one - */ - madvise(p1, PAGE_SIZE, MADV_DONTNEED); - - return p1; -} - -void test_executing_on_unreadable_memory(int *ptr, u16 pkey) -{ - void *p1; - int scratch; - int ptr_contents; - int ret; - - p1 = get_pointer_to_instructions(); - lots_o_noops_around_write(&scratch); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - - ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey); - pkey_assert(!ret); - pkey_access_deny(pkey); - - dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); - - /* - * Make sure this is an *instruction* fault - */ - madvise(p1, PAGE_SIZE, MADV_DONTNEED); - lots_o_noops_around_write(&scratch); - do_not_expect_pkey_fault("executing on PROT_EXEC memory"); - expect_fault_on_read_execonly_key(p1, pkey); -} - -void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) -{ - void *p1; - int scratch; - int ptr_contents; - int ret; - - dprintf1("%s() start\n", __func__); - - p1 = get_pointer_to_instructions(); - lots_o_noops_around_write(&scratch); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - - /* Use a *normal* mprotect(), not mprotect_pkey(): */ - ret = mprotect(p1, PAGE_SIZE, PROT_EXEC); - pkey_assert(!ret); - - /* - * Reset the shadow, assuming that the above mprotect() - * correctly changed PKRU, but to an unknown value since - * the actual allocated pkey is unknown. - */ - shadow_pkey_reg = __read_pkey_reg(); - - dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); - - /* Make sure this is an *instruction* fault */ - madvise(p1, PAGE_SIZE, MADV_DONTNEED); - lots_o_noops_around_write(&scratch); - do_not_expect_pkey_fault("executing on PROT_EXEC memory"); - expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY); - - /* - * Put the memory back to non-PROT_EXEC. Should clear the - * exec-only pkey off the VMA and allow it to be readable - * again. Go to PROT_NONE first to check for a kernel bug - * that did not clear the pkey when doing PROT_NONE. - */ - ret = mprotect(p1, PAGE_SIZE, PROT_NONE); - pkey_assert(!ret); - - ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC); - pkey_assert(!ret); - ptr_contents = read_ptr(p1); - do_not_expect_pkey_fault("plain read on recently PROT_EXEC area"); -} - -#if defined(__i386__) || defined(__x86_64__) -void test_ptrace_modifies_pkru(int *ptr, u16 pkey) -{ - u32 new_pkru; - pid_t child; - int status, ret; - int pkey_offset = pkey_reg_xstate_offset(); - size_t xsave_size = cpu_max_xsave_size(); - void *xsave; - u32 *pkey_register; - u64 *xstate_bv; - struct iovec iov; - - new_pkru = ~read_pkey_reg(); - /* Don't make PROT_EXEC mappings inaccessible */ - new_pkru &= ~3; - - child = fork(); - pkey_assert(child >= 0); - dprintf3("[%d] fork() ret: %d\n", getpid(), child); - if (!child) { - ptrace(PTRACE_TRACEME, 0, 0, 0); - /* Stop and allow the tracer to modify PKRU directly */ - raise(SIGSTOP); - - /* - * need __read_pkey_reg() version so we do not do shadow_pkey_reg - * checking - */ - if (__read_pkey_reg() != new_pkru) - exit(1); - - /* Stop and allow the tracer to clear XSTATE_BV for PKRU */ - raise(SIGSTOP); - - if (__read_pkey_reg() != 0) - exit(1); - - /* Stop and allow the tracer to examine PKRU */ - raise(SIGSTOP); - - exit(0); - } - - pkey_assert(child == waitpid(child, &status, 0)); - dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); - pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); - - xsave = (void *)malloc(xsave_size); - pkey_assert(xsave > 0); - - /* Modify the PKRU register directly */ - iov.iov_base = xsave; - iov.iov_len = xsave_size; - ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); - pkey_assert(ret == 0); - - pkey_register = (u32 *)(xsave + pkey_offset); - pkey_assert(*pkey_register == read_pkey_reg()); - - *pkey_register = new_pkru; - - ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov); - pkey_assert(ret == 0); - - /* Test that the modification is visible in ptrace before any execution */ - memset(xsave, 0xCC, xsave_size); - ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); - pkey_assert(ret == 0); - pkey_assert(*pkey_register == new_pkru); - - /* Execute the tracee */ - ret = ptrace(PTRACE_CONT, child, 0, 0); - pkey_assert(ret == 0); - - /* Test that the tracee saw the PKRU value change */ - pkey_assert(child == waitpid(child, &status, 0)); - dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); - pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); - - /* Test that the modification is visible in ptrace after execution */ - memset(xsave, 0xCC, xsave_size); - ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); - pkey_assert(ret == 0); - pkey_assert(*pkey_register == new_pkru); - - /* Clear the PKRU bit from XSTATE_BV */ - xstate_bv = (u64 *)(xsave + 512); - *xstate_bv &= ~(1 << 9); - - ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov); - pkey_assert(ret == 0); - - /* Test that the modification is visible in ptrace before any execution */ - memset(xsave, 0xCC, xsave_size); - ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); - pkey_assert(ret == 0); - pkey_assert(*pkey_register == 0); - - ret = ptrace(PTRACE_CONT, child, 0, 0); - pkey_assert(ret == 0); - - /* Test that the tracee saw the PKRU value go to 0 */ - pkey_assert(child == waitpid(child, &status, 0)); - dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); - pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); - - /* Test that the modification is visible in ptrace after execution */ - memset(xsave, 0xCC, xsave_size); - ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); - pkey_assert(ret == 0); - pkey_assert(*pkey_register == 0); - - ret = ptrace(PTRACE_CONT, child, 0, 0); - pkey_assert(ret == 0); - pkey_assert(child == waitpid(child, &status, 0)); - dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); - pkey_assert(WIFEXITED(status)); - pkey_assert(WEXITSTATUS(status) == 0); - free(xsave); -} -#endif - -void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) -{ - int size = PAGE_SIZE; - int sret; - - if (cpu_has_pkeys()) { - dprintf1("SKIP: %s: no CPU support\n", __func__); - return; - } - - sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey); - pkey_assert(sret < 0); -} - -void (*pkey_tests[])(int *ptr, u16 pkey) = { - test_read_of_write_disabled_region, - test_read_of_access_disabled_region, - test_read_of_access_disabled_region_with_page_already_mapped, - test_write_of_write_disabled_region, - test_write_of_write_disabled_region_with_page_already_mapped, - test_write_of_access_disabled_region, - test_write_of_access_disabled_region_with_page_already_mapped, - test_kernel_write_of_access_disabled_region, - test_kernel_write_of_write_disabled_region, - test_kernel_gup_of_access_disabled_region, - test_kernel_gup_write_to_write_disabled_region, - test_executing_on_unreadable_memory, - test_implicit_mprotect_exec_only_memory, - test_mprotect_with_pkey_0, - test_ptrace_of_child, - test_pkey_init_state, - test_pkey_syscalls_on_non_allocated_pkey, - test_pkey_syscalls_bad_args, - test_pkey_alloc_exhaust, - test_pkey_alloc_free_attach_pkey0, -#if defined(__i386__) || defined(__x86_64__) - test_ptrace_modifies_pkru, -#endif -}; - -void run_tests_once(void) -{ - int *ptr; - int prot = PROT_READ|PROT_WRITE; - - for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) { - int pkey; - int orig_pkey_faults = pkey_faults; - - dprintf1("======================\n"); - dprintf1("test %d preparing...\n", test_nr); - - tracing_on(); - pkey = alloc_random_pkey(); - dprintf1("test %d starting with pkey: %d\n", test_nr, pkey); - ptr = malloc_pkey(PAGE_SIZE, prot, pkey); - dprintf1("test %d starting...\n", test_nr); - pkey_tests[test_nr](ptr, pkey); - dprintf1("freeing test memory: %p\n", ptr); - free_pkey_malloc(ptr); - sys_pkey_free(pkey); - - dprintf1("pkey_faults: %d\n", pkey_faults); - dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults); - - tracing_off(); - close_test_fds(); - - printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr); - dprintf1("======================\n\n"); - } - iteration_nr++; -} - -void pkey_setup_shadow(void) -{ - shadow_pkey_reg = __read_pkey_reg(); -} - -int main(void) -{ - int nr_iterations = 22; - int pkeys_supported = is_pkeys_supported(); - - srand((unsigned int)time(NULL)); - - setup_handlers(); - - printf("has pkeys: %d\n", pkeys_supported); - - if (!pkeys_supported) { - int size = PAGE_SIZE; - int *ptr; - - printf("running PKEY tests for unsupported CPU/OS\n"); - - ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - assert(ptr != (void *)-1); - test_mprotect_pkey_on_unsupported_cpu(ptr, 1); - exit(0); - } - - pkey_setup_shadow(); - printf("startup pkey_reg: %016llx\n", read_pkey_reg()); - setup_hugetlbfs(); - - while (nr_iterations-- > 0) - run_tests_once(); - - printf("done (all tests OK)\n"); - return 0; -} diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh deleted file mode 100755 index 8984e0bb58c7..000000000000 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ /dev/null @@ -1,274 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# Please run as root - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -exitcode=0 - -usage() { - cat <"] - -t: specify specific categories to tests to run - -h: display this message - -The default behavior is to run all tests. - -Alternatively, specific groups tests can be run by passing a string -to the -t argument containing one or more of the following categories -separated by spaces: -- mmap - tests for mmap(2) -- gup_test - tests for gup using gup_test interface -- userfaultfd - tests for userfaultfd(2) -- compaction - a test for the patch "Allow compaction of unevictable pages" -- mlock - tests for mlock(2) -- mremap - tests for mremap(2) -- hugevm - tests for very large virtual address space -- vmalloc - vmalloc smoke tests -- hmm - hmm smoke tests -- madv_populate - test memadvise(2) MADV_POPULATE_{READ,WRITE} options -- memfd_secret - test memfd_secret(2) -- process_mrelease - test process_mrelease(2) -- ksm - ksm tests that do not require >=2 NUMA nodes -- ksm_numa - ksm tests that require >=2 NUMA nodes -- pkey - memory protection key tests -- soft_dirty - test soft dirty page bit semantics -- cow - test copy-on-write semantics -example: ./run_vmtests.sh -t "hmm mmap ksm" -EOF - exit 0 -} - - -while getopts "ht:" OPT; do - case ${OPT} in - "h") usage ;; - "t") VM_SELFTEST_ITEMS=${OPTARG} ;; - esac -done -shift $((OPTIND -1)) - -# default behavior: run all tests -VM_SELFTEST_ITEMS=${VM_SELFTEST_ITEMS:-default} - -test_selected() { - if [ "$VM_SELFTEST_ITEMS" == "default" ]; then - # If no VM_SELFTEST_ITEMS are specified, run all tests - return 0 - fi - # If test selected argument is one of the test items - if [[ " ${VM_SELFTEST_ITEMS[*]} " =~ " ${1} " ]]; then - return 0 - else - return 1 - fi -} - -# get huge pagesize and freepages from /proc/meminfo -while read -r name size unit; do - if [ "$name" = "HugePages_Free:" ]; then - freepgs="$size" - fi - if [ "$name" = "Hugepagesize:" ]; then - hpgsize_KB="$size" - fi -done < /proc/meminfo - -# Simple hugetlbfs tests have a hardcoded minimum requirement of -# huge pages totaling 256MB (262144KB) in size. The userfaultfd -# hugetlb test requires a minimum of 2 * nr_cpus huge pages. Take -# both of these requirements into account and attempt to increase -# number of huge pages available. -nr_cpus=$(nproc) -hpgsize_MB=$((hpgsize_KB / 1024)) -half_ufd_size_MB=$((((nr_cpus * hpgsize_MB + 127) / 128) * 128)) -needmem_KB=$((half_ufd_size_MB * 2 * 1024)) - -# set proper nr_hugepages -if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then - nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) - needpgs=$((needmem_KB / hpgsize_KB)) - tries=2 - while [ "$tries" -gt 0 ] && [ "$freepgs" -lt "$needpgs" ]; do - lackpgs=$((needpgs - freepgs)) - echo 3 > /proc/sys/vm/drop_caches - if ! echo $((lackpgs + nr_hugepgs)) > /proc/sys/vm/nr_hugepages; then - echo "Please run this test as root" - exit $ksft_skip - fi - while read -r name size unit; do - if [ "$name" = "HugePages_Free:" ]; then - freepgs=$size - fi - done < /proc/meminfo - tries=$((tries - 1)) - done - if [ "$freepgs" -lt "$needpgs" ]; then - printf "Not enough huge pages available (%d < %d)\n" \ - "$freepgs" "$needpgs" - exit 1 - fi -else - echo "no hugetlbfs support in kernel?" - exit 1 -fi - -# filter 64bit architectures -ARCH64STR="arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64" -if [ -z "$ARCH" ]; then - ARCH=$(uname -m 2>/dev/null | sed -e 's/aarch64.*/arm64/') -fi -VADDR64=0 -echo "$ARCH64STR" | grep "$ARCH" &>/dev/null && VADDR64=1 - -# Usage: run_test [test binary] [arbitrary test arguments...] -run_test() { - if test_selected ${CATEGORY}; then - local title="running $*" - local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -) - printf "%s\n%s\n%s\n" "$sep" "$title" "$sep" - - "$@" - local ret=$? - if [ $ret -eq 0 ]; then - echo "[PASS]" - elif [ $ret -eq $ksft_skip ]; then - echo "[SKIP]" - exitcode=$ksft_skip - else - echo "[FAIL]" - exitcode=1 - fi - fi # test_selected -} - -CATEGORY="hugetlb" run_test ./hugepage-mmap - -shmmax=$(cat /proc/sys/kernel/shmmax) -shmall=$(cat /proc/sys/kernel/shmall) -echo 268435456 > /proc/sys/kernel/shmmax -echo 4194304 > /proc/sys/kernel/shmall -CATEGORY="hugetlb" run_test ./hugepage-shm -echo "$shmmax" > /proc/sys/kernel/shmmax -echo "$shmall" > /proc/sys/kernel/shmall - -CATEGORY="hugetlb" run_test ./map_hugetlb -CATEGORY="hugetlb" run_test ./hugepage-mremap -CATEGORY="hugetlb" run_test ./hugepage-vmemmap -CATEGORY="hugetlb" run_test ./hugetlb-madvise - -if test_selected "hugetlb"; then - echo "NOTE: These hugetlb tests provide minimal coverage. Use" - echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" - echo " hugetlb regression testing." -fi - -CATEGORY="mmap" run_test ./map_fixed_noreplace - -# get_user_pages_fast() benchmark -CATEGORY="gup_test" run_test ./gup_test -u -# pin_user_pages_fast() benchmark -CATEGORY="gup_test" run_test ./gup_test -a -# Dump pages 0, 19, and 4096, using pin_user_pages: -CATEGORY="gup_test" run_test ./gup_test -ct -F 0x1 0 19 0x1000 - -uffd_mods=("" ":dev") -for mod in "${uffd_mods[@]}"; do - CATEGORY="userfaultfd" run_test ./userfaultfd anon${mod} 20 16 - # Hugetlb tests require source and destination huge pages. Pass in half - # the size ($half_ufd_size_MB), which is used for *each*. - CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32 - CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32 - CATEGORY="userfaultfd" run_test ./userfaultfd shmem${mod} 20 16 -done - -#cleanup -echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages - -CATEGORY="compaction" run_test ./compaction_test - -CATEGORY="mlock" run_test sudo -u nobody ./on-fault-limit - -CATEGORY="mmap" run_test ./map_populate - -CATEGORY="mlock" run_test ./mlock-random-test - -CATEGORY="mlock" run_test ./mlock2-tests - -CATEGORY="process_mrelease" run_test ./mrelease_test - -CATEGORY="mremap" run_test ./mremap_test - -CATEGORY="hugetlb" run_test ./thuge-gen - -if [ $VADDR64 -ne 0 ]; then - CATEGORY="hugevm" run_test ./virtual_address_range - - # virtual address 128TB switch test - CATEGORY="hugevm" run_test ./va_128TBswitch.sh -fi # VADDR64 - -# vmalloc stability smoke test -CATEGORY="vmalloc" run_test ./test_vmalloc.sh smoke - -CATEGORY="mremap" run_test ./mremap_dontunmap - -CATEGORY="hmm" run_test ./test_hmm.sh smoke - -# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests -CATEGORY="madv_populate" run_test ./madv_populate - -CATEGORY="memfd_secret" run_test ./memfd_secret - -# KSM MADV_MERGEABLE test with 10 identical pages -CATEGORY="ksm" run_test ./ksm_tests -M -p 10 -# KSM unmerge test -CATEGORY="ksm" run_test ./ksm_tests -U -# KSM test with 10 zero pages and use_zero_pages = 0 -CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 0 -# KSM test with 10 zero pages and use_zero_pages = 1 -CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 1 -# KSM test with 2 NUMA nodes and merge_across_nodes = 1 -CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 1 -# KSM test with 2 NUMA nodes and merge_across_nodes = 0 -CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 0 - -CATEGORY="ksm" run_test ./ksm_functional_tests - -run_test ./ksm_functional_tests - -# protection_keys tests -if [ -x ./protection_keys_32 ] -then - CATEGORY="pkey" run_test ./protection_keys_32 -fi - -if [ -x ./protection_keys_64 ] -then - CATEGORY="pkey" run_test ./protection_keys_64 -fi - -CATEGORY="soft_dirty" run_test ./soft-dirty - -# COW tests -CATEGORY="cow" run_test ./cow - -exit $exitcode diff --git a/tools/testing/selftests/vm/settings b/tools/testing/selftests/vm/settings deleted file mode 100644 index 9abfc60e9e6f..000000000000 --- a/tools/testing/selftests/vm/settings +++ /dev/null @@ -1 +0,0 @@ -timeout=45 diff --git a/tools/testing/selftests/vm/soft-dirty.c b/tools/testing/selftests/vm/soft-dirty.c deleted file mode 100644 index 21d8830c5f24..000000000000 --- a/tools/testing/selftests/vm/soft-dirty.c +++ /dev/null @@ -1,210 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include "../kselftest.h" -#include "vm_util.h" - -#define PAGEMAP_FILE_PATH "/proc/self/pagemap" -#define TEST_ITERATIONS 10000 - -static void test_simple(int pagemap_fd, int pagesize) -{ - int i; - char *map; - - map = aligned_alloc(pagesize, pagesize); - if (!map) - ksft_exit_fail_msg("mmap failed\n"); - - clear_softdirty(); - - for (i = 0 ; i < TEST_ITERATIONS; i++) { - if (pagemap_is_softdirty(pagemap_fd, map) == 1) { - ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i); - break; - } - - clear_softdirty(); - // Write something to the page to get the dirty bit enabled on the page - map[0]++; - - if (pagemap_is_softdirty(pagemap_fd, map) == 0) { - ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i); - break; - } - - clear_softdirty(); - } - free(map); - - ksft_test_result(i == TEST_ITERATIONS, "Test %s\n", __func__); -} - -static void test_vma_reuse(int pagemap_fd, int pagesize) -{ - char *map, *map2; - - map = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0); - if (map == MAP_FAILED) - ksft_exit_fail_msg("mmap failed"); - - // The kernel always marks new regions as soft dirty - ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, - "Test %s dirty bit of allocated page\n", __func__); - - clear_softdirty(); - munmap(map, pagesize); - - map2 = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0); - if (map2 == MAP_FAILED) - ksft_exit_fail_msg("mmap failed"); - - // Dirty bit is set for new regions even if they are reused - if (map == map2) - ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1, - "Test %s dirty bit of reused address page\n", __func__); - else - ksft_test_result_skip("Test %s dirty bit of reused address page\n", __func__); - - munmap(map2, pagesize); -} - -static void test_hugepage(int pagemap_fd, int pagesize) -{ - char *map; - int i, ret; - size_t hpage_len = read_pmd_pagesize(); - - map = memalign(hpage_len, hpage_len); - if (!map) - ksft_exit_fail_msg("memalign failed\n"); - - ret = madvise(map, hpage_len, MADV_HUGEPAGE); - if (ret) - ksft_exit_fail_msg("madvise failed %d\n", ret); - - for (i = 0; i < hpage_len; i++) - map[i] = (char)i; - - if (check_huge_anon(map, 1, hpage_len)) { - ksft_test_result_pass("Test %s huge page allocation\n", __func__); - - clear_softdirty(); - for (i = 0 ; i < TEST_ITERATIONS ; i++) { - if (pagemap_is_softdirty(pagemap_fd, map) == 1) { - ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i); - break; - } - - clear_softdirty(); - // Write something to the page to get the dirty bit enabled on the page - map[0]++; - - if (pagemap_is_softdirty(pagemap_fd, map) == 0) { - ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i); - break; - } - clear_softdirty(); - } - - ksft_test_result(i == TEST_ITERATIONS, "Test %s huge page dirty bit\n", __func__); - } else { - // hugepage allocation failed. skip these tests - ksft_test_result_skip("Test %s huge page allocation\n", __func__); - ksft_test_result_skip("Test %s huge page dirty bit\n", __func__); - } - free(map); -} - -static void test_mprotect(int pagemap_fd, int pagesize, bool anon) -{ - const char *type[] = {"file", "anon"}; - const char *fname = "./soft-dirty-test-file"; - int test_fd; - char *map; - - if (anon) { - map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - if (!map) - ksft_exit_fail_msg("anon mmap failed\n"); - } else { - test_fd = open(fname, O_RDWR | O_CREAT); - if (test_fd < 0) { - ksft_test_result_skip("Test %s open() file failed\n", __func__); - return; - } - unlink(fname); - ftruncate(test_fd, pagesize); - map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, - MAP_SHARED, test_fd, 0); - if (!map) - ksft_exit_fail_msg("file mmap failed\n"); - } - - *map = 1; - ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, - "Test %s-%s dirty bit of new written page\n", - __func__, type[anon]); - clear_softdirty(); - ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, - "Test %s-%s soft-dirty clear after clear_refs\n", - __func__, type[anon]); - mprotect(map, pagesize, PROT_READ); - ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, - "Test %s-%s soft-dirty clear after marking RO\n", - __func__, type[anon]); - mprotect(map, pagesize, PROT_READ|PROT_WRITE); - ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, - "Test %s-%s soft-dirty clear after marking RW\n", - __func__, type[anon]); - *map = 2; - ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, - "Test %s-%s soft-dirty after rewritten\n", - __func__, type[anon]); - - munmap(map, pagesize); - - if (!anon) - close(test_fd); -} - -static void test_mprotect_anon(int pagemap_fd, int pagesize) -{ - test_mprotect(pagemap_fd, pagesize, true); -} - -static void test_mprotect_file(int pagemap_fd, int pagesize) -{ - test_mprotect(pagemap_fd, pagesize, false); -} - -int main(int argc, char **argv) -{ - int pagemap_fd; - int pagesize; - - ksft_print_header(); - ksft_set_plan(15); - - pagemap_fd = open(PAGEMAP_FILE_PATH, O_RDONLY); - if (pagemap_fd < 0) - ksft_exit_fail_msg("Failed to open %s\n", PAGEMAP_FILE_PATH); - - pagesize = getpagesize(); - - test_simple(pagemap_fd, pagesize); - test_vma_reuse(pagemap_fd, pagesize); - test_hugepage(pagemap_fd, pagesize); - test_mprotect_anon(pagemap_fd, pagesize); - test_mprotect_file(pagemap_fd, pagesize); - - close(pagemap_fd); - - return ksft_exit_pass(); -} diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c deleted file mode 100644 index 76e1c36dd9e5..000000000000 --- a/tools/testing/selftests/vm/split_huge_page_test.c +++ /dev/null @@ -1,309 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual - * address range in a process via /split_huge_pages interface. - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "vm_util.h" - -uint64_t pagesize; -unsigned int pageshift; -uint64_t pmd_pagesize; - -#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages" -#define INPUT_MAX 80 - -#define PID_FMT "%d,0x%lx,0x%lx" -#define PATH_FMT "%s,0x%lx,0x%lx" - -#define PFN_MASK ((1UL<<55)-1) -#define KPF_THP (1UL<<22) - -int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file) -{ - uint64_t paddr; - uint64_t page_flags; - - if (pagemap_file) { - pread(pagemap_file, &paddr, sizeof(paddr), - ((long)vaddr >> pageshift) * sizeof(paddr)); - - if (kpageflags_file) { - pread(kpageflags_file, &page_flags, sizeof(page_flags), - (paddr & PFN_MASK) * sizeof(page_flags)); - - return !!(page_flags & KPF_THP); - } - } - return 0; -} - -static int write_file(const char *path, const char *buf, size_t buflen) -{ - int fd; - ssize_t numwritten; - - fd = open(path, O_WRONLY); - if (fd == -1) - return 0; - - numwritten = write(fd, buf, buflen - 1); - close(fd); - if (numwritten < 1) - return 0; - - return (unsigned int) numwritten; -} - -static void write_debugfs(const char *fmt, ...) -{ - char input[INPUT_MAX]; - int ret; - va_list argp; - - va_start(argp, fmt); - ret = vsnprintf(input, INPUT_MAX, fmt, argp); - va_end(argp); - - if (ret >= INPUT_MAX) { - printf("%s: Debugfs input is too long\n", __func__); - exit(EXIT_FAILURE); - } - - if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) { - perror(SPLIT_DEBUGFS); - exit(EXIT_FAILURE); - } -} - -void split_pmd_thp(void) -{ - char *one_page; - size_t len = 4 * pmd_pagesize; - size_t i; - - one_page = memalign(pmd_pagesize, len); - - if (!one_page) { - printf("Fail to allocate memory\n"); - exit(EXIT_FAILURE); - } - - madvise(one_page, len, MADV_HUGEPAGE); - - for (i = 0; i < len; i++) - one_page[i] = (char)i; - - if (!check_huge_anon(one_page, 1, pmd_pagesize)) { - printf("No THP is allocated\n"); - exit(EXIT_FAILURE); - } - - /* split all THPs */ - write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, - (uint64_t)one_page + len); - - for (i = 0; i < len; i++) - if (one_page[i] != (char)i) { - printf("%ld byte corrupted\n", i); - exit(EXIT_FAILURE); - } - - - if (check_huge_anon(one_page, 0, pmd_pagesize)) { - printf("Still AnonHugePages not split\n"); - exit(EXIT_FAILURE); - } - - printf("Split huge pages successful\n"); - free(one_page); -} - -void split_pte_mapped_thp(void) -{ - char *one_page, *pte_mapped, *pte_mapped2; - size_t len = 4 * pmd_pagesize; - uint64_t thp_size; - size_t i; - const char *pagemap_template = "/proc/%d/pagemap"; - const char *kpageflags_proc = "/proc/kpageflags"; - char pagemap_proc[255]; - int pagemap_fd; - int kpageflags_fd; - - if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) { - perror("get pagemap proc error"); - exit(EXIT_FAILURE); - } - pagemap_fd = open(pagemap_proc, O_RDONLY); - - if (pagemap_fd == -1) { - perror("read pagemap:"); - exit(EXIT_FAILURE); - } - - kpageflags_fd = open(kpageflags_proc, O_RDONLY); - - if (kpageflags_fd == -1) { - perror("read kpageflags:"); - exit(EXIT_FAILURE); - } - - one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - - madvise(one_page, len, MADV_HUGEPAGE); - - for (i = 0; i < len; i++) - one_page[i] = (char)i; - - if (!check_huge_anon(one_page, 1, pmd_pagesize)) { - printf("No THP is allocated\n"); - exit(EXIT_FAILURE); - } - - /* remap the first pagesize of first THP */ - pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE); - - /* remap the Nth pagesize of Nth THP */ - for (i = 1; i < 4; i++) { - pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i, - pagesize, pagesize, - MREMAP_MAYMOVE|MREMAP_FIXED, - pte_mapped + pagesize * i); - if (pte_mapped2 == (char *)-1) { - perror("mremap failed"); - exit(EXIT_FAILURE); - } - } - - /* smap does not show THPs after mremap, use kpageflags instead */ - thp_size = 0; - for (i = 0; i < pagesize * 4; i++) - if (i % pagesize == 0 && - is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) - thp_size++; - - if (thp_size != 4) { - printf("Some THPs are missing during mremap\n"); - exit(EXIT_FAILURE); - } - - /* split all remapped THPs */ - write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped, - (uint64_t)pte_mapped + pagesize * 4); - - /* smap does not show THPs after mremap, use kpageflags instead */ - thp_size = 0; - for (i = 0; i < pagesize * 4; i++) { - if (pte_mapped[i] != (char)i) { - printf("%ld byte corrupted\n", i); - exit(EXIT_FAILURE); - } - if (i % pagesize == 0 && - is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) - thp_size++; - } - - if (thp_size) { - printf("Still %ld THPs not split\n", thp_size); - exit(EXIT_FAILURE); - } - - printf("Split PTE-mapped huge pages successful\n"); - munmap(one_page, len); - close(pagemap_fd); - close(kpageflags_fd); -} - -void split_file_backed_thp(void) -{ - int status; - int fd; - ssize_t num_written; - char tmpfs_template[] = "/tmp/thp_split_XXXXXX"; - const char *tmpfs_loc = mkdtemp(tmpfs_template); - char testfile[INPUT_MAX]; - uint64_t pgoff_start = 0, pgoff_end = 1024; - - printf("Please enable pr_debug in split_huge_pages_in_file() if you need more info.\n"); - - status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m"); - - if (status) { - printf("Unable to create a tmpfs for testing\n"); - exit(EXIT_FAILURE); - } - - status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc); - if (status >= INPUT_MAX) { - printf("Fail to create file-backed THP split testing file\n"); - goto cleanup; - } - - fd = open(testfile, O_CREAT|O_WRONLY); - if (fd == -1) { - perror("Cannot open testing file\n"); - goto cleanup; - } - - /* write something to the file, so a file-backed THP can be allocated */ - num_written = write(fd, tmpfs_loc, strlen(tmpfs_loc) + 1); - close(fd); - - if (num_written < 1) { - printf("Fail to write data to testing file\n"); - goto cleanup; - } - - /* split the file-backed THP */ - write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end); - - status = unlink(testfile); - if (status) - perror("Cannot remove testing file\n"); - -cleanup: - status = umount(tmpfs_loc); - if (status) { - printf("Unable to umount %s\n", tmpfs_loc); - exit(EXIT_FAILURE); - } - status = rmdir(tmpfs_loc); - if (status) { - perror("cannot remove tmp dir"); - exit(EXIT_FAILURE); - } - - printf("file-backed THP split test done, please check dmesg for more information\n"); -} - -int main(int argc, char **argv) -{ - if (geteuid() != 0) { - printf("Please run the benchmark as root\n"); - exit(EXIT_FAILURE); - } - - pagesize = getpagesize(); - pageshift = ffs(pagesize) - 1; - pmd_pagesize = read_pmd_pagesize(); - - split_pmd_thp(); - split_pte_mapped_thp(); - split_file_backed_thp(); - - return 0; -} diff --git a/tools/testing/selftests/vm/test_hmm.sh b/tools/testing/selftests/vm/test_hmm.sh deleted file mode 100755 index 46e19b5d648d..000000000000 --- a/tools/testing/selftests/vm/test_hmm.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Copyright (C) 2018 Uladzislau Rezki (Sony) -# -# This is a test script for the kernel test driver to analyse vmalloc -# allocator. Therefore it is just a kernel module loader. You can specify -# and pass different parameters in order to: -# a) analyse performance of vmalloc allocations; -# b) stressing and stability check of vmalloc subsystem. - -TEST_NAME="test_hmm" -DRIVER="test_hmm" - -# 1 if fails -exitcode=1 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -check_test_requirements() -{ - uid=$(id -u) - if [ $uid -ne 0 ]; then - echo "$0: Must be run as root" - exit $ksft_skip - fi - - if ! which modprobe > /dev/null 2>&1; then - echo "$0: You need modprobe installed" - exit $ksft_skip - fi - - if ! modinfo $DRIVER > /dev/null 2>&1; then - echo "$0: You must have the following enabled in your kernel:" - echo "CONFIG_TEST_HMM=m" - exit $ksft_skip - fi -} - -load_driver() -{ - if [ $# -eq 0 ]; then - modprobe $DRIVER > /dev/null 2>&1 - else - if [ $# -eq 2 ]; then - modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2 - > /dev/null 2>&1 - else - echo "Missing module parameters. Make sure pass"\ - "spm_addr_dev0 and spm_addr_dev1" - usage - fi - fi -} - -unload_driver() -{ - modprobe -r $DRIVER > /dev/null 2>&1 -} - -run_smoke() -{ - echo "Running smoke test. Note, this test provides basic coverage." - - load_driver $1 $2 - $(dirname "${BASH_SOURCE[0]}")/hmm-tests - unload_driver -} - -usage() -{ - echo -n "Usage: $0" - echo - echo "Example usage:" - echo - echo "# Shows help message" - echo "./${TEST_NAME}.sh" - echo - echo "# Smoke testing" - echo "./${TEST_NAME}.sh smoke" - echo - echo "# Smoke testing with SPM enabled" - echo "./${TEST_NAME}.sh smoke " - echo - exit 0 -} - -function run_test() -{ - if [ $# -eq 0 ]; then - usage - else - if [ "$1" = "smoke" ]; then - run_smoke $2 $3 - else - usage - fi - fi -} - -check_test_requirements -run_test $@ - -exit 0 diff --git a/tools/testing/selftests/vm/test_vmalloc.sh b/tools/testing/selftests/vm/test_vmalloc.sh deleted file mode 100755 index d73b846736f1..000000000000 --- a/tools/testing/selftests/vm/test_vmalloc.sh +++ /dev/null @@ -1,177 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Copyright (C) 2018 Uladzislau Rezki (Sony) -# -# This is a test script for the kernel test driver to analyse vmalloc -# allocator. Therefore it is just a kernel module loader. You can specify -# and pass different parameters in order to: -# a) analyse performance of vmalloc allocations; -# b) stressing and stability check of vmalloc subsystem. - -TEST_NAME="vmalloc" -DRIVER="test_${TEST_NAME}" -NUM_CPUS=`grep -c ^processor /proc/cpuinfo` - -# 1 if fails -exitcode=1 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -# -# Static templates for performance, stressing and smoke tests. -# Also it is possible to pass any supported parameters manualy. -# -PERF_PARAM="sequential_test_order=1 test_repeat_count=3" -SMOKE_PARAM="test_loop_count=10000 test_repeat_count=10" -STRESS_PARAM="nr_threads=$NUM_CPUS test_repeat_count=20" - -check_test_requirements() -{ - uid=$(id -u) - if [ $uid -ne 0 ]; then - echo "$0: Must be run as root" - exit $ksft_skip - fi - - if ! which modprobe > /dev/null 2>&1; then - echo "$0: You need modprobe installed" - exit $ksft_skip - fi - - if ! modinfo $DRIVER > /dev/null 2>&1; then - echo "$0: You must have the following enabled in your kernel:" - echo "CONFIG_TEST_VMALLOC=m" - exit $ksft_skip - fi -} - -run_perfformance_check() -{ - echo "Run performance tests to evaluate how fast vmalloc allocation is." - echo "It runs all test cases on one single CPU with sequential order." - - modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1 - echo "Done." - echo "Ccheck the kernel message buffer to see the summary." -} - -run_stability_check() -{ - echo "Run stability tests. In order to stress vmalloc subsystem all" - echo "available test cases are run by NUM_CPUS workers simultaneously." - echo "It will take time, so be patient." - - modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1 - echo "Done." - echo "Check the kernel ring buffer to see the summary." -} - -run_smoke_check() -{ - echo "Run smoke test. Note, this test provides basic coverage." - echo "Please check $0 output how it can be used" - echo "for deep performance analysis as well as stress testing." - - modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1 - echo "Done." - echo "Check the kernel ring buffer to see the summary." -} - -usage() -{ - echo -n "Usage: $0 [ performance ] | [ stress ] | | [ smoke ] | " - echo "manual parameters" - echo - echo "Valid tests and parameters:" - echo - modinfo $DRIVER - echo - echo "Example usage:" - echo - echo "# Shows help message" - echo "./${DRIVER}.sh" - echo - echo "# Runs 1 test(id_1), repeats it 5 times by NUM_CPUS workers" - echo "./${DRIVER}.sh nr_threads=$NUM_CPUS run_test_mask=1 test_repeat_count=5" - echo - echo -n "# Runs 4 tests(id_1|id_2|id_4|id_16) on one CPU with " - echo "sequential order" - echo -n "./${DRIVER}.sh sequential_test_order=1 " - echo "run_test_mask=23" - echo - echo -n "# Runs all tests by NUM_CPUS workers, shuffled order, repeats " - echo "20 times" - echo "./${DRIVER}.sh nr_threads=$NUM_CPUS test_repeat_count=20" - echo - echo "# Performance analysis" - echo "./${DRIVER}.sh performance" - echo - echo "# Stress testing" - echo "./${DRIVER}.sh stress" - echo - exit 0 -} - -function validate_passed_args() -{ - VALID_ARGS=`modinfo $DRIVER | awk '/parm:/ {print $2}' | sed 's/:.*//'` - - # - # Something has been passed, check it. - # - for passed_arg in $@; do - key=${passed_arg//=*/} - val="${passed_arg:$((${#key}+1))}" - valid=0 - - for valid_arg in $VALID_ARGS; do - if [[ $key = $valid_arg ]] && [[ $val -gt 0 ]]; then - valid=1 - break - fi - done - - if [[ $valid -ne 1 ]]; then - echo "Error: key or value is not correct: ${key} $val" - exit $exitcode - fi - done -} - -function run_manual_check() -{ - # - # Validate passed parameters. If there is wrong one, - # the script exists and does not execute further. - # - validate_passed_args $@ - - echo "Run the test with following parameters: $@" - modprobe $DRIVER $@ > /dev/null 2>&1 - echo "Done." - echo "Check the kernel ring buffer to see the summary." -} - -function run_test() -{ - if [ $# -eq 0 ]; then - usage - else - if [[ "$1" = "performance" ]]; then - run_perfformance_check - elif [[ "$1" = "stress" ]]; then - run_stability_check - elif [[ "$1" = "smoke" ]]; then - run_smoke_check - else - run_manual_check $@ - fi - fi -} - -check_test_requirements -run_test $@ - -exit 0 diff --git a/tools/testing/selftests/vm/thuge-gen.c b/tools/testing/selftests/vm/thuge-gen.c deleted file mode 100644 index 361ef7192cc6..000000000000 --- a/tools/testing/selftests/vm/thuge-gen.c +++ /dev/null @@ -1,257 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Test selecting other page sizes for mmap/shmget. - - Before running this huge pages for each huge page size must have been - reserved. - For large pages beyond MAX_ORDER (like 1GB on x86) boot options must be used. - Also shmmax must be increased. - And you need to run as root to work around some weird permissions in shm. - And nothing using huge pages should run in parallel. - When the program aborts you may need to clean up the shm segments with - ipcrm -m by hand, like this - sudo ipcs | awk '$1 == "0x00000000" {print $2}' | xargs -n1 sudo ipcrm -m - (warning this will remove all if someone else uses them) */ - -#define _GNU_SOURCE 1 -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define err(x) perror(x), exit(1) - -#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) -#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) -#define MAP_HUGE_SHIFT 26 -#define MAP_HUGE_MASK 0x3f -#if !defined(MAP_HUGETLB) -#define MAP_HUGETLB 0x40000 -#endif - -#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */ -#define SHM_HUGE_SHIFT 26 -#define SHM_HUGE_MASK 0x3f -#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT) -#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT) - -#define NUM_PAGESIZES 5 - -#define NUM_PAGES 4 - -#define Dprintf(fmt...) // printf(fmt) - -unsigned long page_sizes[NUM_PAGESIZES]; -int num_page_sizes; - -int ilog2(unsigned long v) -{ - int l = 0; - while ((1UL << l) < v) - l++; - return l; -} - -void find_pagesizes(void) -{ - glob_t g; - int i; - glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g); - assert(g.gl_pathc <= NUM_PAGESIZES); - for (i = 0; i < g.gl_pathc; i++) { - sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB", - &page_sizes[i]); - page_sizes[i] <<= 10; - printf("Found %luMB\n", page_sizes[i] >> 20); - } - num_page_sizes = g.gl_pathc; - globfree(&g); -} - -unsigned long default_huge_page_size(void) -{ - unsigned long hps = 0; - char *line = NULL; - size_t linelen = 0; - FILE *f = fopen("/proc/meminfo", "r"); - if (!f) - return 0; - while (getline(&line, &linelen, f) > 0) { - if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { - hps <<= 10; - break; - } - } - free(line); - return hps; -} - -void show(unsigned long ps) -{ - char buf[100]; - if (ps == getpagesize()) - return; - printf("%luMB: ", ps >> 20); - fflush(stdout); - snprintf(buf, sizeof buf, - "cat /sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", - ps >> 10); - system(buf); -} - -unsigned long read_sysfs(int warn, char *fmt, ...) -{ - char *line = NULL; - size_t linelen = 0; - char buf[100]; - FILE *f; - va_list ap; - unsigned long val = 0; - - va_start(ap, fmt); - vsnprintf(buf, sizeof buf, fmt, ap); - va_end(ap); - - f = fopen(buf, "r"); - if (!f) { - if (warn) - printf("missing %s\n", buf); - return 0; - } - if (getline(&line, &linelen, f) > 0) { - sscanf(line, "%lu", &val); - } - fclose(f); - free(line); - return val; -} - -unsigned long read_free(unsigned long ps) -{ - return read_sysfs(ps != getpagesize(), - "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", - ps >> 10); -} - -void test_mmap(unsigned long size, unsigned flags) -{ - char *map; - unsigned long before, after; - int err; - - before = read_free(size); - map = mmap(NULL, size*NUM_PAGES, PROT_READ|PROT_WRITE, - MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB|flags, -1, 0); - - if (map == (char *)-1) err("mmap"); - memset(map, 0xff, size*NUM_PAGES); - after = read_free(size); - Dprintf("before %lu after %lu diff %ld size %lu\n", - before, after, before - after, size); - assert(size == getpagesize() || (before - after) == NUM_PAGES); - show(size); - err = munmap(map, size); - assert(!err); -} - -void test_shmget(unsigned long size, unsigned flags) -{ - int id; - unsigned long before, after; - int err; - - before = read_free(size); - id = shmget(IPC_PRIVATE, size * NUM_PAGES, IPC_CREAT|0600|flags); - if (id < 0) err("shmget"); - - struct shm_info i; - if (shmctl(id, SHM_INFO, (void *)&i) < 0) err("shmctl"); - Dprintf("alloc %lu res %lu\n", i.shm_tot, i.shm_rss); - - - Dprintf("id %d\n", id); - char *map = shmat(id, NULL, 0600); - if (map == (char*)-1) err("shmat"); - - shmctl(id, IPC_RMID, NULL); - - memset(map, 0xff, size*NUM_PAGES); - after = read_free(size); - - Dprintf("before %lu after %lu diff %ld size %lu\n", - before, after, before - after, size); - assert(size == getpagesize() || (before - after) == NUM_PAGES); - show(size); - err = shmdt(map); - assert(!err); -} - -void sanity_checks(void) -{ - int i; - unsigned long largest = getpagesize(); - - for (i = 0; i < num_page_sizes; i++) { - if (page_sizes[i] > largest) - largest = page_sizes[i]; - - if (read_free(page_sizes[i]) < NUM_PAGES) { - printf("Not enough huge pages for page size %lu MB, need %u\n", - page_sizes[i] >> 20, - NUM_PAGES); - exit(0); - } - } - - if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) { - printf("Please do echo %lu > /proc/sys/kernel/shmmax", largest * NUM_PAGES); - exit(0); - } - -#if defined(__x86_64__) - if (largest != 1U<<30) { - printf("No GB pages available on x86-64\n" - "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES); - exit(0); - } -#endif -} - -int main(void) -{ - int i; - unsigned default_hps = default_huge_page_size(); - - find_pagesizes(); - - sanity_checks(); - - for (i = 0; i < num_page_sizes; i++) { - unsigned long ps = page_sizes[i]; - int arg = ilog2(ps) << MAP_HUGE_SHIFT; - printf("Testing %luMB mmap with shift %x\n", ps >> 20, arg); - test_mmap(ps, MAP_HUGETLB | arg); - } - printf("Testing default huge mmap\n"); - test_mmap(default_hps, SHM_HUGETLB); - - puts("Testing non-huge shmget"); - test_shmget(getpagesize(), 0); - - for (i = 0; i < num_page_sizes; i++) { - unsigned long ps = page_sizes[i]; - int arg = ilog2(ps) << SHM_HUGE_SHIFT; - printf("Testing %luMB shmget with shift %x\n", ps >> 20, arg); - test_shmget(ps, SHM_HUGETLB | arg); - } - puts("default huge shmget"); - test_shmget(default_hps, SHM_HUGETLB); - - return 0; -} diff --git a/tools/testing/selftests/vm/transhuge-stress.c b/tools/testing/selftests/vm/transhuge-stress.c deleted file mode 100644 index e3f00adb1b82..000000000000 --- a/tools/testing/selftests/vm/transhuge-stress.c +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Stress test for transparent huge pages, memory compaction and migration. - * - * Authors: Konstantin Khlebnikov - * - * This is free and unencumbered software released into the public domain. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "util.h" - -int backing_fd = -1; -int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE; -#define PROT_RW (PROT_READ | PROT_WRITE) - -int main(int argc, char **argv) -{ - size_t ram, len; - void *ptr, *p; - struct timespec a, b; - int i = 0; - char *name = NULL; - double s; - uint8_t *map; - size_t map_len; - int pagemap_fd; - - ram = sysconf(_SC_PHYS_PAGES); - if (ram > SIZE_MAX / sysconf(_SC_PAGESIZE) / 4) - ram = SIZE_MAX / 4; - else - ram *= sysconf(_SC_PAGESIZE); - len = ram; - - while (++i < argc) { - if (!strcmp(argv[i], "-h")) - errx(1, "usage: %s [size in MiB]", argv[0]); - else if (!strcmp(argv[i], "-f")) - name = argv[++i]; - else - len = atoll(argv[i]) << 20; - } - - if (name) { - backing_fd = open(name, O_RDWR); - if (backing_fd == -1) - errx(2, "open %s", name); - mmap_flags = MAP_SHARED; - } - - warnx("allocate %zd transhuge pages, using %zd MiB virtual memory" - " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20, - ram >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1)); - - pagemap_fd = open("/proc/self/pagemap", O_RDONLY); - if (pagemap_fd < 0) - err(2, "open pagemap"); - - len -= len % HPAGE_SIZE; - ptr = mmap(NULL, len + HPAGE_SIZE, PROT_RW, mmap_flags, backing_fd, 0); - if (ptr == MAP_FAILED) - err(2, "initial mmap"); - ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE; - - if (madvise(ptr, len, MADV_HUGEPAGE)) - err(2, "MADV_HUGEPAGE"); - - map_len = ram >> (HPAGE_SHIFT - 1); - map = malloc(map_len); - if (!map) - errx(2, "map malloc"); - - while (1) { - int nr_succeed = 0, nr_failed = 0, nr_pages = 0; - - memset(map, 0, map_len); - - clock_gettime(CLOCK_MONOTONIC, &a); - for (p = ptr; p < ptr + len; p += HPAGE_SIZE) { - int64_t pfn; - - pfn = allocate_transhuge(p, pagemap_fd); - - if (pfn < 0) { - nr_failed++; - } else { - size_t idx = pfn >> (HPAGE_SHIFT - PAGE_SHIFT); - - nr_succeed++; - if (idx >= map_len) { - map = realloc(map, idx + 1); - if (!map) - errx(2, "map realloc"); - memset(map + map_len, 0, idx + 1 - map_len); - map_len = idx + 1; - } - if (!map[idx]) - nr_pages++; - map[idx] = 1; - } - - /* split transhuge page, keep last page */ - if (madvise(p, HPAGE_SIZE - PAGE_SIZE, MADV_DONTNEED)) - err(2, "MADV_DONTNEED"); - } - clock_gettime(CLOCK_MONOTONIC, &b); - s = b.tv_sec - a.tv_sec + (b.tv_nsec - a.tv_nsec) / 1000000000.; - - warnx("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t" - "%4d succeed, %4d failed, %4d different pages", - s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20), - nr_succeed, nr_failed, nr_pages); - } -} diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c deleted file mode 100644 index 7f22844ed704..000000000000 --- a/tools/testing/selftests/vm/userfaultfd.c +++ /dev/null @@ -1,1858 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Stress userfaultfd syscall. - * - * Copyright (C) 2015 Red Hat, Inc. - * - * This test allocates two virtual areas and bounces the physical - * memory across the two virtual areas (from area_src to area_dst) - * using userfaultfd. - * - * There are three threads running per CPU: - * - * 1) one per-CPU thread takes a per-page pthread_mutex in a random - * page of the area_dst (while the physical page may still be in - * area_src), and increments a per-page counter in the same page, - * and checks its value against a verification region. - * - * 2) another per-CPU thread handles the userfaults generated by - * thread 1 above. userfaultfd blocking reads or poll() modes are - * exercised interleaved. - * - * 3) one last per-CPU thread transfers the memory in the background - * at maximum bandwidth (if not already transferred by thread - * 2). Each cpu thread takes cares of transferring a portion of the - * area. - * - * When all threads of type 3 completed the transfer, one bounce is - * complete. area_src and area_dst are then swapped. All threads are - * respawned and so the bounce is immediately restarted in the - * opposite direction. - * - * per-CPU threads 1 by triggering userfaults inside - * pthread_mutex_lock will also verify the atomicity of the memory - * transfer (UFFDIO_COPY). - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../kselftest.h" -#include "vm_util.h" - -#ifdef __NR_userfaultfd - -static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; - -#define BOUNCE_RANDOM (1<<0) -#define BOUNCE_RACINGFAULTS (1<<1) -#define BOUNCE_VERIFY (1<<2) -#define BOUNCE_POLL (1<<3) -static int bounces; - -#define TEST_ANON 1 -#define TEST_HUGETLB 2 -#define TEST_SHMEM 3 -static int test_type; - -#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY) - -#define BASE_PMD_ADDR ((void *)(1UL << 30)) - -/* test using /dev/userfaultfd, instead of userfaultfd(2) */ -static bool test_dev_userfaultfd; - -/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ -#define ALARM_INTERVAL_SECS 10 -static volatile bool test_uffdio_copy_eexist = true; -static volatile bool test_uffdio_zeropage_eexist = true; -/* Whether to test uffd write-protection */ -static bool test_uffdio_wp = true; -/* Whether to test uffd minor faults */ -static bool test_uffdio_minor = false; -static bool map_shared; -static int mem_fd; -static unsigned long long *count_verify; -static int uffd = -1; -static int uffd_flags, finished, *pipefd; -static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; -static char *zeropage; -pthread_attr_t attr; -static bool test_collapse; - -/* Userfaultfd test statistics */ -struct uffd_stats { - int cpu; - unsigned long missing_faults; - unsigned long wp_faults; - unsigned long minor_faults; -}; - -/* pthread_mutex_t starts at page offset 0 */ -#define area_mutex(___area, ___nr) \ - ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) -/* - * count is placed in the page after pthread_mutex_t naturally aligned - * to avoid non alignment faults on non-x86 archs. - */ -#define area_count(___area, ___nr) \ - ((volatile unsigned long long *) ((unsigned long) \ - ((___area) + (___nr)*page_size + \ - sizeof(pthread_mutex_t) + \ - sizeof(unsigned long long) - 1) & \ - ~(unsigned long)(sizeof(unsigned long long) \ - - 1))) - -#define swap(a, b) \ - do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) - -#define factor_of_2(x) ((x) ^ ((x) & ((x) - 1))) - -const char *examples = - "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" - "./userfaultfd anon 100 99999\n\n" - "# Run the same anonymous memory test, but using /dev/userfaultfd:\n" - "./userfaultfd anon:dev 100 99999\n\n" - "# Run share memory test on 1GiB region with 99 bounces:\n" - "./userfaultfd shmem 1000 99\n\n" - "# Run hugetlb memory test on 256MiB region with 50 bounces:\n" - "./userfaultfd hugetlb 256 50\n\n" - "# Run the same hugetlb test but using shared file:\n" - "./userfaultfd hugetlb_shared 256 50\n\n" - "# 10MiB-~6GiB 999 bounces anonymous test, " - "continue forever unless an error triggers\n" - "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n"; - -static void usage(void) -{ - fprintf(stderr, "\nUsage: ./userfaultfd " - "[hugetlbfs_file]\n\n"); - fprintf(stderr, "Supported : anon, hugetlb, " - "hugetlb_shared, shmem\n\n"); - fprintf(stderr, "'Test mods' can be joined to the test type string with a ':'. " - "Supported mods:\n"); - fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n"); - fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n"); - fprintf(stderr, "\tcollapse - Test MADV_COLLAPSE of UFFDIO_REGISTER_MODE_MINOR\n" - "memory\n"); - fprintf(stderr, "\nExample test mod usage:\n"); - fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n"); - fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n"); - - fprintf(stderr, "Examples:\n\n"); - fprintf(stderr, "%s", examples); - exit(1); -} - -#define _err(fmt, ...) \ - do { \ - int ret = errno; \ - fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \ - fprintf(stderr, " (errno=%d, line=%d)\n", \ - ret, __LINE__); \ - } while (0) - -#define errexit(exitcode, fmt, ...) \ - do { \ - _err(fmt, ##__VA_ARGS__); \ - exit(exitcode); \ - } while (0) - -#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__) - -static void uffd_stats_reset(struct uffd_stats *uffd_stats, - unsigned long n_cpus) -{ - int i; - - for (i = 0; i < n_cpus; i++) { - uffd_stats[i].cpu = i; - uffd_stats[i].missing_faults = 0; - uffd_stats[i].wp_faults = 0; - uffd_stats[i].minor_faults = 0; - } -} - -static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) -{ - int i; - unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; - - for (i = 0; i < n_cpus; i++) { - miss_total += stats[i].missing_faults; - wp_total += stats[i].wp_faults; - minor_total += stats[i].minor_faults; - } - - printf("userfaults: "); - if (miss_total) { - printf("%llu missing (", miss_total); - for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].missing_faults); - printf("\b) "); - } - if (wp_total) { - printf("%llu wp (", wp_total); - for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].wp_faults); - printf("\b) "); - } - if (minor_total) { - printf("%llu minor (", minor_total); - for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].minor_faults); - printf("\b)"); - } - printf("\n"); -} - -static void anon_release_pages(char *rel_area) -{ - if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) - err("madvise(MADV_DONTNEED) failed"); -} - -static void anon_allocate_area(void **alloc_area, bool is_src) -{ - *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); -} - -static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) -{ -} - -static void hugetlb_release_pages(char *rel_area) -{ - if (!map_shared) { - if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) - err("madvise(MADV_DONTNEED) failed"); - } else { - if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) - err("madvise(MADV_REMOVE) failed"); - } -} - -static void hugetlb_allocate_area(void **alloc_area, bool is_src) -{ - off_t size = nr_pages * page_size; - off_t offset = is_src ? 0 : size; - void *area_alias = NULL; - char **alloc_area_alias; - - *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE, - (map_shared ? MAP_SHARED : MAP_PRIVATE) | - (is_src ? 0 : MAP_NORESERVE), - mem_fd, offset); - if (*alloc_area == MAP_FAILED) - err("mmap of hugetlbfs file failed"); - - if (map_shared) { - area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE, - MAP_SHARED, mem_fd, offset); - if (area_alias == MAP_FAILED) - err("mmap of hugetlb file alias failed"); - } - - if (is_src) { - alloc_area_alias = &area_src_alias; - } else { - alloc_area_alias = &area_dst_alias; - } - if (area_alias) - *alloc_area_alias = area_alias; -} - -static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) -{ - if (!map_shared) - return; - - *start = (unsigned long) area_dst_alias + offset; -} - -static void shmem_release_pages(char *rel_area) -{ - if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) - err("madvise(MADV_REMOVE) failed"); -} - -static void shmem_allocate_area(void **alloc_area, bool is_src) -{ - void *area_alias = NULL; - size_t bytes = nr_pages * page_size; - unsigned long offset = is_src ? 0 : bytes; - char *p = NULL, *p_alias = NULL; - - if (test_collapse) { - p = BASE_PMD_ADDR; - if (!is_src) - /* src map + alias + interleaved hpages */ - p += 2 * (bytes + hpage_size); - p_alias = p; - p_alias += bytes; - p_alias += hpage_size; /* Prevent src/dst VMA merge */ - } - - *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, - mem_fd, offset); - if (*alloc_area == MAP_FAILED) - err("mmap of memfd failed"); - if (test_collapse && *alloc_area != p) - err("mmap of memfd failed at %p", p); - - area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, - mem_fd, offset); - if (area_alias == MAP_FAILED) - err("mmap of memfd alias failed"); - if (test_collapse && area_alias != p_alias) - err("mmap of anonymous memory failed at %p", p_alias); - - if (is_src) - area_src_alias = area_alias; - else - area_dst_alias = area_alias; -} - -static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) -{ - *start = (unsigned long)area_dst_alias + offset; -} - -static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) -{ - if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size)) - err("Did not find expected %d number of hugepages", - expect_nr_hpages); -} - -struct uffd_test_ops { - void (*allocate_area)(void **alloc_area, bool is_src); - void (*release_pages)(char *rel_area); - void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); - void (*check_pmd_mapping)(void *p, int expect_nr_hpages); -}; - -static struct uffd_test_ops anon_uffd_test_ops = { - .allocate_area = anon_allocate_area, - .release_pages = anon_release_pages, - .alias_mapping = noop_alias_mapping, - .check_pmd_mapping = NULL, -}; - -static struct uffd_test_ops shmem_uffd_test_ops = { - .allocate_area = shmem_allocate_area, - .release_pages = shmem_release_pages, - .alias_mapping = shmem_alias_mapping, - .check_pmd_mapping = shmem_check_pmd_mapping, -}; - -static struct uffd_test_ops hugetlb_uffd_test_ops = { - .allocate_area = hugetlb_allocate_area, - .release_pages = hugetlb_release_pages, - .alias_mapping = hugetlb_alias_mapping, - .check_pmd_mapping = NULL, -}; - -static struct uffd_test_ops *uffd_test_ops; - -static inline uint64_t uffd_minor_feature(void) -{ - if (test_type == TEST_HUGETLB && map_shared) - return UFFD_FEATURE_MINOR_HUGETLBFS; - else if (test_type == TEST_SHMEM) - return UFFD_FEATURE_MINOR_SHMEM; - else - return 0; -} - -static uint64_t get_expected_ioctls(uint64_t mode) -{ - uint64_t ioctls = UFFD_API_RANGE_IOCTLS; - - if (test_type == TEST_HUGETLB) - ioctls &= ~(1 << _UFFDIO_ZEROPAGE); - - if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp)) - ioctls &= ~(1 << _UFFDIO_WRITEPROTECT); - - if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor)) - ioctls &= ~(1 << _UFFDIO_CONTINUE); - - return ioctls; -} - -static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls) -{ - uint64_t expected = get_expected_ioctls(mode); - uint64_t actual = ioctls & expected; - - if (actual != expected) { - err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64, - expected, actual); - } -} - -static int __userfaultfd_open_dev(void) -{ - int fd, _uffd; - - fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); - if (fd < 0) - errexit(KSFT_SKIP, "opening /dev/userfaultfd failed"); - - _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS); - if (_uffd < 0) - errexit(errno == ENOTTY ? KSFT_SKIP : 1, - "creating userfaultfd failed"); - close(fd); - return _uffd; -} - -static void userfaultfd_open(uint64_t *features) -{ - struct uffdio_api uffdio_api; - - if (test_dev_userfaultfd) - uffd = __userfaultfd_open_dev(); - else { - uffd = syscall(__NR_userfaultfd, UFFD_FLAGS); - if (uffd < 0) - errexit(errno == ENOSYS ? KSFT_SKIP : 1, - "creating userfaultfd failed"); - } - uffd_flags = fcntl(uffd, F_GETFD, NULL); - - uffdio_api.api = UFFD_API; - uffdio_api.features = *features; - if (ioctl(uffd, UFFDIO_API, &uffdio_api)) - err("UFFDIO_API failed.\nPlease make sure to " - "run with either root or ptrace capability."); - if (uffdio_api.api != UFFD_API) - err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); - - *features = uffdio_api.features; -} - -static inline void munmap_area(void **area) -{ - if (*area) - if (munmap(*area, nr_pages * page_size)) - err("munmap"); - - *area = NULL; -} - -static void uffd_test_ctx_clear(void) -{ - size_t i; - - if (pipefd) { - for (i = 0; i < nr_cpus * 2; ++i) { - if (close(pipefd[i])) - err("close pipefd"); - } - free(pipefd); - pipefd = NULL; - } - - if (count_verify) { - free(count_verify); - count_verify = NULL; - } - - if (uffd != -1) { - if (close(uffd)) - err("close uffd"); - uffd = -1; - } - - munmap_area((void **)&area_src); - munmap_area((void **)&area_src_alias); - munmap_area((void **)&area_dst); - munmap_area((void **)&area_dst_alias); - munmap_area((void **)&area_remap); -} - -static void uffd_test_ctx_init(uint64_t features) -{ - unsigned long nr, cpu; - - uffd_test_ctx_clear(); - - uffd_test_ops->allocate_area((void **)&area_src, true); - uffd_test_ops->allocate_area((void **)&area_dst, false); - - userfaultfd_open(&features); - - count_verify = malloc(nr_pages * sizeof(unsigned long long)); - if (!count_verify) - err("count_verify"); - - for (nr = 0; nr < nr_pages; nr++) { - *area_mutex(area_src, nr) = - (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; - count_verify[nr] = *area_count(area_src, nr) = 1; - /* - * In the transition between 255 to 256, powerpc will - * read out of order in my_bcmp and see both bytes as - * zero, so leave a placeholder below always non-zero - * after the count, to avoid my_bcmp to trigger false - * positives. - */ - *(area_count(area_src, nr) + 1) = 1; - } - - /* - * After initialization of area_src, we must explicitly release pages - * for area_dst to make sure it's fully empty. Otherwise we could have - * some area_dst pages be errornously initialized with zero pages, - * hence we could hit memory corruption later in the test. - * - * One example is when THP is globally enabled, above allocate_area() - * calls could have the two areas merged into a single VMA (as they - * will have the same VMA flags so they're mergeable). When we - * initialize the area_src above, it's possible that some part of - * area_dst could have been faulted in via one huge THP that will be - * shared between area_src and area_dst. It could cause some of the - * area_dst won't be trapped by missing userfaults. - * - * This release_pages() will guarantee even if that happened, we'll - * proactively split the thp and drop any accidentally initialized - * pages within area_dst. - */ - uffd_test_ops->release_pages(area_dst); - - pipefd = malloc(sizeof(int) * nr_cpus * 2); - if (!pipefd) - err("pipefd"); - for (cpu = 0; cpu < nr_cpus; cpu++) - if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) - err("pipe"); -} - -static int my_bcmp(char *str1, char *str2, size_t n) -{ - unsigned long i; - for (i = 0; i < n; i++) - if (str1[i] != str2[i]) - return 1; - return 0; -} - -static void wp_range(int ufd, __u64 start, __u64 len, bool wp) -{ - struct uffdio_writeprotect prms; - - /* Write protection page faults */ - prms.range.start = start; - prms.range.len = len; - /* Undo write-protect, do wakeup after that */ - prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; - - if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) - err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); -} - -static void continue_range(int ufd, __u64 start, __u64 len) -{ - struct uffdio_continue req; - int ret; - - req.range.start = start; - req.range.len = len; - req.mode = 0; - - if (ioctl(ufd, UFFDIO_CONTINUE, &req)) - err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, - (uint64_t)start); - - /* - * Error handling within the kernel for continue is subtly different - * from copy or zeropage, so it may be a source of bugs. Trigger an - * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. - */ - req.mapped = 0; - ret = ioctl(ufd, UFFDIO_CONTINUE, &req); - if (ret >= 0 || req.mapped != -EEXIST) - err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, - ret, (int64_t) req.mapped); -} - -static void *locking_thread(void *arg) -{ - unsigned long cpu = (unsigned long) arg; - unsigned long page_nr; - unsigned long long count; - - if (!(bounces & BOUNCE_RANDOM)) { - page_nr = -bounces; - if (!(bounces & BOUNCE_RACINGFAULTS)) - page_nr += cpu * nr_pages_per_cpu; - } - - while (!finished) { - if (bounces & BOUNCE_RANDOM) { - if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr)) - err("getrandom failed"); - } else - page_nr += 1; - page_nr %= nr_pages; - pthread_mutex_lock(area_mutex(area_dst, page_nr)); - count = *area_count(area_dst, page_nr); - if (count != count_verify[page_nr]) - err("page_nr %lu memory corruption %llu %llu", - page_nr, count, count_verify[page_nr]); - count++; - *area_count(area_dst, page_nr) = count_verify[page_nr] = count; - pthread_mutex_unlock(area_mutex(area_dst, page_nr)); - } - - return NULL; -} - -static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, - unsigned long offset) -{ - uffd_test_ops->alias_mapping(&uffdio_copy->dst, - uffdio_copy->len, - offset); - if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { - /* real retval in ufdio_copy.copy */ - if (uffdio_copy->copy != -EEXIST) - err("UFFDIO_COPY retry error: %"PRId64, - (int64_t)uffdio_copy->copy); - } else { - err("UFFDIO_COPY retry unexpected: %"PRId64, - (int64_t)uffdio_copy->copy); - } -} - -static void wake_range(int ufd, unsigned long addr, unsigned long len) -{ - struct uffdio_range uffdio_wake; - - uffdio_wake.start = addr; - uffdio_wake.len = len; - - if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) - fprintf(stderr, "error waking %lu\n", - addr), exit(1); -} - -static int __copy_page(int ufd, unsigned long offset, bool retry) -{ - struct uffdio_copy uffdio_copy; - - if (offset >= nr_pages * page_size) - err("unexpected offset %lu\n", offset); - uffdio_copy.dst = (unsigned long) area_dst + offset; - uffdio_copy.src = (unsigned long) area_src + offset; - uffdio_copy.len = page_size; - if (test_uffdio_wp) - uffdio_copy.mode = UFFDIO_COPY_MODE_WP; - else - uffdio_copy.mode = 0; - uffdio_copy.copy = 0; - if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { - /* real retval in ufdio_copy.copy */ - if (uffdio_copy.copy != -EEXIST) - err("UFFDIO_COPY error: %"PRId64, - (int64_t)uffdio_copy.copy); - wake_range(ufd, uffdio_copy.dst, page_size); - } else if (uffdio_copy.copy != page_size) { - err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); - } else { - if (test_uffdio_copy_eexist && retry) { - test_uffdio_copy_eexist = false; - retry_copy_page(ufd, &uffdio_copy, offset); - } - return 1; - } - return 0; -} - -static int copy_page_retry(int ufd, unsigned long offset) -{ - return __copy_page(ufd, offset, true); -} - -static int copy_page(int ufd, unsigned long offset) -{ - return __copy_page(ufd, offset, false); -} - -static int uffd_read_msg(int ufd, struct uffd_msg *msg) -{ - int ret = read(uffd, msg, sizeof(*msg)); - - if (ret != sizeof(*msg)) { - if (ret < 0) { - if (errno == EAGAIN || errno == EINTR) - return 1; - err("blocking read error"); - } else { - err("short read"); - } - } - - return 0; -} - -static void uffd_handle_page_fault(struct uffd_msg *msg, - struct uffd_stats *stats) -{ - unsigned long offset; - - if (msg->event != UFFD_EVENT_PAGEFAULT) - err("unexpected msg event %u", msg->event); - - if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { - /* Write protect page faults */ - wp_range(uffd, msg->arg.pagefault.address, page_size, false); - stats->wp_faults++; - } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { - uint8_t *area; - int b; - - /* - * Minor page faults - * - * To prove we can modify the original range for testing - * purposes, we're going to bit flip this range before - * continuing. - * - * Note that this requires all minor page fault tests operate on - * area_dst (non-UFFD-registered) and area_dst_alias - * (UFFD-registered). - */ - - area = (uint8_t *)(area_dst + - ((char *)msg->arg.pagefault.address - - area_dst_alias)); - for (b = 0; b < page_size; ++b) - area[b] = ~area[b]; - continue_range(uffd, msg->arg.pagefault.address, page_size); - stats->minor_faults++; - } else { - /* - * Missing page faults. - * - * Here we force a write check for each of the missing mode - * faults. It's guaranteed because the only threads that - * will trigger uffd faults are the locking threads, and - * their first instruction to touch the missing page will - * always be pthread_mutex_lock(). - * - * Note that here we relied on an NPTL glibc impl detail to - * always read the lock type at the entry of the lock op - * (pthread_mutex_t.__data.__type, offset 0x10) before - * doing any locking operations to guarantee that. It's - * actually not good to rely on this impl detail because - * logically a pthread-compatible lib can implement the - * locks without types and we can fail when linking with - * them. However since we used to find bugs with this - * strict check we still keep it around. Hopefully this - * could be a good hint when it fails again. If one day - * it'll break on some other impl of glibc we'll revisit. - */ - if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) - err("unexpected write fault"); - - offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; - offset &= ~(page_size-1); - - if (copy_page(uffd, offset)) - stats->missing_faults++; - } -} - -static void *uffd_poll_thread(void *arg) -{ - struct uffd_stats *stats = (struct uffd_stats *)arg; - unsigned long cpu = stats->cpu; - struct pollfd pollfd[2]; - struct uffd_msg msg; - struct uffdio_register uffd_reg; - int ret; - char tmp_chr; - - pollfd[0].fd = uffd; - pollfd[0].events = POLLIN; - pollfd[1].fd = pipefd[cpu*2]; - pollfd[1].events = POLLIN; - - for (;;) { - ret = poll(pollfd, 2, -1); - if (ret <= 0) { - if (errno == EINTR || errno == EAGAIN) - continue; - err("poll error: %d", ret); - } - if (pollfd[1].revents & POLLIN) { - if (read(pollfd[1].fd, &tmp_chr, 1) != 1) - err("read pipefd error"); - break; - } - if (!(pollfd[0].revents & POLLIN)) - err("pollfd[0].revents %d", pollfd[0].revents); - if (uffd_read_msg(uffd, &msg)) - continue; - switch (msg.event) { - default: - err("unexpected msg event %u\n", msg.event); - break; - case UFFD_EVENT_PAGEFAULT: - uffd_handle_page_fault(&msg, stats); - break; - case UFFD_EVENT_FORK: - close(uffd); - uffd = msg.arg.fork.ufd; - pollfd[0].fd = uffd; - break; - case UFFD_EVENT_REMOVE: - uffd_reg.range.start = msg.arg.remove.start; - uffd_reg.range.len = msg.arg.remove.end - - msg.arg.remove.start; - if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) - err("remove failure"); - break; - case UFFD_EVENT_REMAP: - area_remap = area_dst; /* save for later unmap */ - area_dst = (char *)(unsigned long)msg.arg.remap.to; - break; - } - } - - return NULL; -} - -pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; - -static void *uffd_read_thread(void *arg) -{ - struct uffd_stats *stats = (struct uffd_stats *)arg; - struct uffd_msg msg; - - pthread_mutex_unlock(&uffd_read_mutex); - /* from here cancellation is ok */ - - for (;;) { - if (uffd_read_msg(uffd, &msg)) - continue; - uffd_handle_page_fault(&msg, stats); - } - - return NULL; -} - -static void *background_thread(void *arg) -{ - unsigned long cpu = (unsigned long) arg; - unsigned long page_nr, start_nr, mid_nr, end_nr; - - start_nr = cpu * nr_pages_per_cpu; - end_nr = (cpu+1) * nr_pages_per_cpu; - mid_nr = (start_nr + end_nr) / 2; - - /* Copy the first half of the pages */ - for (page_nr = start_nr; page_nr < mid_nr; page_nr++) - copy_page_retry(uffd, page_nr * page_size); - - /* - * If we need to test uffd-wp, set it up now. Then we'll have - * at least the first half of the pages mapped already which - * can be write-protected for testing - */ - if (test_uffdio_wp) - wp_range(uffd, (unsigned long)area_dst + start_nr * page_size, - nr_pages_per_cpu * page_size, true); - - /* - * Continue the 2nd half of the page copying, handling write - * protection faults if any - */ - for (page_nr = mid_nr; page_nr < end_nr; page_nr++) - copy_page_retry(uffd, page_nr * page_size); - - return NULL; -} - -static int stress(struct uffd_stats *uffd_stats) -{ - unsigned long cpu; - pthread_t locking_threads[nr_cpus]; - pthread_t uffd_threads[nr_cpus]; - pthread_t background_threads[nr_cpus]; - - finished = 0; - for (cpu = 0; cpu < nr_cpus; cpu++) { - if (pthread_create(&locking_threads[cpu], &attr, - locking_thread, (void *)cpu)) - return 1; - if (bounces & BOUNCE_POLL) { - if (pthread_create(&uffd_threads[cpu], &attr, - uffd_poll_thread, - (void *)&uffd_stats[cpu])) - return 1; - } else { - if (pthread_create(&uffd_threads[cpu], &attr, - uffd_read_thread, - (void *)&uffd_stats[cpu])) - return 1; - pthread_mutex_lock(&uffd_read_mutex); - } - if (pthread_create(&background_threads[cpu], &attr, - background_thread, (void *)cpu)) - return 1; - } - for (cpu = 0; cpu < nr_cpus; cpu++) - if (pthread_join(background_threads[cpu], NULL)) - return 1; - - /* - * Be strict and immediately zap area_src, the whole area has - * been transferred already by the background treads. The - * area_src could then be faulted in a racy way by still - * running uffdio_threads reading zeropages after we zapped - * area_src (but they're guaranteed to get -EEXIST from - * UFFDIO_COPY without writing zero pages into area_dst - * because the background threads already completed). - */ - uffd_test_ops->release_pages(area_src); - - finished = 1; - for (cpu = 0; cpu < nr_cpus; cpu++) - if (pthread_join(locking_threads[cpu], NULL)) - return 1; - - for (cpu = 0; cpu < nr_cpus; cpu++) { - char c; - if (bounces & BOUNCE_POLL) { - if (write(pipefd[cpu*2+1], &c, 1) != 1) - err("pipefd write error"); - if (pthread_join(uffd_threads[cpu], - (void *)&uffd_stats[cpu])) - return 1; - } else { - if (pthread_cancel(uffd_threads[cpu])) - return 1; - if (pthread_join(uffd_threads[cpu], NULL)) - return 1; - } - } - - return 0; -} - -sigjmp_buf jbuf, *sigbuf; - -static void sighndl(int sig, siginfo_t *siginfo, void *ptr) -{ - if (sig == SIGBUS) { - if (sigbuf) - siglongjmp(*sigbuf, 1); - abort(); - } -} - -/* - * For non-cooperative userfaultfd test we fork() a process that will - * generate pagefaults, will mremap the area monitored by the - * userfaultfd and at last this process will release the monitored - * area. - * For the anonymous and shared memory the area is divided into two - * parts, the first part is accessed before mremap, and the second - * part is accessed after mremap. Since hugetlbfs does not support - * mremap, the entire monitored area is accessed in a single pass for - * HUGETLB_TEST. - * The release of the pages currently generates event for shmem and - * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked - * for hugetlb. - * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register - * monitored area, generate pagefaults and test that signal is delivered. - * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2 - * test robustness use case - we release monitored area, fork a process - * that will generate pagefaults and verify signal is generated. - * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal - * feature. Using monitor thread, verify no userfault events are generated. - */ -static int faulting_process(int signal_test) -{ - unsigned long nr; - unsigned long long count; - unsigned long split_nr_pages; - unsigned long lastnr; - struct sigaction act; - volatile unsigned long signalled = 0; - - split_nr_pages = (nr_pages + 1) / 2; - - if (signal_test) { - sigbuf = &jbuf; - memset(&act, 0, sizeof(act)); - act.sa_sigaction = sighndl; - act.sa_flags = SA_SIGINFO; - if (sigaction(SIGBUS, &act, 0)) - err("sigaction"); - lastnr = (unsigned long)-1; - } - - for (nr = 0; nr < split_nr_pages; nr++) { - volatile int steps = 1; - unsigned long offset = nr * page_size; - - if (signal_test) { - if (sigsetjmp(*sigbuf, 1) != 0) { - if (steps == 1 && nr == lastnr) - err("Signal repeated"); - - lastnr = nr; - if (signal_test == 1) { - if (steps == 1) { - /* This is a MISSING request */ - steps++; - if (copy_page(uffd, offset)) - signalled++; - } else { - /* This is a WP request */ - assert(steps == 2); - wp_range(uffd, - (__u64)area_dst + - offset, - page_size, false); - } - } else { - signalled++; - continue; - } - } - } - - count = *area_count(area_dst, nr); - if (count != count_verify[nr]) - err("nr %lu memory corruption %llu %llu\n", - nr, count, count_verify[nr]); - /* - * Trigger write protection if there is by writing - * the same value back. - */ - *area_count(area_dst, nr) = count; - } - - if (signal_test) - return signalled != split_nr_pages; - - area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, - MREMAP_MAYMOVE | MREMAP_FIXED, area_src); - if (area_dst == MAP_FAILED) - err("mremap"); - /* Reset area_src since we just clobbered it */ - area_src = NULL; - - for (; nr < nr_pages; nr++) { - count = *area_count(area_dst, nr); - if (count != count_verify[nr]) { - err("nr %lu memory corruption %llu %llu\n", - nr, count, count_verify[nr]); - } - /* - * Trigger write protection if there is by writing - * the same value back. - */ - *area_count(area_dst, nr) = count; - } - - uffd_test_ops->release_pages(area_dst); - - for (nr = 0; nr < nr_pages; nr++) - if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) - err("nr %lu is not zero", nr); - - return 0; -} - -static void retry_uffdio_zeropage(int ufd, - struct uffdio_zeropage *uffdio_zeropage, - unsigned long offset) -{ - uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start, - uffdio_zeropage->range.len, - offset); - if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) { - if (uffdio_zeropage->zeropage != -EEXIST) - err("UFFDIO_ZEROPAGE error: %"PRId64, - (int64_t)uffdio_zeropage->zeropage); - } else { - err("UFFDIO_ZEROPAGE error: %"PRId64, - (int64_t)uffdio_zeropage->zeropage); - } -} - -static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) -{ - struct uffdio_zeropage uffdio_zeropage; - int ret; - bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE); - __s64 res; - - if (offset >= nr_pages * page_size) - err("unexpected offset %lu", offset); - uffdio_zeropage.range.start = (unsigned long) area_dst + offset; - uffdio_zeropage.range.len = page_size; - uffdio_zeropage.mode = 0; - ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage); - res = uffdio_zeropage.zeropage; - if (ret) { - /* real retval in ufdio_zeropage.zeropage */ - if (has_zeropage) - err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res); - else if (res != -EINVAL) - err("UFFDIO_ZEROPAGE not -EINVAL"); - } else if (has_zeropage) { - if (res != page_size) { - err("UFFDIO_ZEROPAGE unexpected size"); - } else { - if (test_uffdio_zeropage_eexist && retry) { - test_uffdio_zeropage_eexist = false; - retry_uffdio_zeropage(ufd, &uffdio_zeropage, - offset); - } - return 1; - } - } else - err("UFFDIO_ZEROPAGE succeeded"); - - return 0; -} - -static int uffdio_zeropage(int ufd, unsigned long offset) -{ - return __uffdio_zeropage(ufd, offset, false); -} - -/* exercise UFFDIO_ZEROPAGE */ -static int userfaultfd_zeropage_test(void) -{ - struct uffdio_register uffdio_register; - - printf("testing UFFDIO_ZEROPAGE: "); - fflush(stdout); - - uffd_test_ctx_init(0); - - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - if (uffdio_zeropage(uffd, 0)) - if (my_bcmp(area_dst, zeropage, page_size)) - err("zeropage is not zero"); - - printf("done.\n"); - return 0; -} - -static int userfaultfd_events_test(void) -{ - struct uffdio_register uffdio_register; - pthread_t uffd_mon; - int err, features; - pid_t pid; - char c; - struct uffd_stats stats = { 0 }; - - printf("testing events (fork, remap, remove): "); - fflush(stdout); - - features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | - UFFD_FEATURE_EVENT_REMOVE; - uffd_test_ctx_init(features); - - fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); - - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) - err("uffd_poll_thread create"); - - pid = fork(); - if (pid < 0) - err("fork"); - - if (!pid) - exit(faulting_process(0)); - - waitpid(pid, &err, 0); - if (err) - err("faulting process failed"); - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) - err("pipe write"); - if (pthread_join(uffd_mon, NULL)) - return 1; - - uffd_stats_report(&stats, 1); - - return stats.missing_faults != nr_pages; -} - -static int userfaultfd_sig_test(void) -{ - struct uffdio_register uffdio_register; - unsigned long userfaults; - pthread_t uffd_mon; - int err, features; - pid_t pid; - char c; - struct uffd_stats stats = { 0 }; - - printf("testing signal delivery: "); - fflush(stdout); - - features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS; - uffd_test_ctx_init(features); - - fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); - - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - if (faulting_process(1)) - err("faulting process failed"); - - uffd_test_ops->release_pages(area_dst); - - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) - err("uffd_poll_thread create"); - - pid = fork(); - if (pid < 0) - err("fork"); - - if (!pid) - exit(faulting_process(2)); - - waitpid(pid, &err, 0); - if (err) - err("faulting process failed"); - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) - err("pipe write"); - if (pthread_join(uffd_mon, (void **)&userfaults)) - return 1; - - printf("done.\n"); - if (userfaults) - err("Signal test failed, userfaults: %ld", userfaults); - - return userfaults != 0; -} - -void check_memory_contents(char *p) -{ - unsigned long i; - uint8_t expected_byte; - void *expected_page; - - if (posix_memalign(&expected_page, page_size, page_size)) - err("out of memory"); - - for (i = 0; i < nr_pages; ++i) { - expected_byte = ~((uint8_t)(i % ((uint8_t)-1))); - memset(expected_page, expected_byte, page_size); - if (my_bcmp(expected_page, p + (i * page_size), page_size)) - err("unexpected page contents after minor fault"); - } - - free(expected_page); -} - -static int userfaultfd_minor_test(void) -{ - unsigned long p; - struct uffdio_register uffdio_register; - pthread_t uffd_mon; - char c; - struct uffd_stats stats = { 0 }; - - if (!test_uffdio_minor) - return 0; - - printf("testing minor faults: "); - fflush(stdout); - - uffd_test_ctx_init(uffd_minor_feature()); - - uffdio_register.range.start = (unsigned long)area_dst_alias; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - /* - * After registering with UFFD, populate the non-UFFD-registered side of - * the shared mapping. This should *not* trigger any UFFD minor faults. - */ - for (p = 0; p < nr_pages; ++p) { - memset(area_dst + (p * page_size), p % ((uint8_t)-1), - page_size); - } - - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) - err("uffd_poll_thread create"); - - /* - * Read each of the pages back using the UFFD-registered mapping. We - * expect that the first time we touch a page, it will result in a minor - * fault. uffd_poll_thread will resolve the fault by bit-flipping the - * page's contents, and then issuing a CONTINUE ioctl. - */ - check_memory_contents(area_dst_alias); - - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) - err("pipe write"); - if (pthread_join(uffd_mon, NULL)) - return 1; - - uffd_stats_report(&stats, 1); - - if (test_collapse) { - printf("testing collapse of uffd memory into PMD-mapped THPs:"); - if (madvise(area_dst_alias, nr_pages * page_size, - MADV_COLLAPSE)) - err("madvise(MADV_COLLAPSE)"); - - uffd_test_ops->check_pmd_mapping(area_dst, - nr_pages * page_size / - hpage_size); - /* - * This won't cause uffd-fault - it purely just makes sure there - * was no corruption. - */ - check_memory_contents(area_dst_alias); - printf(" done.\n"); - } - - return stats.missing_faults != 0 || stats.minor_faults != nr_pages; -} - -#define BIT_ULL(nr) (1ULL << (nr)) -#define PM_SOFT_DIRTY BIT_ULL(55) -#define PM_MMAP_EXCLUSIVE BIT_ULL(56) -#define PM_UFFD_WP BIT_ULL(57) -#define PM_FILE BIT_ULL(61) -#define PM_SWAP BIT_ULL(62) -#define PM_PRESENT BIT_ULL(63) - -static int pagemap_open(void) -{ - int fd = open("/proc/self/pagemap", O_RDONLY); - - if (fd < 0) - err("open pagemap"); - - return fd; -} - -static uint64_t pagemap_read_vaddr(int fd, void *vaddr) -{ - uint64_t value; - int ret; - - ret = pread(fd, &value, sizeof(uint64_t), - ((uint64_t)vaddr >> 12) * sizeof(uint64_t)); - if (ret != sizeof(uint64_t)) - err("pread() on pagemap failed"); - - return value; -} - -/* This macro let __LINE__ works in err() */ -#define pagemap_check_wp(value, wp) do { \ - if (!!(value & PM_UFFD_WP) != wp) \ - err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \ - } while (0) - -static int pagemap_test_fork(bool present) -{ - pid_t child = fork(); - uint64_t value; - int fd, result; - - if (!child) { - /* Open the pagemap fd of the child itself */ - fd = pagemap_open(); - value = pagemap_read_vaddr(fd, area_dst); - /* - * After fork() uffd-wp bit should be gone as long as we're - * without UFFD_FEATURE_EVENT_FORK - */ - pagemap_check_wp(value, false); - /* Succeed */ - exit(0); - } - waitpid(child, &result, 0); - return result; -} - -static void userfaultfd_pagemap_test(unsigned int test_pgsize) -{ - struct uffdio_register uffdio_register; - int pagemap_fd; - uint64_t value; - - /* Pagemap tests uffd-wp only */ - if (!test_uffdio_wp) - return; - - /* Not enough memory to test this page size */ - if (test_pgsize > nr_pages * page_size) - return; - - printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize); - /* Flush so it doesn't flush twice in parent/child later */ - fflush(stdout); - - uffd_test_ctx_init(0); - - if (test_pgsize > page_size) { - /* This is a thp test */ - if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE)) - err("madvise(MADV_HUGEPAGE) failed"); - } else if (test_pgsize == page_size) { - /* This is normal page test; force no thp */ - if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE)) - err("madvise(MADV_NOHUGEPAGE) failed"); - } - - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failed"); - - pagemap_fd = pagemap_open(); - - /* Touch the page */ - *area_dst = 1; - wp_range(uffd, (uint64_t)area_dst, test_pgsize, true); - value = pagemap_read_vaddr(pagemap_fd, area_dst); - pagemap_check_wp(value, true); - /* Make sure uffd-wp bit dropped when fork */ - if (pagemap_test_fork(true)) - err("Detected stall uffd-wp bit in child"); - - /* Exclusive required or PAGEOUT won't work */ - if (!(value & PM_MMAP_EXCLUSIVE)) - err("multiple mapping detected: 0x%"PRIx64, value); - - if (madvise(area_dst, test_pgsize, MADV_PAGEOUT)) - err("madvise(MADV_PAGEOUT) failed"); - - /* Uffd-wp should persist even swapped out */ - value = pagemap_read_vaddr(pagemap_fd, area_dst); - pagemap_check_wp(value, true); - /* Make sure uffd-wp bit dropped when fork */ - if (pagemap_test_fork(false)) - err("Detected stall uffd-wp bit in child"); - - /* Unprotect; this tests swap pte modifications */ - wp_range(uffd, (uint64_t)area_dst, page_size, false); - value = pagemap_read_vaddr(pagemap_fd, area_dst); - pagemap_check_wp(value, false); - - /* Fault in the page from disk */ - *area_dst = 2; - value = pagemap_read_vaddr(pagemap_fd, area_dst); - pagemap_check_wp(value, false); - - close(pagemap_fd); - printf("done\n"); -} - -static int userfaultfd_stress(void) -{ - void *area; - unsigned long nr; - struct uffdio_register uffdio_register; - struct uffd_stats uffd_stats[nr_cpus]; - - uffd_test_ctx_init(0); - - if (posix_memalign(&area, page_size, page_size)) - err("out of memory"); - zeropage = area; - bzero(zeropage, page_size); - - pthread_mutex_lock(&uffd_read_mutex); - - pthread_attr_init(&attr); - pthread_attr_setstacksize(&attr, 16*1024*1024); - - while (bounces--) { - printf("bounces: %d, mode:", bounces); - if (bounces & BOUNCE_RANDOM) - printf(" rnd"); - if (bounces & BOUNCE_RACINGFAULTS) - printf(" racing"); - if (bounces & BOUNCE_VERIFY) - printf(" ver"); - if (bounces & BOUNCE_POLL) - printf(" poll"); - else - printf(" read"); - printf(", "); - fflush(stdout); - - if (bounces & BOUNCE_POLL) - fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); - else - fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK); - - /* register */ - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - if (area_dst_alias) { - uffdio_register.range.start = (unsigned long) - area_dst_alias; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure alias"); - } - - /* - * The madvise done previously isn't enough: some - * uffd_thread could have read userfaults (one of - * those already resolved by the background thread) - * and it may be in the process of calling - * UFFDIO_COPY. UFFDIO_COPY will read the zapped - * area_src and it would map a zero page in it (of - * course such a UFFDIO_COPY is perfectly safe as it'd - * return -EEXIST). The problem comes at the next - * bounce though: that racing UFFDIO_COPY would - * generate zeropages in the area_src, so invalidating - * the previous MADV_DONTNEED. Without this additional - * MADV_DONTNEED those zeropages leftovers in the - * area_src would lead to -EEXIST failure during the - * next bounce, effectively leaving a zeropage in the - * area_dst. - * - * Try to comment this out madvise to see the memory - * corruption being caught pretty quick. - * - * khugepaged is also inhibited to collapse THP after - * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's - * required to MADV_DONTNEED here. - */ - uffd_test_ops->release_pages(area_dst); - - uffd_stats_reset(uffd_stats, nr_cpus); - - /* bounce pass */ - if (stress(uffd_stats)) - return 1; - - /* Clear all the write protections if there is any */ - if (test_uffdio_wp) - wp_range(uffd, (unsigned long)area_dst, - nr_pages * page_size, false); - - /* unregister */ - if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) - err("unregister failure"); - if (area_dst_alias) { - uffdio_register.range.start = (unsigned long) area_dst; - if (ioctl(uffd, UFFDIO_UNREGISTER, - &uffdio_register.range)) - err("unregister failure alias"); - } - - /* verification */ - if (bounces & BOUNCE_VERIFY) - for (nr = 0; nr < nr_pages; nr++) - if (*area_count(area_dst, nr) != count_verify[nr]) - err("error area_count %llu %llu %lu\n", - *area_count(area_src, nr), - count_verify[nr], nr); - - /* prepare next bounce */ - swap(area_src, area_dst); - - swap(area_src_alias, area_dst_alias); - - uffd_stats_report(uffd_stats, nr_cpus); - } - - if (test_type == TEST_ANON) { - /* - * shmem/hugetlb won't be able to run since they have different - * behavior on fork() (file-backed memory normally drops ptes - * directly when fork), meanwhile the pagemap test will verify - * pgtable entry of fork()ed child. - */ - userfaultfd_pagemap_test(page_size); - /* - * Hard-code for x86_64 for now for 2M THP, as x86_64 is - * currently the only one that supports uffd-wp - */ - userfaultfd_pagemap_test(page_size * 512); - } - - return userfaultfd_zeropage_test() || userfaultfd_sig_test() - || userfaultfd_events_test() || userfaultfd_minor_test(); -} - -/* - * Copied from mlock2-tests.c - */ -unsigned long default_huge_page_size(void) -{ - unsigned long hps = 0; - char *line = NULL; - size_t linelen = 0; - FILE *f = fopen("/proc/meminfo", "r"); - - if (!f) - return 0; - while (getline(&line, &linelen, f) > 0) { - if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { - hps <<= 10; - break; - } - } - - free(line); - fclose(f); - return hps; -} - -static void set_test_type(const char *type) -{ - if (!strcmp(type, "anon")) { - test_type = TEST_ANON; - uffd_test_ops = &anon_uffd_test_ops; - } else if (!strcmp(type, "hugetlb")) { - test_type = TEST_HUGETLB; - uffd_test_ops = &hugetlb_uffd_test_ops; - } else if (!strcmp(type, "hugetlb_shared")) { - map_shared = true; - test_type = TEST_HUGETLB; - uffd_test_ops = &hugetlb_uffd_test_ops; - /* Minor faults require shared hugetlb; only enable here. */ - test_uffdio_minor = true; - } else if (!strcmp(type, "shmem")) { - map_shared = true; - test_type = TEST_SHMEM; - uffd_test_ops = &shmem_uffd_test_ops; - test_uffdio_minor = true; - } -} - -static void parse_test_type_arg(const char *raw_type) -{ - char *buf = strdup(raw_type); - uint64_t features = UFFD_API_FEATURES; - - while (buf) { - const char *token = strsep(&buf, ":"); - - if (!test_type) - set_test_type(token); - else if (!strcmp(token, "dev")) - test_dev_userfaultfd = true; - else if (!strcmp(token, "syscall")) - test_dev_userfaultfd = false; - else if (!strcmp(token, "collapse")) - test_collapse = true; - else - err("unrecognized test mod '%s'", token); - } - - if (!test_type) - err("failed to parse test type argument: '%s'", raw_type); - - if (test_collapse && test_type != TEST_SHMEM) - err("Unsupported test: %s", raw_type); - - if (test_type == TEST_HUGETLB) - page_size = hpage_size; - else - page_size = sysconf(_SC_PAGE_SIZE); - - if (!page_size) - err("Unable to determine page size"); - if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 - > page_size) - err("Impossible to run this test"); - - /* - * Whether we can test certain features depends not just on test type, - * but also on whether or not this particular kernel supports the - * feature. - */ - - userfaultfd_open(&features); - - test_uffdio_wp = test_uffdio_wp && - (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP); - test_uffdio_minor = test_uffdio_minor && - (features & uffd_minor_feature()); - - close(uffd); - uffd = -1; -} - -static void sigalrm(int sig) -{ - if (sig != SIGALRM) - abort(); - test_uffdio_copy_eexist = true; - test_uffdio_zeropage_eexist = true; - alarm(ALARM_INTERVAL_SECS); -} - -int main(int argc, char **argv) -{ - size_t bytes; - - if (argc < 4) - usage(); - - if (signal(SIGALRM, sigalrm) == SIG_ERR) - err("failed to arm SIGALRM"); - alarm(ALARM_INTERVAL_SECS); - - hpage_size = default_huge_page_size(); - parse_test_type_arg(argv[1]); - bytes = atol(argv[2]) * 1024 * 1024; - - if (test_collapse && bytes & (hpage_size - 1)) - err("MiB must be multiple of %lu if :collapse mod set", - hpage_size >> 20); - - nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); - - if (test_collapse) { - /* nr_cpus must divide (bytes / page_size), otherwise, - * area allocations of (nr_pages * paze_size) won't be a - * multiple of hpage_size, even if bytes is a multiple of - * hpage_size. - * - * This means that nr_cpus must divide (N * (2 << (H-P)) - * where: - * bytes = hpage_size * N - * hpage_size = 2 << H - * page_size = 2 << P - * - * And we want to chose nr_cpus to be the largest value - * satisfying this constraint, not larger than the number - * of online CPUs. Unfortunately, prime factorization of - * N and nr_cpus may be arbitrary, so have to search for it. - * Instead, just use the highest power of 2 dividing both - * nr_cpus and (bytes / page_size). - */ - int x = factor_of_2(nr_cpus); - int y = factor_of_2(bytes / page_size); - - nr_cpus = x < y ? x : y; - } - nr_pages_per_cpu = bytes / page_size / nr_cpus; - if (!nr_pages_per_cpu) { - _err("invalid MiB"); - usage(); - } - - bounces = atoi(argv[3]); - if (bounces <= 0) { - _err("invalid bounces"); - usage(); - } - nr_pages = nr_pages_per_cpu * nr_cpus; - - if (test_type == TEST_SHMEM || test_type == TEST_HUGETLB) { - unsigned int memfd_flags = 0; - - if (test_type == TEST_HUGETLB) - memfd_flags = MFD_HUGETLB; - mem_fd = memfd_create(argv[0], memfd_flags); - if (mem_fd < 0) - err("memfd_create"); - if (ftruncate(mem_fd, nr_pages * page_size * 2)) - err("ftruncate"); - if (fallocate(mem_fd, - FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, - nr_pages * page_size * 2)) - err("fallocate"); - } - printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", - nr_pages, nr_pages_per_cpu); - return userfaultfd_stress(); -} - -#else /* __NR_userfaultfd */ - -#warning "missing __NR_userfaultfd definition" - -int main(void) -{ - printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n"); - return KSFT_SKIP; -} - -#endif /* __NR_userfaultfd */ diff --git a/tools/testing/selftests/vm/util.h b/tools/testing/selftests/vm/util.h deleted file mode 100644 index b27d26199334..000000000000 --- a/tools/testing/selftests/vm/util.h +++ /dev/null @@ -1,69 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef __KSELFTEST_VM_UTIL_H -#define __KSELFTEST_VM_UTIL_H - -#include -#include -#include -#include /* ffsl() */ -#include /* _SC_PAGESIZE */ - -static unsigned int __page_size; -static unsigned int __page_shift; - -static inline unsigned int page_size(void) -{ - if (!__page_size) - __page_size = sysconf(_SC_PAGESIZE); - return __page_size; -} - -static inline unsigned int page_shift(void) -{ - if (!__page_shift) - __page_shift = (ffsl(page_size()) - 1); - return __page_shift; -} - -#define PAGE_SHIFT (page_shift()) -#define PAGE_SIZE (page_size()) -/* - * On ppc64 this will only work with radix 2M hugepage size - */ -#define HPAGE_SHIFT 21 -#define HPAGE_SIZE (1 << HPAGE_SHIFT) - -#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0) -#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1)) - - -static inline int64_t allocate_transhuge(void *ptr, int pagemap_fd) -{ - uint64_t ent[2]; - - /* drop pmd */ - if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_FIXED | MAP_ANONYMOUS | - MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr) - errx(2, "mmap transhuge"); - - if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE)) - err(2, "MADV_HUGEPAGE"); - - /* allocate transparent huge page */ - *(volatile void **)ptr = ptr; - - if (pread(pagemap_fd, ent, sizeof(ent), - (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent)) - err(2, "read pagemap"); - - if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) && - PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) && - !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1))) - return PAGEMAP_PFN(ent[0]); - - return -1; -} - -#endif diff --git a/tools/testing/selftests/vm/va_128TBswitch.c b/tools/testing/selftests/vm/va_128TBswitch.c deleted file mode 100644 index 1d2068989883..000000000000 --- a/tools/testing/selftests/vm/va_128TBswitch.c +++ /dev/null @@ -1,289 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * - * Authors: Kirill A. Shutemov - * Authors: Aneesh Kumar K.V - */ - -#include -#include -#include - -#include "../kselftest.h" - -#ifdef __powerpc64__ -#define PAGE_SIZE (64 << 10) -/* - * This will work with 16M and 2M hugepage size - */ -#define HUGETLB_SIZE (16 << 20) -#else -#define PAGE_SIZE (4 << 10) -#define HUGETLB_SIZE (2 << 20) -#endif - -/* - * >= 128TB is the hint addr value we used to select - * large address space. - */ -#define ADDR_SWITCH_HINT (1UL << 47) -#define LOW_ADDR ((void *) (1UL << 30)) -#define HIGH_ADDR ((void *) (1UL << 48)) - -struct testcase { - void *addr; - unsigned long size; - unsigned long flags; - const char *msg; - unsigned int low_addr_required:1; - unsigned int keep_mapped:1; -}; - -static struct testcase testcases[] = { - { - /* - * If stack is moved, we could possibly allocate - * this at the requested address. - */ - .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), - .size = PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", - .low_addr_required = 1, - }, - { - /* - * We should never allocate at the requested address or above it - * The len cross the 128TB boundary. Without MAP_FIXED - * we will always search in the lower address space. - */ - .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, (2 * PAGE_SIZE))", - .low_addr_required = 1, - }, - { - /* - * Exact mapping at 128TB, the area is free we should get that - * even without MAP_FIXED. - */ - .addr = ((void *)(ADDR_SWITCH_HINT)), - .size = PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", - .keep_mapped = 1, - }, - { - .addr = (void *)(ADDR_SWITCH_HINT), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", - }, - { - .addr = NULL, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(NULL)", - .low_addr_required = 1, - }, - { - .addr = LOW_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(LOW_ADDR)", - .low_addr_required = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR)", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR) again", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(HIGH_ADDR, MAP_FIXED)", - }, - { - .addr = (void *) -1, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1)", - .keep_mapped = 1, - }, - { - .addr = (void *) -1, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1) again", - }, - { - .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), - .size = PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", - .low_addr_required = 1, - }, - { - .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2 * PAGE_SIZE)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE / 2), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE/2 , 2 * PAGE_SIZE)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = ((void *)(ADDR_SWITCH_HINT)), - .size = PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", - }, - { - .addr = (void *)(ADDR_SWITCH_HINT), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", - }, -}; - -static struct testcase hugetlb_testcases[] = { - { - .addr = NULL, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(NULL, MAP_HUGETLB)", - .low_addr_required = 1, - }, - { - .addr = LOW_ADDR, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(LOW_ADDR, MAP_HUGETLB)", - .low_addr_required = 1, - }, - { - .addr = HIGH_ADDR, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)", - }, - { - .addr = (void *) -1, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1, MAP_HUGETLB)", - .keep_mapped = 1, - }, - { - .addr = (void *) -1, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1, MAP_HUGETLB) again", - }, - { - .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), - .size = 2 * HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2*HUGETLB_SIZE, MAP_HUGETLB)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = (void *)(ADDR_SWITCH_HINT), - .size = 2 * HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(ADDR_SWITCH_HINT , 2*HUGETLB_SIZE, MAP_FIXED | MAP_HUGETLB)", - }, -}; - -static int run_test(struct testcase *test, int count) -{ - void *p; - int i, ret = KSFT_PASS; - - for (i = 0; i < count; i++) { - struct testcase *t = test + i; - - p = mmap(t->addr, t->size, PROT_READ | PROT_WRITE, t->flags, -1, 0); - - printf("%s: %p - ", t->msg, p); - - if (p == MAP_FAILED) { - printf("FAILED\n"); - ret = KSFT_FAIL; - continue; - } - - if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) { - printf("FAILED\n"); - ret = KSFT_FAIL; - } else { - /* - * Do a dereference of the address returned so that we catch - * bugs in page fault handling - */ - memset(p, 0, t->size); - printf("OK\n"); - } - if (!t->keep_mapped) - munmap(p, t->size); - } - - return ret; -} - -static int supported_arch(void) -{ -#if defined(__powerpc64__) - return 1; -#elif defined(__x86_64__) - return 1; -#else - return 0; -#endif -} - -int main(int argc, char **argv) -{ - int ret; - - if (!supported_arch()) - return KSFT_SKIP; - - ret = run_test(testcases, ARRAY_SIZE(testcases)); - if (argc == 2 && !strcmp(argv[1], "--run-hugetlb")) - ret = run_test(hugetlb_testcases, ARRAY_SIZE(hugetlb_testcases)); - return ret; -} diff --git a/tools/testing/selftests/vm/va_128TBswitch.sh b/tools/testing/selftests/vm/va_128TBswitch.sh deleted file mode 100755 index 41580751dc51..000000000000 --- a/tools/testing/selftests/vm/va_128TBswitch.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Copyright (C) 2022 Adam Sindelar (Meta) -# -# This is a test for mmap behavior with 5-level paging. This script wraps the -# real test to check that the kernel is configured to support at least 5 -# pagetable levels. - -# 1 means the test failed -exitcode=1 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -fail() -{ - echo "$1" - exit $exitcode -} - -check_supported_x86_64() -{ - local config="/proc/config.gz" - [[ -f "${config}" ]] || config="/boot/config-$(uname -r)" - [[ -f "${config}" ]] || fail "Cannot find kernel config in /proc or /boot" - - # gzip -dcfq automatically handles both compressed and plaintext input. - # See man 1 gzip under '-f'. - local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2) - - if [[ "${pg_table_levels}" -lt 5 ]]; then - echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test" - exit $ksft_skip - fi -} - -check_test_requirements() -{ - # The test supports x86_64 and powerpc64. We currently have no useful - # eligibility check for powerpc64, and the test itself will reject other - # architectures. - case `uname -m` in - "x86_64") - check_supported_x86_64 - ;; - *) - return 0 - ;; - esac -} - -check_test_requirements -./va_128TBswitch diff --git a/tools/testing/selftests/vm/virtual_address_range.c b/tools/testing/selftests/vm/virtual_address_range.c deleted file mode 100644 index c0592646ed93..000000000000 --- a/tools/testing/selftests/vm/virtual_address_range.c +++ /dev/null @@ -1,139 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2017, Anshuman Khandual, IBM Corp. - * - * Works on architectures which support 128TB virtual - * address range and beyond. - */ -#include -#include -#include -#include -#include -#include -#include - -/* - * Maximum address range mapped with a single mmap() - * call is little bit more than 16GB. Hence 16GB is - * chosen as the single chunk size for address space - * mapping. - */ -#define MAP_CHUNK_SIZE 17179869184UL /* 16GB */ - -/* - * Address space till 128TB is mapped without any hint - * and is enabled by default. Address space beyond 128TB - * till 512TB is obtained by passing hint address as the - * first argument into mmap() system call. - * - * The process heap address space is divided into two - * different areas one below 128TB and one above 128TB - * till it reaches 512TB. One with size 128TB and the - * other being 384TB. - * - * On Arm64 the address space is 256TB and no high mappings - * are supported so far. - */ - -#define NR_CHUNKS_128TB 8192UL /* Number of 16GB chunks for 128TB */ -#define NR_CHUNKS_256TB (NR_CHUNKS_128TB * 2UL) -#define NR_CHUNKS_384TB (NR_CHUNKS_128TB * 3UL) - -#define ADDR_MARK_128TB (1UL << 47) /* First address beyond 128TB */ -#define ADDR_MARK_256TB (1UL << 48) /* First address beyond 256TB */ - -#ifdef __aarch64__ -#define HIGH_ADDR_MARK ADDR_MARK_256TB -#define HIGH_ADDR_SHIFT 49 -#define NR_CHUNKS_LOW NR_CHUNKS_256TB -#define NR_CHUNKS_HIGH 0 -#else -#define HIGH_ADDR_MARK ADDR_MARK_128TB -#define HIGH_ADDR_SHIFT 48 -#define NR_CHUNKS_LOW NR_CHUNKS_128TB -#define NR_CHUNKS_HIGH NR_CHUNKS_384TB -#endif - -static char *hind_addr(void) -{ - int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT); - - return (char *) (1UL << bits); -} - -static int validate_addr(char *ptr, int high_addr) -{ - unsigned long addr = (unsigned long) ptr; - - if (high_addr) { - if (addr < HIGH_ADDR_MARK) { - printf("Bad address %lx\n", addr); - return 1; - } - return 0; - } - - if (addr > HIGH_ADDR_MARK) { - printf("Bad address %lx\n", addr); - return 1; - } - return 0; -} - -static int validate_lower_address_hint(void) -{ - char *ptr; - - ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ | - PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (ptr == MAP_FAILED) - return 0; - - return 1; -} - -int main(int argc, char *argv[]) -{ - char *ptr[NR_CHUNKS_LOW]; - char *hptr[NR_CHUNKS_HIGH]; - char *hint; - unsigned long i, lchunks, hchunks; - - for (i = 0; i < NR_CHUNKS_LOW; i++) { - ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (ptr[i] == MAP_FAILED) { - if (validate_lower_address_hint()) - return 1; - break; - } - - if (validate_addr(ptr[i], 0)) - return 1; - } - lchunks = i; - - for (i = 0; i < NR_CHUNKS_HIGH; i++) { - hint = hind_addr(); - hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (hptr[i] == MAP_FAILED) - break; - - if (validate_addr(hptr[i], 1)) - return 1; - } - hchunks = i; - - for (i = 0; i < lchunks; i++) - munmap(ptr[i], MAP_CHUNK_SIZE); - - for (i = 0; i < hchunks; i++) - munmap(hptr[i], MAP_CHUNK_SIZE); - - return 0; -} diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c deleted file mode 100644 index 40e795624ff3..000000000000 --- a/tools/testing/selftests/vm/vm_util.c +++ /dev/null @@ -1,151 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include "../kselftest.h" -#include "vm_util.h" - -#define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" -#define SMAP_FILE_PATH "/proc/self/smaps" -#define MAX_LINE_LENGTH 500 - -uint64_t pagemap_get_entry(int fd, char *start) -{ - const unsigned long pfn = (unsigned long)start / getpagesize(); - uint64_t entry; - int ret; - - ret = pread(fd, &entry, sizeof(entry), pfn * sizeof(entry)); - if (ret != sizeof(entry)) - ksft_exit_fail_msg("reading pagemap failed\n"); - return entry; -} - -bool pagemap_is_softdirty(int fd, char *start) -{ - uint64_t entry = pagemap_get_entry(fd, start); - - // Check if dirty bit (55th bit) is set - return entry & 0x0080000000000000ull; -} - -bool pagemap_is_swapped(int fd, char *start) -{ - uint64_t entry = pagemap_get_entry(fd, start); - - return entry & 0x4000000000000000ull; -} - -bool pagemap_is_populated(int fd, char *start) -{ - uint64_t entry = pagemap_get_entry(fd, start); - - /* Present or swapped. */ - return entry & 0xc000000000000000ull; -} - -unsigned long pagemap_get_pfn(int fd, char *start) -{ - uint64_t entry = pagemap_get_entry(fd, start); - - /* If present (63th bit), PFN is at bit 0 -- 54. */ - if (entry & 0x8000000000000000ull) - return entry & 0x007fffffffffffffull; - return -1ul; -} - -void clear_softdirty(void) -{ - int ret; - const char *ctrl = "4"; - int fd = open("/proc/self/clear_refs", O_WRONLY); - - if (fd < 0) - ksft_exit_fail_msg("opening clear_refs failed\n"); - ret = write(fd, ctrl, strlen(ctrl)); - close(fd); - if (ret != strlen(ctrl)) - ksft_exit_fail_msg("writing clear_refs failed\n"); -} - -bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len) -{ - while (fgets(buf, len, fp)) { - if (!strncmp(buf, pattern, strlen(pattern))) - return true; - } - return false; -} - -uint64_t read_pmd_pagesize(void) -{ - int fd; - char buf[20]; - ssize_t num_read; - - fd = open(PMD_SIZE_FILE_PATH, O_RDONLY); - if (fd == -1) - ksft_exit_fail_msg("Open hpage_pmd_size failed\n"); - - num_read = read(fd, buf, 19); - if (num_read < 1) { - close(fd); - ksft_exit_fail_msg("Read hpage_pmd_size failed\n"); - } - buf[num_read] = '\0'; - close(fd); - - return strtoul(buf, NULL, 10); -} - -bool __check_huge(void *addr, char *pattern, int nr_hpages, - uint64_t hpage_size) -{ - uint64_t thp = -1; - int ret; - FILE *fp; - char buffer[MAX_LINE_LENGTH]; - char addr_pattern[MAX_LINE_LENGTH]; - - ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", - (unsigned long) addr); - if (ret >= MAX_LINE_LENGTH) - ksft_exit_fail_msg("%s: Pattern is too long\n", __func__); - - fp = fopen(SMAP_FILE_PATH, "r"); - if (!fp) - ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, SMAP_FILE_PATH); - - if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) - goto err_out; - - /* - * Fetch the pattern in the same block and check the number of - * hugepages. - */ - if (!check_for_pattern(fp, pattern, buffer, sizeof(buffer))) - goto err_out; - - snprintf(addr_pattern, MAX_LINE_LENGTH, "%s%%9ld kB", pattern); - - if (sscanf(buffer, addr_pattern, &thp) != 1) - ksft_exit_fail_msg("Reading smap error\n"); - -err_out: - fclose(fp); - return thp == (nr_hpages * (hpage_size >> 10)); -} - -bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size) -{ - return __check_huge(addr, "AnonHugePages: ", nr_hpages, hpage_size); -} - -bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size) -{ - return __check_huge(addr, "FilePmdMapped:", nr_hpages, hpage_size); -} - -bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size) -{ - return __check_huge(addr, "ShmemPmdMapped:", nr_hpages, hpage_size); -} diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h deleted file mode 100644 index 1995ee911ef2..000000000000 --- a/tools/testing/selftests/vm/vm_util.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include -#include - -uint64_t pagemap_get_entry(int fd, char *start); -bool pagemap_is_softdirty(int fd, char *start); -bool pagemap_is_swapped(int fd, char *start); -bool pagemap_is_populated(int fd, char *start); -unsigned long pagemap_get_pfn(int fd, char *start); -void clear_softdirty(void); -bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); -uint64_t read_pmd_pagesize(void); -bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); -bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size); -bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size); diff --git a/tools/testing/selftests/vm/write_hugetlb_memory.sh b/tools/testing/selftests/vm/write_hugetlb_memory.sh deleted file mode 100644 index 70a02301f4c2..000000000000 --- a/tools/testing/selftests/vm/write_hugetlb_memory.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -set -e - -size=$1 -populate=$2 -write=$3 -cgroup=$4 -path=$5 -method=$6 -private=$7 -want_sleep=$8 -reserve=$9 - -echo "Putting task in cgroup '$cgroup'" -echo $$ > ${cgroup_path:-/dev/cgroup/memory}/"$cgroup"/cgroup.procs - -echo "Method is $method" - -set +e -./write_to_hugetlbfs -p "$path" -s "$size" "$write" "$populate" -m "$method" \ - "$private" "$want_sleep" "$reserve" diff --git a/tools/testing/selftests/vm/write_to_hugetlbfs.c b/tools/testing/selftests/vm/write_to_hugetlbfs.c deleted file mode 100644 index 6a2caba19ee1..000000000000 --- a/tools/testing/selftests/vm/write_to_hugetlbfs.c +++ /dev/null @@ -1,240 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * This program reserves and uses hugetlb memory, supporting a bunch of - * scenarios needed by the charged_reserved_hugetlb.sh test. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Global definitions. */ -enum method { - HUGETLBFS, - MMAP_MAP_HUGETLB, - SHM, - MAX_METHOD -}; - - -/* Global variables. */ -static const char *self; -static char *shmaddr; -static int shmid; - -/* - * Show usage and exit. - */ -static void exit_usage(void) -{ - printf("Usage: %s -p -s " - "[-m <0=hugetlbfs | 1=mmap(MAP_HUGETLB)>] [-l] [-r] " - "[-o] [-w] [-n]\n", - self); - exit(EXIT_FAILURE); -} - -void sig_handler(int signo) -{ - printf("Received %d.\n", signo); - if (signo == SIGINT) { - printf("Deleting the memory\n"); - if (shmdt((const void *)shmaddr) != 0) { - perror("Detach failure"); - shmctl(shmid, IPC_RMID, NULL); - exit(4); - } - - shmctl(shmid, IPC_RMID, NULL); - printf("Done deleting the memory\n"); - } - exit(2); -} - -int main(int argc, char **argv) -{ - int fd = 0; - int key = 0; - int *ptr = NULL; - int c = 0; - int size = 0; - char path[256] = ""; - enum method method = MAX_METHOD; - int want_sleep = 0, private = 0; - int populate = 0; - int write = 0; - int reserve = 1; - - if (signal(SIGINT, sig_handler) == SIG_ERR) - err(1, "\ncan't catch SIGINT\n"); - - /* Parse command-line arguments. */ - setvbuf(stdout, NULL, _IONBF, 0); - self = argv[0]; - - while ((c = getopt(argc, argv, "s:p:m:owlrn")) != -1) { - switch (c) { - case 's': - size = atoi(optarg); - break; - case 'p': - strncpy(path, optarg, sizeof(path)); - break; - case 'm': - if (atoi(optarg) >= MAX_METHOD) { - errno = EINVAL; - perror("Invalid -m."); - exit_usage(); - } - method = atoi(optarg); - break; - case 'o': - populate = 1; - break; - case 'w': - write = 1; - break; - case 'l': - want_sleep = 1; - break; - case 'r': - private - = 1; - break; - case 'n': - reserve = 0; - break; - default: - errno = EINVAL; - perror("Invalid arg"); - exit_usage(); - } - } - - if (strncmp(path, "", sizeof(path)) != 0) { - printf("Writing to this path: %s\n", path); - } else { - errno = EINVAL; - perror("path not found"); - exit_usage(); - } - - if (size != 0) { - printf("Writing this size: %d\n", size); - } else { - errno = EINVAL; - perror("size not found"); - exit_usage(); - } - - if (!populate) - printf("Not populating.\n"); - else - printf("Populating.\n"); - - if (!write) - printf("Not writing to memory.\n"); - - if (method == MAX_METHOD) { - errno = EINVAL; - perror("-m Invalid"); - exit_usage(); - } else - printf("Using method=%d\n", method); - - if (!private) - printf("Shared mapping.\n"); - else - printf("Private mapping.\n"); - - if (!reserve) - printf("NO_RESERVE mapping.\n"); - else - printf("RESERVE mapping.\n"); - - switch (method) { - case HUGETLBFS: - printf("Allocating using HUGETLBFS.\n"); - fd = open(path, O_CREAT | O_RDWR, 0777); - if (fd == -1) - err(1, "Failed to open file."); - - ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, - (private ? MAP_PRIVATE : MAP_SHARED) | - (populate ? MAP_POPULATE : 0) | - (reserve ? 0 : MAP_NORESERVE), - fd, 0); - - if (ptr == MAP_FAILED) { - close(fd); - err(1, "Error mapping the file"); - } - break; - case MMAP_MAP_HUGETLB: - printf("Allocating using MAP_HUGETLB.\n"); - ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, - (private ? (MAP_PRIVATE | MAP_ANONYMOUS) : - MAP_SHARED) | - MAP_HUGETLB | (populate ? MAP_POPULATE : 0) | - (reserve ? 0 : MAP_NORESERVE), - -1, 0); - - if (ptr == MAP_FAILED) - err(1, "mmap"); - - printf("Returned address is %p\n", ptr); - break; - case SHM: - printf("Allocating using SHM.\n"); - shmid = shmget(key, size, - SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); - if (shmid < 0) { - shmid = shmget(++key, size, - SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); - if (shmid < 0) - err(1, "shmget"); - } - printf("shmid: 0x%x, shmget key:%d\n", shmid, key); - - ptr = shmat(shmid, NULL, 0); - if (ptr == (int *)-1) { - perror("Shared memory attach failure"); - shmctl(shmid, IPC_RMID, NULL); - exit(2); - } - printf("shmaddr: %p\n", ptr); - - break; - default: - errno = EINVAL; - err(1, "Invalid method."); - } - - if (write) { - printf("Writing to memory.\n"); - memset(ptr, 1, size); - } - - if (want_sleep) { - /* Signal to caller that we're done. */ - printf("DONE\n"); - - /* Hold memory until external kill signal is delivered. */ - while (1) - sleep(100); - } - - if (method == HUGETLBFS) - close(fd); - - return 0; -} -- cgit v1.2.3 From 6c364edc194e104566f4de72f7a2af5b8fc17110 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 3 Jan 2023 18:07:54 +0000 Subject: Docs/admin-guide/mm/numaperf: increase depth of subsections Each section of numaperf.rst has zero depth, and therefore be exposed to the index of admin-guide/mm. Especially 'See Also' section on the index makes the document weird. Hide the sections from the index by giving the document a title and increasing the depth of each section. [sj@kernel.org: change title to fix duplicate label warning] Link: https://lkml.kernel.org/r/20230106194927.152663-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230103180754.129637-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/numaperf.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/mm/numaperf.rst b/Documentation/admin-guide/mm/numaperf.rst index 166697325947..544a6d16c801 100644 --- a/Documentation/admin-guide/mm/numaperf.rst +++ b/Documentation/admin-guide/mm/numaperf.rst @@ -1,6 +1,9 @@ .. _numaperf: -============= +======================= +NUMA Memory Performance +======================= + NUMA Locality ============= @@ -61,7 +64,6 @@ that are CPUs and hence suitable for generic task scheduling, and IO initiators such as GPUs and NICs. Unlike access class 0, only nodes containing CPUs are considered. -================ NUMA Performance ================ @@ -96,7 +98,6 @@ for the platform. Access class 1 takes the same form but only includes values for CPU to memory activity. -========== NUMA Cache ========== @@ -170,7 +171,6 @@ The "size" is the number of bytes provided by this cache level. The "write_policy" will be 0 for write-back, and non-zero for write-through caching. -======== See Also ======== -- cgit v1.2.3 From 4b89a37d54a0b5ed6b2e5a9afc44a15a22e563f5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 3 Jan 2023 11:44:30 +0100 Subject: fs: don't allocate blocks beyond EOF from __mpage_writepage When __mpage_writepage() is called for a page beyond EOF, it will go and allocate all blocks underlying the page. This is not only unnecessary but this way blocks can get leaked (e.g. if a page beyond EOF is marked dirty but in the end write fails and i_size is not extended). Link: https://lkml.kernel.org/r/20230103104430.27749-1-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Cc: Al Viro Signed-off-by: Andrew Morton --- fs/mpage.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/mpage.c b/fs/mpage.c index d36a95473f77..b8e7975159bc 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -524,6 +524,12 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, */ BUG_ON(!PageUptodate(page)); block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); + /* + * Whole page beyond EOF? Skip allocating blocks to avoid leaking + * space. + */ + if (block_in_file >= (i_size + (1 << blkbits) - 1) >> blkbits) + goto page_is_mapped; last_block = (i_size - 1) >> blkbits; map_bh.b_page = page; for (page_block = 0; page_block < blocks_per_page; ) { -- cgit v1.2.3 From df32de1433412621b92daf1b3369ac053214031e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 3 Jan 2023 12:01:19 +0900 Subject: zram: correctly handle all next_arg() cases When supplied buffer does not have assignment sign next_arg() sets `val` pointer to NULL, so we cannot dereference it. Add a NULL pointer test to handle `param` case, in addition to `*val` test, which handles cases when param has no value assigned to it: `param=`. Link: https://lkml.kernel.org/r/20230103030119.1496358-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 7becd5448791..5d1088a645e3 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1140,7 +1140,7 @@ static ssize_t recomp_algorithm_store(struct device *dev, while (*args) { args = next_arg(args, ¶m, &val); - if (!*val) + if (!val || !*val) return -EINVAL; if (!strcmp(param, "algo")) { @@ -1824,7 +1824,7 @@ static ssize_t recompress_store(struct device *dev, while (*args) { args = next_arg(args, ¶m, &val); - if (!*val) + if (!val || !*val) return -EINVAL; if (!strcmp(param, "type")) { -- cgit v1.2.3 From da0618c146ca0e1412173a8a229dd737a73b1a4f Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 2 Jan 2023 16:11:26 +0000 Subject: selftest/vm: add mremap expand merge offset test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a test to assert that we can mremap() and expand a mapping starting from an offset within an existing mapping. We unmap the last page in a 3 page mapping to ensure that the remap should always succeed, before remapping from the 2nd page. This is additionally a regression test for the issue solved in "mm, mremap: fix mremap() expanding vma with addr inside vma" and confirmed to fail prior to the change and pass after it. Finally, this patch updates the existing mremap expand merge test to check error conditions and reduce code duplication between the two tests. [lstoakes@gmail.com: increment num_expand_tests so test doesn't complain about unexpected tests being run] Link: https://lkml.kernel.org/r/8ff3ba3cadc0b6c1b2688ae5c851bf73aa062d57.1673701836.git.lstoakes@gmail.com Link: https://lkml.kernel.org/r/02b117a8ffd52acc01dc66c2fb39754f08d92c0e.1672675824.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand Cc: Jakub Matěna Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mremap_test.c | 119 +++++++++++++++++++++++++------ 1 file changed, 96 insertions(+), 23 deletions(-) diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index 9496346973d4..5c3773de9f0f 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -119,47 +119,109 @@ static unsigned long long get_mmap_min_addr(void) } /* - * This test validates that merge is called when expanding a mapping. - * Mapping containing three pages is created, middle page is unmapped - * and then the mapping containing the first page is expanded so that - * it fills the created hole. The two parts should merge creating - * single mapping with three pages. + * Using /proc/self/maps, assert that the specified address range is contained + * within a single mapping. */ -static void mremap_expand_merge(unsigned long page_size) +static bool is_range_mapped(FILE *maps_fp, void *start, void *end) { - char *test_name = "mremap expand merge"; - FILE *fp; char *line = NULL; size_t len = 0; bool success = false; - char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - munmap(start + page_size, page_size); - mremap(start, page_size, 2 * page_size, 0); - fp = fopen("/proc/self/maps", "r"); - if (fp == NULL) { - ksft_test_result_fail("%s\n", test_name); - return; - } + rewind(maps_fp); - while (getline(&line, &len, fp) != -1) { + while (getline(&line, &len, maps_fp) != -1) { char *first = strtok(line, "- "); void *first_val = (void *)strtol(first, NULL, 16); char *second = strtok(NULL, "- "); void *second_val = (void *) strtol(second, NULL, 16); - if (first_val == start && second_val == start + 3 * page_size) { + if (first_val <= start && second_val >= end) { success = true; break; } } + + return success; +} + +/* + * This test validates that merge is called when expanding a mapping. + * Mapping containing three pages is created, middle page is unmapped + * and then the mapping containing the first page is expanded so that + * it fills the created hole. The two parts should merge creating + * single mapping with three pages. + */ +static void mremap_expand_merge(FILE *maps_fp, unsigned long page_size) +{ + char *test_name = "mremap expand merge"; + bool success = false; + char *remap, *start; + + start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (start == MAP_FAILED) { + ksft_print_msg("mmap failed: %s\n", strerror(errno)); + goto out; + } + + munmap(start + page_size, page_size); + remap = mremap(start, page_size, 2 * page_size, 0); + if (remap == MAP_FAILED) { + ksft_print_msg("mremap failed: %s\n", strerror(errno)); + munmap(start, page_size); + munmap(start + 2 * page_size, page_size); + goto out; + } + + success = is_range_mapped(maps_fp, start, start + 3 * page_size); + munmap(start, 3 * page_size); + +out: + if (success) + ksft_test_result_pass("%s\n", test_name); + else + ksft_test_result_fail("%s\n", test_name); +} + +/* + * Similar to mremap_expand_merge() except instead of removing the middle page, + * we remove the last then attempt to remap offset from the second page. This + * should result in the mapping being restored to its former state. + */ +static void mremap_expand_merge_offset(FILE *maps_fp, unsigned long page_size) +{ + + char *test_name = "mremap expand merge offset"; + bool success = false; + char *remap, *start; + + start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (start == MAP_FAILED) { + ksft_print_msg("mmap failed: %s\n", strerror(errno)); + goto out; + } + + /* Unmap final page to ensure we have space to expand. */ + munmap(start + 2 * page_size, page_size); + remap = mremap(start + page_size, page_size, 2 * page_size, 0); + if (remap == MAP_FAILED) { + ksft_print_msg("mremap failed: %s\n", strerror(errno)); + munmap(start, 2 * page_size); + goto out; + } + + success = is_range_mapped(maps_fp, start, start + 3 * page_size); + munmap(start, 3 * page_size); + +out: if (success) ksft_test_result_pass("%s\n", test_name); else ksft_test_result_fail("%s\n", test_name); - fclose(fp); } /* @@ -380,11 +442,12 @@ int main(int argc, char **argv) int i, run_perf_tests; unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; unsigned int pattern_seed; - int num_expand_tests = 1; + int num_expand_tests = 2; struct test test_cases[MAX_TEST]; struct test perf_test_cases[MAX_PERF_TEST]; int page_size; time_t t; + FILE *maps_fp; pattern_seed = (unsigned int) time(&t); @@ -458,7 +521,17 @@ int main(int argc, char **argv) run_mremap_test_case(test_cases[i], &failures, threshold_mb, pattern_seed); - mremap_expand_merge(page_size); + maps_fp = fopen("/proc/self/maps", "r"); + + if (maps_fp == NULL) { + ksft_print_msg("Failed to read /proc/self/maps: %s\n", strerror(errno)); + exit(KSFT_FAIL); + } + + mremap_expand_merge(maps_fp, page_size); + mremap_expand_merge_offset(maps_fp, page_size); + + fclose(maps_fp); if (run_perf_tests) { ksft_print_msg("\n%s\n", -- cgit v1.2.3 From fc4f4be9b5271e43eeb4c675d190fa9734de9ea3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 2 Jan 2023 17:08:54 +0100 Subject: mm/nommu: factor out check for NOMMU shared mappings into is_nommu_shared_mapping() Patch series "mm/nommu: don't use VM_MAYSHARE for MAP_PRIVATE mappings". Trying to reduce the confusion around VM_SHARED and VM_MAYSHARE first requires !CONFIG_MMU to stop using VM_MAYSHARE for MAP_PRIVATE mappings. CONFIG_MMU only sets VM_MAYSHARE for MAP_SHARED mappings. This paves the way for further VM_MAYSHARE and VM_SHARED cleanups: for example, renaming VM_MAYSHARED to VM_MAP_SHARED to make it cleaner what is actually means. Let's first get the weird case out of the way and not use VM_MAYSHARE in MAP_PRIVATE mappings, using a new VM_MAYOVERLAY flag instead. This patch (of 3): We want to stop using VM_MAYSHARE in private mappings to pave the way for clarifying the semantics of VM_MAYSHARE vs. VM_SHARED and reduce the confusion. While CONFIG_MMU uses VM_MAYSHARE to represent MAP_SHARED, !CONFIG_MMU also sets VM_MAYSHARE for selected R/O private file mappings that are an effective overlay of a file mapping. Let's factor out all relevant VM_MAYSHARE checks in !CONFIG_MMU code into is_nommu_shared_mapping() first. Note that whenever VM_SHARED is set, VM_MAYSHARE must be set as well (unless there is a serious BUG). So there is not need to test for VM_SHARED manually. No functional change intended. Link: https://lkml.kernel.org/r/20230102160856.500584-1-david@redhat.com Link: https://lkml.kernel.org/r/20230102160856.500584-2-david@redhat.com Signed-off-by: David Hildenbrand Cc: Arnd Bergmann Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Jens Axboe Cc: Nicolas Pitre Cc: Pavel Begunkov Signed-off-by: Andrew Morton --- drivers/char/mem.c | 2 +- fs/cramfs/inode.c | 2 +- fs/proc/task_nommu.c | 2 +- fs/ramfs/file-nommu.c | 2 +- fs/romfs/mmap-nommu.c | 2 +- include/linux/mm.h | 15 +++++++++++++++ io_uring/io_uring.c | 2 +- mm/nommu.c | 11 ++++++----- 8 files changed, 27 insertions(+), 11 deletions(-) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 83bf2a4dcb57..ffb101d349f0 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -343,7 +343,7 @@ static unsigned zero_mmap_capabilities(struct file *file) /* can't do an in-place private mapping if there's no MMU */ static inline int private_mapping_ok(struct vm_area_struct *vma) { - return vma->vm_flags & VM_MAYSHARE; + return is_nommu_shared_mapping(vma->vm_flags); } #else diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 61ccf7722fc3..50e4e060db68 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -437,7 +437,7 @@ bailout: static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) { - return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS; + return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -ENOSYS; } static unsigned long cramfs_physmem_get_unmapped_area(struct file *file, diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 2fd06f52b6a4..0ec35072a8e5 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -38,7 +38,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) } if (atomic_read(&mm->mm_count) > 1 || - vma->vm_flags & VM_MAYSHARE) { + is_nommu_shared_mapping(vma->vm_flags)) { sbytes += size; } else { bytes += size; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index cb240eac5036..cd4537692751 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -264,7 +264,7 @@ out: */ static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) { - if (!(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) + if (!is_nommu_shared_mapping(vma->vm_flags)) return -ENOSYS; file_accessed(file); diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c index 2c4a23113fb5..4578dc45e50a 100644 --- a/fs/romfs/mmap-nommu.c +++ b/fs/romfs/mmap-nommu.c @@ -63,7 +63,7 @@ static unsigned long romfs_get_unmapped_area(struct file *file, */ static int romfs_mmap(struct file *file, struct vm_area_struct *vma) { - return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS; + return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -ENOSYS; } static unsigned romfs_mmap_capabilities(struct file *file) diff --git a/include/linux/mm.h b/include/linux/mm.h index eb5bfc77c2c2..791bac40bf8e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1347,6 +1347,21 @@ static inline bool is_cow_mapping(vm_flags_t flags) return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } +#ifndef CONFIG_MMU +static inline bool is_nommu_shared_mapping(vm_flags_t flags) +{ + /* + * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected + * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of + * a file mapping. R/O MAP_PRIVATE mappings might still modify + * underlying memory if ptrace is active, so this is only possible if + * ptrace does not apply. Note that there is no mprotect() to upgrade + * write permissions later. + */ + return flags & VM_MAYSHARE; +} +#endif + #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2ac1cd8d23ea..3a934f733136 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3206,7 +3206,7 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { - return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL; + return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; } static unsigned int io_uring_nommu_mmap_capabilities(struct file *file) diff --git a/mm/nommu.c b/mm/nommu.c index 5b83938ecb67..1671ebbecb8d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -958,9 +958,10 @@ static int do_mmap_private(struct vm_area_struct *vma, */ if (capabilities & NOMMU_MAP_DIRECT) { ret = call_mmap(vma->vm_file, vma); + /* shouldn't return success if we're not sharing */ + if (WARN_ON_ONCE(!is_nommu_shared_mapping(vma->vm_flags))) + ret = -ENOSYS; if (ret == 0) { - /* shouldn't return success if we're not sharing */ - BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); vma->vm_region->vm_top = vma->vm_region->vm_end; return 0; } @@ -1106,7 +1107,7 @@ unsigned long do_mmap(struct file *file, * these cases, sharing is handled in the driver or filesystem rather * than here */ - if (vm_flags & VM_MAYSHARE) { + if (is_nommu_shared_mapping(vm_flags)) { struct vm_region *pregion; unsigned long pglen, rpglen, pgend, rpgend, start; @@ -1116,7 +1117,7 @@ unsigned long do_mmap(struct file *file, for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) { pregion = rb_entry(rb, struct vm_region, vm_rb); - if (!(pregion->vm_flags & VM_MAYSHARE)) + if (!is_nommu_shared_mapping(pregion->vm_flags)) continue; /* search for overlapping mappings on the same file */ @@ -1600,7 +1601,7 @@ static unsigned long do_mremap(unsigned long addr, if (vma->vm_end != vma->vm_start + old_len) return (unsigned long) -EFAULT; - if (vma->vm_flags & VM_MAYSHARE) + if (is_nommu_shared_mapping(vma->vm_flags)) return (unsigned long) -EPERM; if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start) -- cgit v1.2.3 From b6b7a8faf05c709cd9f63d3b7d9c66bd91bc3b0d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 2 Jan 2023 17:08:55 +0100 Subject: mm/nommu: don't use VM_MAYSHARE for MAP_PRIVATE mappings Let's stop using VM_MAYSHARE for MAP_PRIVATE mappings and use VM_MAYOVERLAY instead. Rewrite determine_vm_flags() to make the whole logic easier to digest, and to cleanly separate MAP_PRIVATE vs. MAP_SHARED. No functional change intended. Link: https://lkml.kernel.org/r/20230102160856.500584-3-david@redhat.com Signed-off-by: David Hildenbrand Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Jens Axboe Cc: Nicolas Pitre Cc: Pavel Begunkov Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 ++++++- mm/nommu.c | 51 ++++++++++++++++++++++++++++++--------------------- 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 791bac40bf8e..8a8563359946 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -276,7 +276,12 @@ extern unsigned int kobjsize(const void *objp); #define VM_MAYSHARE 0x00000080 #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ +#ifdef CONFIG_MMU #define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ +#else /* CONFIG_MMU */ +#define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ +#define VM_UFFD_MISSING 0 +#endif /* CONFIG_MMU */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ @@ -1358,7 +1363,7 @@ static inline bool is_nommu_shared_mapping(vm_flags_t flags) * ptrace does not apply. Note that there is no mprotect() to upgrade * write permissions later. */ - return flags & VM_MAYSHARE; + return flags & (VM_MAYSHARE | VM_MAYOVERLAY); } #endif diff --git a/mm/nommu.c b/mm/nommu.c index 1671ebbecb8d..df1711acdf5b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -892,29 +892,36 @@ static unsigned long determine_vm_flags(struct file *file, unsigned long vm_flags; vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags); - /* vm_flags |= mm->def_flags; */ - if (!(capabilities & NOMMU_MAP_DIRECT)) { - /* attempt to share read-only copies of mapped file chunks */ + if (!file) { + /* + * MAP_ANONYMOUS. MAP_SHARED is mapped to MAP_PRIVATE, because + * there is no fork(). + */ vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; - if (file && !(prot & PROT_WRITE)) - vm_flags |= VM_MAYSHARE; + } else if (flags & MAP_PRIVATE) { + /* MAP_PRIVATE file mapping */ + if (capabilities & NOMMU_MAP_DIRECT) + vm_flags |= (capabilities & NOMMU_VMFLAGS); + else + vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + + if (!(prot & PROT_WRITE) && !current->ptrace) + /* + * R/O private file mapping which cannot be used to + * modify memory, especially also not via active ptrace + * (e.g., set breakpoints) or later by upgrading + * permissions (no mprotect()). We can try overlaying + * the file mapping, which will work e.g., on chardevs, + * ramfs/tmpfs/shmfs and romfs/cramf. + */ + vm_flags |= VM_MAYOVERLAY; } else { - /* overlay a shareable mapping on the backing device or inode - * if possible - used for chardevs, ramfs/tmpfs/shmfs and - * romfs/cramfs */ - vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS); - if (flags & MAP_SHARED) - vm_flags |= VM_SHARED; + /* MAP_SHARED file mapping: NOMMU_MAP_DIRECT is set. */ + vm_flags |= VM_SHARED | VM_MAYSHARE | + (capabilities & NOMMU_VMFLAGS); } - /* refuse to let anyone share private mappings with this process if - * it's being traced - otherwise breakpoints set in it may interfere - * with another untraced process - */ - if ((flags & MAP_PRIVATE) && current->ptrace) - vm_flags &= ~VM_MAYSHARE; - return vm_flags; } @@ -952,9 +959,11 @@ static int do_mmap_private(struct vm_area_struct *vma, void *base; int ret, order; - /* invoke the file's mapping function so that it can keep track of - * shared mappings on devices or memory - * - VM_MAYSHARE will be set if it may attempt to share + /* + * Invoke the file's mapping function so that it can keep track of + * shared mappings on devices or memory. VM_MAYOVERLAY will be set if + * it may attempt to share, which will make is_nommu_shared_mapping() + * happy. */ if (capabilities & NOMMU_MAP_DIRECT) { ret = call_mmap(vma->vm_file, vma); -- cgit v1.2.3 From 997931ce02b72f15c40e742ee035ce5643ba574f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 2 Jan 2023 17:08:56 +0100 Subject: drivers/misc/open-dice: don't touch VM_MAYSHARE A MAP_SHARED mapping always has VM_MAYSHARE set, and writable (VM_MAYWRITE) MAP_SHARED mappings have VM_SHARED set as well. To identify a MAP_SHARED mapping, it's sufficient to look at VM_MAYSHARE. We cannot have VM_MAYSHARE|VM_WRITE mappings without having VM_SHARED set. Consequently, current code will never actually end up clearing VM_MAYSHARE and that code is confusing, because nobody is supposed to mess with VM_MAYWRITE. Let's clean it up and restructure the code. No functional change intended. Link: https://lkml.kernel.org/r/20230102160856.500584-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Jens Axboe Cc: Nicolas Pitre Cc: Pavel Begunkov Signed-off-by: Andrew Morton --- drivers/misc/open-dice.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/misc/open-dice.c b/drivers/misc/open-dice.c index c61be3404c6f..9dda47b3fd70 100644 --- a/drivers/misc/open-dice.c +++ b/drivers/misc/open-dice.c @@ -90,15 +90,13 @@ static int open_dice_mmap(struct file *filp, struct vm_area_struct *vma) { struct open_dice_drvdata *drvdata = to_open_dice_drvdata(filp); - /* Do not allow userspace to modify the underlying data. */ - if ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED)) - return -EPERM; - - /* Ensure userspace cannot acquire VM_WRITE + VM_SHARED later. */ - if (vma->vm_flags & VM_WRITE) - vma->vm_flags &= ~VM_MAYSHARE; - else if (vma->vm_flags & VM_SHARED) + if (vma->vm_flags & VM_MAYSHARE) { + /* Do not allow userspace to modify the underlying data. */ + if (vma->vm_flags & VM_WRITE) + return -EPERM; + /* Ensure userspace cannot acquire VM_WRITE later. */ vma->vm_flags &= ~VM_MAYWRITE; + } /* Create write-combine mapping so all clients observe a wipe. */ vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); -- cgit v1.2.3 From 8788f6781486769d9598dcaedc3fe0eb12fc3e59 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Fri, 30 Dec 2022 14:52:51 -0700 Subject: mm: add vma_has_recency() Add vma_has_recency() to indicate whether a VMA may exhibit temporal locality that the LRU algorithm relies on. This function returns false for VMAs marked by VM_SEQ_READ or VM_RAND_READ. While the former flag indicates linear access, i.e., a special case of spatial locality, both flags indicate a lack of temporal locality, i.e., the reuse of an area within a relatively small duration. "Recency" is chosen over "locality" to avoid confusion between temporal and spatial localities. Before this patch, the active/inactive LRU only ignored the accessed bit from VMAs marked by VM_SEQ_READ. After this patch, the active/inactive LRU and MGLRU share the same logic: they both ignore the accessed bit if vma_has_recency() returns false. For the active/inactive LRU, the following fio test showed a [6, 8]% increase in IOPS when randomly accessing mapped files under memory pressure. kb=$(awk '/MemTotal/ { print $2 }' /proc/meminfo) kb=$((kb - 8*1024*1024)) modprobe brd rd_nr=1 rd_size=$kb dd if=/dev/zero of=/dev/ram0 bs=1M mkfs.ext4 /dev/ram0 mount /dev/ram0 /mnt/ swapoff -a fio --name=test --directory=/mnt/ --ioengine=mmap --numjobs=8 \ --size=8G --rw=randrw --time_based --runtime=10m \ --group_reporting The discussion that led to this patch is here [1]. Additional test results are available in that thread. [1] https://lore.kernel.org/r/Y31s%2FK8T85jh05wH@google.com/ Link: https://lkml.kernel.org/r/20221230215252.2628425-1-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Alexander Viro Cc: Andrea Righi Cc: Johannes Weiner Cc: Michael Larabel Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 8 ++++++++ mm/memory.c | 7 +++---- mm/rmap.c | 42 ++++++++++++++++++------------------------ mm/vmscan.c | 5 ++++- 4 files changed, 33 insertions(+), 29 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index acf03147fff8..4abebf2615a3 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -594,4 +594,12 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, #endif } +static inline bool vma_has_recency(struct vm_area_struct *vma) +{ + if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) + return false; + + return true; +} + #endif diff --git a/mm/memory.c b/mm/memory.c index b0dda866ffe6..90f8f72777c7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1402,8 +1402,7 @@ again: force_flush = 1; } } - if (pte_young(ptent) && - likely(!(vma->vm_flags & VM_SEQ_READ))) + if (pte_young(ptent) && likely(vma_has_recency(vma))) mark_page_accessed(page); } rss[mm_counter(page)]--; @@ -5115,8 +5114,8 @@ static inline void mm_account_fault(struct pt_regs *regs, #ifdef CONFIG_LRU_GEN static void lru_gen_enter_fault(struct vm_area_struct *vma) { - /* the LRU algorithm doesn't apply to sequential or random reads */ - current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)); + /* the LRU algorithm only applies to accesses with recency */ + current->in_lru_fault = vma_has_recency(vma); } static void lru_gen_exit_fault(void) diff --git a/mm/rmap.c b/mm/rmap.c index 32e48b1c5847..ab74e0547a52 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -823,25 +823,14 @@ static bool folio_referenced_one(struct folio *folio, } if (pvmw.pte) { - if (lru_gen_enabled() && pte_young(*pvmw.pte) && - !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) { + if (lru_gen_enabled() && pte_young(*pvmw.pte)) { lru_gen_look_around(&pvmw); referenced++; } if (ptep_clear_flush_young_notify(vma, address, - pvmw.pte)) { - /* - * Don't treat a reference through - * a sequentially read mapping as such. - * If the folio has been used in another mapping, - * we will catch it; if this other mapping is - * already gone, the unmap path will have set - * the referenced flag or activated the folio. - */ - if (likely(!(vma->vm_flags & VM_SEQ_READ))) - referenced++; - } + pvmw.pte)) + referenced++; } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (pmdp_clear_flush_young_notify(vma, address, pvmw.pmd)) @@ -875,7 +864,20 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg) struct folio_referenced_arg *pra = arg; struct mem_cgroup *memcg = pra->memcg; - if (!mm_match_cgroup(vma->vm_mm, memcg)) + /* + * Ignore references from this mapping if it has no recency. If the + * folio has been used in another mapping, we will catch it; if this + * other mapping is already gone, the unmap path will have set the + * referenced flag or activated the folio in zap_pte_range(). + */ + if (!vma_has_recency(vma)) + return true; + + /* + * If we are reclaiming on behalf of a cgroup, skip counting on behalf + * of references from different cgroups. + */ + if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) return true; return false; @@ -906,6 +908,7 @@ int folio_referenced(struct folio *folio, int is_locked, .arg = (void *)&pra, .anon_lock = folio_lock_anon_vma_read, .try_lock = true, + .invalid_vma = invalid_folio_referenced_vma, }; *vm_flags = 0; @@ -921,15 +924,6 @@ int folio_referenced(struct folio *folio, int is_locked, return 1; } - /* - * If we are reclaiming on behalf of a cgroup, skip - * counting on behalf of references from different - * cgroups - */ - if (memcg) { - rwc.invalid_vma = invalid_folio_referenced_vma; - } - rmap_walk(folio, &rwc); *vm_flags = pra.vm_flags; diff --git a/mm/vmscan.c b/mm/vmscan.c index 7c3fd900a89d..fe30b8c43f92 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3794,7 +3794,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal if (is_vm_hugetlb_page(vma)) return true; - if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ)) + if (!vma_has_recency(vma)) + return true; + + if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) return true; if (vma == get_gate_vma(vma->vm_mm)) -- cgit v1.2.3 From 17e810229cb3068b692fa078bd9b3a6527e0866a Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Fri, 30 Dec 2022 14:52:52 -0700 Subject: mm: support POSIX_FADV_NOREUSE This patch adds POSIX_FADV_NOREUSE to vma_has_recency() so that the LRU algorithm can ignore access to mapped files marked by this flag. The advantages of POSIX_FADV_NOREUSE are: 1. Unlike MADV_SEQUENTIAL and MADV_RANDOM, it does not alter the default readahead behavior. 2. Unlike MADV_SEQUENTIAL and MADV_RANDOM, it does not split VMAs and therefore does not take mmap_lock. 3. Unlike MADV_COLD, setting it has a negligible cost, regardless of how many pages it affects. Its limitations are: 1. Like POSIX_FADV_RANDOM and POSIX_FADV_SEQUENTIAL, it currently does not support range. IOW, its scope is the entire file. 2. It currently does not ignore access through file descriptors. Specifically, for the active/inactive LRU, given a file page shared by two users and one of them having set POSIX_FADV_NOREUSE on the file, this page will be activated upon the second user accessing it. This corner case can be covered by checking POSIX_FADV_NOREUSE before calling folio_mark_accessed() on the read path. But it is considered not worth the effort. There have been a few attempts to support POSIX_FADV_NOREUSE, e.g., [1]. This time the goal is to fill a niche: a few desktop applications, e.g., large file transferring and video encoding/decoding, want fast file streaming with mmap() rather than direct IO. Among those applications, an SVT-AV1 regression was reported when running with MGLRU [2]. The following test can reproduce that regression. kb=$(awk '/MemTotal/ { print $2 }' /proc/meminfo) kb=$((kb - 8*1024*1024)) modprobe brd rd_nr=1 rd_size=$kb dd if=/dev/zero of=/dev/ram0 bs=1M mkfs.ext4 /dev/ram0 mount /dev/ram0 /mnt/ swapoff -a fallocate -l 8G /mnt/swapfile mkswap /mnt/swapfile swapon /mnt/swapfile wget http://ultravideo.cs.tut.fi/video/Bosphorus_3840x2160_120fps_420_8bit_YUV_Y4M.7z 7z e -o/mnt/ Bosphorus_3840x2160_120fps_420_8bit_YUV_Y4M.7z SvtAv1EncApp --preset 12 -w 3840 -h 2160 \ -i /mnt/Bosphorus_3840x2160.y4m For MGLRU, the following change showed a [9-11]% increase in FPS, which makes it on par with the active/inactive LRU. patch Source/App/EncApp/EbAppMain.c < #include 35d35 < #include /* _O_BINARY */ 117a118 > posix_fadvise(config->mmap.fd, 0, 0, POSIX_FADV_NOREUSE); EOF [1] https://lore.kernel.org/r/1308923350-7932-1-git-send-email-andrea@betterlinux.com/ [2] https://openbenchmarking.org/result/2209259-PTS-MGLRU8GB57 Link: https://lkml.kernel.org/r/20221230215252.2628425-2-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Alexander Viro Cc: Andrea Righi Cc: Johannes Weiner Cc: Michael Larabel Signed-off-by: Andrew Morton --- include/linux/fs.h | 2 ++ include/linux/mm_inline.h | 3 +++ mm/fadvise.c | 5 ++++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index c1769a2c5d70..d353c262d669 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -166,6 +166,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File supports DIRECT IO */ #define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000) +#define FMODE_NOREUSE ((__force fmode_t)0x800000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 4abebf2615a3..26dcbda07e92 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -599,6 +599,9 @@ static inline bool vma_has_recency(struct vm_area_struct *vma) if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) return false; + if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE)) + return false; + return true; } diff --git a/mm/fadvise.c b/mm/fadvise.c index bf04fec87f35..fb7c5f43fd2a 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -80,7 +80,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) case POSIX_FADV_NORMAL: file->f_ra.ra_pages = bdi->ra_pages; spin_lock(&file->f_lock); - file->f_mode &= ~FMODE_RANDOM; + file->f_mode &= ~(FMODE_RANDOM | FMODE_NOREUSE); spin_unlock(&file->f_lock); break; case POSIX_FADV_RANDOM: @@ -107,6 +107,9 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) force_page_cache_readahead(mapping, file, start_index, nrpages); break; case POSIX_FADV_NOREUSE: + spin_lock(&file->f_lock); + file->f_mode |= FMODE_NOREUSE; + spin_unlock(&file->f_lock); break; case POSIX_FADV_DONTNEED: __filemap_fdatawrite_range(mapping, offset, endbyte, -- cgit v1.2.3 From 02d65d6fb1aae151570c8bfd1bd77a8153d2e607 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 6 Jan 2023 15:52:51 -0600 Subject: mm: introduce folio_is_pfmemalloc Add a folio equivalent for page_is_pfmemalloc. This removes two instances of page_is_pfmemalloc(folio_page(folio, 0)) so the folio can be used directly. Link: https://lkml.kernel.org/r/20230106215251.599222-1-sidhartha.kumar@oracle.com Suggested-by: Matthew Wilcox Signed-off-by: Sidhartha Kumar Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: SeongJae Park Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 15 +++++++++++++++ mm/slab.c | 2 +- mm/slub.c | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8a8563359946..76c97cb8ee9a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1926,6 +1926,21 @@ static inline bool page_is_pfmemalloc(const struct page *page) return (uintptr_t)page->lru.next & BIT(1); } +/* + * Return true only if the folio has been allocated with + * ALLOC_NO_WATERMARKS and the low watermark was not + * met implying that the system is under some pressure. + */ +static inline bool folio_is_pfmemalloc(const struct folio *folio) +{ + /* + * lru.next has bit 1 set if the page is allocated from the + * pfmemalloc reserves. Callers may simply overwrite it if + * they do not need to preserve that information. + */ + return (uintptr_t)folio->lru.next & BIT(1); +} + /* * Only to be called by the page allocator on a freshly allocated * page. diff --git a/mm/slab.c b/mm/slab.c index 7a269db050ee..b77be9c6d6b1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1373,7 +1373,7 @@ static struct slab *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, /* Make the flag visible before any changes to folio->mapping */ smp_wmb(); /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ - if (sk_memalloc_socks() && page_is_pfmemalloc(folio_page(folio, 0))) + if (sk_memalloc_socks() && folio_is_pfmemalloc(folio)) slab_set_pfmemalloc(slab); return slab; diff --git a/mm/slub.c b/mm/slub.c index 13459c69095a..67020074ecb4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1859,7 +1859,7 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node, __folio_set_slab(folio); /* Make the flag visible before any changes to folio->mapping */ smp_wmb(); - if (page_is_pfmemalloc(folio_page(folio, 0))) + if (folio_is_pfmemalloc(folio)) slab_set_pfmemalloc(slab); return slab; -- cgit v1.2.3 From 61d3d5108eb621d2a097c41f6cc83bf63b1b6c03 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 6 Jan 2023 14:59:00 +0100 Subject: mm: remove PageMovable export The only in-kernel users that need PageMovable() to be exported are z3fold and zsmalloc and they are only using it for dubious debugging functionality. So remove those usages and the export so that no driver code accidentally thinks that they are allowed to use this symbol. Link: https://lkml.kernel.org/r/20230106135900.3763622-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman Reviewed-by: Sergey Senozhatsky Reviewed-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Christoph Hellwig Acked-by: Minchan Kim Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/compaction.c | 1 - mm/z3fold.c | 2 -- mm/zsmalloc.c | 3 --- 3 files changed, 6 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index ca1603524bbe..62a61de44658 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -122,7 +122,6 @@ bool PageMovable(struct page *page) return false; } -EXPORT_SYMBOL(PageMovable); void __SetPageMovable(struct page *page, const struct movable_operations *mops) { diff --git a/mm/z3fold.c b/mm/z3fold.c index a4de0c317ac7..0cef845d397b 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -1450,7 +1450,6 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) struct z3fold_header *zhdr; struct z3fold_pool *pool; - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(PageIsolated(page), page); if (test_bit(PAGE_HEADLESS, &page->private)) @@ -1490,7 +1489,6 @@ static int z3fold_page_migrate(struct page *newpage, struct page *page, struct z3fold_header *zhdr, *new_zhdr; struct z3fold_pool *pool; - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 9445bee6b014..6aafacd664fc 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1973,7 +1973,6 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) * Page is locked so zspage couldn't be destroyed. For detail, look at * lock_zspage in free_zspage. */ - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(PageIsolated(page), page); zspage = get_zspage(page); @@ -2005,7 +2004,6 @@ static int zs_page_migrate(struct page *newpage, struct page *page, if (mode == MIGRATE_SYNC_NO_COPY) return -EINVAL; - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); /* The page is locked, so this pointer must remain valid */ @@ -2070,7 +2068,6 @@ static void zs_page_putback(struct page *page) { struct zspage *zspage; - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); zspage = get_zspage(page); -- cgit v1.2.3 From fc8c7d2380ab7d6aa1ddef1f69169ef9a15596eb Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Jan 2023 21:33:30 +0000 Subject: mm/damon/vaddr: rename 'damon_young_walk_private->page_sz' to 'folio_sz' Patch series "mm/damon/{v,p}addr: misc fixups for folio usage". DAMON's monitoring operations set for the virtual and the physical address spaces use folio now, but some code is not reflecting the fact. Further cleanup the code for folio usage. This patch (of 6): DAMON's virtual address space monitoring operations set is using folio now. Rename 'damon_pa_access_chk_result->page_sz' to reflect the fact. Link: https://lkml.kernel.org/r/20230109213335.62525-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230109213335.62525-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 9d92c5eb3a1f..d6cb1fca1769 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -422,7 +422,8 @@ static void damon_va_prepare_access_checks(struct damon_ctx *ctx) } struct damon_young_walk_private { - unsigned long *page_sz; + /* size of the folio for the access checked virtual memory address */ + unsigned long *folio_sz; bool young; }; @@ -452,7 +453,7 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, if (pmd_young(*pmd) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) { - *priv->page_sz = HPAGE_PMD_SIZE; + *priv->folio_sz = HPAGE_PMD_SIZE; priv->young = true; } folio_put(folio); @@ -474,7 +475,7 @@ regular_page: goto out; if (pte_young(*pte) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) { - *priv->page_sz = PAGE_SIZE; + *priv->folio_sz = PAGE_SIZE; priv->young = true; } folio_put(folio); @@ -504,7 +505,7 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, if (pte_young(entry) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) { - *priv->page_sz = huge_page_size(h); + *priv->folio_sz = huge_page_size(h); priv->young = true; } @@ -524,10 +525,10 @@ static const struct mm_walk_ops damon_young_ops = { }; static bool damon_va_young(struct mm_struct *mm, unsigned long addr, - unsigned long *page_sz) + unsigned long *folio_sz) { struct damon_young_walk_private arg = { - .page_sz = page_sz, + .folio_sz = folio_sz, .young = false, }; @@ -547,18 +548,18 @@ static void __damon_va_check_access(struct mm_struct *mm, struct damon_region *r, bool same_target) { static unsigned long last_addr; - static unsigned long last_page_sz = PAGE_SIZE; + static unsigned long last_folio_sz = PAGE_SIZE; static bool last_accessed; /* If the region is in the last checked page, reuse the result */ - if (same_target && (ALIGN_DOWN(last_addr, last_page_sz) == - ALIGN_DOWN(r->sampling_addr, last_page_sz))) { + if (same_target && (ALIGN_DOWN(last_addr, last_folio_sz) == + ALIGN_DOWN(r->sampling_addr, last_folio_sz))) { if (last_accessed) r->nr_accesses++; return; } - last_accessed = damon_va_young(mm, r->sampling_addr, &last_page_sz); + last_accessed = damon_va_young(mm, r->sampling_addr, &last_folio_sz); if (last_accessed) r->nr_accesses++; -- cgit v1.2.3 From 18fd73dbe5c39707b51552d622235e5c41e3d869 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Jan 2023 21:33:31 +0000 Subject: mm/damon/vaddr: support folio of neither HPAGE_PMD_SIZE nor PAGE_SIZE DAMON virtual address space monitoring operations set treats folios having non-HPAGE_PMD_SIZE size as having PAGE_SIZE size. Use the exact size of the folio. Link: https://lkml.kernel.org/r/20230109213335.62525-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index d6cb1fca1769..c7b192006fe6 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -475,7 +475,7 @@ regular_page: goto out; if (pte_young(*pte) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) { - *priv->folio_sz = PAGE_SIZE; + *priv->folio_sz = folio_size(folio); priv->young = true; } folio_put(folio); -- cgit v1.2.3 From 7477d7560cb2c756d6a8ab165d9ed52537df54e7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Jan 2023 21:33:32 +0000 Subject: mm/damon/vaddr: record appropriate folio size when the access is not found DAMON virtual address spaces monitoring operations set doesn't set folio size of the access checked address if access is not found. It could result in unnecessary and inefficient repeated check. Appropriately set the size regardless of access check result. Link: https://lkml.kernel.org/r/20230109213335.62525-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index c7b192006fe6..1fec16d7263e 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -452,10 +452,9 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, goto huge_out; if (pmd_young(*pmd) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, - addr)) { - *priv->folio_sz = HPAGE_PMD_SIZE; + addr)) priv->young = true; - } + *priv->folio_sz = HPAGE_PMD_SIZE; folio_put(folio); huge_out: spin_unlock(ptl); @@ -474,10 +473,9 @@ regular_page: if (!folio) goto out; if (pte_young(*pte) || !folio_test_idle(folio) || - mmu_notifier_test_young(walk->mm, addr)) { - *priv->folio_sz = folio_size(folio); + mmu_notifier_test_young(walk->mm, addr)) priv->young = true; - } + *priv->folio_sz = folio_size(folio); folio_put(folio); out: pte_unmap_unlock(pte, ptl); @@ -504,10 +502,9 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, folio_get(folio); if (pte_young(entry) || !folio_test_idle(folio) || - mmu_notifier_test_young(walk->mm, addr)) { - *priv->folio_sz = huge_page_size(h); + mmu_notifier_test_young(walk->mm, addr)) priv->young = true; - } + *priv->folio_sz = huge_page_size(h); folio_put(folio); -- cgit v1.2.3 From af40e35a992ff5691166badd52a4fa2f940c2cea Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Jan 2023 21:33:33 +0000 Subject: mm/damon/paddr: rename 'damon_pa_access_chk_result->page_sz' to 'folio_sz' DAMON's physical address space monitoring operations set is using folio now. Rename 'damon_pa_access_chk_result->page_sz' to reflect the fact. Link: https://lkml.kernel.org/r/20230109213335.62525-5-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 99d4c357ef2b..65c1e0f91535 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -80,7 +80,8 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) } struct damon_pa_access_chk_result { - unsigned long page_sz; + /* size of the folio for the access checked physical memory address */ + unsigned long folio_sz; bool accessed; }; @@ -91,7 +92,7 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); result->accessed = false; - result->page_sz = PAGE_SIZE; + result->folio_sz = PAGE_SIZE; while (page_vma_mapped_walk(&pvmw)) { addr = pvmw.address; if (pvmw.pte) { @@ -103,7 +104,7 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, result->accessed = pmd_young(*pvmw.pmd) || !folio_test_idle(folio) || mmu_notifier_test_young(vma->vm_mm, addr); - result->page_sz = HPAGE_PMD_SIZE; + result->folio_sz = HPAGE_PMD_SIZE; #else WARN_ON_ONCE(1); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -118,11 +119,11 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, return !result->accessed; } -static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz) +static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) { struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); struct damon_pa_access_chk_result result = { - .page_sz = PAGE_SIZE, + .folio_sz = PAGE_SIZE, .accessed = false, }; struct rmap_walk_control rwc = { @@ -157,25 +158,25 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz) folio_put(folio); out: - *page_sz = result.page_sz; + *folio_sz = result.folio_sz; return result.accessed; } static void __damon_pa_check_access(struct damon_region *r) { static unsigned long last_addr; - static unsigned long last_page_sz = PAGE_SIZE; + static unsigned long last_folio_sz = PAGE_SIZE; static bool last_accessed; /* If the region is in the last checked page, reuse the result */ - if (ALIGN_DOWN(last_addr, last_page_sz) == - ALIGN_DOWN(r->sampling_addr, last_page_sz)) { + if (ALIGN_DOWN(last_addr, last_folio_sz) == + ALIGN_DOWN(r->sampling_addr, last_folio_sz)) { if (last_accessed) r->nr_accesses++; return; } - last_accessed = damon_pa_young(r->sampling_addr, &last_page_sz); + last_accessed = damon_pa_young(r->sampling_addr, &last_folio_sz); if (last_accessed) r->nr_accesses++; -- cgit v1.2.3 From 397b0c3a584b4176b4956b519ea3f1402d61fc4e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Jan 2023 21:33:34 +0000 Subject: mm/damon/paddr: remove folio_sz field from damon_pa_access_chk_result DAMON physical address space monitoring operations set gets and saves size of the folio for a given physical address inside rmap walks, but it can be directly caluclated outside of the walks. Remove the 'folio_sz' field from 'damon_pa_access_chk_result struct' and calculate the size directly from outside of the walks. Link: https://lkml.kernel.org/r/20230109213335.62525-6-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 65c1e0f91535..b51606519bbd 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -80,8 +80,6 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) } struct damon_pa_access_chk_result { - /* size of the folio for the access checked physical memory address */ - unsigned long folio_sz; bool accessed; }; @@ -92,7 +90,6 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); result->accessed = false; - result->folio_sz = PAGE_SIZE; while (page_vma_mapped_walk(&pvmw)) { addr = pvmw.address; if (pvmw.pte) { @@ -104,7 +101,6 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, result->accessed = pmd_young(*pvmw.pmd) || !folio_test_idle(folio) || mmu_notifier_test_young(vma->vm_mm, addr); - result->folio_sz = HPAGE_PMD_SIZE; #else WARN_ON_ONCE(1); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -123,7 +119,6 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) { struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); struct damon_pa_access_chk_result result = { - .folio_sz = PAGE_SIZE, .accessed = false, }; struct rmap_walk_control rwc = { @@ -158,7 +153,7 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) folio_put(folio); out: - *folio_sz = result.folio_sz; + *folio_sz = folio_size(folio); return result.accessed; } -- cgit v1.2.3 From b0c0e744e8a471f1c710197faaab0b81c461f8c0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Jan 2023 21:33:35 +0000 Subject: mm/damon/paddr: remove damon_pa_access_chk_result struct 'damon_pa_access_chk_result' struct contains only one field. Use a variable instead. Link: https://lkml.kernel.org/r/20230109213335.62525-7-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index b51606519bbd..b4df9b9bcc0a 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -79,50 +79,44 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) } } -struct damon_pa_access_chk_result { - bool accessed; -}; - static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, void *arg) { - struct damon_pa_access_chk_result *result = arg; + bool *accessed = arg; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); - result->accessed = false; + *accessed = false; while (page_vma_mapped_walk(&pvmw)) { addr = pvmw.address; if (pvmw.pte) { - result->accessed = pte_young(*pvmw.pte) || + *accessed = pte_young(*pvmw.pte) || !folio_test_idle(folio) || mmu_notifier_test_young(vma->vm_mm, addr); } else { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - result->accessed = pmd_young(*pvmw.pmd) || + *accessed = pmd_young(*pvmw.pmd) || !folio_test_idle(folio) || mmu_notifier_test_young(vma->vm_mm, addr); #else WARN_ON_ONCE(1); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ } - if (result->accessed) { + if (*accessed) { page_vma_mapped_walk_done(&pvmw); break; } } /* If accessed, stop walking */ - return !result->accessed; + return *accessed == false; } static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) { struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); - struct damon_pa_access_chk_result result = { - .accessed = false, - }; + bool accessed = false; struct rmap_walk_control rwc = { - .arg = &result, + .arg = &accessed, .rmap_one = __damon_pa_young, .anon_lock = folio_lock_anon_vma_read, }; @@ -133,9 +127,9 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { if (folio_test_idle(folio)) - result.accessed = false; + accessed = false; else - result.accessed = true; + accessed = true; folio_put(folio); goto out; } @@ -154,7 +148,7 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) out: *folio_sz = folio_size(folio); - return result.accessed; + return accessed; } static void __damon_pa_check_access(struct damon_region *r) -- cgit v1.2.3 From c4876ff68716e5372224d17045b47610d667a0ee Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Mon, 9 Jan 2023 17:43:32 +0000 Subject: mm/debug: use valid physical memory for pmd/pud tests The page table debug tests need a physical address to validate low-level page table manipulation with. The memory at this address is not actually touched, it just encoded in the page table entries at various levels during the tests only. Since the memory is not used, the code just picks the physical address of the start_kernel symbol. This value is then truncated to get a properly aligned address that is to be used for various tests. Because of the truncation, the address might not actually exist, or might not describe a complete huge page. That's not a problem for most tests, but the arch-specific code may check for attribute validity and consistency. The x86 version of {pud,pmd}_set_huge actually validates the MTRRs for the PMD/PUD range. This may fail with an address derived from start_kernel, depending on where the kernel was loaded and what the physical memory layout of the system is. This then leads to false negatives for the {pud,pmd}_set_huge tests. Avoid this by finding a properly aligned memory range that exists and is usable. If such a range is not found, skip the tests that needed it. [fvdl@google.com: v3] Link: https://lkml.kernel.org/r/20230110181208.1633879-1-fvdl@google.com Link: https://lkml.kernel.org/r/20230109174332.329366-1-fvdl@google.com Fixes: 399145f9eb6c ("mm/debug: add tests validating architecture page table helpers") Signed-off-by: Frank van der Linden Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- mm/debug_vm_pgtable.c | 102 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 83 insertions(+), 19 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index c631ade3f1d2..bb3328f46126 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -80,6 +81,7 @@ struct pgtable_debug_args { unsigned long pmd_pfn; unsigned long pte_pfn; + unsigned long fixed_alignment; unsigned long fixed_pgd_pfn; unsigned long fixed_p4d_pfn; unsigned long fixed_pud_pfn; @@ -430,7 +432,8 @@ static void __init pmd_huge_tests(struct pgtable_debug_args *args) { pmd_t pmd; - if (!arch_vmap_pmd_supported(args->page_prot)) + if (!arch_vmap_pmd_supported(args->page_prot) || + args->fixed_alignment < PMD_SIZE) return; pr_debug("Validating PMD huge\n"); @@ -449,7 +452,8 @@ static void __init pud_huge_tests(struct pgtable_debug_args *args) { pud_t pud; - if (!arch_vmap_pud_supported(args->page_prot)) + if (!arch_vmap_pud_supported(args->page_prot) || + args->fixed_alignment < PUD_SIZE) return; pr_debug("Validating PUD huge\n"); @@ -1077,10 +1081,85 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order) return page; } +/* + * Check if a physical memory range described by contains + * an area that is of size psize, and aligned to psize. + * + * Don't use address 0, an all-zeroes physical address might mask bugs, and + * it's not used on x86. + */ +static void __init phys_align_check(phys_addr_t pstart, + phys_addr_t pend, unsigned long psize, + phys_addr_t *physp, unsigned long *alignp) +{ + phys_addr_t aligned_start, aligned_end; + + if (pstart == 0) + pstart = PAGE_SIZE; + + aligned_start = ALIGN(pstart, psize); + aligned_end = aligned_start + psize; + + if (aligned_end > aligned_start && aligned_end <= pend) { + *alignp = psize; + *physp = aligned_start; + } +} + +static void __init init_fixed_pfns(struct pgtable_debug_args *args) +{ + u64 idx; + phys_addr_t phys, pstart, pend; + + /* + * Initialize the fixed pfns. To do this, try to find a + * valid physical range, preferably aligned to PUD_SIZE, + * but settling for aligned to PMD_SIZE as a fallback. If + * neither of those is found, use the physical address of + * the start_kernel symbol. + * + * The memory doesn't need to be allocated, it just needs to exist + * as usable memory. It won't be touched. + * + * The alignment is recorded, and can be checked to see if we + * can run the tests that require an actual valid physical + * address range on some architectures ({pmd,pud}_huge_test + * on x86). + */ + + phys = __pa_symbol(&start_kernel); + args->fixed_alignment = PAGE_SIZE; + + for_each_mem_range(idx, &pstart, &pend) { + /* First check for a PUD-aligned area */ + phys_align_check(pstart, pend, PUD_SIZE, &phys, + &args->fixed_alignment); + + /* If a PUD-aligned area is found, we're done */ + if (args->fixed_alignment == PUD_SIZE) + break; + + /* + * If no PMD-aligned area found yet, check for one, + * but continue the loop to look for a PUD-aligned area. + */ + if (args->fixed_alignment < PMD_SIZE) + phys_align_check(pstart, pend, PMD_SIZE, &phys, + &args->fixed_alignment); + } + + args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK); + args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK); + args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK); + args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK); + args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK); + WARN_ON(!pfn_valid(args->fixed_pte_pfn)); +} + + static int __init init_args(struct pgtable_debug_args *args) { struct page *page = NULL; - phys_addr_t phys; int ret = 0; /* @@ -1160,22 +1239,7 @@ static int __init init_args(struct pgtable_debug_args *args) args->start_ptep = pmd_pgtable(READ_ONCE(*args->pmdp)); WARN_ON(!args->start_ptep); - /* - * PFN for mapping at PTE level is determined from a standard kernel - * text symbol. But pfns for higher page table levels are derived by - * masking lower bits of this real pfn. These derived pfns might not - * exist on the platform but that does not really matter as pfn_pxx() - * helpers will still create appropriate entries for the test. This - * helps avoid large memory block allocations to be used for mapping - * at higher page table levels in some of the tests. - */ - phys = __pa_symbol(&start_kernel); - args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK); - args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK); - args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK); - args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK); - args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK); - WARN_ON(!pfn_valid(args->fixed_pte_pfn)); + init_fixed_pfns(args); /* * Allocate (huge) pages because some of the tests need to access -- cgit v1.2.3 From f4d9139f1394cbe2de158ab8771fea4e587004d4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 Jan 2023 18:12:55 +0100 Subject: selftests/mm: define MADV_PAGEOUT to fix compilation issues If MADV_PAGEOUT is not defined (e.g., on AlmaLinux 8), compilation will fail. Let's fix that like khugepaged.c does by conditionally defining MADV_PAGEOUT. Link: https://lkml.kernel.org/r/20230109171255.488749-1-david@redhat.com Fixes: 69c66add5663 ("selftests/vm: anon_cow: test COW handling of anonymous memory") Signed-off-by: David Hildenbrand Reported-by: Mirsad Goran Todorovac Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/cow.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index 16216d893d96..0eb2e8180aa5 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -30,6 +30,9 @@ #include "../kselftest.h" #include "vm_util.h" +#ifndef MADV_PAGEOUT +#define MADV_PAGEOUT 21 +#endif #ifndef MADV_COLLAPSE #define MADV_COLLAPSE 25 #endif -- cgit v1.2.3 From e8dfc854eef20ac7663996f61837299887f380fc Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 7 Dec 2022 10:10:09 -0800 Subject: ext4: convert mext_page_double_lock() to mext_folio_double_lock() Convert mext_page_double_lock() to use folios. This change saves 146 bytes of kernel text. It also removes 6 calls to compound_head() and 2 calls to folio_file_page(). Link: https://lkml.kernel.org/r/20221207181009.4016-1-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/ext4/move_extent.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 8dbb87edf24c..2de9829aed63 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -110,22 +110,23 @@ out: } /** - * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2 + * mext_folio_double_lock - Grab and lock folio on both @inode1 and @inode2 * * @inode1: the inode structure * @inode2: the inode structure - * @index1: page index - * @index2: page index - * @page: result page vector + * @index1: folio index + * @index2: folio index + * @folio: result folio vector * - * Grab two locked pages for inode's by inode order + * Grab two locked folio for inode's by inode order */ static int -mext_page_double_lock(struct inode *inode1, struct inode *inode2, - pgoff_t index1, pgoff_t index2, struct page *page[2]) +mext_folio_double_lock(struct inode *inode1, struct inode *inode2, + pgoff_t index1, pgoff_t index2, struct folio *folio[2]) { struct address_space *mapping[2]; unsigned int flags; + unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; BUG_ON(!inode1 || !inode2); if (inode1 < inode2) { @@ -138,28 +139,30 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2, } flags = memalloc_nofs_save(); - page[0] = grab_cache_page_write_begin(mapping[0], index1); - if (!page[0]) { + folio[0] = __filemap_get_folio(mapping[0], index1, fgp_flags, + mapping_gfp_mask(mapping[0])); + if (!folio[0]) { memalloc_nofs_restore(flags); return -ENOMEM; } - page[1] = grab_cache_page_write_begin(mapping[1], index2); + folio[1] = __filemap_get_folio(mapping[1], index2, fgp_flags, + mapping_gfp_mask(mapping[1])); memalloc_nofs_restore(flags); - if (!page[1]) { - unlock_page(page[0]); - put_page(page[0]); + if (!folio[1]) { + folio_unlock(folio[0]); + folio_put(folio[0]); return -ENOMEM; } /* - * grab_cache_page_write_begin() may not wait on page's writeback if + * __filemap_get_folio() may not wait on folio's writeback if * BDI not demand that. But it is reasonable to be very conservative - * here and explicitly wait on page's writeback + * here and explicitly wait on folio's writeback */ - wait_on_page_writeback(page[0]); - wait_on_page_writeback(page[1]); + folio_wait_writeback(folio[0]); + folio_wait_writeback(folio[1]); if (inode1 > inode2) - swap(page[0], page[1]); + swap(folio[0], folio[1]); return 0; } @@ -252,7 +255,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, int block_len_in_page, int unwritten, int *err) { struct inode *orig_inode = file_inode(o_filp); - struct page *pagep[2] = {NULL, NULL}; struct folio *folio[2] = {NULL, NULL}; handle_t *handle; ext4_lblk_t orig_blk_offset, donor_blk_offset; @@ -303,8 +305,8 @@ again: replaced_size = data_size; - *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset, - donor_page_offset, pagep); + *err = mext_folio_double_lock(orig_inode, donor_inode, orig_page_offset, + donor_page_offset, folio); if (unlikely(*err < 0)) goto stop_journal; /* @@ -314,8 +316,6 @@ again: * hold page's lock, if it is still the case data copy is not * necessary, just swap data blocks between orig and donor. */ - folio[0] = page_folio(pagep[0]); - folio[1] = page_folio(pagep[1]); VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]); VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]); -- cgit v1.2.3 From b6f00c9190c8e694c9b2b38e7cc63964f7a99195 Mon Sep 17 00:00:00 2001 From: Xu Panda Date: Mon, 9 Jan 2023 19:46:55 +0800 Subject: mm/damon/sysfs-schemes: use strscpy() to instead of strncpy() The implementation of strscpy() is more robust and safer. That's now the recommended way to copy NUL-terminated strings. Link: https://lkml.kernel.org/r/202301091946553770006@zte.com.cn Signed-off-by: Xu Panda Signed-off-by: Yang Yang Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index f0dabe3e2dc0..86edca66aab1 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -353,8 +353,7 @@ static ssize_t memcg_path_store(struct kobject *kobj, if (!path) return -ENOMEM; - strncpy(path, buf, count); - path[count] = '\0'; + strscpy(path, buf, count + 1); filter->memcg_path = path; return count; } -- cgit v1.2.3 From d526643f155c431e8dfef643195f2d636d4e4bb5 Mon Sep 17 00:00:00 2001 From: Alexander Pantyukhin Date: Sun, 8 Jan 2023 15:50:23 +0500 Subject: tools:cgroup:memcg_shrinker remove redundant import Remove redundant import of the sys module. Also use the sort function instead of sorted. It sorts the direct array without create the new one in memory. Link: https://lkml.kernel.org/r/20230108105023.4289-1-apantykhin@gmail.com Signed-off-by: Alexander Pantyukhin Cc: Roman Gushchin Signed-off-by: Andrew Morton --- tools/cgroup/memcg_shrinker.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/cgroup/memcg_shrinker.py b/tools/cgroup/memcg_shrinker.py index 706ab27666a4..e81c3017ada9 100644 --- a/tools/cgroup/memcg_shrinker.py +++ b/tools/cgroup/memcg_shrinker.py @@ -5,7 +5,6 @@ import os import argparse -import sys def scan_cgroups(cgroup_root): @@ -44,7 +43,7 @@ def main(): cgroups = scan_cgroups("/sys/fs/cgroup/") shrinkers = scan_shrinkers("/sys/kernel/debug/shrinker/") - shrinkers = sorted(shrinkers, reverse = True, key = lambda x: x[0]) + shrinkers.sort(reverse = True, key = lambda x: x[0]) n = 0 for s in shrinkers: -- cgit v1.2.3 From 9a3f21fe5cb9f5654ccad7ba712d868f7de66e39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 9 Jan 2023 12:42:51 +0100 Subject: selftests: vm: enable cross-compilation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Selftests vm builds break when doing cross-compilation. The Makefile MACHINE variable incorrectly picks upp the host machine architecture. If the CROSS_COMPILE variable is set, dig out the target host architecture from CROSS_COMPILE, instead of calling uname. Link: https://lkml.kernel.org/r/20230109114251.3349638-1-bjorn@kernel.org Signed-off-by: Björn Töpel Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 6a4b639b2b2b..0a44d77f8437 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -5,7 +5,11 @@ LOCAL_HDRS += $(selfdir)/mm/local_config.h $(top_srcdir)/mm/gup_test.h include local_config.mk +ifeq ($(CROSS_COMPILE),) uname_M := $(shell uname -m 2>/dev/null || echo not) +else +uname_M := $(shell echo $(CROSS_COMPILE) | grep -o '^[a-z0-9]\+') +endif MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') # Without this, failed build products remain, with up-to-date timestamps, -- cgit v1.2.3 From 92b64bd01fe99325ba0f51125bcb991f1566eadc Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Wed, 7 Dec 2022 23:53:08 +0100 Subject: mm/highmem: add notes about conversions from kmap{,_atomic}() kmap() and kmap_atomic() have been deprecated. kmap_local_page() should always be used in new code and the call sites of the two deprecated functions should be converted. This latter task can lead to errors if it is not carried out with the necessary attention to the context around and between the maps and unmaps. Therefore, add further information to the Highmem's documentation for the purpose to make it clearer that (1) kmap() and kmap_atomic() must not any longer be called in new code and (2) developers doing conversions from kmap() amd kmap_atomic() are expected to take care of the context around and between the maps and unmaps, in order to not break the code. Relevant parts of this patch have been taken from messages exchanged privately with Ira Weiny (thanks!). [fmdefrancesco@gmail.com: merge two sentences into one, per Bagas] Link: https://lkml.kernel.org/r/20230119123945.10471-1-fmdefrancesco@gmail.com Link: https://lkml.kernel.org/r/20221207225308.8290-1-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Cc: Ira Weiny Cc: Jonathan Corbet Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- Documentation/mm/highmem.rst | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/Documentation/mm/highmem.rst b/Documentation/mm/highmem.rst index 0f731d9196b0..e691a06fb337 100644 --- a/Documentation/mm/highmem.rst +++ b/Documentation/mm/highmem.rst @@ -57,7 +57,8 @@ list shows them in order of preference of use. It can be invoked from any context (including interrupts) but the mappings can only be used in the context which acquired them. - This function should be preferred, where feasible, over all the others. + This function should always be used, whereas kmap_atomic() and kmap() have + been deprecated. These mappings are thread-local and CPU-local, meaning that the mapping can only be accessed from within this thread and the thread is bound to the @@ -100,10 +101,21 @@ list shows them in order of preference of use. (included in the "Functions" section) for details on how to manage nested mappings. -* kmap_atomic(). This permits a very short duration mapping of a single - page. Since the mapping is restricted to the CPU that issued it, it - performs well, but the issuing task is therefore required to stay on that - CPU until it has finished, lest some other task displace its mappings. +* kmap_atomic(). This function has been deprecated; use kmap_local_page(). + + NOTE: Conversions to kmap_local_page() must take care to follow the mapping + restrictions imposed on kmap_local_page(). Furthermore, the code between + calls to kmap_atomic() and kunmap_atomic() may implicitly depend on the side + effects of atomic mappings, i.e. disabling page faults or preemption, or both. + In that case, explicit calls to pagefault_disable() or preempt_disable() or + both must be made in conjunction with the use of kmap_local_page(). + + [Legacy documentation] + + This permits a very short duration mapping of a single page. Since the + mapping is restricted to the CPU that issued it, it performs well, but + the issuing task is therefore required to stay on that CPU until it has + finished, lest some other task displace its mappings. kmap_atomic() may also be used by interrupt contexts, since it does not sleep and the callers too may not sleep until after kunmap_atomic() is @@ -115,11 +127,20 @@ list shows them in order of preference of use. It is assumed that k[un]map_atomic() won't fail. -* kmap(). This should be used to make short duration mapping of a single - page with no restrictions on preemption or migration. It comes with an - overhead as mapping space is restricted and protected by a global lock - for synchronization. When mapping is no longer needed, the address that - the page was mapped to must be released with kunmap(). +* kmap(). This function has been deprecated; use kmap_local_page(). + + NOTE: Conversions to kmap_local_page() must take care to follow the mapping + restrictions imposed on kmap_local_page(). In particular, it is necessary to + make sure that the kernel virtual memory pointer is only valid in the thread + that obtained it. + + [Legacy documentation] + + This should be used to make short duration mapping of a single page with no + restrictions on preemption or migration. It comes with an overhead as mapping + space is restricted and protected by a global lock for synchronization. When + mapping is no longer needed, the address that the page was mapped to must be + released with kunmap(). Mapping changes must be propagated across all the CPUs. kmap() also requires global TLB invalidation when the kmap's pool wraps and it might -- cgit v1.2.3 From fb6f026b833a71f4701e12b43800e46d7351f7a2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Jan 2023 19:03:53 +0000 Subject: mm/damon/core: update kernel-doc comments for DAMOS action supports of each DAMON operations set Patch series "mm/damon: trivial fixups". This patchset contains patches for trivial fixups of DAMON's documentation, MAINTAINERS section, and selftests. This patch (of 8): Supports of each DAMOS action are up to DAMON operations set implementation in use, but not well mentioned on the kernel-doc comments. Add the comment. Link: https://lkml.kernel.org/r/20230110190400.119388-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230110190400.119388-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 7907918ad2e0..3fa96d7c9fe4 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -91,6 +91,12 @@ struct damon_target { * @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists. * @DAMOS_STAT: Do nothing but count the stat. * @NR_DAMOS_ACTIONS: Total number of DAMOS actions + * + * The support of each action is up to running &struct damon_operations. + * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR supports all actions except + * &enum DAMOS_LRU_PRIO and &enum DAMOS_LRU_DEPRIO. &enum DAMON_OPS_PADDR + * supports only &enum DAMOS_PAGEOUT, &enum DAMOS_LRU_PRIO, &enum + * DAMOS_LRU_DEPRIO, and &DAMOS_STAT. */ enum damos_action { DAMOS_WILLNEED, -- cgit v1.2.3 From 55901e89d2864b5ef9961892470eedf29279d412 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Jan 2023 19:03:54 +0000 Subject: mm/damon/core: update kernel-doc comments for DAMOS filters supports of each DAMON operations set Supports of each DAMOS filter type are up to DAMON operations set implementation in use, but not well mentioned on the kernel-doc comments. Add the comment. Link: https://lkml.kernel.org/r/20230110190400.119388-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 3fa96d7c9fe4..dfb245bb3053 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -227,6 +227,11 @@ struct damos_stat { * @DAMOS_FILTER_TYPE_ANON: Anonymous pages. * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages. * @NR_DAMOS_FILTER_TYPES: Number of filter types. + * + * The support of each filter type is up to running &struct damon_operations. + * &enum DAMON_OPS_PADDR is supporting all filter types, while + * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR are not supporting any + * filter types. */ enum damos_filter_type { DAMOS_FILTER_TYPE_ANON, -- cgit v1.2.3 From 86834644e3c9301ccd28df1293c37306a6332f3b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Jan 2023 19:03:55 +0000 Subject: Docs/mm/damon/index: mention DAMOS on the intro What DAMON aims to do is not only access monitoring but efficient and effective access-aware system operations. And DAMon-based Operation Schemes (DAMOS) is the important feature of DAMON for the goal. Make the intro of DAMON documentation to emphasize the goal and mention DAMOS. Link: https://lkml.kernel.org/r/20230110190400.119388-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/mm/damon/index.rst | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/Documentation/mm/damon/index.rst b/Documentation/mm/damon/index.rst index 48c0bbff98b2..2983699c12ea 100644 --- a/Documentation/mm/damon/index.rst +++ b/Documentation/mm/damon/index.rst @@ -4,8 +4,9 @@ DAMON: Data Access MONitor ========================== -DAMON is a data access monitoring framework subsystem for the Linux kernel. -The core mechanisms of DAMON (refer to :doc:`design` for the detail) make it +DAMON is a Linux kernel subsystem that provides a framework for data access +monitoring and the monitoring results based system operations. The core +monitoring mechanisms of DAMON (refer to :doc:`design` for the detail) make it - *accurate* (the monitoring output is useful enough for DRAM level memory management; It might not appropriate for CPU Cache levels, though), @@ -14,12 +15,16 @@ The core mechanisms of DAMON (refer to :doc:`design` for the detail) make it - *scalable* (the upper-bound of the overhead is in constant range regardless of the size of target workloads). -Using this framework, therefore, the kernel's memory management mechanisms can -make advanced decisions. Experimental memory management optimization works -that incurring high data accesses monitoring overhead could implemented again. -In user space, meanwhile, users who have some special workloads can write -personalized applications for better understanding and optimizations of their -workloads and systems. +Using this framework, therefore, the kernel can operate system in an +access-aware fashion. Because the features are also exposed to the user space, +users who have special information about their workloads can write personalized +applications for better understanding and optimizations of their workloads and +systems. + +For easier development of such systems, DAMON provides a feature called DAMOS +(DAMon-based Operation Schemes) in addition to the monitoring. Using the +feature, DAMON users in both kernel and user spaces can do access-aware system +operations with no code but simple configurations. .. toctree:: :maxdepth: 2 -- cgit v1.2.3 From 9a47c411ccddf843812dbbff707bd45e3bc83f40 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Jan 2023 19:03:56 +0000 Subject: Docs/admin-guide/mm/damon/usage: update DAMOS actions/filters supports of each DAMON operations set Supports of each DAMOS action and filters are up to DAMON operations set implementation, but it's not mentioned in detail on the documentation. Update the information on the usage document. Link: https://lkml.kernel.org/r/20230110190400.119388-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 41 +++++++++++++++++++--------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 3d82ca6a17ff..9237d6a25897 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -279,14 +279,25 @@ The ``action`` file is for setting and getting what action you want to apply to memory regions having specific access pattern of the interest. The keywords that can be written to and read from the file and their meaning are as below. - - ``willneed``: Call ``madvise()`` for the region with ``MADV_WILLNEED`` - - ``cold``: Call ``madvise()`` for the region with ``MADV_COLD`` - - ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT`` - - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE`` - - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE`` +Note that support of each action depends on the running DAMON operations set +`implementation `. + + - ``willneed``: Call ``madvise()`` for the region with ``MADV_WILLNEED``. + Supported by ``vaddr`` and ``fvaddr`` operations set. + - ``cold``: Call ``madvise()`` for the region with ``MADV_COLD``. + Supported by ``vaddr`` and ``fvaddr`` operations set. + - ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT``. + Supported by ``vaddr``, ``fvaddr`` and ``paddr`` operations set. + - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``. + Supported by ``vaddr`` and ``fvaddr`` operations set. + - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``. + Supported by ``vaddr`` and ``fvaddr`` operations set. - ``lru_prio``: Prioritize the region on its LRU lists. + Supported by ``paddr`` operations set. - ``lru_deprio``: Deprioritize the region on its LRU lists. - - ``stat``: Do nothing but count the statistics + Supported by ``paddr`` operations set. + - ``stat``: Do nothing but count the statistics. + Supported by all operations sets. schemes//access_pattern/ --------------------------- @@ -388,8 +399,8 @@ pages of all memory cgroups except ``/having_care_already``.:: echo /having_care_already > 1/memcg_path echo N > 1/matching -Note that filters could be ignored depend on the running DAMON operations set -`implementation `. +Note that filters are currently supported only when ``paddr`` +`implementation ` is being used. .. _sysfs_schemes_stats: @@ -618,11 +629,15 @@ The ```` is a predefined integer for memory management actions, which DAMON will apply to the regions having the target access pattern. The supported numbers and their meanings are as below. - - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED`` - - 1: Call ``madvise()`` for the region with ``MADV_COLD`` - - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT`` - - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE`` - - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE`` + - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``. Ignored if + ``target`` is ``paddr``. + - 1: Call ``madvise()`` for the region with ``MADV_COLD``. Ignored if + ``target`` is ``paddr``. + - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``. + - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``. Ignored if + ``target`` is ``paddr``. + - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``. Ignored if + ``target`` is ``paddr``. - 5: Do nothing but count the statistics Quota -- cgit v1.2.3 From e7366f3a2ed0f554354a1d7fe8bf5d98bab5247e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Jan 2023 19:03:57 +0000 Subject: Docs/mm/damon: add a maintainer-profile for DAMON Document the basic policies and expectations for DAMON development. Link: https://lkml.kernel.org/r/20230110190400.119388-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/mm/damon/index.rst | 1 + Documentation/mm/damon/maintainer-profile.rst | 62 +++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 Documentation/mm/damon/maintainer-profile.rst diff --git a/Documentation/mm/damon/index.rst b/Documentation/mm/damon/index.rst index 2983699c12ea..5e0a50583500 100644 --- a/Documentation/mm/damon/index.rst +++ b/Documentation/mm/damon/index.rst @@ -32,3 +32,4 @@ operations with no code but simple configurations. faq design api + maintainer-profile diff --git a/Documentation/mm/damon/maintainer-profile.rst b/Documentation/mm/damon/maintainer-profile.rst new file mode 100644 index 000000000000..24a202f03de8 --- /dev/null +++ b/Documentation/mm/damon/maintainer-profile.rst @@ -0,0 +1,62 @@ +.. SPDX-License-Identifier: GPL-2.0 + +DAMON Maintainer Entry Profile +============================== + +The DAMON subsystem covers the files that listed in 'DATA ACCESS MONITOR' +section of 'MAINTAINERS' file. + +The mailing lists for the subsystem are damon@lists.linux.dev and +linux-mm@kvack.org. Patches should be made against the mm-unstable tree [1]_ +whenever possible and posted to the mailing lists. + +SCM Trees +--------- + +There are multiple Linux trees for DAMON development. Patches under +development or testing are queued in damon/next [2]_ by the DAMON maintainer. +Suffieicntly reviewed patches will be queued in mm-unstable [1]_ by the memory +management subsystem maintainer. After more sufficient tests, the patches will +be queued in mm-stable [3]_ , and finally pull-requested to the mainline by the +memory management subsystem maintainer. + +Note again the patches for review should be made against the mm-unstable +tree[1] whenever possible. damon/next is only for preview of others' works in +progress. + +Submit checklist addendum +------------------------- + +When making DAMON changes, you should do below. + +- Build changes related outputs including kernel and documents. +- Ensure the builds introduce no new errors or warnings. +- Run and ensure no new failures for DAMON selftests [4]_ and kunittests [5]_ . + +Further doing below and putting the results will be helpful. + +- Run damon-tests/corr [6]_ for normal changes. +- Run damon-tests/perf [7]_ for performance changes. + +Key cycle dates +--------------- + +Patches can be sent anytime. Key cycle dates of the mm-unstable[1] and +mm-stable[3] trees depend on the memory management subsystem maintainer. + +Review cadence +-------------- + +The DAMON maintainer does the work on the usual work hour (09:00 to 17:00, +Mon-Fri) in PST. The response to patches will occasionally be slow. Do not +hesitate to send a ping if you have not heard back within a week of sending a +patch. + + +.. [1] https://git.kernel.org/akpm/mm/h/mm-unstable +.. [2] https://git.kernel.org/sj/h/damon/next +.. [3] https://git.kernel.org/akpm/mm/h/mm-stable +.. [4] https://github.com/awslabs/damon-tests/blob/master/corr/run.sh#L49 +.. [5] https://github.com/awslabs/damon-tests/blob/master/corr/tests/kunit.sh +.. [6] https://github.com/awslabs/damon-tests/tree/master/corr +.. [7] https://github.com/awslabs/damon-tests/tree/master/perf -- cgit v1.2.3 From 2d2230efbcecda7747a2bee659eae474f504ef42 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Jan 2023 19:03:58 +0000 Subject: MAINTAINERS/DAMON: link maintainer profile, git trees, and website Add links to below DAMON development related resource to DAMON section in MAINTAINERS file. - The basic policies and expectations of DAMON development, - DAMON development trees, and - DAMON introduction website. Link: https://lkml.kernel.org/r/20230110190400.119388-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- MAINTAINERS | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8ac1472bea34..b92a2a0cb36b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5790,6 +5790,11 @@ M: SeongJae Park L: damon@lists.linux.dev L: linux-mm@kvack.org S: Maintained +W: https://damonitor.github.io +P: Documentation/mm/damon/maintainer-profile.rst +T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +T: quilt git://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new +T: git git://git.kernel.org/pub/scm/linux/kernel/git/sj/linux.git damon/next F: Documentation/ABI/testing/sysfs-kernel-mm-damon F: Documentation/admin-guide/mm/damon/ F: Documentation/mm/damon/ -- cgit v1.2.3 From 16ddcb15497e11a2695c604357e77140010d3d51 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Jan 2023 19:03:59 +0000 Subject: selftests/damon/sysfs: hide expected write failures DAMON selftests for sysfs (sysfs.sh) tests if some writes to DAMON sysfs interface files fails as expected. It makes the test results noisy with the failure error message because it tests a number of such failures. Redirect the expected failure error messages to /dev/null to make the results clean. Link: https://lkml.kernel.org/r/20230110190400.119388-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh index a00336ffdcad..bcd4734ca094 100644 --- a/tools/testing/selftests/damon/sysfs.sh +++ b/tools/testing/selftests/damon/sysfs.sh @@ -24,7 +24,7 @@ ensure_write_fail() content=$2 reason=$3 - if echo "$content" > "$file" + if (echo "$content" > "$file") 2> /dev/null then echo "writing $content to $file succeed ($fail_reason)" echo "expected failure because $reason" -- cgit v1.2.3 From 75cb348714f527ce2de3446202b76ce74808b668 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Jan 2023 19:04:00 +0000 Subject: selftests/damon/debugfs_rm_non_contexts: hide expected write error messages A selftest case for DAMON debugfs interface has a test for expected failure. To make the test output clean, hide the expected failure error message. Link: https://lkml.kernel.org/r/20230110190400.119388-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/debugfs_rm_non_contexts.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh b/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh index 48b7af6b022c..f3ffeb1343cf 100644 --- a/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh +++ b/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh @@ -10,7 +10,7 @@ dmesg -C for file in "$DBGFS/"* do - echo "$(basename "$f")" > "$DBGFS/rm_contexts" + (echo "$(basename "$f")" > "$DBGFS/rm_contexts") &> /dev/null if dmesg | grep -q BUG then dmesg -- cgit v1.2.3 From c5d5546ea06512accc894cd19265c7041a6ac81a Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Tue, 10 Jan 2023 23:42:11 +0800 Subject: maple_tree: remove the parameter entry of mas_preallocate The parameter entry of mas_preallocate is not used, so drop it. Link: https://lkml.kernel.org/r/20230110154211.1758562-1-vernon2gm@gmail.com Signed-off-by: Vernon Yang Cc: Liam Howlett Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 2 +- lib/maple_tree.c | 3 +-- mm/mmap.c | 16 ++++++++-------- mm/nommu.c | 8 ++++---- tools/testing/radix-tree/maple.c | 32 ++++++++++++++++---------------- 5 files changed, 30 insertions(+), 31 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 815a27661517..a7bf58fd7cc6 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -455,7 +455,7 @@ int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp); void mas_store_prealloc(struct ma_state *mas, void *entry); void *mas_find(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); -int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); +int mas_preallocate(struct ma_state *mas, gfp_t gfp); bool mas_is_err(struct ma_state *mas); bool mas_nomem(struct ma_state *mas, gfp_t gfp); diff --git a/lib/maple_tree.c b/lib/maple_tree.c index baff62a012e1..5be99550e36d 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5700,12 +5700,11 @@ EXPORT_SYMBOL_GPL(mas_store_prealloc); /** * mas_preallocate() - Preallocate enough nodes for a store operation * @mas: The maple state - * @entry: The entry that will be stored * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -ENOMEM if memory could not be allocated. */ -int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) +int mas_preallocate(struct ma_state *mas, gfp_t gfp) { int ret; diff --git a/mm/mmap.c b/mm/mmap.c index 425a9349e610..4fe29b8f99b0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -472,7 +472,7 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) MA_STATE(mas, &mm->mm_mt, 0, 0); struct address_space *mapping = NULL; - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(&mas, GFP_KERNEL)) return -ENOMEM; if (vma->vm_file) { @@ -538,7 +538,7 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, /* Only handles expanding */ VM_BUG_ON(vma->vm_start < start || vma->vm_end > end); - if (mas_preallocate(mas, vma, GFP_KERNEL)) + if (mas_preallocate(mas, GFP_KERNEL)) goto nomem; vma_adjust_trans_huge(vma, start, end, 0); @@ -712,7 +712,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } } - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(&mas, GFP_KERNEL)) return -ENOMEM; vma_adjust_trans_huge(orig_vma, start, end, adjust_next); @@ -1938,7 +1938,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* Check that both stack segments have the same anon_vma? */ } - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(&mas, GFP_KERNEL)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ @@ -2019,7 +2019,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) return -ENOMEM; } - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(&mas, GFP_KERNEL)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ @@ -2311,7 +2311,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN); mt_set_external_lock(&mt_detach, &mm->mmap_lock); - if (mas_preallocate(mas, vma, GFP_KERNEL)) + if (mas_preallocate(mas, GFP_KERNEL)) return -ENOMEM; mas->last = end - 1; @@ -2680,7 +2680,7 @@ cannot_expand: goto free_vma; } - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + if (mas_preallocate(&mas, GFP_KERNEL)) { error = -ENOMEM; if (file) goto close_and_free_vma; @@ -2953,7 +2953,7 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, can_vma_merge_after(vma, flags, NULL, NULL, addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) { mas_set_range(mas, vma->vm_start, addr + len - 1); - if (mas_preallocate(mas, vma, GFP_KERNEL)) + if (mas_preallocate(mas, GFP_KERNEL)) goto unacct_fail; vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); diff --git a/mm/nommu.c b/mm/nommu.c index df1711acdf5b..0481922fe66e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -602,7 +602,7 @@ static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + if (mas_preallocate(&mas, GFP_KERNEL)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); return -ENOMEM; @@ -633,7 +633,7 @@ static int delete_vma_from_mm(struct vm_area_struct *vma) { MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + if (mas_preallocate(&mas, GFP_KERNEL)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); return -ENOMEM; @@ -1091,7 +1091,7 @@ unsigned long do_mmap(struct file *file, if (!vma) goto error_getting_vma; - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(&mas, GFP_KERNEL)) goto error_maple_preallocate; region->vm_usage = 1; @@ -1369,7 +1369,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (!new) goto err_vma_dup; - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + if (mas_preallocate(&mas, GFP_KERNEL)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); goto err_mas_preallocate; diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 1f36bc1c5d36..958ee9bdb316 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35342,7 +35342,7 @@ static noinline void check_prealloc(struct maple_tree *mt) for (i = 0; i <= max; i++) mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35351,18 +35351,18 @@ static noinline void check_prealloc(struct maple_tree *mt) allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35370,25 +35370,25 @@ static noinline void check_prealloc(struct maple_tree *mt) mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); ma_free_rcu(mn); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); ma_free_rcu(mn); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35397,12 +35397,12 @@ static noinline void check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); mas_push_node(&mas, mn); MT_BUG_ON(mt, mas_allocated(&mas) != allocated); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35410,21 +35410,21 @@ static noinline void check_prealloc(struct maple_tree *mt) mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mas_store_prealloc(&mas, ptr); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35432,14 +35432,14 @@ static noinline void check_prealloc(struct maple_tree *mt) mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); mt_set_non_kernel(1); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL & GFP_NOWAIT) == 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL & GFP_NOWAIT) == 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated != 0); mas_destroy(&mas); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35447,7 +35447,7 @@ static noinline void check_prealloc(struct maple_tree *mt) mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); mt_set_non_kernel(1); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL & GFP_NOWAIT) == 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL & GFP_NOWAIT) == 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated != 0); -- cgit v1.2.3 From baabcfc93d3b5b14c52eb5c18d38627c32d2d82b Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Tue, 10 Jan 2023 22:53:53 +0800 Subject: mm/mmap: fix typo in comment Replace "parital" with "partial". Link: https://lkml.kernel.org/r/20230110145353.1658435-1-vernon2gm@gmail.com Signed-off-by: Vernon Yang Cc: Liam Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index 4fe29b8f99b0..0641e6e0016c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2889,7 +2889,7 @@ out: } /* - * brk_munmap() - Unmap a parital vma. + * brk_munmap() - Unmap a partial vma. * @mas: The maple tree state. * @vma: The vma to be modified * @newbrk: the start of the address to unmap -- cgit v1.2.3 From c6835e8d86bcd8313347e097da140057772307c0 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 10 Jan 2023 21:36:18 +0800 Subject: mm: compaction: remove redundant VM_BUG_ON() in compact_zone() Patch series "Some small improvements for compaction". When I did some compaction testing, I found some small room for improvement as well as some code cleanups. This patch (of 5): The compaction_suitable() will never return values other than COMPACT_SUCCESS, COMPACT_SKIPPED and COMPACT_CONTINUE, so after validation of COMPACT_SUCCESS and COMPACT_SKIPPED, we will never hit other unexpected case. Thus remove the redundant VM_BUG_ON() validation for the return values of compaction_suitable(). Link: https://lkml.kernel.org/r/cover.1673342761.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/740a2396d9b98154dba76e326cba5e798b640ead.1673342761.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/compaction.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 62a61de44658..5e6f5e35748d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2313,9 +2313,6 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) return ret; - /* huh, compaction_suitable is returning something unexpected */ - VM_BUG_ON(ret != COMPACT_CONTINUE); - /* * Clear pageblock skip if there were failures recently and compaction * is about to be retried after being deferred. -- cgit v1.2.3 From 753ec50d976c28b08266dec3110905b377464eb1 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 10 Jan 2023 21:36:19 +0800 Subject: mm: compaction: move list validation into compact_zone() Move the cc.freepages and cc.migratepages list validation into compact_zone() to remove some duplicate code. Link: https://lkml.kernel.org/r/15cf54f7d762e87b04ac3cc74536f7d1ebbcd8cd.1673342761.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/compaction.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 5e6f5e35748d..f8e8addc8664 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2488,6 +2488,9 @@ out: trace_mm_compaction_end(cc, start_pfn, end_pfn, sync, ret); + VM_BUG_ON(!list_empty(&cc->freepages)); + VM_BUG_ON(!list_empty(&cc->migratepages)); + return ret; } @@ -2526,9 +2529,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, ret = compact_zone(&cc, &capc); - VM_BUG_ON(!list_empty(&cc.freepages)); - VM_BUG_ON(!list_empty(&cc.migratepages)); - /* * Make sure we hide capture control first before we read the captured * page pointer, otherwise an interrupt could free and capture a page @@ -2659,9 +2659,6 @@ static void proactive_compact_node(pg_data_t *pgdat) cc.zone = zone; compact_zone(&cc, NULL); - - VM_BUG_ON(!list_empty(&cc.freepages)); - VM_BUG_ON(!list_empty(&cc.migratepages)); } } @@ -2689,9 +2686,6 @@ static void compact_node(int nid) cc.zone = zone; compact_zone(&cc, NULL); - - VM_BUG_ON(!list_empty(&cc.freepages)); - VM_BUG_ON(!list_empty(&cc.migratepages)); } } @@ -2868,9 +2862,6 @@ static void kcompactd_do_work(pg_data_t *pgdat) cc.total_migrate_scanned); count_compact_events(KCOMPACTD_FREE_SCANNED, cc.total_free_scanned); - - VM_BUG_ON(!list_empty(&cc.freepages)); - VM_BUG_ON(!list_empty(&cc.migratepages)); } /* -- cgit v1.2.3 From 1bfb7684db1233d9e3f3f26fbbc0c58d40ff65e7 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 10 Jan 2023 21:36:20 +0800 Subject: mm: compaction: count the migration scanned pages events for proactive compaction The proactive compaction will reuse per-node kcompactd threads, so we should also count the KCOMPACTD_MIGRATE_SCANNED and KCOMPACTD_FREE_SCANNED events for proactive compaction. Link: https://lkml.kernel.org/r/b7f1ece1adc17defa47e3667b5f9fd61f496517a.1673342761.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/compaction.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index f8e8addc8664..62f6bb68c9cb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2659,6 +2659,11 @@ static void proactive_compact_node(pg_data_t *pgdat) cc.zone = zone; compact_zone(&cc, NULL); + + count_compact_events(KCOMPACTD_MIGRATE_SCANNED, + cc.total_migrate_scanned); + count_compact_events(KCOMPACTD_FREE_SCANNED, + cc.total_free_scanned); } } -- cgit v1.2.3 From 8fff8b6f8d0ef7620e06f3f4cfb912171aef6cd5 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 10 Jan 2023 21:36:21 +0800 Subject: mm: compaction: add missing kcompactd wakeup trace event Add missing kcompactd wakeup trace event for proactive compaction, meanwhile use order = -1 and the highest zone index of the pgdat for the kcompactd wakeup trace event by proactive compaction. Link: https://lkml.kernel.org/r/cbf8097a2d8a1b6800991f2a21575550d3613ce6.1673342761.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/compaction.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index 62f6bb68c9cb..0fd6c81a7809 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2730,6 +2730,8 @@ int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, continue; pgdat->proactive_compact_trigger = true; + trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, -1, + pgdat->nr_zones - 1); wake_up_interruptible(&pgdat->kcompactd_wait); } } -- cgit v1.2.3 From 9e5522715e6941bcfdc08d066a79d6da0f8cec8e Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 10 Jan 2023 21:36:22 +0800 Subject: mm: compaction: avoid fragmentation score calculation for empty zones There is no need to calculate the fragmentation score for empty zones. Link: https://lkml.kernel.org/r/100331ad9d274a9725e687b00d85d75d7e4a17c7.1673342761.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/compaction.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index 0fd6c81a7809..b758b00a4885 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2025,6 +2025,8 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat) struct zone *zone; zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; score += fragmentation_score_zone_weighted(zone); } -- cgit v1.2.3 From 7d4a8be0c4b2b7ffb367929d2b352651f083806b Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 10 Jan 2023 13:57:22 +1100 Subject: mm/mmu_notifier: remove unused mmu_notifier_range_update_to_read_only export mmu_notifier_range_update_to_read_only() was originally introduced in commit c6d23413f81b ("mm/mmu_notifier: mmu_notifier_range_update_to_read_only() helper") as an optimisation for device drivers that know a range has only been mapped read-only. However there are no users of this feature so remove it. As it is the only user of the struct mmu_notifier_range.vma field remove that also. Link: https://lkml.kernel.org/r/20230110025722.600912-1-apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: Mike Rapoport (IBM) Reviewed-by: Jason Gunthorpe Reviewed-by: Christoph Hellwig Reviewed-by: Mike Kravetz Cc: Ira Weiny Cc: Jerome Glisse Cc: John Hubbard Cc: Ralph Campbell Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 2 +- include/linux/mmu_notifier.h | 13 +++++-------- kernel/events/uprobes.c | 2 +- mm/huge_memory.c | 4 ++-- mm/hugetlb.c | 12 ++++++------ mm/khugepaged.c | 6 +++--- mm/ksm.c | 5 ++--- mm/madvise.c | 2 +- mm/mapping_dirty_helpers.c | 2 +- mm/memory.c | 10 +++++----- mm/migrate_device.c | 4 ++-- mm/mmu_notifier.c | 10 ---------- mm/mprotect.c | 2 +- mm/mremap.c | 2 +- mm/oom_kill.c | 2 +- mm/rmap.c | 11 +++++------ 16 files changed, 37 insertions(+), 52 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index af1c49ae11b1..a44339a77a75 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1306,7 +1306,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, inc_tlb_flush_pending(mm); mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, - 0, NULL, mm, 0, -1UL); + 0, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index d6c06e140277..64a3e051c3c4 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -269,7 +269,6 @@ extern struct lockdep_map __mmu_notifier_invalidate_range_start_map; #endif struct mmu_notifier_range { - struct vm_area_struct *vma; struct mm_struct *mm; unsigned long start; unsigned long end; @@ -514,12 +513,10 @@ static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, enum mmu_notifier_event event, unsigned flags, - struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end) { - range->vma = vma; range->event = event; range->mm = mm; range->start = start; @@ -530,10 +527,10 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, static inline void mmu_notifier_range_init_owner( struct mmu_notifier_range *range, enum mmu_notifier_event event, unsigned int flags, - struct vm_area_struct *vma, struct mm_struct *mm, - unsigned long start, unsigned long end, void *owner) + struct mm_struct *mm, unsigned long start, + unsigned long end, void *owner) { - mmu_notifier_range_init(range, event, flags, vma, mm, start, end); + mmu_notifier_range_init(range, event, flags, mm, start, end); range->owner = owner; } @@ -659,9 +656,9 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, range->end = end; } -#define mmu_notifier_range_init(range,event,flags,vma,mm,start,end) \ +#define mmu_notifier_range_init(range,event,flags,mm,start,end) \ _mmu_notifier_range_init(range, start, end) -#define mmu_notifier_range_init_owner(range, event, flags, vma, mm, start, \ +#define mmu_notifier_range_init_owner(range, event, flags, mm, start, \ end, owner) \ _mmu_notifier_range_init(range, start, end) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d9e357b7e17c..29f36d2ae129 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, int err; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + PAGE_SIZE); if (new_page) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7e68a36b4f7d..c13b1f67d14e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2020,7 +2020,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address & HPAGE_PUD_MASK, (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -2282,7 +2282,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address & HPAGE_PMD_MASK, (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6fe65f14d33b..273a6522aa4c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4966,7 +4966,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, int ret = 0; if (cow) { - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src, src_vma->vm_start, src_vma->vm_end); mmu_notifier_invalidate_range_start(&range); @@ -5177,7 +5177,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, struct mmu_notifier_range range; bool shared_pmd = false; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr, old_end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); /* @@ -5391,7 +5391,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, struct mmu_notifier_range range; struct mmu_gather tlb; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, start, end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); mmu_notifier_invalidate_range_start(&range); @@ -5597,7 +5597,7 @@ retry_avoidcopy: pages_per_huge_page(h)); __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, haddr + huge_page_size(h)); mmu_notifier_invalidate_range_start(&range); @@ -6637,7 +6637,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, * range if PMD sharing is possible. */ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, - 0, vma, mm, start, end); + 0, mm, start, end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); BUG_ON(address >= end); @@ -7368,7 +7368,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, * No need to call adjust_range_if_pmd_sharing_possible(), because * we have already done the PUD_SIZE alignment. */ - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start, end); mmu_notifier_invalidate_range_start(&range); hugetlb_vma_lock_write(vma); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 90acfea40c13..57164c15e076 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1040,8 +1040,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, anon_vma_lock_write(vma->anon_vma); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, - address, address + HPAGE_PMD_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address, + address + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pte = pte_offset_map(pmd, address); @@ -1412,7 +1412,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v if (vma->anon_vma) lockdep_assert_held_write(&vma->anon_vma->root->rwsem); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pmd = pmdp_collapse_flush(vma, addr, pmdp); diff --git a/mm/ksm.c b/mm/ksm.c index dd02780c387f..cea0c4478220 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1057,8 +1057,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, BUG_ON(PageTransCompound(page)); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, - pvmw.address, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -1164,7 +1163,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd_present(pmde) || pmd_trans_huge(pmde)) goto out; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/madvise.c b/mm/madvise.c index e407d335e614..5296e78dccda 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -765,7 +765,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, range.end = min(vma->vm_end, end_addr); if (range.end <= vma->vm_start) return -EINVAL; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, range.start, range.end); lru_add_drain(); diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index 175e424b9ab1..e1eb33f49059 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -191,7 +191,7 @@ static int wp_clean_pre_vma(unsigned long start, unsigned long end, wpwalk->tlbflush_end = start; mmu_notifier_range_init(&wpwalk->range, MMU_NOTIFY_PROTECTION_PAGE, 0, - walk->vma, walk->mm, start, end); + walk->mm, start, end); mmu_notifier_invalidate_range_start(&wpwalk->range); flush_cache_range(walk->vma, start, end); diff --git a/mm/memory.c b/mm/memory.c index 90f8f72777c7..c6bacd58d032 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1266,7 +1266,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) if (is_cow) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, - 0, src_vma, src_mm, addr, end); + 0, src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); /* * Disabling preemption is not needed for the write side, as @@ -1683,7 +1683,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, }; MA_STATE(mas, mt, vma->vm_end, vma->vm_end); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); do { @@ -1709,7 +1709,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, struct mmu_gather tlb; lru_add_drain(); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); if (is_vm_hugetlb_page(vma)) adjust_range_if_pmd_sharing_possible(vma, &range.start, @@ -3091,7 +3091,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -3561,7 +3561,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) return VM_FAULT_RETRY; - mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma, + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma->vm_mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 721b2365dbca..6c3740318a98 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -306,7 +306,7 @@ static void migrate_vma_collect(struct migrate_vma *migrate) * private page mappings that won't be migrated. */ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0, - migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end, + migrate->vma->vm_mm, migrate->start, migrate->end, migrate->pgmap_owner); mmu_notifier_invalidate_range_start(&range); @@ -733,7 +733,7 @@ static void __migrate_device_pages(unsigned long *src_pfns, notified = true; mmu_notifier_range_init_owner(&range, - MMU_NOTIFY_MIGRATE, 0, migrate->vma, + MMU_NOTIFY_MIGRATE, 0, migrate->vma->vm_mm, addr, migrate->end, migrate->pgmap_owner); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index f45ff1b7626a..50c0dde1354f 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -1120,13 +1120,3 @@ void mmu_notifier_synchronize(void) synchronize_srcu(&srcu); } EXPORT_SYMBOL_GPL(mmu_notifier_synchronize); - -bool -mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) -{ - if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA) - return false; - /* Return true if the vma still have the read flag set. */ - return range->vma->vm_flags & VM_READ; -} -EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only); diff --git a/mm/mprotect.c b/mm/mprotect.c index 92fc6f3fa512..6ecdf0671b81 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -398,7 +398,7 @@ static inline long change_pmd_range(struct mmu_gather *tlb, if (!range.start) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, - vma, vma->vm_mm, addr, end); + vma->vm_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } diff --git a/mm/mremap.c b/mm/mremap.c index 930f65c315c0..05f90f47e149 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -498,7 +498,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, new_addr, len); flush_cache_range(vma, old_addr, old_end); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, old_addr, old_end); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1276e49b31b0..044e1eed720e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -542,7 +542,7 @@ static bool __oom_reap_task_mm(struct mm_struct *mm) struct mmu_gather tlb; mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, - vma, mm, vma->vm_start, + mm, vma->vm_start, vma->vm_end); tlb_gather_mmu(&tlb, mm); if (mmu_notifier_invalidate_range_start_nonblock(&range)) { diff --git a/mm/rmap.c b/mm/rmap.c index ab74e0547a52..6ccd42bbae93 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -944,9 +944,8 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) * We have to assume the worse case ie pmd for invalidation. Note that * the folio can not be freed from this function. */ - mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, - 0, vma, vma->vm_mm, address, - vma_address_end(pvmw)); + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, + vma->vm_mm, address, vma_address_end(pvmw)); mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(pvmw)) { @@ -1475,7 +1474,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * try_to_unmap() must hold a reference on the folio. */ range.end = vma_address_end(&pvmw); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, range.end); if (folio_test_hugetlb(folio)) { /* @@ -1850,7 +1849,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * try_to_unmap() must hold a reference on the page. */ range.end = vma_address_end(&pvmw); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, range.end); if (folio_test_hugetlb(folio)) { /* @@ -2180,7 +2179,7 @@ static bool page_make_device_exclusive_one(struct folio *folio, swp_entry_t entry; pte_t swp_pte; - mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma, + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma->vm_mm, address, min(vma->vm_end, address + folio_size(folio)), args->owner); -- cgit v1.2.3 From 94688e8eb453e616098cb930e5f6fed4a6ea2dfa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:47 +0000 Subject: mm: remove folio_pincount_ptr() and head_compound_pincount() We can use folio->_pincount directly, since all users are guarded by tests of compound/large. Link: https://lkml.kernel.org/r/20230111142915.1001531-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: John Hubbard Signed-off-by: Andrew Morton --- Documentation/core-api/pin_user_pages.rst | 29 ++++++++++++++--------------- include/linux/mm.h | 14 ++------------ include/linux/mm_types.h | 5 ----- mm/debug.c | 4 ++-- mm/gup.c | 8 ++++---- mm/huge_memory.c | 4 ++-- mm/hugetlb.c | 4 ++-- mm/page_alloc.c | 9 ++++++--- 8 files changed, 32 insertions(+), 45 deletions(-) diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index facafbdecb95..9fb0b1080d3b 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -55,18 +55,17 @@ flags the caller provides. The caller is required to pass in a non-null struct pages* array, and the function then pins pages by incrementing each by a special value: GUP_PIN_COUNTING_BIAS. -For compound pages, the GUP_PIN_COUNTING_BIAS scheme is not used. Instead, -an exact form of pin counting is achieved, by using the 2nd struct page -in the compound page. A new struct page field, compound_pincount, has -been added in order to support this. - -This approach for compound pages avoids the counting upper limit problems that -are discussed below. Those limitations would have been aggravated severely by -huge pages, because each tail page adds a refcount to the head page. And in -fact, testing revealed that, without a separate compound_pincount field, -page overflows were seen in some huge page stress tests. - -This also means that huge pages and compound pages do not suffer +For large folios, the GUP_PIN_COUNTING_BIAS scheme is not used. Instead, +the extra space available in the struct folio is used to store the +pincount directly. + +This approach for large folios avoids the counting upper limit problems +that are discussed below. Those limitations would have been aggravated +severely by huge pages, because each tail page adds a refcount to the +head page. And in fact, testing revealed that, without a separate pincount +field, refcount overflows were seen in some huge page stress tests. + +This also means that huge pages and large folios do not suffer from the false positives problem that is mentioned below.:: Function @@ -264,9 +263,9 @@ place.) Other diagnostics ================= -dump_page() has been enhanced slightly, to handle these new counting -fields, and to better report on compound pages in general. Specifically, -for compound pages, the exact (compound_pincount) pincount is reported. +dump_page() has been enhanced slightly to handle these new counting +fields, and to better report on large folios in general. Specifically, +for large folios, the exact pincount is reported. References ========== diff --git a/include/linux/mm.h b/include/linux/mm.h index 76c97cb8ee9a..6d3945207067 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1011,11 +1011,6 @@ static inline void folio_set_compound_dtor(struct folio *folio, void destroy_large_folio(struct folio *folio); -static inline int head_compound_pincount(struct page *head) -{ - return atomic_read(compound_pincount_ptr(head)); -} - static inline void set_compound_order(struct page *page, unsigned int order) { page[1].compound_order = order; @@ -1641,11 +1636,6 @@ static inline struct folio *pfn_folio(unsigned long pfn) return page_folio(pfn_to_page(pfn)); } -static inline atomic_t *folio_pincount_ptr(struct folio *folio) -{ - return &folio_page(folio, 1)->compound_pincount; -} - /** * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. * @folio: The folio. @@ -1663,7 +1653,7 @@ static inline atomic_t *folio_pincount_ptr(struct folio *folio) * expected to be able to deal gracefully with a false positive. * * For large folios, the result will be exactly correct. That's because - * we have more tracking data available: the compound_pincount is used + * we have more tracking data available: the _pincount field is used * instead of the GUP_PIN_COUNTING_BIAS scheme. * * For more information, please see Documentation/core-api/pin_user_pages.rst. @@ -1674,7 +1664,7 @@ static inline atomic_t *folio_pincount_ptr(struct folio *folio) static inline bool folio_maybe_dma_pinned(struct folio *folio) { if (folio_test_large(folio)) - return atomic_read(folio_pincount_ptr(folio)) > 0; + return atomic_read(&folio->_pincount) > 0; /* * folio_ref_count() is signed. If that refcount overflows, then diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 10b6eb311ede..6ff1d7db00a7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -443,11 +443,6 @@ static inline atomic_t *subpages_mapcount_ptr(struct page *page) return &page[1].subpages_mapcount; } -static inline atomic_t *compound_pincount_ptr(struct page *page) -{ - return &page[1].compound_pincount; -} - /* * Used for sizing the vmemmap region on some architectures */ diff --git a/mm/debug.c b/mm/debug.c index 7f8e5f744e42..893c9dbf76ca 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -94,11 +94,11 @@ static void __dump_page(struct page *page) page, page_ref_count(head), mapcount, mapping, page_to_pgoff(page), page_to_pfn(page)); if (compound) { - pr_warn("head:%p order:%u compound_mapcount:%d subpages_mapcount:%d compound_pincount:%d\n", + pr_warn("head:%p order:%u compound_mapcount:%d subpages_mapcount:%d pincount:%d\n", head, compound_order(head), head_compound_mapcount(head), head_subpages_mapcount(head), - head_compound_pincount(head)); + atomic_read(&folio->_pincount)); } #ifdef CONFIG_MEMCG diff --git a/mm/gup.c b/mm/gup.c index f45a3a5be53a..38ba1697dd61 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -111,7 +111,7 @@ retry: * FOLL_GET: folio's refcount will be incremented by @refs. * * FOLL_PIN on large folios: folio's refcount will be incremented by - * @refs, and its compound_pincount will be incremented by @refs. + * @refs, and its pincount will be incremented by @refs. * * FOLL_PIN on single-page folios: folio's refcount will be incremented by * @refs * GUP_PIN_COUNTING_BIAS. @@ -157,7 +157,7 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) * try_get_folio() is left intact. */ if (folio_test_large(folio)) - atomic_add(refs, folio_pincount_ptr(folio)); + atomic_add(refs, &folio->_pincount); else folio_ref_add(folio, refs * (GUP_PIN_COUNTING_BIAS - 1)); @@ -182,7 +182,7 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) if (flags & FOLL_PIN) { node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs); if (folio_test_large(folio)) - atomic_sub(refs, folio_pincount_ptr(folio)); + atomic_sub(refs, &folio->_pincount); else refs *= GUP_PIN_COUNTING_BIAS; } @@ -232,7 +232,7 @@ int __must_check try_grab_page(struct page *page, unsigned int flags) */ if (folio_test_large(folio)) { folio_ref_add(folio, 1); - atomic_add(1, folio_pincount_ptr(folio)); + atomic_add(1, &folio->_pincount); } else { folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c13b1f67d14e..9570f03cdee4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2477,9 +2477,9 @@ static void __split_huge_page_tail(struct page *head, int tail, * of swap cache pages that store the swp_entry_t in tail pages. * Fix up and warn once if private is unexpectedly set. * - * What of 32-bit systems, on which head[1].compound_pincount overlays + * What of 32-bit systems, on which folio->_pincount overlays * head[1].private? No problem: THP_SWAP is not enabled on 32-bit, and - * compound_pincount must be 0 for folio_ref_freeze() to have succeeded. + * pincount must be 0 for folio_ref_freeze() to have succeeded. */ if (!folio_test_swapcache(page_folio(head))) { VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 273a6522aa4c..15b2707c1600 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1476,7 +1476,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio, atomic_set(folio_mapcount_ptr(folio), 0); atomic_set(folio_subpages_mapcount_ptr(folio), 0); - atomic_set(folio_pincount_ptr(folio), 0); + atomic_set(&folio->_pincount, 0); for (i = 1; i < nr_pages; i++) { p = folio_page(folio, i); @@ -1998,7 +1998,7 @@ static bool __prep_compound_gigantic_folio(struct folio *folio, } atomic_set(folio_mapcount_ptr(folio), -1); atomic_set(folio_subpages_mapcount_ptr(folio), 0); - atomic_set(folio_pincount_ptr(folio), 0); + atomic_set(&folio->_pincount, 0); return true; out_error: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5514d84cc712..b224c2132ed1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -775,11 +775,13 @@ void free_compound_page(struct page *page) static void prep_compound_head(struct page *page, unsigned int order) { + struct folio *folio = (struct folio *)page; + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); set_compound_order(page, order); atomic_set(compound_mapcount_ptr(page), -1); atomic_set(subpages_mapcount_ptr(page), 0); - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(&folio->_pincount, 0); } static void prep_compound_tail(struct page *head, int tail_idx) @@ -1291,6 +1293,7 @@ static inline bool free_page_is_bad(struct page *page) static int free_tail_pages_check(struct page *head_page, struct page *page) { + struct folio *folio = (struct folio *)head_page; int ret = 1; /* @@ -1314,8 +1317,8 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) bad_page(page, "nonzero subpages_mapcount"); goto out; } - if (unlikely(head_compound_pincount(head_page))) { - bad_page(page, "nonzero compound_pincount"); + if (unlikely(atomic_read(&folio->_pincount))) { + bad_page(page, "nonzero pincount"); goto out; } break; -- cgit v1.2.3 From eec20426d48bd7b63c69969a793943ed1a99b731 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:48 +0000 Subject: mm: convert head_subpages_mapcount() into folio_nr_pages_mapped() Calling this 'mapcount' is confusing since mapcount is usually the number of times something is mapped; instead this is the number of mapped pages. It's also better to enforce that this is a folio rather than a head page. Move folio_nr_pages_mapped() into mm/internal.h since this is not something we want device drivers or filesystems poking at. Get rid of folio_subpages_mapcount_ptr() and use folio->_nr_pages_mapped directly. Link: https://lkml.kernel.org/r/20230111142915.1001531-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 22 ++-------------------- include/linux/mm_types.h | 12 +++--------- mm/debug.c | 4 ++-- mm/hugetlb.c | 4 ++-- mm/internal.h | 18 ++++++++++++++++++ mm/rmap.c | 9 +++++---- 6 files changed, 32 insertions(+), 37 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 6d3945207067..2bdd08a5b8b4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -843,24 +843,6 @@ static inline int head_compound_mapcount(struct page *head) return atomic_read(compound_mapcount_ptr(head)) + 1; } -/* - * If a 16GB hugetlb page were mapped by PTEs of all of its 4kB sub-pages, - * its subpages_mapcount would be 0x400000: choose the COMPOUND_MAPPED bit - * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently - * leaves subpages_mapcount at 0, but avoid surprise if it participates later. - */ -#define COMPOUND_MAPPED 0x800000 -#define SUBPAGES_MAPPED (COMPOUND_MAPPED - 1) - -/* - * Number of sub-pages mapped by PTE, does not include compound mapcount. - * Must be called only on head of compound page. - */ -static inline int head_subpages_mapcount(struct page *head) -{ - return atomic_read(subpages_mapcount_ptr(head)) & SUBPAGES_MAPPED; -} - /* * The atomic page->_mapcount, starts from -1: so that transitions * both from it and to it can be tracked, using atomic_inc_and_test @@ -920,9 +902,9 @@ static inline bool folio_large_is_mapped(struct folio *folio) { /* * Reading folio_mapcount_ptr() below could be omitted if hugetlb - * participated in incrementing subpages_mapcount when compound mapped. + * participated in incrementing nr_pages_mapped when compound mapped. */ - return atomic_read(folio_subpages_mapcount_ptr(folio)) > 0 || + return atomic_read(&folio->_nr_pages_mapped) > 0 || atomic_read(folio_mapcount_ptr(folio)) >= 0; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6ff1d7db00a7..4751c67b98a6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -307,7 +307,7 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) * @_folio_dtor: Which destructor to use for this folio. * @_folio_order: Do not use directly, call folio_order(). * @_compound_mapcount: Do not use directly, call folio_entire_mapcount(). - * @_subpages_mapcount: Do not use directly, call folio_mapcount(). + * @_nr_pages_mapped: Do not use directly, call folio_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). * @_flags_2: For alignment. Do not use. @@ -361,7 +361,7 @@ struct folio { unsigned char _folio_dtor; unsigned char _folio_order; atomic_t _compound_mapcount; - atomic_t _subpages_mapcount; + atomic_t _nr_pages_mapped; atomic_t _pincount; #ifdef CONFIG_64BIT unsigned int _folio_nr_pages; @@ -404,7 +404,7 @@ FOLIO_MATCH(compound_head, _head_1); FOLIO_MATCH(compound_dtor, _folio_dtor); FOLIO_MATCH(compound_order, _folio_order); FOLIO_MATCH(compound_mapcount, _compound_mapcount); -FOLIO_MATCH(subpages_mapcount, _subpages_mapcount); +FOLIO_MATCH(subpages_mapcount, _nr_pages_mapped); FOLIO_MATCH(compound_pincount, _pincount); #ifdef CONFIG_64BIT FOLIO_MATCH(compound_nr, _folio_nr_pages); @@ -427,12 +427,6 @@ static inline atomic_t *folio_mapcount_ptr(struct folio *folio) return &tail->compound_mapcount; } -static inline atomic_t *folio_subpages_mapcount_ptr(struct folio *folio) -{ - struct page *tail = &folio->page + 1; - return &tail->subpages_mapcount; -} - static inline atomic_t *compound_mapcount_ptr(struct page *page) { return &page[1].compound_mapcount; diff --git a/mm/debug.c b/mm/debug.c index 893c9dbf76ca..8e58e8dab0b2 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -94,10 +94,10 @@ static void __dump_page(struct page *page) page, page_ref_count(head), mapcount, mapping, page_to_pgoff(page), page_to_pfn(page)); if (compound) { - pr_warn("head:%p order:%u compound_mapcount:%d subpages_mapcount:%d pincount:%d\n", + pr_warn("head:%p order:%u compound_mapcount:%d nr_pages_mapped:%d pincount:%d\n", head, compound_order(head), head_compound_mapcount(head), - head_subpages_mapcount(head), + folio_nr_pages_mapped(folio), atomic_read(&folio->_pincount)); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 15b2707c1600..c9702224931c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1475,7 +1475,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio, struct page *p; atomic_set(folio_mapcount_ptr(folio), 0); - atomic_set(folio_subpages_mapcount_ptr(folio), 0); + atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); for (i = 1; i < nr_pages; i++) { @@ -1997,7 +1997,7 @@ static bool __prep_compound_gigantic_folio(struct folio *folio, set_compound_head(p, &folio->page); } atomic_set(folio_mapcount_ptr(folio), -1); - atomic_set(folio_subpages_mapcount_ptr(folio), 0); + atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); return true; diff --git a/mm/internal.h b/mm/internal.h index 1d6f4e168510..583e15357e09 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -52,6 +52,24 @@ struct folio_batch; void page_writeback_init(void); +/* + * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages, + * its nr_pages_mapped would be 0x400000: choose the COMPOUND_MAPPED bit + * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently + * leaves nr_pages_mapped at 0, but avoid surprise if it participates later. + */ +#define COMPOUND_MAPPED 0x800000 +#define FOLIO_PAGES_MAPPED (COMPOUND_MAPPED - 1) + +/* + * How many individual pages have an elevated _mapcount. Excludes + * the folio's entire_mapcount. + */ +static inline int folio_nr_pages_mapped(struct folio *folio) +{ + return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED; +} + static inline void *folio_raw_mapping(struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; diff --git a/mm/rmap.c b/mm/rmap.c index 6ccd42bbae93..b573472b4ac3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1080,12 +1080,13 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, int total_compound_mapcount(struct page *head) { + struct folio *folio = (struct folio *)head; int mapcount = head_compound_mapcount(head); int nr_subpages; int i; /* In the common case, avoid the loop when no subpages mapped by PTE */ - if (head_subpages_mapcount(head) == 0) + if (folio_nr_pages_mapped(folio) == 0) return mapcount; /* * Add all the PTE mappings of those subpages mapped by PTE. @@ -1233,7 +1234,7 @@ void page_add_anon_rmap(struct page *page, nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { nr_pmdmapped = thp_nr_pages(page); - nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ if (unlikely(nr < 0)) nr = 0; @@ -1337,7 +1338,7 @@ void page_add_file_rmap(struct page *page, nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { nr_pmdmapped = thp_nr_pages(page); - nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ if (unlikely(nr < 0)) nr = 0; @@ -1399,7 +1400,7 @@ void page_remove_rmap(struct page *page, nr = atomic_sub_return_relaxed(COMPOUND_MAPPED, mapped); if (likely(nr < COMPOUND_MAPPED)) { nr_pmdmapped = thp_nr_pages(page); - nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of another remove and an add? */ if (unlikely(nr < 0)) nr = 0; -- cgit v1.2.3 From 6eee1a0062298601dfc36bd34517affc4458c43d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:49 +0000 Subject: doc: clarify refcount section by referring to folios & pages Include the rename of subpages_mapcount to _nr_pages_mapped. Link: https://lkml.kernel.org/r/20230111142915.1001531-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/mm/transhuge.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst index ec3dc5b04226..03bbd0a19041 100644 --- a/Documentation/mm/transhuge.rst +++ b/Documentation/mm/transhuge.rst @@ -112,20 +112,20 @@ Refcounts and transparent huge pages Refcounting on THP is mostly consistent with refcounting on other compound pages: - - get_page()/put_page() and GUP operate on head page's ->_refcount. + - get_page()/put_page() and GUP operate on the folio->_refcount. - ->_refcount in tail pages is always zero: get_page_unless_zero() never succeeds on tail pages. - - map/unmap of PMD entry for the whole compound page increment/decrement - ->compound_mapcount, stored in the first tail page of the compound page; - and also increment/decrement ->subpages_mapcount (also in the first tail) - by COMPOUND_MAPPED when compound_mapcount goes from -1 to 0 or 0 to -1. + - map/unmap of a PMD entry for the whole THP increment/decrement + folio->_entire_mapcount and also increment/decrement + folio->_nr_pages_mapped by COMPOUND_MAPPED when _entire_mapcount + goes from -1 to 0 or 0 to -1. - - map/unmap of sub-pages with PTE entry increment/decrement ->_mapcount - on relevant sub-page of the compound page, and also increment/decrement - ->subpages_mapcount, stored in first tail page of the compound page, when - _mapcount goes from -1 to 0 or 0 to -1: counting sub-pages mapped by PTE. + - map/unmap of individual pages with PTE entry increment/decrement + page->_mapcount and also increment/decrement folio->_nr_pages_mapped + when page->_mapcount goes from -1 to 0 or 0 to -1 as this counts + the number of pages mapped by PTE. split_huge_page internally has to distribute the refcounts in the head page to the tail pages before clearing all PG_head/tail bits from the page -- cgit v1.2.3 From b14224fbea62e5bffd680613376fe1268f4103ba Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:50 +0000 Subject: mm: convert total_compound_mapcount() to folio_total_mapcount() Instead of enforcing that the argument must be a head page by naming, enforce it with the compiler by making it a folio. Also rename the counter in struct folio from _compound_mapcount to _entire_mapcount. Link: https://lkml.kernel.org/r/20230111142915.1001531-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++--- include/linux/mm_types.h | 6 +++--- mm/rmap.c | 21 ++++++++++----------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 2bdd08a5b8b4..bdf83e75bcd6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -871,7 +871,7 @@ static inline int page_mapcount(struct page *page) return head_compound_mapcount(page) + mapcount; } -int total_compound_mapcount(struct page *head); +int folio_total_mapcount(struct folio *folio); /** * folio_mapcount() - Calculate the number of mappings of this folio. @@ -888,14 +888,14 @@ static inline int folio_mapcount(struct folio *folio) { if (likely(!folio_test_large(folio))) return atomic_read(&folio->_mapcount) + 1; - return total_compound_mapcount(&folio->page); + return folio_total_mapcount(folio); } static inline int total_mapcount(struct page *page) { if (likely(!PageCompound(page))) return atomic_read(&page->_mapcount) + 1; - return total_compound_mapcount(compound_head(page)); + return folio_total_mapcount(page_folio(page)); } static inline bool folio_large_is_mapped(struct folio *folio) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4751c67b98a6..70cbda768308 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -306,7 +306,7 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) * @_head_1: Points to the folio. Do not use. * @_folio_dtor: Which destructor to use for this folio. * @_folio_order: Do not use directly, call folio_order(). - * @_compound_mapcount: Do not use directly, call folio_entire_mapcount(). + * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). * @_nr_pages_mapped: Do not use directly, call folio_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). @@ -360,7 +360,7 @@ struct folio { unsigned long _head_1; unsigned char _folio_dtor; unsigned char _folio_order; - atomic_t _compound_mapcount; + atomic_t _entire_mapcount; atomic_t _nr_pages_mapped; atomic_t _pincount; #ifdef CONFIG_64BIT @@ -403,7 +403,7 @@ FOLIO_MATCH(flags, _flags_1); FOLIO_MATCH(compound_head, _head_1); FOLIO_MATCH(compound_dtor, _folio_dtor); FOLIO_MATCH(compound_order, _folio_order); -FOLIO_MATCH(compound_mapcount, _compound_mapcount); +FOLIO_MATCH(compound_mapcount, _entire_mapcount); FOLIO_MATCH(subpages_mapcount, _nr_pages_mapped); FOLIO_MATCH(compound_pincount, _pincount); #ifdef CONFIG_64BIT diff --git a/mm/rmap.c b/mm/rmap.c index b573472b4ac3..1418efc80027 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1078,27 +1078,26 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, return page_vma_mkclean_one(&pvmw); } -int total_compound_mapcount(struct page *head) +int folio_total_mapcount(struct folio *folio) { - struct folio *folio = (struct folio *)head; - int mapcount = head_compound_mapcount(head); - int nr_subpages; + int mapcount = folio_entire_mapcount(folio); + int nr_pages; int i; - /* In the common case, avoid the loop when no subpages mapped by PTE */ + /* In the common case, avoid the loop when no pages mapped by PTE */ if (folio_nr_pages_mapped(folio) == 0) return mapcount; /* - * Add all the PTE mappings of those subpages mapped by PTE. - * Limit the loop, knowing that only subpages_mapcount are mapped? + * Add all the PTE mappings of those pages mapped by PTE. + * Limit the loop to folio_nr_pages_mapped()? * Perhaps: given all the raciness, that may be a good or a bad idea. */ - nr_subpages = thp_nr_pages(head); - for (i = 0; i < nr_subpages; i++) - mapcount += atomic_read(&head[i]._mapcount); + nr_pages = folio_nr_pages(folio); + for (i = 0; i < nr_pages; i++) + mapcount += atomic_read(&folio_page(folio, i)->_mapcount); /* But each of those _mapcounts was based on -1 */ - mapcount += nr_subpages; + mapcount += nr_pages; return mapcount; } -- cgit v1.2.3 From 62beb906ef644b0f0555b2b9f9626c27e2038d84 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:51 +0000 Subject: mm: convert page_remove_rmap() to use a folio internally The API for page_remove_rmap() needs to be page-based, because we can remove mappings of pages individually. But inside the function, we want to only call compound_head() once and then use the folio APIs instead of the page APIs that each call compound_head(). Link: https://lkml.kernel.org/r/20230111142915.1001531-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/rmap.c | 47 ++++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 1418efc80027..7762ab8371c0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1365,19 +1365,21 @@ void page_add_file_rmap(struct page *page, * * The caller needs to hold the pte lock. */ -void page_remove_rmap(struct page *page, - struct vm_area_struct *vma, bool compound) +void page_remove_rmap(struct page *page, struct vm_area_struct *vma, + bool compound) { - atomic_t *mapped; + struct folio *folio = page_folio(page); + atomic_t *mapped = &folio->_nr_pages_mapped; int nr = 0, nr_pmdmapped = 0; bool last; + enum node_stat_item idx; VM_BUG_ON_PAGE(compound && !PageHead(page), page); /* Hugetlb pages are not counted in NR_*MAPPED */ - if (unlikely(PageHuge(page))) { + if (unlikely(folio_test_hugetlb(folio))) { /* hugetlb pages are always mapped with pmds */ - atomic_dec(compound_mapcount_ptr(page)); + atomic_dec(&folio->_entire_mapcount); return; } @@ -1385,20 +1387,18 @@ void page_remove_rmap(struct page *page, if (likely(!compound)) { last = atomic_add_negative(-1, &page->_mapcount); nr = last; - if (last && PageCompound(page)) { - mapped = subpages_mapcount_ptr(compound_head(page)); + if (last && folio_test_large(folio)) { nr = atomic_dec_return_relaxed(mapped); nr = (nr < COMPOUND_MAPPED); } - } else if (PageTransHuge(page)) { + } else if (folio_test_pmd_mappable(folio)) { /* That test is redundant: it's for safety or to optimize out */ - last = atomic_add_negative(-1, compound_mapcount_ptr(page)); + last = atomic_add_negative(-1, &folio->_entire_mapcount); if (last) { - mapped = subpages_mapcount_ptr(page); nr = atomic_sub_return_relaxed(COMPOUND_MAPPED, mapped); if (likely(nr < COMPOUND_MAPPED)) { - nr_pmdmapped = thp_nr_pages(page); + nr_pmdmapped = folio_nr_pages(folio); nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of another remove and an add? */ if (unlikely(nr < 0)) @@ -1411,21 +1411,26 @@ void page_remove_rmap(struct page *page, } if (nr_pmdmapped) { - __mod_lruvec_page_state(page, PageAnon(page) ? NR_ANON_THPS : - (PageSwapBacked(page) ? NR_SHMEM_PMDMAPPED : - NR_FILE_PMDMAPPED), -nr_pmdmapped); + if (folio_test_anon(folio)) + idx = NR_ANON_THPS; + else if (folio_test_swapbacked(folio)) + idx = NR_SHMEM_PMDMAPPED; + else + idx = NR_FILE_PMDMAPPED; + __lruvec_stat_mod_folio(folio, idx, -nr_pmdmapped); } if (nr) { - __mod_lruvec_page_state(page, PageAnon(page) ? NR_ANON_MAPPED : - NR_FILE_MAPPED, -nr); + idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; + __lruvec_stat_mod_folio(folio, idx, -nr); + /* - * Queue anon THP for deferred split if at least one small - * page of the compound page is unmapped, but at least one - * small page is still mapped. + * Queue anon THP for deferred split if at least one + * page of the folio is unmapped and at least one page + * is still mapped. */ - if (PageTransCompound(page) && PageAnon(page)) + if (folio_test_pmd_mappable(folio) && folio_test_anon(folio)) if (!compound || nr < nr_pmdmapped) - deferred_split_huge_page(compound_head(page)); + deferred_split_huge_page(&folio->page); } /* -- cgit v1.2.3 From ee0800c2f6a9e605947ce499d79fb7e2be16d6dd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:52 +0000 Subject: mm: convert page_add_anon_rmap() to use a folio internally The API for page_add_anon_rmap() needs to be page-based, because we can add mappings of individual pages. But inside the function, we want to only call compound_head() once and then use the folio APIs instead of the page APIs that each call compound_head(). Link: https://lkml.kernel.org/r/20230111142915.1001531-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/rmap.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 7762ab8371c0..7d5e0bf98c1e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1207,10 +1207,11 @@ static void __page_check_anon_rmap(struct page *page, * and to ensure that PageAnon is not being upgraded racily to PageKsm * (but PageKsm is never downgraded to PageAnon). */ -void page_add_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address, rmap_t flags) +void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, + unsigned long address, rmap_t flags) { - atomic_t *mapped; + struct folio *folio = page_folio(page); + atomic_t *mapped = &folio->_nr_pages_mapped; int nr = 0, nr_pmdmapped = 0; bool compound = flags & RMAP_COMPOUND; bool first = true; @@ -1219,20 +1220,18 @@ void page_add_anon_rmap(struct page *page, if (likely(!compound)) { first = atomic_inc_and_test(&page->_mapcount); nr = first; - if (first && PageCompound(page)) { - mapped = subpages_mapcount_ptr(compound_head(page)); + if (first && folio_test_large(folio)) { nr = atomic_inc_return_relaxed(mapped); nr = (nr < COMPOUND_MAPPED); } - } else if (PageTransHuge(page)) { + } else if (folio_test_pmd_mappable(folio)) { /* That test is redundant: it's for safety or to optimize out */ - first = atomic_inc_and_test(compound_mapcount_ptr(page)); + first = atomic_inc_and_test(&folio->_entire_mapcount); if (first) { - mapped = subpages_mapcount_ptr(page); nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { - nr_pmdmapped = thp_nr_pages(page); + nr_pmdmapped = folio_nr_pages(folio); nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ if (unlikely(nr < 0)) @@ -1248,11 +1247,11 @@ void page_add_anon_rmap(struct page *page, VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); if (nr_pmdmapped) - __mod_lruvec_page_state(page, NR_ANON_THPS, nr_pmdmapped); + __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr_pmdmapped); if (nr) - __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); + __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); - if (likely(!PageKsm(page))) { + if (likely(!folio_test_ksm(folio))) { /* address might be in next vma when migration races vma_adjust */ if (first) __page_set_anon_rmap(page, vma, address, -- cgit v1.2.3 From eb01a2ad7e9cba1b9dd131edc5a26ffbda90a5ed Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:53 +0000 Subject: mm: convert page_add_file_rmap() to use a folio internally The API for page_add_file_rmap() needs to be page-based, because we can add mappings of individual pages. But inside the function, we want to only call compound_head() once and then use the folio APIs instead of the page APIs that each call compound_head(). Link: https://lkml.kernel.org/r/20230111142915.1001531-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/rmap.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 7d5e0bf98c1e..430c066c2295 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1309,10 +1309,11 @@ void page_add_new_anon_rmap(struct page *page, * * The caller needs to hold the pte lock. */ -void page_add_file_rmap(struct page *page, - struct vm_area_struct *vma, bool compound) +void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, + bool compound) { - atomic_t *mapped; + struct folio *folio = page_folio(page); + atomic_t *mapped = &folio->_nr_pages_mapped; int nr = 0, nr_pmdmapped = 0; bool first; @@ -1322,20 +1323,18 @@ void page_add_file_rmap(struct page *page, if (likely(!compound)) { first = atomic_inc_and_test(&page->_mapcount); nr = first; - if (first && PageCompound(page)) { - mapped = subpages_mapcount_ptr(compound_head(page)); + if (first && folio_test_large(folio)) { nr = atomic_inc_return_relaxed(mapped); nr = (nr < COMPOUND_MAPPED); } - } else if (PageTransHuge(page)) { + } else if (folio_test_pmd_mappable(folio)) { /* That test is redundant: it's for safety or to optimize out */ - first = atomic_inc_and_test(compound_mapcount_ptr(page)); + first = atomic_inc_and_test(&folio->_entire_mapcount); if (first) { - mapped = subpages_mapcount_ptr(page); nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { - nr_pmdmapped = thp_nr_pages(page); + nr_pmdmapped = folio_nr_pages(folio); nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ if (unlikely(nr < 0)) @@ -1348,10 +1347,10 @@ void page_add_file_rmap(struct page *page, } if (nr_pmdmapped) - __mod_lruvec_page_state(page, PageSwapBacked(page) ? + __lruvec_stat_mod_folio(folio, folio_test_swapbacked(folio) ? NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped); if (nr) - __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); + __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr); mlock_vma_page(page, vma, compound); } -- cgit v1.2.3 From 4d510f3da4c216d4c2695395f67aec38e2aa6cc7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:54 +0000 Subject: mm: add folio_add_new_anon_rmap() In contrast to other rmap functions, page_add_new_anon_rmap() is always called with a freshly allocated page. That means it can't be called with a tail page. Turn page_add_new_anon_rmap() into folio_add_new_anon_rmap() and add a page_add_new_anon_rmap() wrapper. Callers can be converted individually. [akpm@linux-foundation.org: fix NOMMU build. page_add_new_anon_rmap() requires CONFIG_MMU] [willy@infradead.org: folio-compat.c needs rmap.h] Link: https://lkml.kernel.org/r/20230111142915.1001531-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 ++ mm/folio-compat.c | 11 +++++++++++ mm/rmap.c | 37 ++++++++++++++++++------------------- 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bd3504d11b15..aa682a2a93ce 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -194,6 +194,8 @@ void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address); +void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, + unsigned long address); void page_add_file_rmap(struct page *, struct vm_area_struct *, bool compound); void page_remove_rmap(struct page *, struct vm_area_struct *, diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 69ed25790c68..18c48b557926 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -6,6 +6,7 @@ #include #include +#include #include #include "internal.h" @@ -123,3 +124,13 @@ void putback_lru_page(struct page *page) { folio_putback_lru(page_folio(page)); } + +#ifdef CONFIG_MMU +void page_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, + unsigned long address) +{ + VM_BUG_ON_PAGE(PageTail(page), page); + + return folio_add_new_anon_rmap((struct folio *)page, vma, address); +} +#endif diff --git a/mm/rmap.c b/mm/rmap.c index 430c066c2295..513e23e5a158 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1264,41 +1264,40 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, } /** - * page_add_new_anon_rmap - add mapping to a new anonymous page - * @page: the page to add the mapping to + * folio_add_new_anon_rmap - Add mapping to a new anonymous folio. + * @folio: The folio to add the mapping to. * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped * - * If it's a compound page, it is accounted as a compound page. As the page - * is new, it's assume to get mapped exclusively by a single process. - * - * Same as page_add_anon_rmap but must only be called on *new* pages. + * Like page_add_anon_rmap() but must only be called on *new* folios. * This means the inc-and-test can be bypassed. - * Page does not have to be locked. + * The folio does not have to be locked. + * + * If the folio is large, it is accounted as a THP. As the folio + * is new, it's assumed to be mapped exclusively by a single process. */ -void page_add_new_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address) +void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, + unsigned long address) { int nr; VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); - __SetPageSwapBacked(page); + __folio_set_swapbacked(folio); - if (likely(!PageCompound(page))) { + if (likely(!folio_test_pmd_mappable(folio))) { /* increment count (starts at -1) */ - atomic_set(&page->_mapcount, 0); + atomic_set(&folio->_mapcount, 0); nr = 1; } else { - VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* increment count (starts at -1) */ - atomic_set(compound_mapcount_ptr(page), 0); - atomic_set(subpages_mapcount_ptr(page), COMPOUND_MAPPED); - nr = thp_nr_pages(page); - __mod_lruvec_page_state(page, NR_ANON_THPS, nr); + atomic_set(&folio->_entire_mapcount, 0); + atomic_set(&folio->_nr_pages_mapped, COMPOUND_MAPPED); + nr = folio_nr_pages(folio); + __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr); } - __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); - __page_set_anon_rmap(page, vma, address, 1); + __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); + __page_set_anon_rmap(&folio->page, vma, address, 1); } /** -- cgit v1.2.3 From 65a689f35ad7ebbfb79f429c1bc290b042ebb10b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:55 +0000 Subject: page_alloc: use folio fields directly Rmove the uses of compound_mapcount_ptr(), head_compound_mapcount() and subpages_mapcount_ptr() Link: https://lkml.kernel.org/r/20230111142915.1001531-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page_alloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b224c2132ed1..f15e0e15243f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -779,8 +779,8 @@ static void prep_compound_head(struct page *page, unsigned int order) set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); set_compound_order(page, order); - atomic_set(compound_mapcount_ptr(page), -1); - atomic_set(subpages_mapcount_ptr(page), 0); + atomic_set(&folio->_entire_mapcount, -1); + atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); } @@ -1309,12 +1309,12 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) switch (page - head_page) { case 1: /* the first tail page: these may be in place of ->mapping */ - if (unlikely(head_compound_mapcount(head_page))) { - bad_page(page, "nonzero compound_mapcount"); + if (unlikely(folio_entire_mapcount(folio))) { + bad_page(page, "nonzero entire_mapcount"); goto out; } - if (unlikely(atomic_read(subpages_mapcount_ptr(head_page)))) { - bad_page(page, "nonzero subpages_mapcount"); + if (unlikely(atomic_read(&folio->_nr_pages_mapped))) { + bad_page(page, "nonzero nr_pages_mapped"); goto out; } if (unlikely(atomic_read(&folio->_pincount))) { -- cgit v1.2.3 From db4e5dbdcdd55482ab23bf4a0ae6746f93efb0d9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:56 +0000 Subject: mm: use a folio in hugepage_add_anon_rmap() and hugepage_add_new_anon_rmap() Remove uses of compound_mapcount_ptr() Link: https://lkml.kernel.org/r/20230111142915.1001531-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/rmap.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 513e23e5a158..0020474f46c1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2528,13 +2528,14 @@ void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc) void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { + struct folio *folio = page_folio(page); struct anon_vma *anon_vma = vma->anon_vma; int first; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); BUG_ON(!anon_vma); /* address might be in next vma when migration races vma_adjust */ - first = atomic_inc_and_test(compound_mapcount_ptr(page)); + first = atomic_inc_and_test(&folio->_entire_mapcount); VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); if (first) @@ -2545,10 +2546,12 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, void hugepage_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { + struct folio *folio = page_folio(page); + BUG_ON(address < vma->vm_start || address >= vma->vm_end); /* increment count (starts at -1) */ - atomic_set(compound_mapcount_ptr(page), 0); - ClearHPageRestoreReserve(page); + atomic_set(&folio->_entire_mapcount, 0); + folio_clear_hugetlb_restore_reserve(folio); __page_set_anon_rmap(page, vma, address, 1); } #endif /* CONFIG_HUGETLB_PAGE */ -- cgit v1.2.3 From c7f84b5723f1a60becd79d895ab214a7d5ee93c1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:57 +0000 Subject: mm: use entire_mapcount in __page_dup_rmap() Remove the use of the compound_mapcount_ptr() wrapper, and add an assertion that we're not passing a tail page if we're duplicating a PMD. Link: https://lkml.kernel.org/r/20230111142915.1001531-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/rmap.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index aa682a2a93ce..a6bd1f0a183d 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -208,7 +208,14 @@ void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, static inline void __page_dup_rmap(struct page *page, bool compound) { - atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); + if (compound) { + struct folio *folio = (struct folio *)page; + + VM_BUG_ON_PAGE(compound && !PageHead(page), page); + atomic_inc(&folio->_entire_mapcount); + } else { + atomic_inc(&page->_mapcount); + } } static inline void page_dup_file_rmap(struct page *page, bool compound) -- cgit v1.2.3 From 91ec7f284a0c445b251caba942c993ebf7b6db9f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:58 +0000 Subject: mm/debug: remove call to head_compound_mapcount() Call folio_entire_mapcount() instead. Link: https://lkml.kernel.org/r/20230111142915.1001531-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/debug.c b/mm/debug.c index 8e58e8dab0b2..9d3d893dc7f4 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -94,9 +94,9 @@ static void __dump_page(struct page *page) page, page_ref_count(head), mapcount, mapping, page_to_pgoff(page), page_to_pfn(page)); if (compound) { - pr_warn("head:%p order:%u compound_mapcount:%d nr_pages_mapped:%d pincount:%d\n", + pr_warn("head:%p order:%u entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n", head, compound_order(head), - head_compound_mapcount(head), + folio_entire_mapcount(folio), folio_nr_pages_mapped(folio), atomic_read(&folio->_pincount)); } -- cgit v1.2.3 From 46f2722825983a51e849eb0ef2814e5c7f040fef Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:59 +0000 Subject: hugetlb: remove uses of folio_mapcount_ptr Use the entire_mapcount field directly. Link: https://lkml.kernel.org/r/20230111142915.1001531-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c9702224931c..a68e0e597a8f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1474,7 +1474,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio, int nr_pages = 1 << order; struct page *p; - atomic_set(folio_mapcount_ptr(folio), 0); + atomic_set(&folio->_entire_mapcount, 0); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); @@ -1996,7 +1996,7 @@ static bool __prep_compound_gigantic_folio(struct folio *folio, if (i != 0) set_compound_head(p, &folio->page); } - atomic_set(folio_mapcount_ptr(folio), -1); + atomic_set(&folio->_entire_mapcount, -1); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); return true; -- cgit v1.2.3 From c97eeb8f260dba098ba775e37d216f81f28559a9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:00 +0000 Subject: mm: convert page_mapcount() to use folio_entire_mapcount() Remove a use of head_compound_mapcount(). Link: https://lkml.kernel.org/r/20230111142915.1001531-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index bdf83e75bcd6..a6afa6c51a4d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -853,22 +853,26 @@ static inline void page_mapcount_reset(struct page *page) atomic_set(&(page)->_mapcount, -1); } -/* - * Mapcount of 0-order page; when compound sub-page, includes - * compound_mapcount of compound_head of page. +/** + * page_mapcount() - Number of times this precise page is mapped. + * @page: The page. + * + * The number of times this page is mapped. If this page is part of + * a large folio, it includes the number of times this page is mapped + * as part of that folio. * - * Result is undefined for pages which cannot be mapped into userspace. + * The result is undefined for pages which cannot be mapped into userspace. * For example SLAB or special types of pages. See function page_has_type(). - * They use this place in struct page differently. + * They use this field in struct page differently. */ static inline int page_mapcount(struct page *page) { int mapcount = atomic_read(&page->_mapcount) + 1; - if (likely(!PageCompound(page))) - return mapcount; - page = compound_head(page); - return head_compound_mapcount(page) + mapcount; + if (unlikely(PageCompound(page))) + mapcount += folio_entire_mapcount(page_folio(page)); + + return mapcount; } int folio_total_mapcount(struct folio *folio); -- cgit v1.2.3 From 1aa4d03b60c0f61a8d96d5d633bf7968dbf6841f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:01 +0000 Subject: mm: remove head_compound_mapcount() and _ptr functions folio_mapcount_ptr(), compound_mapcount_ptr() and subpages_mapcount_ptr() are all now unused. Link: https://lkml.kernel.org/r/20230111142915.1001531-16-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 15 +++------------ include/linux/mm_types.h | 16 ---------------- 2 files changed, 3 insertions(+), 28 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index a6afa6c51a4d..7ff6e2410aa3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -831,16 +831,7 @@ static inline int is_vmalloc_or_module_addr(const void *x) static inline int folio_entire_mapcount(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); - return atomic_read(folio_mapcount_ptr(folio)) + 1; -} - -/* - * Mapcount of compound page as a whole, does not include mapped sub-pages. - * Must be called only on head of compound page. - */ -static inline int head_compound_mapcount(struct page *head) -{ - return atomic_read(compound_mapcount_ptr(head)) + 1; + return atomic_read(&folio->_entire_mapcount) + 1; } /* @@ -905,11 +896,11 @@ static inline int total_mapcount(struct page *page) static inline bool folio_large_is_mapped(struct folio *folio) { /* - * Reading folio_mapcount_ptr() below could be omitted if hugetlb + * Reading _entire_mapcount below could be omitted if hugetlb * participated in incrementing nr_pages_mapped when compound mapped. */ return atomic_read(&folio->_nr_pages_mapped) > 0 || - atomic_read(folio_mapcount_ptr(folio)) >= 0; + atomic_read(&folio->_entire_mapcount) >= 0; } /** diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 70cbda768308..ffcf21fbaaf0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -421,22 +421,6 @@ FOLIO_MATCH(hugetlb_cgroup_rsvd, _hugetlb_cgroup_rsvd); FOLIO_MATCH(hugetlb_hwpoison, _hugetlb_hwpoison); #undef FOLIO_MATCH -static inline atomic_t *folio_mapcount_ptr(struct folio *folio) -{ - struct page *tail = &folio->page + 1; - return &tail->compound_mapcount; -} - -static inline atomic_t *compound_mapcount_ptr(struct page *page) -{ - return &page[1].compound_mapcount; -} - -static inline atomic_t *subpages_mapcount_ptr(struct page *page) -{ - return &page[1].subpages_mapcount; -} - /* * Used for sizing the vmemmap region on some architectures */ -- cgit v1.2.3 From 5eb5cea11dcbafaa37685bc4e89e1d4ae9c434ea Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:02 +0000 Subject: mm: reimplement compound_order() Make compound_order() use struct folio. It can't be turned into a wrapper around folio_order() as a page can be turned into a tail page between a check in compound_order() and the assertion in folio_test_large(). Link: https://lkml.kernel.org/r/20230111142915.1001531-17-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 7ff6e2410aa3..3adc37cebe6f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -719,11 +719,20 @@ int vma_is_stack_for_current(struct vm_area_struct *vma); struct mmu_gather; struct inode; +/* + * compound_order() can be called without holding a reference, which means + * that niceties like page_folio() don't work. These callers should be + * prepared to handle wild return values. For example, PG_head may be + * set before _folio_order is initialised, or this may be a tail page. + * See compaction.c for some good examples. + */ static inline unsigned int compound_order(struct page *page) { - if (!PageHead(page)) + struct folio *folio = (struct folio *)page; + + if (!test_bit(PG_head, &folio->flags)) return 0; - return page[1].compound_order; + return folio->_folio_order; } /** -- cgit v1.2.3 From 21a000fe97a018c6d25be63892afb4fd8210ab57 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:03 +0000 Subject: mm: reimplement compound_nr() Turn compound_nr() into a wrapper around folio_nr_pages(). Similarly to compound_order(), casting the struct page directly to struct folio preserves the existing behaviour, while calling page_folio() would change the behaviour. Move thp_nr_pages() down in the file so that compound_nr() can be after folio_nr_pages(). [willy@infradead.org: fix assertion triggering] Link: https://lkml.kernel.org/r/Y8AFgZEEjnUIaCbf@casper.infradead.org Link: https://lkml.kernel.org/r/20230111142915.1001531-18-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/mm.h | 49 +++++++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 3adc37cebe6f..3acf09d5b0bd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1005,18 +1005,6 @@ static inline void set_compound_order(struct page *page, unsigned int order) #endif } -/* Returns the number of pages in this potentially compound page. */ -static inline unsigned long compound_nr(struct page *page) -{ - if (!PageHead(page)) - return 1; -#ifdef CONFIG_64BIT - return page[1].compound_nr; -#else - return 1UL << compound_order(page); -#endif -} - /* Returns the number of bytes in this potentially compound page. */ static inline unsigned long page_size(struct page *page) { @@ -1039,16 +1027,6 @@ static inline unsigned int thp_order(struct page *page) return compound_order(page); } -/** - * thp_nr_pages - The number of regular pages in this huge page. - * @page: The head page of a huge page. - */ -static inline int thp_nr_pages(struct page *page) -{ - VM_BUG_ON_PGFLAGS(PageTail(page), page); - return compound_nr(page); -} - /** * thp_size - Size of a transparent huge page. * @page: Head page of a transparent huge page. @@ -1758,6 +1736,33 @@ static inline long folio_nr_pages(struct folio *folio) #endif } +/* + * compound_nr() returns the number of pages in this potentially compound + * page. compound_nr() can be called on a tail page, and is defined to + * return 1 in that case. + */ +static inline unsigned long compound_nr(struct page *page) +{ + struct folio *folio = (struct folio *)page; + + if (!test_bit(PG_head, &folio->flags)) + return 1; +#ifdef CONFIG_64BIT + return folio->_folio_nr_pages; +#else + return 1L << folio->_folio_order; +#endif +} + +/** + * thp_nr_pages - The number of regular pages in this huge page. + * @page: The head page of a huge page. + */ +static inline int thp_nr_pages(struct page *page) +{ + return folio_nr_pages((struct folio *)page); +} + /** * folio_next - Move to the next physical folio. * @folio: The folio we're currently operating on. -- cgit v1.2.3 From bad6da64565846ef5ba85b0b685cfde9db0085dc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:04 +0000 Subject: mm: convert set_compound_page_dtor() and set_compound_order() to folios Replace uses of compound_dtor, compound_order and compound_nr by their folio equivalents. Link: https://lkml.kernel.org/r/20230111142915.1001531-19-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 3acf09d5b0bd..836b96e08a14 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -984,8 +984,11 @@ extern compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS]; static inline void set_compound_page_dtor(struct page *page, enum compound_dtor_id compound_dtor) { + struct folio *folio = (struct folio *)page; + VM_BUG_ON_PAGE(compound_dtor >= NR_COMPOUND_DTORS, page); - page[1].compound_dtor = compound_dtor; + VM_BUG_ON_PAGE(!PageHead(page), page); + folio->_folio_dtor = compound_dtor; } static inline void folio_set_compound_dtor(struct folio *folio, @@ -999,9 +1002,11 @@ void destroy_large_folio(struct folio *folio); static inline void set_compound_order(struct page *page, unsigned int order) { - page[1].compound_order = order; + struct folio *folio = (struct folio *)page; + + folio->_folio_order = order; #ifdef CONFIG_64BIT - page[1].compound_nr = 1U << order; + folio->_folio_nr_pages = 1U << order; #endif } -- cgit v1.2.3 From f04029f34e8c3750b8fb39b54e788b9355d1b912 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:05 +0000 Subject: mm: convert is_transparent_hugepage() to use a folio Replace a use of page->compound_dtor with its folio equivalent. Link: https://lkml.kernel.org/r/20230111142915.1001531-20-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/huge_memory.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9570f03cdee4..bfa960f012fa 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -591,12 +591,14 @@ void prep_transhuge_page(struct page *page) static inline bool is_transparent_hugepage(struct page *page) { + struct folio *folio; + if (!PageCompound(page)) return false; - page = compound_head(page); - return is_huge_zero_page(page) || - page[1].compound_dtor == TRANSHUGE_PAGE_DTOR; + folio = page_folio(page); + return is_huge_zero_page(&folio->page) || + folio->_folio_dtor == TRANSHUGE_PAGE_DTOR; } static unsigned long __thp_get_unmapped_area(struct file *filp, -- cgit v1.2.3 From a60d5942cc9bd65806b69360020d9b1664c747ad Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:06 +0000 Subject: mm: convert destroy_large_folio() to use folio_dtor Replace a use of compound_dtor. Link: https://lkml.kernel.org/r/20230111142915.1001531-21-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f15e0e15243f..88494e82843d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -807,7 +807,7 @@ void prep_compound_page(struct page *page, unsigned int order) void destroy_large_folio(struct folio *folio) { - enum compound_dtor_id dtor = folio_page(folio, 1)->compound_dtor; + enum compound_dtor_id dtor = folio->_folio_dtor; VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio); compound_page_dtors[dtor](&folio->page); -- cgit v1.2.3 From 2d678c641a4625d2b1cfeb50d7426fab6d3740b3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:07 +0000 Subject: hugetlb: remove uses of compound_dtor and compound_nr Convert the entire file to use the folio equivalents. Link: https://lkml.kernel.org/r/20230111142915.1001531-22-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/hugetlb.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a68e0e597a8f..ca9e177b9c54 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2038,11 +2038,12 @@ static bool prep_compound_gigantic_folio_for_demote(struct folio *folio, */ int PageHuge(struct page *page) { + struct folio *folio; + if (!PageCompound(page)) return 0; - - page = compound_head(page); - return page[1].compound_dtor == HUGETLB_PAGE_DTOR; + folio = page_folio(page); + return folio->_folio_dtor == HUGETLB_PAGE_DTOR; } EXPORT_SYMBOL_GPL(PageHuge); @@ -2052,10 +2053,11 @@ EXPORT_SYMBOL_GPL(PageHuge); */ int PageHeadHuge(struct page *page_head) { - if (!PageHead(page_head)) + struct folio *folio = (struct folio *)page_head; + if (!folio_test_large(folio)) return 0; - return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR; + return folio->_folio_dtor == HUGETLB_PAGE_DTOR; } EXPORT_SYMBOL_GPL(PageHeadHuge); -- cgit v1.2.3 From 1c5509be58f636afabbdaf66e7436da8ec0a1828 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:08 +0000 Subject: mm: remove 'First tail page' members from struct page All former users now use the folio equivalents, so remove them from the definition of struct page. Link: https://lkml.kernel.org/r/20230111142915.1001531-23-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 18 ------------------ kernel/crash_core.c | 4 ++-- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ffcf21fbaaf0..94b1707f5d33 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -140,16 +140,6 @@ struct page { }; struct { /* Tail pages of compound page */ unsigned long compound_head; /* Bit zero is set */ - - /* First tail page only */ - unsigned char compound_dtor; - unsigned char compound_order; - atomic_t compound_mapcount; - atomic_t subpages_mapcount; - atomic_t compound_pincount; -#ifdef CONFIG_64BIT - unsigned int compound_nr; /* 1 << compound_order */ -#endif }; struct { /* Second tail page of transparent huge page */ unsigned long _compound_pad_1; /* compound_head */ @@ -401,14 +391,6 @@ FOLIO_MATCH(memcg_data, memcg_data); offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); FOLIO_MATCH(compound_head, _head_1); -FOLIO_MATCH(compound_dtor, _folio_dtor); -FOLIO_MATCH(compound_order, _folio_order); -FOLIO_MATCH(compound_mapcount, _entire_mapcount); -FOLIO_MATCH(subpages_mapcount, _nr_pages_mapped); -FOLIO_MATCH(compound_pincount, _pincount); -#ifdef CONFIG_64BIT -FOLIO_MATCH(compound_nr, _folio_nr_pages); -#endif #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 87ef6096823f..755f5f08ab38 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -455,8 +455,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(page, compound_dtor); - VMCOREINFO_OFFSET(page, compound_order); + VMCOREINFO_OFFSET(folio, _folio_dtor); + VMCOREINFO_OFFSET(folio, _folio_order); VMCOREINFO_OFFSET(page, compound_head); VMCOREINFO_OFFSET(pglist_data, node_zones); VMCOREINFO_OFFSET(pglist_data, nr_zones); -- cgit v1.2.3 From a8d55327ccc1f999a5fba4eee67ed08bd36493ad Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:09 +0000 Subject: doc: correct struct folio kernel-doc Insert appropriate public: and private: markers to make the generated kernel-doc look right. Link: https://lkml.kernel.org/r/20230111142915.1001531-24-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 94b1707f5d33..d458e9b8496c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -292,16 +292,12 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) * @_refcount: Do not access this member directly. Use folio_ref_count() * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. - * @_flags_1: For large folios, additional page flags. - * @_head_1: Points to the folio. Do not use. * @_folio_dtor: Which destructor to use for this folio. * @_folio_order: Do not use directly, call folio_order(). * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). * @_nr_pages_mapped: Do not use directly, call folio_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). - * @_flags_2: For alignment. Do not use. - * @_head_2: Points to the folio. Do not use. * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h. * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h. * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h. @@ -348,6 +344,7 @@ struct folio { struct { unsigned long _flags_1; unsigned long _head_1; + /* public: */ unsigned char _folio_dtor; unsigned char _folio_order; atomic_t _entire_mapcount; @@ -356,6 +353,7 @@ struct folio { #ifdef CONFIG_64BIT unsigned int _folio_nr_pages; #endif + /* private: the union with struct page is transitional */ }; struct page __page_1; }; @@ -363,10 +361,12 @@ struct folio { struct { unsigned long _flags_2; unsigned long _head_2; + /* public: */ void *_hugetlb_subpool; void *_hugetlb_cgroup; void *_hugetlb_cgroup_rsvd; void *_hugetlb_hwpoison; + /* private: the union with struct page is transitional */ }; struct page __page_2; }; -- cgit v1.2.3 From 4375a553f46c6cb66d1711d8f514dfdf34ce74b0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:10 +0000 Subject: mm: move page->deferred_list to folio->_deferred_list Remove the entire block of definitions for the second tail page, and add the deferred list to the struct folio. This actually moves _deferred_list to a different offset in struct folio because I don't see a need to include the padding. This lets us use list_for_each_entry_safe() in deferred_split_scan() and avoid a number of calls to compound_head(). Link: https://lkml.kernel.org/r/20230111142915.1001531-25-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 9 ++++----- include/linux/mm_types.h | 14 ++++++++------ mm/huge_memory.c | 32 +++++++++++++++----------------- 3 files changed, 27 insertions(+), 28 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a1341fdcf666..aacfcb02606f 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -295,11 +295,10 @@ static inline bool thp_migration_supported(void) static inline struct list_head *page_deferred_list(struct page *page) { - /* - * See organization of tail pages of compound page in - * "struct page" definition. - */ - return &page[2].deferred_list; + struct folio *folio = (struct folio *)page; + + VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); + return &folio->_deferred_list; } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d458e9b8496c..7eb4d0815a78 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -141,12 +141,6 @@ struct page { struct { /* Tail pages of compound page */ unsigned long compound_head; /* Bit zero is set */ }; - struct { /* Second tail page of transparent huge page */ - unsigned long _compound_pad_1; /* compound_head */ - unsigned long _compound_pad_2; - /* For both global and memcg */ - struct list_head deferred_list; - }; struct { /* Second tail page of hugetlb page */ unsigned long _hugetlb_pad_1; /* compound_head */ void *hugetlb_subpool; @@ -302,6 +296,7 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h. * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h. * @_hugetlb_hwpoison: Do not use directly, call raw_hwp_list_head(). + * @_deferred_list: Folios to be split under memory pressure. * * A folio is a physically, virtually and logically contiguous set * of bytes. It is a power-of-two in size, and it is aligned to that @@ -366,6 +361,13 @@ struct folio { void *_hugetlb_cgroup; void *_hugetlb_cgroup_rsvd; void *_hugetlb_hwpoison; + /* private: the union with struct page is transitional */ + }; + struct { + unsigned long _flags_2a; + unsigned long _head_2a; + /* public: */ + struct list_head _deferred_list; /* private: the union with struct page is transitional */ }; struct page __page_2; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bfa960f012fa..a4138daaa0b8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2756,9 +2756,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); if (folio_ref_freeze(folio, 1 + extra_pins)) { - if (!list_empty(page_deferred_list(&folio->page))) { + if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; - list_del(page_deferred_list(&folio->page)); + list_del(&folio->_deferred_list); } spin_unlock(&ds_queue->split_queue_lock); if (mapping) { @@ -2873,8 +2873,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, struct pglist_data *pgdata = NODE_DATA(sc->nid); struct deferred_split *ds_queue = &pgdata->deferred_split_queue; unsigned long flags; - LIST_HEAD(list), *pos, *next; - struct page *page; + LIST_HEAD(list); + struct folio *folio, *next; int split = 0; #ifdef CONFIG_MEMCG @@ -2884,14 +2884,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, spin_lock_irqsave(&ds_queue->split_queue_lock, flags); /* Take pin on all head pages to avoid freeing them under us */ - list_for_each_safe(pos, next, &ds_queue->split_queue) { - page = list_entry((void *)pos, struct page, deferred_list); - page = compound_head(page); - if (get_page_unless_zero(page)) { - list_move(page_deferred_list(page), &list); + list_for_each_entry_safe(folio, next, &ds_queue->split_queue, + _deferred_list) { + if (folio_try_get(folio)) { + list_move(&folio->_deferred_list, &list); } else { - /* We lost race with put_compound_page() */ - list_del_init(page_deferred_list(page)); + /* We lost race with folio_put() */ + list_del_init(&folio->_deferred_list); ds_queue->split_queue_len--; } if (!--sc->nr_to_scan) @@ -2899,16 +2898,15 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); - list_for_each_safe(pos, next, &list) { - page = list_entry((void *)pos, struct page, deferred_list); - if (!trylock_page(page)) + list_for_each_entry_safe(folio, next, &list, _deferred_list) { + if (!folio_trylock(folio)) goto next; /* split_huge_page() removes page from list on success */ - if (!split_huge_page(page)) + if (!split_folio(folio)) split++; - unlock_page(page); + folio_unlock(folio); next: - put_page(page); + folio_put(folio); } spin_lock_irqsave(&ds_queue->split_queue_lock, flags); -- cgit v1.2.3 From 8991de90e99755b13026b1db32d1fa52e94c6a96 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:11 +0000 Subject: mm/huge_memory: remove page_deferred_list() Use folio->_deferred_list directly. Link: https://lkml.kernel.org/r/20230111142915.1001531-26-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 8 -------- mm/huge_memory.c | 34 +++++++++++++++++----------------- 2 files changed, 17 insertions(+), 25 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index aacfcb02606f..b9978978a160 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -293,14 +293,6 @@ static inline bool thp_migration_supported(void) return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); } -static inline struct list_head *page_deferred_list(struct page *page) -{ - struct folio *folio = (struct folio *)page; - - VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); - return &folio->_deferred_list; -} - #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a4138daaa0b8..7aedfe7cf5df 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -580,12 +580,10 @@ static inline struct deferred_split *get_deferred_split_queue(struct page *page) void prep_transhuge_page(struct page *page) { - /* - * we use page->mapping and page->index in second tail page - * as list_head: assuming THP order >= 2 - */ + struct folio *folio = (struct folio *)page; - INIT_LIST_HEAD(page_deferred_list(page)); + VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); + INIT_LIST_HEAD(&folio->_deferred_list); set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); } @@ -2802,13 +2800,14 @@ out: void free_transhuge_page(struct page *page) { + struct folio *folio = (struct folio *)page; struct deferred_split *ds_queue = get_deferred_split_queue(page); unsigned long flags; spin_lock_irqsave(&ds_queue->split_queue_lock, flags); - if (!list_empty(page_deferred_list(page))) { + if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; - list_del(page_deferred_list(page)); + list_del(&folio->_deferred_list); } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); free_compound_page(page); @@ -2816,38 +2815,39 @@ void free_transhuge_page(struct page *page) void deferred_split_huge_page(struct page *page) { + struct folio *folio = page_folio(page); struct deferred_split *ds_queue = get_deferred_split_queue(page); #ifdef CONFIG_MEMCG - struct mem_cgroup *memcg = page_memcg(compound_head(page)); + struct mem_cgroup *memcg = folio_memcg(folio); #endif unsigned long flags; - VM_BUG_ON_PAGE(!PageTransHuge(page), page); + VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); /* * The try_to_unmap() in page reclaim path might reach here too, * this may cause a race condition to corrupt deferred split queue. - * And, if page reclaim is already handling the same page, it is + * And, if page reclaim is already handling the same folio, it is * unnecessary to handle it again in shrinker. * - * Check PageSwapCache to determine if the page is being - * handled by page reclaim since THP swap would add the page into + * Check the swapcache flag to determine if the folio is being + * handled by page reclaim since THP swap would add the folio into * swap cache before calling try_to_unmap(). */ - if (PageSwapCache(page)) + if (folio_test_swapcache(folio)) return; - if (!list_empty(page_deferred_list(page))) + if (!list_empty(&folio->_deferred_list)) return; spin_lock_irqsave(&ds_queue->split_queue_lock, flags); - if (list_empty(page_deferred_list(page))) { + if (list_empty(&folio->_deferred_list)) { count_vm_event(THP_DEFERRED_SPLIT_PAGE); - list_add_tail(page_deferred_list(page), &ds_queue->split_queue); + list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); ds_queue->split_queue_len++; #ifdef CONFIG_MEMCG if (memcg) - set_shrinker_bit(memcg, page_to_nid(page), + set_shrinker_bit(memcg, folio_nid(folio), deferred_split_shrinker.id); #endif } -- cgit v1.2.3 From f8baa6be0368b5d21be34e8bf071b563b0f77584 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:12 +0000 Subject: mm/huge_memory: convert get_deferred_split_queue() to take a folio Removes a few calls to compound_head(). Link: https://lkml.kernel.org/r/20230111142915.1001531-27-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/huge_memory.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7aedfe7cf5df..c23b0e01734b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -559,10 +559,11 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) } #ifdef CONFIG_MEMCG -static inline struct deferred_split *get_deferred_split_queue(struct page *page) +static inline +struct deferred_split *get_deferred_split_queue(struct folio *folio) { - struct mem_cgroup *memcg = page_memcg(compound_head(page)); - struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + struct mem_cgroup *memcg = folio_memcg(folio); + struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); if (memcg) return &memcg->deferred_split_queue; @@ -570,9 +571,10 @@ static inline struct deferred_split *get_deferred_split_queue(struct page *page) return &pgdat->deferred_split_queue; } #else -static inline struct deferred_split *get_deferred_split_queue(struct page *page) +static inline +struct deferred_split *get_deferred_split_queue(struct folio *folio) { - struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); return &pgdat->deferred_split_queue; } @@ -2650,7 +2652,7 @@ bool can_split_folio(struct folio *folio, int *pextra_pins) int split_huge_page_to_list(struct page *page, struct list_head *list) { struct folio *folio = page_folio(page); - struct deferred_split *ds_queue = get_deferred_split_queue(&folio->page); + struct deferred_split *ds_queue = get_deferred_split_queue(folio); XA_STATE(xas, &folio->mapping->i_pages, folio->index); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; @@ -2801,7 +2803,7 @@ out: void free_transhuge_page(struct page *page) { struct folio *folio = (struct folio *)page; - struct deferred_split *ds_queue = get_deferred_split_queue(page); + struct deferred_split *ds_queue = get_deferred_split_queue(folio); unsigned long flags; spin_lock_irqsave(&ds_queue->split_queue_lock, flags); @@ -2816,7 +2818,7 @@ void free_transhuge_page(struct page *page) void deferred_split_huge_page(struct page *page) { struct folio *folio = page_folio(page); - struct deferred_split *ds_queue = get_deferred_split_queue(page); + struct deferred_split *ds_queue = get_deferred_split_queue(folio); #ifdef CONFIG_MEMCG struct mem_cgroup *memcg = folio_memcg(folio); #endif -- cgit v1.2.3 From f158ed6195ef949060811fd85086928470651944 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:13 +0000 Subject: mm: convert deferred_split_huge_page() to deferred_split_folio() Now that both callers use a folio, pass the folio in and save a call to compound_head(). Link: https://lkml.kernel.org/r/20230111142915.1001531-28-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/mm/transhuge.rst | 6 +++--- include/linux/huge_mm.h | 4 ++-- mm/huge_memory.c | 3 +-- mm/rmap.c | 2 +- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst index 03bbd0a19041..a9608fe51649 100644 --- a/Documentation/mm/transhuge.rst +++ b/Documentation/mm/transhuge.rst @@ -153,8 +153,8 @@ clear where references should go after split: it will stay on the head page. Note that split_huge_pmd() doesn't have any limitations on refcounting: pmd can be split at any point and never fails. -Partial unmap and deferred_split_huge_page() -============================================ +Partial unmap and deferred_split_folio() +======================================== Unmapping part of THP (with munmap() or other way) is not going to free memory immediately. Instead, we detect that a subpage of THP is not in use @@ -166,6 +166,6 @@ the place where we can detect partial unmap. It also might be counterproductive since in many cases partial unmap happens during exit(2) if a THP crosses a VMA boundary. -The function deferred_split_huge_page() is used to queue a page for splitting. +The function deferred_split_folio() is used to queue a folio for splitting. The splitting itself will happen when we get memory pressure via shrinker interface. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b9978978a160..70bd867eba94 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -187,7 +187,7 @@ static inline int split_huge_page(struct page *page) { return split_huge_page_to_list(page, NULL); } -void deferred_split_huge_page(struct page *page); +void deferred_split_folio(struct folio *folio); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct folio *folio); @@ -340,7 +340,7 @@ static inline int split_huge_page(struct page *page) { return 0; } -static inline void deferred_split_huge_page(struct page *page) {} +static inline void deferred_split_folio(struct folio *folio) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c23b0e01734b..868fcccdff72 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2815,9 +2815,8 @@ void free_transhuge_page(struct page *page) free_compound_page(page); } -void deferred_split_huge_page(struct page *page) +void deferred_split_folio(struct folio *folio) { - struct folio *folio = page_folio(page); struct deferred_split *ds_queue = get_deferred_split_queue(folio); #ifdef CONFIG_MEMCG struct mem_cgroup *memcg = folio_memcg(folio); diff --git a/mm/rmap.c b/mm/rmap.c index 0020474f46c1..a079d9964b9c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1427,7 +1427,7 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma, */ if (folio_test_pmd_mappable(folio) && folio_test_anon(folio)) if (!compound || nr < nr_pmdmapped) - deferred_split_huge_page(&folio->page); + deferred_split_folio(folio); } /* -- cgit v1.2.3 From 6a171c16e62f854e6a7e0f837dbe8f3ace0f00ce Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 11 Jan 2023 14:29:14 +0000 Subject: mm: remove the hugetlb field from struct page Patch series "Get rid of tail page fields". Continue the shrinkage of the struct page definition by getting rid of the 'first tail page' and 'second tail page' fields. I originally did this patch set before Hugh's rewrite of the subpages_mapcount, so it needed substantial updates; hope I didn't miss anything. This patch (of 28): commit dad6a5eb5556(mm,hugetlb: use folio fields in second tail page) added a transitional hugetlb field to struct page and struct folio to make room for another int in the first tail of a compound page. Hugetlb folio conversions have changed all page users of this field to use the fields within the folio so struct page no longer needs this hugetlb specific field. Link: https://lkml.kernel.org/r/20230111142915.1001531-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230111142915.1001531-29-willy@infradead.org Signed-off-by: Sidhartha Kumar Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7eb4d0815a78..452920467223 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -141,14 +141,6 @@ struct page { struct { /* Tail pages of compound page */ unsigned long compound_head; /* Bit zero is set */ }; - struct { /* Second tail page of hugetlb page */ - unsigned long _hugetlb_pad_1; /* compound_head */ - void *hugetlb_subpool; - void *hugetlb_cgroup; - void *hugetlb_cgroup_rsvd; - void *hugetlb_hwpoison; - /* No more space on 32-bit: use third tail if more */ - }; struct { /* Page table pages */ unsigned long _pt_pad_1; /* compound_head */ pgtable_t pmd_huge_pte; /* protected by page->ptl */ @@ -399,10 +391,6 @@ FOLIO_MATCH(compound_head, _head_1); offsetof(struct page, pg) + 2 * sizeof(struct page)) FOLIO_MATCH(flags, _flags_2); FOLIO_MATCH(compound_head, _head_2); -FOLIO_MATCH(hugetlb_subpool, _hugetlb_subpool); -FOLIO_MATCH(hugetlb_cgroup, _hugetlb_cgroup); -FOLIO_MATCH(hugetlb_cgroup_rsvd, _hugetlb_cgroup_rsvd); -FOLIO_MATCH(hugetlb_hwpoison, _hugetlb_hwpoison); #undef FOLIO_MATCH /* -- cgit v1.2.3 From f942b0f0528d1198b94b8211c84d4f28a654c0ff Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Wed, 11 Jan 2023 21:53:48 +0800 Subject: maple_tree: fix comment of mte_destroy_walk The parameter name of maple tree is mt, make the comment be mt instead of mn, and the separator between the parameter name and the description to be : instead of -. Link: https://lkml.kernel.org/r/20230111135348.803181-1-vernon2gm@gmail.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Vernon Yang Cc: Liam R. Howlett Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- lib/maple_tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 5be99550e36d..1c5d3b640a24 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5579,8 +5579,8 @@ free_leaf: /* * mte_destroy_walk() - Free a tree or sub-tree. - * @enode - the encoded maple node (maple_enode) to start - * @mn - the tree to free - needed for node types. + * @enode: the encoded maple node (maple_enode) to start + * @mt: the tree to free - needed for node types. * * Must hold the write lock. */ -- cgit v1.2.3 From 82b249361f2d1627acc7dbbdab1c80246a02fd4a Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Wed, 11 Jan 2023 21:20:36 +0800 Subject: mm/mmap: fix comment of unmapped_area{_topdown} The low_limit of unmapped area information is inclusive, and the hight_limit is not, so make symbol to be [ instead of (. And replace hight_limit to high_limit. Link: https://lkml.kernel.org/r/20230111132036.801404-1-vernon2gm@gmail.com Fixes: 3499a13168da ("mm/mmap: use maple tree for unmapped_area{_topdown}") Signed-off-by: Vernon Yang Cc: Liam R. Howlett Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/mmap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 0641e6e0016c..335ba3df9898 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1558,8 +1558,8 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) * the correct alignment and offset, all from @info. Note: current->mm is used * for the search. * - * @info: The unmapped area information including the range (low_limit - - * hight_limit), the alignment offset and mask. + * @info: The unmapped area information including the range [low_limit - + * high_limit), the alignment offset and mask. * * Return: A memory address or -ENOMEM. */ @@ -1585,11 +1585,11 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) /** * unmapped_area_topdown() - Find an area between the low_limit and the - * high_limit with * the correct alignment and offset at the highest available + * high_limit with the correct alignment and offset at the highest available * address, all from @info. Note: current->mm is used for the search. * - * @info: The unmapped area information including the range (low_limit - - * hight_limit), the alignment offset and mask. + * @info: The unmapped area information including the range [low_limit - + * high_limit), the alignment offset and mask. * * Return: A memory address or -ENOMEM. */ -- cgit v1.2.3 From 1e15d374bb1cb95f738334018a216e4f6360e821 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 11 Jan 2023 11:18:06 +0100 Subject: Revert "x86: kmsan: sync metadata pages on page fault" This reverts commit 3f1e2c7a9099c1ed32c67f12cdf432ba782cf51f. As noticed by Qun-Wei Lin, arch_sync_kernel_mappings() in arch/x86/mm/fault.c is only used with CONFIG_X86_32, whereas KMSAN is only supported on x86_64, where this code is not compiled. The patch in question dates back to downstream KMSAN branch based on v5.8-rc5, it sneaked into upstream unnoticed in v6.1. Link: https://lkml.kernel.org/r/20230111101806.3236991-1-glider@google.com Signed-off-by: Alexander Potapenko Reported-by: Qun-Wei Lin Link: https://github.com/google/kmsan/issues/91 Cc: Andy Lutomirski Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Marco Elver Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- arch/x86/mm/fault.c | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7b0d4ab894c8..a498ae1fbe66 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -260,7 +260,7 @@ static noinline int vmalloc_fault(unsigned long address) } NOKPROBE_SYMBOL(vmalloc_fault); -static void __arch_sync_kernel_mappings(unsigned long start, unsigned long end) +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) { unsigned long addr; @@ -284,27 +284,6 @@ static void __arch_sync_kernel_mappings(unsigned long start, unsigned long end) } } -void arch_sync_kernel_mappings(unsigned long start, unsigned long end) -{ - __arch_sync_kernel_mappings(start, end); -#ifdef CONFIG_KMSAN - /* - * KMSAN maintains two additional metadata page mappings for the - * [VMALLOC_START, VMALLOC_END) range. These mappings start at - * KMSAN_VMALLOC_SHADOW_START and KMSAN_VMALLOC_ORIGIN_START and - * have to be synced together with the vmalloc memory mapping. - */ - if (start >= VMALLOC_START && end < VMALLOC_END) { - __arch_sync_kernel_mappings( - start - VMALLOC_START + KMSAN_VMALLOC_SHADOW_START, - end - VMALLOC_START + KMSAN_VMALLOC_SHADOW_START); - __arch_sync_kernel_mappings( - start - VMALLOC_START + KMSAN_VMALLOC_ORIGIN_START, - end - VMALLOC_START + KMSAN_VMALLOC_ORIGIN_START); - } -#endif -} - static bool low_pfn(unsigned long pfn) { return pfn < max_low_pfn; -- cgit v1.2.3 From 4c110ec98e39944732ec31bf0415f22632bae2b7 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 12 Jan 2023 14:46:01 -0600 Subject: mm/memory-failure: convert __get_huge_page_for_hwpoison() to folios Patch series "convert hugepage memory failure functions to folios". This series contains a 1:1 straightforward page to folio conversion for memory failure functions which deal with huge pages. I renamed a few functions to fit with how other folio operating functions are named. These include: hugetlb_clear_page_hwpoison -> folio_clear_hugetlb_hwpoison free_raw_hwp_pages -> folio_free_raw_hwp __free_raw_hwp_pages -> __folio_free_raw_hwp hugetlb_set_page_hwpoison -> folio_set_hugetlb_hwpoison The goal of this series was to reduce users of the hugetlb specific page flag macros which take in a page so users are protected by the compiler to make sure they are operating on a head page. This patch (of 8): Use a folio throughout the function rather than using a head page. This also reduces the users of the page version of hugetlb specific page flags. Link: https://lkml.kernel.org/r/20230112204608.80136-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: Naoya Horiguchi Cc: Matthew Wilcox Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory-failure.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6bf07345ea2c..cca28f19ad99 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1807,20 +1807,20 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { struct page *page = pfn_to_page(pfn); - struct page *head = compound_head(page); + struct folio *folio = page_folio(page); int ret = 2; /* fallback to normal page handling */ bool count_increased = false; - if (!PageHeadHuge(head)) + if (!folio_test_hugetlb(folio)) goto out; if (flags & MF_COUNT_INCREASED) { ret = 1; count_increased = true; - } else if (HPageFreed(head)) { + } else if (folio_test_hugetlb_freed(folio)) { ret = 0; - } else if (HPageMigratable(head)) { - ret = get_page_unless_zero(head); + } else if (folio_test_hugetlb_migratable(folio)) { + ret = folio_try_get(folio); if (ret) count_increased = true; } else { @@ -1829,24 +1829,24 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, goto out; } - if (hugetlb_set_page_hwpoison(head, page)) { + if (hugetlb_set_page_hwpoison(&folio->page, page)) { ret = -EHWPOISON; goto out; } /* - * Clearing HPageMigratable for hwpoisoned hugepages to prevent them + * Clearing hugetlb_migratable for hwpoisoned hugepages to prevent them * from being migrated by memory hotremove. */ - if (count_increased && HPageMigratable(head)) { - ClearHPageMigratable(head); + if (count_increased && folio_test_hugetlb_migratable(folio)) { + folio_clear_hugetlb_migratable(folio); *migratable_cleared = true; } return ret; out: if (count_increased) - put_page(head); + folio_put(folio); return ret; } -- cgit v1.2.3 From bc1cfde194675215857755b75e5fe90f6a654843 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 12 Jan 2023 14:46:02 -0600 Subject: mm/memory-failure: convert try_memory_failure_hugetlb() to folios Use a struct folio rather than a head page in try_memory_failure_hugetlb. This converts one user of SetHPageMigratable to the folio equivalent. Link: https://lkml.kernel.org/r/20230112204608.80136-3-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: Naoya Horiguchi Cc: Matthew Wilcox Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory-failure.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index cca28f19ad99..19f6035608d5 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1860,7 +1860,7 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb { int res; struct page *p = pfn_to_page(pfn); - struct page *head; + struct folio *folio; unsigned long page_flags; bool migratable_cleared = false; @@ -1873,8 +1873,8 @@ retry: } else if (res == -EHWPOISON) { pr_err("%#lx: already hardware poisoned\n", pfn); if (flags & MF_ACTION_REQUIRED) { - head = compound_head(p); - res = kill_accessing_process(current, page_to_pfn(head), flags); + folio = page_folio(p); + res = kill_accessing_process(current, folio_pfn(folio), flags); } return res; } else if (res == -EBUSY) { @@ -1885,16 +1885,16 @@ retry: return action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); } - head = compound_head(p); - lock_page(head); + folio = page_folio(p); + folio_lock(folio); if (hwpoison_filter(p)) { - hugetlb_clear_page_hwpoison(head); + hugetlb_clear_page_hwpoison(&folio->page); if (migratable_cleared) - SetHPageMigratable(head); - unlock_page(head); + folio_set_hugetlb_migratable(folio); + folio_unlock(folio); if (res == 1) - put_page(head); + folio_put(folio); return -EOPNOTSUPP; } @@ -1903,7 +1903,7 @@ retry: * or demotion can be prevented by PageHWPoison flag. */ if (res == 0) { - unlock_page(head); + folio_unlock(folio); if (__page_handle_poison(p) >= 0) { page_ref_inc(p); res = MF_RECOVERED; @@ -1913,10 +1913,10 @@ retry: return action_result(pfn, MF_MSG_FREE_HUGE, res); } - page_flags = head->flags; + page_flags = folio->flags; - if (!hwpoison_user_mappings(p, pfn, flags, head)) { - unlock_page(head); + if (!hwpoison_user_mappings(p, pfn, flags, &folio->page)) { + folio_unlock(folio); return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); } -- cgit v1.2.3 From 2ff6cecee669bf0fc63eadebac8cfc81f74b9a4c Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 12 Jan 2023 14:46:03 -0600 Subject: mm/memory-failure: convert hugetlb_clear_page_hwpoison to folios Change hugetlb_clear_page_hwpoison() to folio_clear_hugetlb_hwpoison() by changing the function to take in a folio. This converts one use of ClearPageHWPoison and HPageRawHwpUnreliable to their folio equivalents. Link: https://lkml.kernel.org/r/20230112204608.80136-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: Naoya Horiguchi Cc: Matthew Wilcox Cc: Miaohe Lin Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/hugetlb.c | 2 +- mm/memory-failure.c | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 7d6413c3b8f5..cf60fe741c1d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -878,9 +878,9 @@ extern int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn); #ifdef CONFIG_MEMORY_FAILURE -extern void hugetlb_clear_page_hwpoison(struct page *hpage); +extern void folio_clear_hugetlb_hwpoison(struct folio *folio); #else -static inline void hugetlb_clear_page_hwpoison(struct page *hpage) +static inline void folio_clear_hugetlb_hwpoison(struct folio *folio) { } #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ca9e177b9c54..291ad4cb02f9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1731,7 +1731,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) * which makes any healthy subpages reusable. */ if (unlikely(folio_test_hwpoison(folio))) - hugetlb_clear_page_hwpoison(&folio->page); + folio_clear_hugetlb_hwpoison(folio); for (i = 0; i < pages_per_huge_page(h); i++) { subpage = folio_page(folio, i); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 19f6035608d5..d4aaed2756af 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1785,12 +1785,12 @@ static unsigned long free_raw_hwp_pages(struct page *hpage, bool move_flag) return __free_raw_hwp_pages(hpage, move_flag); } -void hugetlb_clear_page_hwpoison(struct page *hpage) +void folio_clear_hugetlb_hwpoison(struct folio *folio) { - if (HPageRawHwpUnreliable(hpage)) + if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; - ClearPageHWPoison(hpage); - free_raw_hwp_pages(hpage, true); + folio_clear_hwpoison(folio); + free_raw_hwp_pages(&folio->page, true); } /* @@ -1889,7 +1889,7 @@ retry: folio_lock(folio); if (hwpoison_filter(p)) { - hugetlb_clear_page_hwpoison(&folio->page); + folio_clear_hugetlb_hwpoison(folio); if (migratable_cleared) folio_set_hugetlb_migratable(folio); folio_unlock(folio); -- cgit v1.2.3 From 9637d7dfb19ce934f81cd56cde23573759c73afb Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 12 Jan 2023 14:46:04 -0600 Subject: mm/memory-failure: convert free_raw_hwp_pages() to folios Change free_raw_hwp_pages() to folio_free_raw_hwp(), converts two users of hugetlb specific page macro users to their folio equivalents. Link: https://lkml.kernel.org/r/20230112204608.80136-5-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: Naoya Horiguchi Cc: Matthew Wilcox Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory-failure.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d4aaed2756af..a2835907caf8 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1766,23 +1766,23 @@ static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page) return ret; } -static unsigned long free_raw_hwp_pages(struct page *hpage, bool move_flag) +static unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag) { /* - * HPageVmemmapOptimized hugepages can't be freed because struct + * hugetlb_vmemmap_optimized hugepages can't be freed because struct * pages for tail pages are required but they don't exist. */ - if (move_flag && HPageVmemmapOptimized(hpage)) + if (move_flag && folio_test_hugetlb_vmemmap_optimized(folio)) return 0; /* - * HPageRawHwpUnreliable hugepages shouldn't be unpoisoned by + * hugetlb_raw_hwp_unreliable hugepages shouldn't be unpoisoned by * definition. */ - if (HPageRawHwpUnreliable(hpage)) + if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return 0; - return __free_raw_hwp_pages(hpage, move_flag); + return __free_raw_hwp_pages(&folio->page, move_flag); } void folio_clear_hugetlb_hwpoison(struct folio *folio) @@ -1790,7 +1790,7 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio) if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; folio_clear_hwpoison(folio); - free_raw_hwp_pages(&folio->page, true); + folio_free_raw_hwp(folio, true); } /* @@ -1929,7 +1929,7 @@ static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int * return 0; } -static inline unsigned long free_raw_hwp_pages(struct page *hpage, bool flag) +static inline unsigned long folio_free_raw_hwp(struct folio *folio, bool flag) { return 0; } @@ -2336,6 +2336,7 @@ core_initcall(memory_failure_init); int unpoison_memory(unsigned long pfn) { struct page *page; + struct folio *folio; struct page *p; int ret = -EBUSY; unsigned long count = 1; @@ -2348,6 +2349,7 @@ int unpoison_memory(unsigned long pfn) p = pfn_to_page(pfn); page = compound_head(p); + folio = page_folio(p); mutex_lock(&mf_mutex); @@ -2389,7 +2391,7 @@ int unpoison_memory(unsigned long pfn) if (!ret) { if (PageHuge(p)) { huge = true; - count = free_raw_hwp_pages(page, false); + count = folio_free_raw_hwp(folio, false); if (count == 0) { ret = -EBUSY; goto unlock_mutex; @@ -2405,7 +2407,7 @@ int unpoison_memory(unsigned long pfn) } else { if (PageHuge(p)) { huge = true; - count = free_raw_hwp_pages(page, false); + count = folio_free_raw_hwp(folio, false); if (count == 0) { ret = -EBUSY; put_page(page); -- cgit v1.2.3 From b02e7582ef245e9694fff6aee8e95fd1764cc5ee Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 12 Jan 2023 14:46:05 -0600 Subject: mm/memory-failure: convert raw_hwp_list_head() to folios Change raw_hwp_list_head() to take in a folio and modify its callers to pass in a folio. Also converts two users of hugetlb specific page macro users to their folio equivalents. Link: https://lkml.kernel.org/r/20230112204608.80136-6-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: Naoya Horiguchi Cc: Matthew Wilcox Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory-failure.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a2835907caf8..6ae5f234bcc1 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1695,9 +1695,9 @@ struct raw_hwp_page { struct page *page; }; -static inline struct llist_head *raw_hwp_list_head(struct page *hpage) +static inline struct llist_head *raw_hwp_list_head(struct folio *folio) { - return (struct llist_head *)&page_folio(hpage)->_hugetlb_hwpoison; + return (struct llist_head *)&folio->_hugetlb_hwpoison; } static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag) @@ -1705,8 +1705,9 @@ static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag) struct llist_head *head; struct llist_node *t, *tnode; unsigned long count = 0; + struct folio *folio = page_folio(hpage); - head = raw_hwp_list_head(hpage); + head = raw_hwp_list_head(folio); llist_for_each_safe(tnode, t, head->first) { struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node); @@ -1727,15 +1728,16 @@ static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page) struct raw_hwp_page *raw_hwp; struct llist_node *t, *tnode; int ret = TestSetPageHWPoison(hpage) ? -EHWPOISON : 0; + struct folio *folio = page_folio(hpage); /* * Once the hwpoison hugepage has lost reliable raw error info, * there is little meaning to keep additional error info precisely, * so skip to add additional raw error info. */ - if (HPageRawHwpUnreliable(hpage)) + if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return -EHWPOISON; - head = raw_hwp_list_head(hpage); + head = raw_hwp_list_head(folio); llist_for_each_safe(tnode, t, head->first) { struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node); @@ -1756,9 +1758,9 @@ static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page) * hwpoisoned subpages, and we need refuse to free/dissolve * this hwpoisoned hugepage. */ - SetHPageRawHwpUnreliable(hpage); + folio_set_hugetlb_raw_hwp_unreliable(folio); /* - * Once HPageRawHwpUnreliable is set, raw_hwp_page is not + * Once hugetlb_raw_hwp_unreliable is set, raw_hwp_page is not * used any more, so free it. */ __free_raw_hwp_pages(hpage, false); -- cgit v1.2.3 From 0858b5eb3aab2de0662a40901699162519628f6e Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 12 Jan 2023 14:46:06 -0600 Subject: mm/memory-failure: convert __free_raw_hwp_pages() to folios Change __free_raw_hwp_pages() to __folio_free_raw_hwp() and modify its callers to pass in a folio. Link: https://lkml.kernel.org/r/20230112204608.80136-7-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: Naoya Horiguchi Cc: Matthew Wilcox Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory-failure.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6ae5f234bcc1..387fb9a9ee4e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1700,12 +1700,11 @@ static inline struct llist_head *raw_hwp_list_head(struct folio *folio) return (struct llist_head *)&folio->_hugetlb_hwpoison; } -static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag) +static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag) { struct llist_head *head; struct llist_node *t, *tnode; unsigned long count = 0; - struct folio *folio = page_folio(hpage); head = raw_hwp_list_head(folio); llist_for_each_safe(tnode, t, head->first) { @@ -1763,7 +1762,7 @@ static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page) * Once hugetlb_raw_hwp_unreliable is set, raw_hwp_page is not * used any more, so free it. */ - __free_raw_hwp_pages(hpage, false); + __folio_free_raw_hwp(folio, false); } return ret; } @@ -1784,7 +1783,7 @@ static unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag) if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return 0; - return __free_raw_hwp_pages(&folio->page, move_flag); + return __folio_free_raw_hwp(folio, move_flag); } void folio_clear_hugetlb_hwpoison(struct folio *folio) -- cgit v1.2.3 From 595dd8185cf1db248b2be4c65ec8936de6ac87c1 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 12 Jan 2023 14:46:07 -0600 Subject: mm/memory-failure: convert hugetlb_set_page_hwpoison() to folios Change hugetlb_set_page_hwpoison() to folio_set_hugetlb_hwpoison() and use a folio internally. Link: https://lkml.kernel.org/r/20230112204608.80136-8-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: Naoya Horiguchi Cc: Matthew Wilcox Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory-failure.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 387fb9a9ee4e..3821d64e2b2a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1721,13 +1721,12 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag) return count; } -static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page) +static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page) { struct llist_head *head; struct raw_hwp_page *raw_hwp; struct llist_node *t, *tnode; - int ret = TestSetPageHWPoison(hpage) ? -EHWPOISON : 0; - struct folio *folio = page_folio(hpage); + int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0; /* * Once the hwpoison hugepage has lost reliable raw error info, @@ -1830,7 +1829,7 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, goto out; } - if (hugetlb_set_page_hwpoison(&folio->page, page)) { + if (folio_set_hugetlb_hwpoison(folio, page)) { ret = -EHWPOISON; goto out; } -- cgit v1.2.3 From a6fddef49eef2cf68c23e91d73d6a6d5e2cd448f Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 12 Jan 2023 14:46:08 -0600 Subject: mm/memory-failure: convert unpoison_memory() to folios Use a folio inside unpoison_memory which replaces a compound_head() call with a call to page_folio(). Link: https://lkml.kernel.org/r/20230112204608.80136-9-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: Naoya Horiguchi Cc: Matthew Wilcox Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory-failure.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3821d64e2b2a..ba0bbfc074ee 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2335,7 +2335,6 @@ core_initcall(memory_failure_init); */ int unpoison_memory(unsigned long pfn) { - struct page *page; struct folio *folio; struct page *p; int ret = -EBUSY; @@ -2348,7 +2347,6 @@ int unpoison_memory(unsigned long pfn) return -ENXIO; p = pfn_to_page(pfn); - page = compound_head(p); folio = page_folio(p); mutex_lock(&mf_mutex); @@ -2360,31 +2358,31 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } - if (!PageHWPoison(p)) { + if (!folio_test_hwpoison(folio)) { unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", pfn, &unpoison_rs); goto unlock_mutex; } - if (page_count(page) > 1) { + if (folio_ref_count(folio) > 1) { unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n", pfn, &unpoison_rs); goto unlock_mutex; } - if (page_mapped(page)) { + if (folio_mapped(folio)) { unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n", pfn, &unpoison_rs); goto unlock_mutex; } - if (page_mapping(page)) { + if (folio_mapping(folio)) { unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n", pfn, &unpoison_rs); goto unlock_mutex; } - if (PageSlab(page) || PageTable(page) || PageReserved(page)) + if (folio_test_slab(folio) || PageTable(&folio->page) || folio_test_reserved(folio)) goto unlock_mutex; ret = get_hwpoison_page(p, MF_UNPOISON); @@ -2397,7 +2395,7 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } } - ret = TestClearPageHWPoison(page) ? 0 : -EBUSY; + ret = folio_test_clear_hwpoison(folio) ? 0 : -EBUSY; } else if (ret < 0) { if (ret == -EHWPOISON) { ret = put_page_back_buddy(p) ? 0 : -EBUSY; @@ -2410,14 +2408,14 @@ int unpoison_memory(unsigned long pfn) count = folio_free_raw_hwp(folio, false); if (count == 0) { ret = -EBUSY; - put_page(page); + folio_put(folio); goto unlock_mutex; } } - put_page(page); + folio_put(folio); if (TestClearPageHWPoison(p)) { - put_page(page); + folio_put(folio); ret = 0; } } -- cgit v1.2.3 From 69bbb87b3f144e4778f028fd85992aa8dea6ff28 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 Jan 2023 13:10:31 +0000 Subject: shmem: convert shmem_write_end() to use a folio Use a folio internally to shmem_write_end() which saves a number of calls to compound_head() and lets us get rid of the custom code to zero out the rest of a THP and supports folios of arbitrary size. Link: https://lkml.kernel.org/r/20230112131031.1209553-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/shmem.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index bc5c156ef470..c5048c6c83dd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2578,33 +2578,23 @@ shmem_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); struct inode *inode = mapping->host; if (pos + copied > inode->i_size) i_size_write(inode, pos + copied); - if (!PageUptodate(page)) { - struct page *head = compound_head(page); - if (PageTransCompound(page)) { - int i; - - for (i = 0; i < HPAGE_PMD_NR; i++) { - if (head + i == page) - continue; - clear_highpage(head + i); - flush_dcache_page(head + i); - } - } - if (copied < PAGE_SIZE) { - unsigned from = pos & (PAGE_SIZE - 1); - zero_user_segments(page, 0, from, - from + copied, PAGE_SIZE); + if (!folio_test_uptodate(folio)) { + if (copied < folio_size(folio)) { + size_t from = offset_in_folio(folio, pos); + folio_zero_segments(folio, 0, from, + from + copied, folio_size(folio)); } - SetPageUptodate(head); + folio_mark_uptodate(folio); } - set_page_dirty(page); - unlock_page(page); - put_page(page); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); return copied; } -- cgit v1.2.3 From 4947ed93c23218c40fb32226b895d8733cd2e05f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 12 Jan 2023 20:40:28 +0800 Subject: mm: madvise: use vm_normal_folio() in madvise_free_pte_range() There is already a vm_normal_folio(), use it to make madvise_free_pte_range() only use a folio. Link: https://lkml.kernel.org/r/20230112124028.16964-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Matthew Wilcox (Oracle) Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/madvise.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 5296e78dccda..92a3c6bd84c1 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -617,7 +617,6 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, spinlock_t *ptl; pte_t *orig_pte, *pte, ptent; struct folio *folio; - struct page *page; int nr_swap = 0; unsigned long next; @@ -658,10 +657,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, continue; } - page = vm_normal_page(vma, addr, ptent); - if (!page || is_zone_device_page(page)) + folio = vm_normal_folio(vma, addr, ptent); + if (!folio || folio_is_zone_device(folio)) continue; - folio = page_folio(page); /* * If pmd isn't transhuge but the folio is large and -- cgit v1.2.3 From 8115612883978069fee8793f873a627ff5868718 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 12 Jan 2023 12:39:28 +0000 Subject: mm: pagevec: add folio_batch_reinit() Patch series "update mlock to use folios", v4. This series updates mlock to use folios, converting the internal interface to using folios exclusively and exposing the folio interface externally. As a product of this we move to using a folio batch rather than a pagevec for mlock folios, which brings it in line with the core folio batches contained in mm/swap.c. This patch (of 5): This performs the same task as pagevec_reinit(), only modifying a folio batch rather than a pagevec. Link: https://lkml.kernel.org/r/cover.1673526881.git.lstoakes@gmail.com Link: https://lkml.kernel.org/r/9018cecacb39e34c883540f997f9be8281153613.1673526881.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Cc: Christian Brauner Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Joel Fernandes (Google) Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Matthew Wilcox Cc: Mike Rapoport (IBM) Cc: William Kucharski Signed-off-by: Andrew Morton --- include/linux/pagevec.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 215eb6c3bdc9..2a6f61a0c10a 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -103,6 +103,11 @@ static inline void folio_batch_init(struct folio_batch *fbatch) fbatch->percpu_pvec_drained = false; } +static inline void folio_batch_reinit(struct folio_batch *fbatch) +{ + fbatch->nr = 0; +} + static inline unsigned int folio_batch_count(struct folio_batch *fbatch) { return fbatch->nr; -- cgit v1.2.3 From 90d07210ab55e458c87048e1ad55582ecff0a3d5 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 12 Jan 2023 12:39:29 +0000 Subject: mm: mlock: use folios and a folio batch internally This brings mlock in line with the folio batches declared in mm/swap.c and makes the code more consistent across the two. The existing mechanism for identifying which operation each folio in the batch is undergoing is maintained, i.e. using the lower 2 bits of the struct folio address (previously struct page address). This should continue to function correctly as folios remain at least system word-aligned. All invocations of mlock() pass either a non-compound page or the head of a THP-compound page and no tail pages need updating so this functionality works with struct folios being used internally rather than struct pages. In this patch the external interface is kept identical to before in order to maintain separation between patches in the series, using a rather awkward conversion from struct page to struct folio in relevant functions. However, this maintenance of the existing interface is intended to be temporary - the next patch in the series will update the interfaces to accept folios directly. Link: https://lkml.kernel.org/r/9f894d54d568773f4ed3cb0eef5f8932f62c95f4.1673526881.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Cc: Christian Brauner Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Joel Fernandes (Google) Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Matthew Wilcox Cc: Mike Rapoport (IBM) Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/mlock.c | 246 +++++++++++++++++++++++++++++++------------------------------ 1 file changed, 124 insertions(+), 122 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index 7032f6dd0ce1..f8e8d30ab08a 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -28,12 +28,12 @@ #include "internal.h" -struct mlock_pvec { +struct mlock_fbatch { local_lock_t lock; - struct pagevec vec; + struct folio_batch fbatch; }; -static DEFINE_PER_CPU(struct mlock_pvec, mlock_pvec) = { +static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch) = { .lock = INIT_LOCAL_LOCK(lock), }; @@ -48,192 +48,192 @@ bool can_do_mlock(void) EXPORT_SYMBOL(can_do_mlock); /* - * Mlocked pages are marked with PageMlocked() flag for efficient testing + * Mlocked folios are marked with the PG_mlocked flag for efficient testing * in vmscan and, possibly, the fault path; and to support semi-accurate * statistics. * - * An mlocked page [PageMlocked(page)] is unevictable. As such, it will - * be placed on the LRU "unevictable" list, rather than the [in]active lists. - * The unevictable list is an LRU sibling list to the [in]active lists. - * PageUnevictable is set to indicate the unevictable state. + * An mlocked folio [folio_test_mlocked(folio)] is unevictable. As such, it + * will be ostensibly placed on the LRU "unevictable" list (actually no such + * list exists), rather than the [in]active lists. PG_unevictable is set to + * indicate the unevictable state. */ -static struct lruvec *__mlock_page(struct page *page, struct lruvec *lruvec) +static struct lruvec *__mlock_folio(struct folio *folio, struct lruvec *lruvec) { /* There is nothing more we can do while it's off LRU */ - if (!TestClearPageLRU(page)) + if (!folio_test_clear_lru(folio)) return lruvec; - lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); + lruvec = folio_lruvec_relock_irq(folio, lruvec); - if (unlikely(page_evictable(page))) { + if (unlikely(folio_evictable(folio))) { /* - * This is a little surprising, but quite possible: - * PageMlocked must have got cleared already by another CPU. - * Could this page be on the Unevictable LRU? I'm not sure, - * but move it now if so. + * This is a little surprising, but quite possible: PG_mlocked + * must have got cleared already by another CPU. Could this + * folio be unevictable? I'm not sure, but move it now if so. */ - if (PageUnevictable(page)) { - del_page_from_lru_list(page, lruvec); - ClearPageUnevictable(page); - add_page_to_lru_list(page, lruvec); + if (folio_test_unevictable(folio)) { + lruvec_del_folio(lruvec, folio); + folio_clear_unevictable(folio); + lruvec_add_folio(lruvec, folio); + __count_vm_events(UNEVICTABLE_PGRESCUED, - thp_nr_pages(page)); + folio_nr_pages(folio)); } goto out; } - if (PageUnevictable(page)) { - if (PageMlocked(page)) - page->mlock_count++; + if (folio_test_unevictable(folio)) { + if (folio_test_mlocked(folio)) + folio->mlock_count++; goto out; } - del_page_from_lru_list(page, lruvec); - ClearPageActive(page); - SetPageUnevictable(page); - page->mlock_count = !!PageMlocked(page); - add_page_to_lru_list(page, lruvec); - __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); + lruvec_del_folio(lruvec, folio); + folio_clear_active(folio); + folio_set_unevictable(folio); + folio->mlock_count = !!folio_test_mlocked(folio); + lruvec_add_folio(lruvec, folio); + __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio)); out: - SetPageLRU(page); + folio_set_lru(folio); return lruvec; } -static struct lruvec *__mlock_new_page(struct page *page, struct lruvec *lruvec) +static struct lruvec *__mlock_new_folio(struct folio *folio, struct lruvec *lruvec) { - VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); + lruvec = folio_lruvec_relock_irq(folio, lruvec); /* As above, this is a little surprising, but possible */ - if (unlikely(page_evictable(page))) + if (unlikely(folio_evictable(folio))) goto out; - SetPageUnevictable(page); - page->mlock_count = !!PageMlocked(page); - __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); + folio_set_unevictable(folio); + folio->mlock_count = !!folio_test_mlocked(folio); + __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio)); out: - add_page_to_lru_list(page, lruvec); - SetPageLRU(page); + lruvec_add_folio(lruvec, folio); + folio_set_lru(folio); return lruvec; } -static struct lruvec *__munlock_page(struct page *page, struct lruvec *lruvec) +static struct lruvec *__munlock_folio(struct folio *folio, struct lruvec *lruvec) { - int nr_pages = thp_nr_pages(page); + int nr_pages = folio_nr_pages(folio); bool isolated = false; - if (!TestClearPageLRU(page)) + if (!folio_test_clear_lru(folio)) goto munlock; isolated = true; - lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); + lruvec = folio_lruvec_relock_irq(folio, lruvec); - if (PageUnevictable(page)) { + if (folio_test_unevictable(folio)) { /* Then mlock_count is maintained, but might undercount */ - if (page->mlock_count) - page->mlock_count--; - if (page->mlock_count) + if (folio->mlock_count) + folio->mlock_count--; + if (folio->mlock_count) goto out; } /* else assume that was the last mlock: reclaim will fix it if not */ munlock: - if (TestClearPageMlocked(page)) { - __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); - if (isolated || !PageUnevictable(page)) + if (folio_test_clear_mlocked(folio)) { + __zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages); + if (isolated || !folio_test_unevictable(folio)) __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); else __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); } - /* page_evictable() has to be checked *after* clearing Mlocked */ - if (isolated && PageUnevictable(page) && page_evictable(page)) { - del_page_from_lru_list(page, lruvec); - ClearPageUnevictable(page); - add_page_to_lru_list(page, lruvec); + /* folio_evictable() has to be checked *after* clearing Mlocked */ + if (isolated && folio_test_unevictable(folio) && folio_evictable(folio)) { + lruvec_del_folio(lruvec, folio); + folio_clear_unevictable(folio); + lruvec_add_folio(lruvec, folio); __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } out: if (isolated) - SetPageLRU(page); + folio_set_lru(folio); return lruvec; } /* - * Flags held in the low bits of a struct page pointer on the mlock_pvec. + * Flags held in the low bits of a struct folio pointer on the mlock_fbatch. */ -#define LRU_PAGE 0x1 -#define NEW_PAGE 0x2 -static inline struct page *mlock_lru(struct page *page) +#define LRU_FOLIO 0x1 +#define NEW_FOLIO 0x2 +static inline struct folio *mlock_lru(struct folio *folio) { - return (struct page *)((unsigned long)page + LRU_PAGE); + return (struct folio *)((unsigned long)folio + LRU_FOLIO); } -static inline struct page *mlock_new(struct page *page) +static inline struct folio *mlock_new(struct folio *folio) { - return (struct page *)((unsigned long)page + NEW_PAGE); + return (struct folio *)((unsigned long)folio + NEW_FOLIO); } /* - * mlock_pagevec() is derived from pagevec_lru_move_fn(): - * perhaps that can make use of such page pointer flags in future, - * but for now just keep it for mlock. We could use three separate - * pagevecs instead, but one feels better (munlocking a full pagevec - * does not need to drain mlocking pagevecs first). + * mlock_folio_batch() is derived from folio_batch_move_lru(): perhaps that can + * make use of such folio pointer flags in future, but for now just keep it for + * mlock. We could use three separate folio batches instead, but one feels + * better (munlocking a full folio batch does not need to drain mlocking folio + * batches first). */ -static void mlock_pagevec(struct pagevec *pvec) +static void mlock_folio_batch(struct folio_batch *fbatch) { struct lruvec *lruvec = NULL; unsigned long mlock; - struct page *page; + struct folio *folio; int i; - for (i = 0; i < pagevec_count(pvec); i++) { - page = pvec->pages[i]; - mlock = (unsigned long)page & (LRU_PAGE | NEW_PAGE); - page = (struct page *)((unsigned long)page - mlock); - pvec->pages[i] = page; + for (i = 0; i < folio_batch_count(fbatch); i++) { + folio = fbatch->folios[i]; + mlock = (unsigned long)folio & (LRU_FOLIO | NEW_FOLIO); + folio = (struct folio *)((unsigned long)folio - mlock); + fbatch->folios[i] = folio; - if (mlock & LRU_PAGE) - lruvec = __mlock_page(page, lruvec); - else if (mlock & NEW_PAGE) - lruvec = __mlock_new_page(page, lruvec); + if (mlock & LRU_FOLIO) + lruvec = __mlock_folio(folio, lruvec); + else if (mlock & NEW_FOLIO) + lruvec = __mlock_new_folio(folio, lruvec); else - lruvec = __munlock_page(page, lruvec); + lruvec = __munlock_folio(folio, lruvec); } if (lruvec) unlock_page_lruvec_irq(lruvec); - release_pages(pvec->pages, pvec->nr); - pagevec_reinit(pvec); + release_pages(fbatch->folios, fbatch->nr); + folio_batch_reinit(fbatch); } void mlock_page_drain_local(void) { - struct pagevec *pvec; + struct folio_batch *fbatch; - local_lock(&mlock_pvec.lock); - pvec = this_cpu_ptr(&mlock_pvec.vec); - if (pagevec_count(pvec)) - mlock_pagevec(pvec); - local_unlock(&mlock_pvec.lock); + local_lock(&mlock_fbatch.lock); + fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); + if (folio_batch_count(fbatch)) + mlock_folio_batch(fbatch); + local_unlock(&mlock_fbatch.lock); } void mlock_page_drain_remote(int cpu) { - struct pagevec *pvec; + struct folio_batch *fbatch; WARN_ON_ONCE(cpu_online(cpu)); - pvec = &per_cpu(mlock_pvec.vec, cpu); - if (pagevec_count(pvec)) - mlock_pagevec(pvec); + fbatch = &per_cpu(mlock_fbatch.fbatch, cpu); + if (folio_batch_count(fbatch)) + mlock_folio_batch(fbatch); } bool need_mlock_page_drain(int cpu) { - return pagevec_count(&per_cpu(mlock_pvec.vec, cpu)); + return folio_batch_count(&per_cpu(mlock_fbatch.fbatch, cpu)); } /** @@ -242,10 +242,10 @@ bool need_mlock_page_drain(int cpu) */ void mlock_folio(struct folio *folio) { - struct pagevec *pvec; + struct folio_batch *fbatch; - local_lock(&mlock_pvec.lock); - pvec = this_cpu_ptr(&mlock_pvec.vec); + local_lock(&mlock_fbatch.lock); + fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); if (!folio_test_set_mlocked(folio)) { int nr_pages = folio_nr_pages(folio); @@ -255,10 +255,10 @@ void mlock_folio(struct folio *folio) } folio_get(folio); - if (!pagevec_add(pvec, mlock_lru(&folio->page)) || + if (!folio_batch_add(fbatch, mlock_lru(folio)) || folio_test_large(folio) || lru_cache_disabled()) - mlock_pagevec(pvec); - local_unlock(&mlock_pvec.lock); + mlock_folio_batch(fbatch); + local_unlock(&mlock_fbatch.lock); } /** @@ -267,20 +267,22 @@ void mlock_folio(struct folio *folio) */ void mlock_new_page(struct page *page) { - struct pagevec *pvec; - int nr_pages = thp_nr_pages(page); + struct folio_batch *fbatch; + struct folio *folio = page_folio(page); + int nr_pages = folio_nr_pages(folio); - local_lock(&mlock_pvec.lock); - pvec = this_cpu_ptr(&mlock_pvec.vec); - SetPageMlocked(page); - mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + local_lock(&mlock_fbatch.lock); + fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); + folio_set_mlocked(folio); + + zone_stat_mod_folio(folio, NR_MLOCK, nr_pages); __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); - get_page(page); - if (!pagevec_add(pvec, mlock_new(page)) || - PageHead(page) || lru_cache_disabled()) - mlock_pagevec(pvec); - local_unlock(&mlock_pvec.lock); + folio_get(folio); + if (!folio_batch_add(fbatch, mlock_new(folio)) || + folio_test_large(folio) || lru_cache_disabled()) + mlock_folio_batch(fbatch); + local_unlock(&mlock_fbatch.lock); } /** @@ -289,20 +291,20 @@ void mlock_new_page(struct page *page) */ void munlock_page(struct page *page) { - struct pagevec *pvec; + struct folio_batch *fbatch; + struct folio *folio = page_folio(page); - local_lock(&mlock_pvec.lock); - pvec = this_cpu_ptr(&mlock_pvec.vec); + local_lock(&mlock_fbatch.lock); + fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); /* - * TestClearPageMlocked(page) must be left to __munlock_page(), - * which will check whether the page is multiply mlocked. + * folio_test_clear_mlocked(folio) must be left to __munlock_folio(), + * which will check whether the folio is multiply mlocked. */ - - get_page(page); - if (!pagevec_add(pvec, page) || - PageHead(page) || lru_cache_disabled()) - mlock_pagevec(pvec); - local_unlock(&mlock_pvec.lock); + folio_get(folio); + if (!folio_batch_add(fbatch, folio) || + folio_test_large(folio) || lru_cache_disabled()) + mlock_folio_batch(fbatch); + local_unlock(&mlock_fbatch.lock); } static int mlock_pte_range(pmd_t *pmd, unsigned long addr, -- cgit v1.2.3 From b213ef6b72b5f1f9d333f8aca05440db4e302b46 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 12 Jan 2023 12:39:30 +0000 Subject: m68k/mm/motorola: specify pmd_page() type Failing to specify a specific type here breaks anything that relies on the type being explicitly known, such as page_folio(). Make explicit the type of null pointer returned here. Link: https://lkml.kernel.org/r/ad6be2821bbd6af10966b3704568ff458b270d9c.1673526881.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Geert Uytterhoeven Acked-by: Vlastimil Babka Cc: Christian Brauner Cc: Hugh Dickins Cc: Joel Fernandes (Google) Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Matthew Wilcox Cc: Mike Rapoport (IBM) Cc: William Kucharski Signed-off-by: Andrew Morton --- arch/m68k/include/asm/motorola_pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h index 7ac3d64c6b33..562b54e09850 100644 --- a/arch/m68k/include/asm/motorola_pgtable.h +++ b/arch/m68k/include/asm/motorola_pgtable.h @@ -124,7 +124,7 @@ static inline void pud_set(pud_t *pudp, pmd_t *pmdp) * expects pmd_page() to exists, only to then DCE it all. Provide a dummy to * make the compiler happy. */ -#define pmd_page(pmd) NULL +#define pmd_page(pmd) ((struct page *)NULL) #define pud_none(pud) (!pud_val(pud)) -- cgit v1.2.3 From 96f97c438f61ddba94117dcd1a1eb0aaafa22309 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 12 Jan 2023 12:39:31 +0000 Subject: mm: mlock: update the interface to use folios Update the mlock interface to accept folios rather than pages, bringing the interface in line with the internal implementation. munlock_vma_page() still requires a page_folio() conversion, however this is consistent with the existent mlock_vma_page() implementation and a product of rmap still dealing in pages rather than folios. Link: https://lkml.kernel.org/r/cba12777c5544305014bc0cbec56bb4cc71477d8.1673526881.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Cc: Christian Brauner Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Joel Fernandes (Google) Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Matthew Wilcox Cc: Mike Rapoport (IBM) Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/internal.h | 38 ++++++++++++++++++++++---------------- mm/migrate.c | 2 +- mm/mlock.c | 38 ++++++++++++++++++-------------------- mm/page_alloc.c | 2 +- mm/rmap.c | 4 ++-- mm/swap.c | 10 +++++----- 6 files changed, 49 insertions(+), 45 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 583e15357e09..973b48e8b1af 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -533,10 +533,9 @@ extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, * should be called with vma's mmap_lock held for read or write, * under page table lock for the pte/pmd being added or removed. * - * mlock is usually called at the end of page_add_*_rmap(), - * munlock at the end of page_remove_rmap(); but new anon - * pages are managed by lru_cache_add_inactive_or_unevictable() - * calling mlock_new_page(). + * mlock is usually called at the end of page_add_*_rmap(), munlock at + * the end of page_remove_rmap(); but new anon folios are managed by + * folio_add_lru_vma() calling mlock_new_folio(). * * @compound is used to include pmd mappings of THPs, but filter out * pte mappings of THPs, which cannot be consistently counted: a pte @@ -565,18 +564,25 @@ static inline void mlock_vma_page(struct page *page, mlock_vma_folio(page_folio(page), vma, compound); } -void munlock_page(struct page *page); -static inline void munlock_vma_page(struct page *page, +void munlock_folio(struct folio *folio); + +static inline void munlock_vma_folio(struct folio *folio, struct vm_area_struct *vma, bool compound) { if (unlikely(vma->vm_flags & VM_LOCKED) && - (compound || !PageTransCompound(page))) - munlock_page(page); + (compound || !folio_test_large(folio))) + munlock_folio(folio); +} + +static inline void munlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) +{ + munlock_vma_folio(page_folio(page), vma, compound); } -void mlock_new_page(struct page *page); -bool need_mlock_page_drain(int cpu); -void mlock_page_drain_local(void); -void mlock_page_drain_remote(int cpu); +void mlock_new_folio(struct folio *folio); +bool need_mlock_drain(int cpu); +void mlock_drain_local(void); +void mlock_drain_remote(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); @@ -665,10 +671,10 @@ static inline void mlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { } static inline void munlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { } -static inline void mlock_new_page(struct page *page) { } -static inline bool need_mlock_page_drain(int cpu) { return false; } -static inline void mlock_page_drain_local(void) { } -static inline void mlock_page_drain_remote(int cpu) { } +static inline void mlock_new_folio(struct folio *folio) { } +static inline bool need_mlock_drain(int cpu) { return false; } +static inline void mlock_drain_local(void) { } +static inline void mlock_drain_remote(int cpu) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } diff --git a/mm/migrate.c b/mm/migrate.c index 98de7ce2b576..206fcdbe67f3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -265,7 +265,7 @@ static bool remove_migration_pte(struct folio *folio, set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } if (vma->vm_flags & VM_LOCKED) - mlock_page_drain_local(); + mlock_drain_local(); trace_remove_migration_pte(pvmw.address, pte_val(pte), compound_order(new)); diff --git a/mm/mlock.c b/mm/mlock.c index f8e8d30ab08a..9e9c8be58277 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -210,7 +210,7 @@ static void mlock_folio_batch(struct folio_batch *fbatch) folio_batch_reinit(fbatch); } -void mlock_page_drain_local(void) +void mlock_drain_local(void) { struct folio_batch *fbatch; @@ -221,7 +221,7 @@ void mlock_page_drain_local(void) local_unlock(&mlock_fbatch.lock); } -void mlock_page_drain_remote(int cpu) +void mlock_drain_remote(int cpu) { struct folio_batch *fbatch; @@ -231,7 +231,7 @@ void mlock_page_drain_remote(int cpu) mlock_folio_batch(fbatch); } -bool need_mlock_page_drain(int cpu) +bool need_mlock_drain(int cpu) { return folio_batch_count(&per_cpu(mlock_fbatch.fbatch, cpu)); } @@ -262,13 +262,12 @@ void mlock_folio(struct folio *folio) } /** - * mlock_new_page - mlock a newly allocated page not yet on LRU - * @page: page to be mlocked, either a normal page or a THP head. + * mlock_new_folio - mlock a newly allocated folio not yet on LRU + * @folio: folio to be mlocked, either normal or a THP head. */ -void mlock_new_page(struct page *page) +void mlock_new_folio(struct folio *folio) { struct folio_batch *fbatch; - struct folio *folio = page_folio(page); int nr_pages = folio_nr_pages(folio); local_lock(&mlock_fbatch.lock); @@ -286,13 +285,12 @@ void mlock_new_page(struct page *page) } /** - * munlock_page - munlock a page - * @page: page to be munlocked, either a normal page or a THP head. + * munlock_folio - munlock a folio + * @folio: folio to be munlocked, either normal or a THP head. */ -void munlock_page(struct page *page) +void munlock_folio(struct folio *folio) { struct folio_batch *fbatch; - struct folio *folio = page_folio(page); local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); @@ -314,7 +312,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *start_pte, *pte; - struct page *page; + struct folio *folio; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { @@ -322,11 +320,11 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, goto out; if (is_huge_zero_pmd(*pmd)) goto out; - page = pmd_page(*pmd); + folio = page_folio(pmd_page(*pmd)); if (vma->vm_flags & VM_LOCKED) - mlock_folio(page_folio(page)); + mlock_folio(folio); else - munlock_page(page); + munlock_folio(folio); goto out; } @@ -334,15 +332,15 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) continue; - page = vm_normal_page(vma, addr, *pte); - if (!page || is_zone_device_page(page)) + folio = vm_normal_folio(vma, addr, *pte); + if (!folio || folio_is_zone_device(folio)) continue; - if (PageTransCompound(page)) + if (folio_test_large(folio)) continue; if (vma->vm_flags & VM_LOCKED) - mlock_folio(page_folio(page)); + mlock_folio(folio); else - munlock_page(page); + munlock_folio(folio); } pte_unmap(start_pte); out: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 88494e82843d..83be3b571fd0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8587,7 +8587,7 @@ static int page_alloc_cpu_dead(unsigned int cpu) struct zone *zone; lru_add_drain_cpu(cpu); - mlock_page_drain_remote(cpu); + mlock_drain_remote(cpu); drain_pages(cpu); /* diff --git a/mm/rmap.c b/mm/rmap.c index a079d9964b9c..073999f78adf 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1764,7 +1764,7 @@ discard: */ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) - mlock_page_drain_local(); + mlock_drain_local(); folio_put(folio); } @@ -2105,7 +2105,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, */ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) - mlock_page_drain_local(); + mlock_drain_local(); folio_put(folio); } diff --git a/mm/swap.c b/mm/swap.c index e54e2a252e27..42d67f9baa8c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -562,7 +562,7 @@ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) - mlock_new_page(&folio->page); + mlock_new_folio(folio); else folio_add_lru(folio); } @@ -781,7 +781,7 @@ void lru_add_drain(void) local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); - mlock_page_drain_local(); + mlock_drain_local(); } /* @@ -796,7 +796,7 @@ static void lru_add_and_bh_lrus_drain(void) lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); invalidate_bh_lrus_cpu(); - mlock_page_drain_local(); + mlock_drain_local(); } void lru_add_drain_cpu_zone(struct zone *zone) @@ -805,7 +805,7 @@ void lru_add_drain_cpu_zone(struct zone *zone) lru_add_drain_cpu(smp_processor_id()); drain_local_pages(zone); local_unlock(&cpu_fbatches.lock); - mlock_page_drain_local(); + mlock_drain_local(); } #ifdef CONFIG_SMP @@ -828,7 +828,7 @@ static bool cpu_needs_drain(unsigned int cpu) folio_batch_count(&fbatches->lru_deactivate) || folio_batch_count(&fbatches->lru_lazyfree) || folio_batch_count(&fbatches->activate) || - need_mlock_page_drain(cpu) || + need_mlock_drain(cpu) || has_bh_in_lru(cpu, NULL); } -- cgit v1.2.3 From a8265cd917a63c0a1e13caf07e9a0a024480372b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 12 Jan 2023 12:39:32 +0000 Subject: Documentation/mm: update references to __m[un]lock_page() to *_folio() We now pass folios to these functions, so update the documentation accordingly. Additionally, correct the outdated reference to __pagevec_lru_add_fn(), the referenced action occurs in __munlock_folio() directly now, replace reference to lru_cache_add_inactive_or_unevictable() with the modern folio equivalent folio_add_lru_vma() and reference folio flags by the flag name rather than accessor. Link: https://lkml.kernel.org/r/898c487169d98a7f09c1c1e57a7dfdc2b3f6bf0f.1673526881.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Cc: Christian Brauner Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Joel Fernandes (Google) Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Matthew Wilcox Cc: Mike Rapoport (IBM) Cc: William Kucharski Signed-off-by: Andrew Morton --- Documentation/mm/unevictable-lru.rst | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/Documentation/mm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst index 4a0e158aa9ce..2a90d0721dd9 100644 --- a/Documentation/mm/unevictable-lru.rst +++ b/Documentation/mm/unevictable-lru.rst @@ -308,22 +308,22 @@ do end up getting faulted into this VM_LOCKED VMA, they will be handled in the fault path - which is also how mlock2()'s MLOCK_ONFAULT areas are handled. For each PTE (or PMD) being faulted into a VMA, the page add rmap function -calls mlock_vma_page(), which calls mlock_page() when the VMA is VM_LOCKED +calls mlock_vma_page(), which calls mlock_folio() when the VMA is VM_LOCKED (unless it is a PTE mapping of a part of a transparent huge page). Or when -it is a newly allocated anonymous page, lru_cache_add_inactive_or_unevictable() -calls mlock_new_page() instead: similar to mlock_page(), but can make better +it is a newly allocated anonymous page, folio_add_lru_vma() calls +mlock_new_folio() instead: similar to mlock_folio(), but can make better judgments, since this page is held exclusively and known not to be on LRU yet. -mlock_page() sets PageMlocked immediately, then places the page on the CPU's -mlock pagevec, to batch up the rest of the work to be done under lru_lock by -__mlock_page(). __mlock_page() sets PageUnevictable, initializes mlock_count +mlock_folio() sets PG_mlocked immediately, then places the page on the CPU's +mlock folio batch, to batch up the rest of the work to be done under lru_lock by +__mlock_folio(). __mlock_folio() sets PG_unevictable, initializes mlock_count and moves the page to unevictable state ("the unevictable LRU", but with -mlock_count in place of LRU threading). Or if the page was already PageLRU -and PageUnevictable and PageMlocked, it simply increments the mlock_count. +mlock_count in place of LRU threading). Or if the page was already PG_lru +and PG_unevictable and PG_mlocked, it simply increments the mlock_count. But in practice that may not work ideally: the page may not yet be on an LRU, or it may have been temporarily isolated from LRU. In such cases the mlock_count -field cannot be touched, but will be set to 0 later when __pagevec_lru_add_fn() +field cannot be touched, but will be set to 0 later when __munlock_folio() returns the page to "LRU". Races prohibit mlock_count from being set to 1 then: rather than risk stranding a page indefinitely as unevictable, always err with mlock_count on the low side, so that when munlocked the page will be rescued to @@ -377,8 +377,8 @@ that it is munlock() being performed. munlock_page() uses the mlock pagevec to batch up work to be done under lru_lock by __munlock_page(). __munlock_page() decrements the page's -mlock_count, and when that reaches 0 it clears PageMlocked and clears -PageUnevictable, moving the page from unevictable state to inactive LRU. +mlock_count, and when that reaches 0 it clears PG_mlocked and clears +PG_unevictable, moving the page from unevictable state to inactive LRU. But in practice that may not work ideally: the page may not yet have reached "the unevictable LRU", or it may have been temporarily isolated from it. In @@ -488,8 +488,8 @@ munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED munlock_page() uses the mlock pagevec to batch up work to be done under lru_lock by __munlock_page(). __munlock_page() decrements the page's -mlock_count, and when that reaches 0 it clears PageMlocked and clears -PageUnevictable, moving the page from unevictable state to inactive LRU. +mlock_count, and when that reaches 0 it clears PG_mlocked and clears +PG_unevictable, moving the page from unevictable state to inactive LRU. But in practice that may not work ideally: the page may not yet have reached "the unevictable LRU", or it may have been temporarily isolated from it. In @@ -515,7 +515,7 @@ munlocking by clearing VM_LOCKED from a VMA, before munlocking all the pages present, if one of those pages were unmapped by truncation or hole punch before mlock_pte_range() reached it, it would not be recognized as mlocked by this VMA, and would not be counted out of mlock_count. In this rare case, a page may -still appear as PageMlocked after it has been fully unmapped: and it is left to +still appear as PG_mlocked after it has been fully unmapped: and it is left to release_pages() (or __page_cache_release()) to clear it and update statistics before freeing (this event is counted in /proc/vmstat unevictable_pgs_cleared, which is usually 0). @@ -527,7 +527,7 @@ Page Reclaim in shrink_*_list() vmscan's shrink_active_list() culls any obviously unevictable pages - i.e. !page_evictable(page) pages - diverting those to the unevictable list. However, shrink_active_list() only sees unevictable pages that made it onto the -active/inactive LRU lists. Note that these pages do not have PageUnevictable +active/inactive LRU lists. Note that these pages do not have PG_unevictable set - otherwise they would be on the unevictable list and shrink_active_list() would never see them. -- cgit v1.2.3 From 62a9bbf2e999b9dca148bd342ab7a29fcf0cb120 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 12 Jan 2023 11:31:47 +0100 Subject: kmsan: silence -Wmissing-prototypes warnings When building the kernel with W=1, the compiler reports numerous warnings about the missing prototypes for KMSAN instrumentation hooks. Because these functions are not supposed to be called explicitly by the kernel code (calls to them are emitted by the compiler), they do not have to be declared in the headers. Instead, we add forward declarations right before the definitions to silence the warnings produced by -Wmissing-prototypes. Link: https://lkml.kernel.org/r/20230112103147.382416-1-glider@google.com Signed-off-by: Alexander Potapenko Reported-by: Vlastimil Babka Suggested-by: Marco Elver Reviewed-by: Marco Elver Reported-by: kernel test robot Link: https://lore.kernel.org/lkml/202301020356.dFruA4I5-lkp@intel.com/T/ Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- mm/kmsan/instrumentation.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c index 770fe02904f3..cf12e9616b24 100644 --- a/mm/kmsan/instrumentation.c +++ b/mm/kmsan/instrumentation.c @@ -38,7 +38,15 @@ get_shadow_origin_ptr(void *addr, u64 size, bool store) return ret; } +/* + * KMSAN instrumentation functions follow. They are not declared elsewhere in + * the kernel code, so they are preceded by prototypes, to silence + * -Wmissing-prototypes warnings. + */ + /* Get shadow and origin pointers for a memory load with non-standard size. */ +struct shadow_origin_ptr __msan_metadata_ptr_for_load_n(void *addr, + uintptr_t size); struct shadow_origin_ptr __msan_metadata_ptr_for_load_n(void *addr, uintptr_t size) { @@ -47,6 +55,8 @@ struct shadow_origin_ptr __msan_metadata_ptr_for_load_n(void *addr, EXPORT_SYMBOL(__msan_metadata_ptr_for_load_n); /* Get shadow and origin pointers for a memory store with non-standard size. */ +struct shadow_origin_ptr __msan_metadata_ptr_for_store_n(void *addr, + uintptr_t size); struct shadow_origin_ptr __msan_metadata_ptr_for_store_n(void *addr, uintptr_t size) { @@ -59,12 +69,16 @@ EXPORT_SYMBOL(__msan_metadata_ptr_for_store_n); * with fixed size. */ #define DECLARE_METADATA_PTR_GETTER(size) \ + struct shadow_origin_ptr __msan_metadata_ptr_for_load_##size( \ + void *addr); \ struct shadow_origin_ptr __msan_metadata_ptr_for_load_##size( \ void *addr) \ { \ return get_shadow_origin_ptr(addr, size, /*store*/ false); \ } \ EXPORT_SYMBOL(__msan_metadata_ptr_for_load_##size); \ + struct shadow_origin_ptr __msan_metadata_ptr_for_store_##size( \ + void *addr); \ struct shadow_origin_ptr __msan_metadata_ptr_for_store_##size( \ void *addr) \ { \ @@ -86,6 +100,7 @@ DECLARE_METADATA_PTR_GETTER(8); * entering or leaving IRQ. We omit the check for kmsan_in_runtime() to ensure * the memory written to in these cases is also marked as initialized. */ +void __msan_instrument_asm_store(void *addr, uintptr_t size); void __msan_instrument_asm_store(void *addr, uintptr_t size) { unsigned long ua_flags; @@ -138,6 +153,7 @@ static inline void set_retval_metadata(u64 shadow, depot_stack_handle_t origin) } /* Handle llvm.memmove intrinsic. */ +void *__msan_memmove(void *dst, const void *src, uintptr_t n); void *__msan_memmove(void *dst, const void *src, uintptr_t n) { depot_stack_handle_t origin; @@ -162,6 +178,7 @@ void *__msan_memmove(void *dst, const void *src, uintptr_t n) EXPORT_SYMBOL(__msan_memmove); /* Handle llvm.memcpy intrinsic. */ +void *__msan_memcpy(void *dst, const void *src, uintptr_t n); void *__msan_memcpy(void *dst, const void *src, uintptr_t n) { depot_stack_handle_t origin; @@ -188,6 +205,7 @@ void *__msan_memcpy(void *dst, const void *src, uintptr_t n) EXPORT_SYMBOL(__msan_memcpy); /* Handle llvm.memset intrinsic. */ +void *__msan_memset(void *dst, int c, uintptr_t n); void *__msan_memset(void *dst, int c, uintptr_t n) { depot_stack_handle_t origin; @@ -217,6 +235,7 @@ EXPORT_SYMBOL(__msan_memset); * uninitialized value to memory. When reporting an error, KMSAN unrolls and * prints the whole chain of stores that preceded the use of this value. */ +depot_stack_handle_t __msan_chain_origin(depot_stack_handle_t origin); depot_stack_handle_t __msan_chain_origin(depot_stack_handle_t origin) { depot_stack_handle_t ret = 0; @@ -237,6 +256,7 @@ depot_stack_handle_t __msan_chain_origin(depot_stack_handle_t origin) EXPORT_SYMBOL(__msan_chain_origin); /* Poison a local variable when entering a function. */ +void __msan_poison_alloca(void *address, uintptr_t size, char *descr); void __msan_poison_alloca(void *address, uintptr_t size, char *descr) { depot_stack_handle_t handle; @@ -272,6 +292,7 @@ void __msan_poison_alloca(void *address, uintptr_t size, char *descr) EXPORT_SYMBOL(__msan_poison_alloca); /* Unpoison a local variable. */ +void __msan_unpoison_alloca(void *address, uintptr_t size); void __msan_unpoison_alloca(void *address, uintptr_t size) { if (!kmsan_enabled || kmsan_in_runtime()) @@ -287,6 +308,7 @@ EXPORT_SYMBOL(__msan_unpoison_alloca); * Report that an uninitialized value with the given origin was used in a way * that constituted undefined behavior. */ +void __msan_warning(u32 origin); void __msan_warning(u32 origin) { if (!kmsan_enabled || kmsan_in_runtime()) @@ -303,6 +325,7 @@ EXPORT_SYMBOL(__msan_warning); * At the beginning of an instrumented function, obtain the pointer to * `struct kmsan_context_state` holding the metadata for function parameters. */ +struct kmsan_context_state *__msan_get_context_state(void); struct kmsan_context_state *__msan_get_context_state(void) { return &kmsan_get_context()->cstate; -- cgit v1.2.3 From 92644f583d5124b60bc20a3dd21b0bc9142f020c Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Fri, 13 Jan 2023 16:15:55 -0800 Subject: mm/khugepaged: introduce release_pte_folio() to replace release_pte_page() release_pte_page() is converted to be a wrapper for release_pte_folio() to help facilitate the khugepaged conversion to folios. This replaces 3 calls to compound_head() with 1, and saves 85 bytes of kernel text. Link: https://lkml.kernel.org/r/20230114001556.43795-1-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/khugepaged.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 57164c15e076..d7b993e53711 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -490,13 +490,18 @@ void __khugepaged_exit(struct mm_struct *mm) } } +static void release_pte_folio(struct folio *folio) +{ + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + -folio_nr_pages(folio)); + folio_unlock(folio); + folio_putback_lru(folio); +} + static void release_pte_page(struct page *page) { - mod_node_page_state(page_pgdat(page), - NR_ISOLATED_ANON + page_is_file_lru(page), - -compound_nr(page)); - unlock_page(page); - putback_lru_page(page); + release_pte_folio(page_folio(page)); } static void release_pte_pages(pte_t *pte, pte_t *_pte, -- cgit v1.2.3 From 9bdfeea46f4926181b9476037c6af28d6d19cc28 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Fri, 13 Jan 2023 16:15:56 -0800 Subject: mm/khugepaged: convert release_pte_pages() to use folios Converts release_pte_pages() to use folios instead of pages. Link: https://lkml.kernel.org/r/20230114001556.43795-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/khugepaged.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d7b993e53711..b39ab219d5b7 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -507,20 +507,20 @@ static void release_pte_page(struct page *page) static void release_pte_pages(pte_t *pte, pte_t *_pte, struct list_head *compound_pagelist) { - struct page *page, *tmp; + struct folio *folio, *tmp; while (--_pte >= pte) { pte_t pteval = *_pte; - page = pte_page(pteval); + folio = pfn_folio(pte_pfn(pteval)); if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) && - !PageCompound(page)) - release_pte_page(page); + !folio_test_large(folio)) + release_pte_folio(folio); } - list_for_each_entry_safe(page, tmp, compound_pagelist, lru) { - list_del(&page->lru); - release_pte_page(page); + list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) { + list_del(&folio->lru); + release_pte_folio(folio); } } -- cgit v1.2.3 From 2321ba3e3733f513e46e29b9c70512ecddbf1085 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:01 +0100 Subject: mm/debug_vm_pgtable: more pte_swp_exclusive() sanity checks Patch series "mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE on all architectures with swap PTEs". This is the follow-up on [1]: [PATCH v2 0/8] mm: COW fixes part 3: reliable GUP R/W FOLL_GET of anonymous pages After we implemented __HAVE_ARCH_PTE_SWP_EXCLUSIVE on most prominent enterprise architectures, implement __HAVE_ARCH_PTE_SWP_EXCLUSIVE on all remaining architectures that support swap PTEs. This makes sure that exclusive anonymous pages will stay exclusive, even after they were swapped out -- for example, making GUP R/W FOLL_GET of anonymous pages reliable. Details can be found in [1]. This primarily fixes remaining known O_DIRECT memory corruptions that can happen on concurrent swapout, whereby we can lose DMA reads to a page (modifying the user page by writing to it). To verify, there are two test cases (requiring swap space, obviously): (1) The O_DIRECT+swapout test case [2] from Andrea. This test case tries triggering a race condition. (2) My vmsplice() test case [3] that tries to detect if the exclusive marker was lost during swapout, not relying on a race condition. For example, on 32bit x86 (with and without PAE), my test case fails without these patches: $ ./test_swp_exclusive FAIL: page was replaced during COW But succeeds with these patches: $ ./test_swp_exclusive PASS: page was not replaced during COW Why implement __HAVE_ARCH_PTE_SWP_EXCLUSIVE for all architectures, even the ones where swap support might be in a questionable state? This is the first step towards removing "readable_exclusive" migration entries, and instead using pte_swp_exclusive() also with (readable) migration entries instead (as suggested by Peter). The only missing piece for that is supporting pmd_swp_exclusive() on relevant architectures with THP migration support. As all relevant architectures now implement __HAVE_ARCH_PTE_SWP_EXCLUSIVE,, we can drop __HAVE_ARCH_PTE_SWP_EXCLUSIVE in the last patch. I tried cross-compiling all relevant setups and tested on x86 and sparc64 so far. CCing arch maintainers only on this cover letter and on the respective patch(es). [1] https://lkml.kernel.org/r/20220329164329.208407-1-david@redhat.com [2] https://gitlab.com/aarcange/kernel-testcases-for-v5.11/-/blob/main/page_count_do_wp_page-swap.c [3] https://gitlab.com/davidhildenbrand/scratchspace/-/blob/main/test_swp_exclusive.c This patch (of 26): We want to implement __HAVE_ARCH_PTE_SWP_EXCLUSIVE on all architectures. Let's extend our sanity checks, especially testing that our PTE bit does not affect: * is_swap_pte() -> pte_present() and pte_none() * the swap entry + type * pte_swp_soft_dirty() Especially, the pfn_pte() is dodgy when the swap PTE layout differs heavily from ordinary PTEs. Let's properly construct a swap PTE from swap type+offset. [david@redhat.com: fix build] Link: https://lkml.kernel.org/r/6aaad548-cf48-77fa-9d6c-db83d724b2eb@redhat.com Link: https://lkml.kernel.org/r/20230113171026.582290-1-david@redhat.com Link: https://lkml.kernel.org/r/20230113171026.582290-2-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Anton Ivanov Cc: Cc: Borislav Petkov (AMD) Cc: Brian Cain Cc: Christophe Leroy Cc: Chris Zankel Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Greg Ungerer Cc: Guo Ren Cc: Helge Deller Cc: H. Peter Anvin (Intel) Cc: Huacai Chen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jason Gunthorpe Cc: Johannes Berg Cc: John Hubbard Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mike Rapoport Cc: Nadav Amit Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Xu Cc: Richard Henderson Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Xuerui Wang Cc: Yang Shi Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- mm/debug_vm_pgtable.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index bb3328f46126..ff8d6f6af896 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -811,13 +811,36 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args) { #ifdef __HAVE_ARCH_PTE_SWP_EXCLUSIVE - pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); + unsigned long max_swap_offset; + swp_entry_t entry, entry2; + pte_t pte; pr_debug("Validating PTE swap exclusive\n"); + + /* See generic_max_swapfile_size(): probe the maximum offset */ + max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL)))); + + /* Create a swp entry with all possible bits set */ + entry = swp_entry((1 << MAX_SWAPFILES_SHIFT) - 1, max_swap_offset); + + pte = swp_entry_to_pte(entry); + WARN_ON(pte_swp_exclusive(pte)); + WARN_ON(!is_swap_pte(pte)); + entry2 = pte_to_swp_entry(pte); + WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); + pte = pte_swp_mkexclusive(pte); WARN_ON(!pte_swp_exclusive(pte)); + WARN_ON(!is_swap_pte(pte)); + WARN_ON(pte_swp_soft_dirty(pte)); + entry2 = pte_to_swp_entry(pte); + WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); + pte = pte_swp_clear_exclusive(pte); WARN_ON(pte_swp_exclusive(pte)); + WARN_ON(!is_swap_pte(pte)); + entry2 = pte_to_swp_entry(pte); + WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); #endif /* __HAVE_ARCH_PTE_SWP_EXCLUSIVE */ } -- cgit v1.2.3 From a172d5128706028ac07b8db709728379ecc72f6e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:02 +0100 Subject: alpha/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the type. Generic MM currently only uses 5 bits for the type (MAX_SWAPFILES_SHIFT), so the stolen bit is effectively unused. While at it, mask the type in mk_swap_pte() as well. Link: https://lkml.kernel.org/r/20230113171026.582290-3-david@redhat.com Signed-off-by: David Hildenbrand Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Signed-off-by: Andrew Morton --- arch/alpha/include/asm/pgtable.h | 41 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index 9e45f6735d5d..970abf511b13 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -74,6 +74,9 @@ struct vm_area_struct; #define _PAGE_DIRTY 0x20000 #define _PAGE_ACCESSED 0x40000 +/* We borrow bit 39 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE 0x8000000000UL + /* * NOTE! The "accessed" bit isn't necessarily exact: it can be kept exactly * by software (use the KRE/URE/KWE/UWE bits appropriately), but I'll fake it. @@ -301,18 +304,48 @@ extern inline void update_mmu_cache(struct vm_area_struct * vma, } /* - * Non-present pages: high 24 bits are offset, next 8 bits type, - * low 32 bits zero. + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * <------------------- offset ------------------> E <--- type --> + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <--------------------------- zeroes --------------------------> + * + * E is the exclusive marker that is not stored in swap entries. */ extern inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) -{ pte_t pte; pte_val(pte) = (type << 32) | (offset << 40); return pte; } +{ pte_t pte; pte_val(pte) = ((type & 0x7f) << 32) | (offset << 40); return pte; } -#define __swp_type(x) (((x).val >> 32) & 0xff) +#define __swp_type(x) (((x).val >> 32) & 0x7f) #define __swp_offset(x) ((x).val >> 40) #define __swp_entry(type, off) ((swp_entry_t) { pte_val(mk_swap_pte((type), (off))) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + #define pte_ERROR(e) \ printk("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e)) #define pmd_ERROR(e) \ -- cgit v1.2.3 From 4a446b3dd335d0bd14a5ca3e563688de3637be0c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:03 +0100 Subject: arc/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by using bit 5, which is yet unused. The only important parts seems to be to not use _PAGE_PRESENT (bit 9). Link: https://lkml.kernel.org/r/20230113171026.582290-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Vineet Gupta Signed-off-by: Andrew Morton --- arch/arc/include/asm/pgtable-bits-arcv2.h | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h b/arch/arc/include/asm/pgtable-bits-arcv2.h index 515e82db519f..611f412713b9 100644 --- a/arch/arc/include/asm/pgtable-bits-arcv2.h +++ b/arch/arc/include/asm/pgtable-bits-arcv2.h @@ -26,6 +26,9 @@ #define _PAGE_GLOBAL (1 << 8) /* ASID agnostic (H) */ #define _PAGE_PRESENT (1 << 9) /* PTE/TLB Valid (H) */ +/* We borrow bit 5 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_DIRTY + #ifdef CONFIG_ARC_MMU_V4 #define _PAGE_HW_SZ (1 << 10) /* Normal/super (H) */ #else @@ -106,9 +109,18 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); -/* Encode swap {type,off} tuple into PTE - * We reserve 13 bits for 5-bit @type, keeping bits 12-5 zero, ensuring that - * PAGE_PRESENT is zero in a PTE holding swap "identifier" +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <-------------- offset -------------> <--- zero --> E < type -> + * + * E is the exclusive marker that is not stored in swap entries. + * The zero'ed bits include _PAGE_PRESENT. */ #define __swp_entry(type, off) ((swp_entry_t) \ { ((type) & 0x1f) | ((off) << 13) }) @@ -120,6 +132,15 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +PTE_BIT_FUNC(swp_mkexclusive, |= (_PAGE_SWP_EXCLUSIVE)); +PTE_BIT_FUNC(swp_clear_exclusive, &= ~(_PAGE_SWP_EXCLUSIVE)); + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #include #endif -- cgit v1.2.3 From 20aae9eff5acd8f50f72adca1176f9269a46b827 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:04 +0100 Subject: arm/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the offset. This reduces the maximum swap space per file to 64 GiB (was 128 GiB). While at it drop the PTE_TYPE_FAULT from __swp_entry_to_pte() which is defined to be 0 and is rather confusing because we should be dealing with "Linux PTEs" not "hardware PTEs". Also, properly mask the type in __swp_entry(). Link: https://lkml.kernel.org/r/20230113171026.582290-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Russell King (Oracle) Signed-off-by: Andrew Morton --- arch/arm/include/asm/pgtable-2level.h | 3 +++ arch/arm/include/asm/pgtable-3level.h | 3 +++ arch/arm/include/asm/pgtable.h | 35 ++++++++++++++++++++++++++++------- 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h index 92abd4cd8ca2..ce543cd9380c 100644 --- a/arch/arm/include/asm/pgtable-2level.h +++ b/arch/arm/include/asm/pgtable-2level.h @@ -126,6 +126,9 @@ #define L_PTE_SHARED (_AT(pteval_t, 1) << 10) /* shared(v6), coherent(xsc3) */ #define L_PTE_NONE (_AT(pteval_t, 1) << 11) +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define L_PTE_SWP_EXCLUSIVE L_PTE_RDONLY + /* * These are the memory types, defined to be compatible with * pre-ARMv6 CPUs cacheable and bufferable bits: n/a,n/a,C,B diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index eabe72ff7381..106049791500 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -76,6 +76,9 @@ #define L_PTE_NONE (_AT(pteval_t, 1) << 57) /* PROT_NONE */ #define L_PTE_RDONLY (_AT(pteval_t, 1) << 58) /* READ ONLY */ +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define L_PTE_SWP_EXCLUSIVE (_AT(pteval_t, 1) << 7) + #define L_PMD_SECT_VALID (_AT(pmdval_t, 1) << 0) #define L_PMD_SECT_DIRTY (_AT(pmdval_t, 1) << 55) #define L_PMD_SECT_NONE (_AT(pmdval_t, 1) << 57) diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index f049072b2e85..886c275995a2 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -271,27 +271,48 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) } /* - * Encode and decode a swap entry. Swap entries are stored in the Linux - * page tables as follows: + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: * * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 - * <--------------- offset ------------------------> < type -> 0 0 + * <------------------- offset ------------------> E < type -> 0 0 + * + * E is the exclusive marker that is not stored in swap entries. * - * This gives us up to 31 swap files and 128GB per swap file. Note that + * This gives us up to 31 swap files and 64GB per swap file. Note that * the offset field is always non-zero. */ #define __SWP_TYPE_SHIFT 2 #define __SWP_TYPE_BITS 5 #define __SWP_TYPE_MASK ((1 << __SWP_TYPE_BITS) - 1) -#define __SWP_OFFSET_SHIFT (__SWP_TYPE_BITS + __SWP_TYPE_SHIFT) +#define __SWP_OFFSET_SHIFT (__SWP_TYPE_BITS + __SWP_TYPE_SHIFT + 1) #define __swp_type(x) (((x).val >> __SWP_TYPE_SHIFT) & __SWP_TYPE_MASK) #define __swp_offset(x) ((x).val >> __SWP_OFFSET_SHIFT) -#define __swp_entry(type,offset) ((swp_entry_t) { ((type) << __SWP_TYPE_SHIFT) | ((offset) << __SWP_OFFSET_SHIFT) }) +#define __swp_entry(type, offset) ((swp_entry_t) { (((type) & __SWP_TYPE_BITS) << __SWP_TYPE_SHIFT) | \ + ((offset) << __SWP_OFFSET_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) -#define __swp_entry_to_pte(swp) __pte((swp).val | PTE_TYPE_FAULT) +#define __swp_entry_to_pte(swp) __pte((swp).val) + +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_isset(pte, L_PTE_SWP_EXCLUSIVE); +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return set_pte_bit(pte, __pgprot(L_PTE_SWP_EXCLUSIVE)); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return clear_pte_bit(pte, __pgprot(L_PTE_SWP_EXCLUSIVE)); +} /* * It is an error for the kernel to have more swap files than we can -- cgit v1.2.3 From 41e0d49104dbff888ef6446ea46842fde66c0a76 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:05 +0100 Subject: csky/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the offset. This reduces the maximum swap space per file to 16 GiB (was 32 GiB). We might actually be able to reuse one of the other software bits (_PAGE_READ / PAGE_WRITE) instead, because we only have to keep pte_present(), pte_none() and HW happy. For now, let's keep it simple because there might be something non-obvious. Link: https://lkml.kernel.org/r/20230113171026.582290-6-david@redhat.com Signed-off-by: David Hildenbrand Cc: Guo Ren Signed-off-by: Andrew Morton --- arch/csky/abiv1/inc/abi/pgtable-bits.h | 13 +++++++++---- arch/csky/abiv2/inc/abi/pgtable-bits.h | 19 ++++++++++++------- arch/csky/include/asm/pgtable.h | 18 ++++++++++++++++++ 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/arch/csky/abiv1/inc/abi/pgtable-bits.h b/arch/csky/abiv1/inc/abi/pgtable-bits.h index 752c8b3f9194..ae7a2f76dd42 100644 --- a/arch/csky/abiv1/inc/abi/pgtable-bits.h +++ b/arch/csky/abiv1/inc/abi/pgtable-bits.h @@ -10,6 +10,9 @@ #define _PAGE_ACCESSED (1<<3) #define _PAGE_MODIFIED (1<<4) +/* We borrow bit 9 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1<<9) + /* implemented in hardware */ #define _PAGE_GLOBAL (1<<6) #define _PAGE_VALID (1<<7) @@ -26,7 +29,8 @@ #define _PAGE_PROT_NONE _PAGE_READ /* - * Encode and decode a swap entry + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). * * Format of swap PTE: * bit 0: _PAGE_PRESENT (zero) @@ -35,15 +39,16 @@ * bit 6: _PAGE_GLOBAL (zero) * bit 7: _PAGE_VALID (zero) * bit 8: swap type[4] - * bit 9 - 31: swap offset + * bit 9: exclusive marker + * bit 10 - 31: swap offset */ #define __swp_type(x) ((((x).val >> 2) & 0xf) | \ (((x).val >> 4) & 0x10)) -#define __swp_offset(x) ((x).val >> 9) +#define __swp_offset(x) ((x).val >> 10) #define __swp_entry(type, offset) ((swp_entry_t) { \ ((type & 0xf) << 2) | \ ((type & 0x10) << 4) | \ - ((offset) << 9)}) + ((offset) << 10)}) #define HAVE_ARCH_UNMAPPED_AREA diff --git a/arch/csky/abiv2/inc/abi/pgtable-bits.h b/arch/csky/abiv2/inc/abi/pgtable-bits.h index 7e7f389f546f..526152bd2156 100644 --- a/arch/csky/abiv2/inc/abi/pgtable-bits.h +++ b/arch/csky/abiv2/inc/abi/pgtable-bits.h @@ -10,6 +10,9 @@ #define _PAGE_PRESENT (1<<10) #define _PAGE_MODIFIED (1<<11) +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1<<7) + /* implemented in hardware */ #define _PAGE_GLOBAL (1<<0) #define _PAGE_VALID (1<<1) @@ -26,23 +29,25 @@ #define _PAGE_PROT_NONE _PAGE_WRITE /* - * Encode and decode a swap entry + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). * * Format of swap PTE: * bit 0: _PAGE_GLOBAL (zero) * bit 1: _PAGE_VALID (zero) * bit 2 - 6: swap type - * bit 7 - 8: swap offset[0 - 1] + * bit 7: exclusive marker + * bit 8: swap offset[0] * bit 9: _PAGE_WRITE (zero) * bit 10: _PAGE_PRESENT (zero) - * bit 11 - 31: swap offset[2 - 22] + * bit 11 - 31: swap offset[1 - 21] */ #define __swp_type(x) (((x).val >> 2) & 0x1f) -#define __swp_offset(x) ((((x).val >> 7) & 0x3) | \ - (((x).val >> 9) & 0x7ffffc)) +#define __swp_offset(x) ((((x).val >> 8) & 0x1) | \ + (((x).val >> 10) & 0x3ffffe)) #define __swp_entry(type, offset) ((swp_entry_t) { \ ((type & 0x1f) << 2) | \ - ((offset & 0x3) << 7) | \ - ((offset & 0x7ffffc) << 9)}) + ((offset & 0x1) << 8) | \ + ((offset & 0x3ffffe) << 10)}) #endif /* __ASM_CSKY_PGTABLE_BITS_H */ diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index 77bc6caff2d2..574c97b9ecca 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -200,6 +200,24 @@ static inline pte_t pte_mkyoung(pte_t pte) return pte; } +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + #define __HAVE_PHYS_MEM_ACCESS_PROT struct file; extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, -- cgit v1.2.3 From 61f4a896e62dee8581fea843479058507fda57fb Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:06 +0100 Subject: hexagon/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the offset. This reduces the maximum swap space per file to 16 GiB (was 32 GiB). While at it, mask the type in __swp_entry(). Link: https://lkml.kernel.org/r/20230113171026.582290-7-david@redhat.com Signed-off-by: David Hildenbrand Cc: Brian Cain Signed-off-by: Andrew Morton --- arch/hexagon/include/asm/pgtable.h | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index f7048c18b6f9..7eb008e477c8 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -61,6 +61,9 @@ extern unsigned long empty_zero_page; * So we'll put up with a bit of inefficiency for now... */ +/* We borrow bit 6 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1<<6) + /* * Top "FOURTH" level (pgd), which for the Hexagon VM is really * only the second from the bottom, pgd and pud both being collapsed. @@ -359,9 +362,12 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define ZERO_PAGE(vaddr) (virt_to_page(&empty_zero_page)) /* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * * Swap/file PTE definitions. If _PAGE_PRESENT is zero, the rest of the PTE is * interpreted as swap information. The remaining free bits are interpreted as - * swap type/offset tuple. Rather than have the TLB fill handler test + * listed below. Rather than have the TLB fill handler test * _PAGE_PRESENT, we're going to reserve the permissions bits and set them to * all zeros for swap entries, which speeds up the miss handler at the cost of * 3 bits of offset. That trade-off can be revisited if necessary, but Hexagon @@ -371,9 +377,10 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) * Format of swap PTE: * bit 0: Present (zero) * bits 1-5: swap type (arch independent layer uses 5 bits max) - * bits 6-9: bits 3:0 of offset + * bit 6: exclusive marker + * bits 7-9: bits 2:0 of offset * bits 10-12: effectively _PAGE_PROTNONE (all zero) - * bits 13-31: bits 22:4 of swap offset + * bits 13-31: bits 21:3 of swap offset * * The split offset makes some of the following macros a little gnarly, * but there's plenty of precedent for this sort of thing. @@ -383,11 +390,29 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define __swp_type(swp_pte) (((swp_pte).val >> 1) & 0x1f) #define __swp_offset(swp_pte) \ - ((((swp_pte).val >> 6) & 0xf) | (((swp_pte).val >> 9) & 0x7ffff0)) + ((((swp_pte).val >> 7) & 0x7) | (((swp_pte).val >> 10) & 0x3ffff8)) #define __swp_entry(type, offset) \ ((swp_entry_t) { \ - ((type << 1) | \ - ((offset & 0x7ffff0) << 9) | ((offset & 0xf) << 6)) }) + (((type & 0x1f) << 1) | \ + ((offset & 0x3ffff8) << 10) | ((offset & 0x7) << 7)) }) + +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} #endif -- cgit v1.2.3 From 3151cc26565ea864c51a78900101ad68b864f405 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:07 +0100 Subject: ia64/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the type. Generic MM currently only uses 5 bits for the type (MAX_SWAPFILES_SHIFT), so the stolen bit is effectively unused. While at it, also mask the type in __swp_entry(). Link: https://lkml.kernel.org/r/20230113171026.582290-8-david@redhat.com Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton --- arch/ia64/include/asm/pgtable.h | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 01517a5e6778..e4b8ab931399 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -58,6 +58,9 @@ #define _PAGE_ED (__IA64_UL(1) << 52) /* exception deferral */ #define _PAGE_PROTNONE (__IA64_UL(1) << 63) +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1 << 7) + #define _PFN_MASK _PAGE_PPN_MASK /* Mask of bits which may be changed by pte_modify(); the odd bits are there for _PAGE_PROTNONE */ #define _PAGE_CHG_MASK (_PAGE_P | _PAGE_PROTNONE | _PAGE_PL_MASK | _PAGE_AR_MASK | _PAGE_ED) @@ -399,6 +402,9 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern void paging_init (void); /* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * * Note: The macros below rely on the fact that MAX_SWAPFILES_SHIFT <= number of * bits in the swap-type field of the swap pte. It would be nice to * enforce that, but we can't easily include here. @@ -406,16 +412,36 @@ extern void paging_init (void); * * Format of swap pte: * bit 0 : present bit (must be zero) - * bits 1- 7: swap-type + * bits 1- 6: swap type + * bit 7 : exclusive marker * bits 8-62: swap offset * bit 63 : _PAGE_PROTNONE bit */ -#define __swp_type(entry) (((entry).val >> 1) & 0x7f) +#define __swp_type(entry) (((entry).val >> 1) & 0x3f) #define __swp_offset(entry) (((entry).val << 1) >> 9) -#define __swp_entry(type,offset) ((swp_entry_t) { ((type) << 1) | ((long) (offset) << 8) }) +#define __swp_entry(type, offset) ((swp_entry_t) { ((type & 0x3f) << 1) | \ + ((long) (offset) << 8) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. -- cgit v1.2.3 From ad3150f11b099321331f2f74d008e86ab2a04c7a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:08 +0100 Subject: loongarch/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the type. Generic MM currently only uses 5 bits for the type (MAX_SWAPFILES_SHIFT), so the stolen bit is effectively unused. While at it, also mask the type in mk_swap_pte(). Note that this bit does not conflict with swap PMDs and could also be used in swap PMD context later. Link: https://lkml.kernel.org/r/20230113171026.582290-9-david@redhat.com Signed-off-by: David Hildenbrand Cc: Huacai Chen Cc: WANG Xuerui Signed-off-by: Andrew Morton --- arch/loongarch/include/asm/pgtable-bits.h | 4 ++++ arch/loongarch/include/asm/pgtable.h | 39 +++++++++++++++++++++++++++---- 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/include/asm/pgtable-bits.h b/arch/loongarch/include/asm/pgtable-bits.h index 3d1e0a69975a..8b98d22a145b 100644 --- a/arch/loongarch/include/asm/pgtable-bits.h +++ b/arch/loongarch/include/asm/pgtable-bits.h @@ -20,6 +20,7 @@ #define _PAGE_SPECIAL_SHIFT 11 #define _PAGE_HGLOBAL_SHIFT 12 /* HGlobal is a PMD bit */ #define _PAGE_PFN_SHIFT 12 +#define _PAGE_SWP_EXCLUSIVE_SHIFT 23 #define _PAGE_PFN_END_SHIFT 48 #define _PAGE_NO_READ_SHIFT 61 #define _PAGE_NO_EXEC_SHIFT 62 @@ -33,6 +34,9 @@ #define _PAGE_PROTNONE (_ULCAST_(1) << _PAGE_PROTNONE_SHIFT) #define _PAGE_SPECIAL (_ULCAST_(1) << _PAGE_SPECIAL_SHIFT) +/* We borrow bit 23 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (_ULCAST_(1) << _PAGE_SWP_EXCLUSIVE_SHIFT) + /* Used by TLB hardware (placed in EntryLo*) */ #define _PAGE_VALID (_ULCAST_(1) << _PAGE_VALID_SHIFT) #define _PAGE_DIRTY (_ULCAST_(1) << _PAGE_DIRTY_SHIFT) diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 7a34e900d8c1..c6b8fe7ac43c 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -249,13 +249,26 @@ extern void pud_init(void *addr); extern void pmd_init(void *addr); /* - * Non-present pages: high 40 bits are offset, next 8 bits type, - * low 16 bits zero. + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * <--------------------------- offset --------------------------- + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * --------------> E <--- type ---> <---------- zeroes ----------> + * + * E is the exclusive marker that is not stored in swap entries. + * The zero'ed bits include _PAGE_PRESENT and _PAGE_PROTNONE. */ static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) -{ pte_t pte; pte_val(pte) = (type << 16) | (offset << 24); return pte; } +{ pte_t pte; pte_val(pte) = ((type & 0x7f) << 16) | (offset << 24); return pte; } -#define __swp_type(x) (((x).val >> 16) & 0xff) +#define __swp_type(x) (((x).val >> 16) & 0x7f) #define __swp_offset(x) ((x).val >> 24) #define __swp_entry(type, offset) ((swp_entry_t) { pte_val(mk_swap_pte((type), (offset))) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) @@ -263,6 +276,24 @@ static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) #define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val(pmd) }) #define __swp_entry_to_pmd(x) ((pmd_t) { (x).val | _PAGE_HUGE }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + extern void paging_init(void); #define pte_none(pte) (!(pte_val(pte) & ~_PAGE_GLOBAL)) -- cgit v1.2.3 From ad464ff2c0f91fcacc24167fc435aa45fe0b7d1b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:09 +0100 Subject: m68k/mm: remove dummy __swp definitions for nommu The definitions are not required, let's remove them. Link: https://lkml.kernel.org/r/20230113171026.582290-10-david@redhat.com Signed-off-by: David Hildenbrand Cc: Geert Uytterhoeven Cc: Greg Ungerer Signed-off-by: Andrew Morton --- arch/m68k/include/asm/pgtable_no.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/m68k/include/asm/pgtable_no.h b/arch/m68k/include/asm/pgtable_no.h index fed58da3a6b6..fc044df52b96 100644 --- a/arch/m68k/include/asm/pgtable_no.h +++ b/arch/m68k/include/asm/pgtable_no.h @@ -31,12 +31,6 @@ extern void paging_init(void); #define swapper_pg_dir ((pgd_t *) 0) -#define __swp_type(x) (0) -#define __swp_offset(x) (0) -#define __swp_entry(typ,off) ((swp_entry_t) { ((typ) | ((off) << 7)) }) -#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) -#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) - /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. -- cgit v1.2.3 From ed4154067a086c44608b08f55cceb2688b661750 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:10 +0100 Subject: m68k/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the type. Generic MM currently only uses 5 bits for the type (MAX_SWAPFILES_SHIFT), so the stolen bit is effectively unused. While at it, make sure for sun3 that the valid bit never gets set by properly masking it off and mask the type in __swp_entry(). Link: https://lkml.kernel.org/r/20230113171026.582290-11-david@redhat.com Signed-off-by: David Hildenbrand Cc: Geert Uytterhoeven Cc: Greg Ungerer Signed-off-by: Andrew Morton --- arch/m68k/include/asm/mcf_pgtable.h | 36 ++++++++++++++++++++++++++--- arch/m68k/include/asm/motorola_pgtable.h | 38 ++++++++++++++++++++++++++++--- arch/m68k/include/asm/sun3_pgtable.h | 39 +++++++++++++++++++++++++++++--- 3 files changed, 104 insertions(+), 9 deletions(-) diff --git a/arch/m68k/include/asm/mcf_pgtable.h b/arch/m68k/include/asm/mcf_pgtable.h index b619b22823f8..3f8f4d0e66dd 100644 --- a/arch/m68k/include/asm/mcf_pgtable.h +++ b/arch/m68k/include/asm/mcf_pgtable.h @@ -46,6 +46,9 @@ #define _CACHEMASK040 (~0x060) #define _PAGE_GLOBAL040 0x400 /* 68040 global bit, used for kva descs */ +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE 0x080 + /* * Externally used page protection values. */ @@ -254,15 +257,42 @@ static inline pte_t pte_mkcache(pte_t pte) extern pgd_t kernel_pg_dir[PTRS_PER_PGD]; /* - * Encode and de-code a swap entry (must be !pte_none(e) && !pte_present(e)) + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <------------------ offset -------------> 0 0 0 E <-- type ---> + * + * E is the exclusive marker that is not stored in swap entries. */ -#define __swp_type(x) ((x).val & 0xFF) +#define __swp_type(x) ((x).val & 0x7f) #define __swp_offset(x) ((x).val >> 11) -#define __swp_entry(typ, off) ((swp_entry_t) { (typ) | \ +#define __swp_entry(typ, off) ((swp_entry_t) { ((typ) & 0x7f) | \ (off << 11) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) (__pte((x).val)) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + #define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT) #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h index 562b54e09850..c1782563e793 100644 --- a/arch/m68k/include/asm/motorola_pgtable.h +++ b/arch/m68k/include/asm/motorola_pgtable.h @@ -41,6 +41,9 @@ #define _PAGE_PROTNONE 0x004 +/* We borrow bit 11 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE 0x800 + #ifndef __ASSEMBLY__ /* This is the cache mode to be used for pages containing page descriptors for @@ -169,12 +172,41 @@ static inline pte_t pte_mkcache(pte_t pte) #define swapper_pg_dir kernel_pg_dir extern pgd_t kernel_pg_dir[128]; -/* Encode and de-code a swap entry (must be !pte_none(e) && !pte_present(e)) */ -#define __swp_type(x) (((x).val >> 4) & 0xff) +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <----------------- offset ------------> E <-- type ---> 0 0 0 0 + * + * E is the exclusive marker that is not stored in swap entries. + */ +#define __swp_type(x) (((x).val >> 4) & 0x7f) #define __swp_offset(x) ((x).val >> 12) -#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 4) | ((offset) << 12) }) +#define __swp_entry(type, offset) ((swp_entry_t) { (((type) & 0x7f) << 4) | ((offset) << 12) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + #endif /* !__ASSEMBLY__ */ #endif /* _MOTOROLA_PGTABLE_H */ diff --git a/arch/m68k/include/asm/sun3_pgtable.h b/arch/m68k/include/asm/sun3_pgtable.h index 90d57e537eb1..dbfc9703b15d 100644 --- a/arch/m68k/include/asm/sun3_pgtable.h +++ b/arch/m68k/include/asm/sun3_pgtable.h @@ -71,6 +71,9 @@ #define SUN3_PMD_MASK (0x0000003F) #define SUN3_PMD_MAGIC (0x0000002B) +/* We borrow bit 6 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE 0x040 + #ifndef __ASSEMBLY__ /* @@ -152,12 +155,42 @@ static inline pte_t pte_mkcache(pte_t pte) { return pte; } extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern pgd_t kernel_pg_dir[PTRS_PER_PGD]; -/* Macros to (de)construct the fake PTEs representing swap pages. */ -#define __swp_type(x) ((x).val & 0x7F) +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * 0 <--------------------- offset ----------------> E <- type --> + * + * E is the exclusive marker that is not stored in swap entries. + */ +#define __swp_type(x) ((x).val & 0x3f) #define __swp_offset(x) (((x).val) >> 7) -#define __swp_entry(type,offset) ((swp_entry_t) { ((type) | ((offset) << 7)) }) +#define __swp_entry(type, offset) ((swp_entry_t) { (((type) & 0x3f) | \ + (((offset) << 7) & ~SUN3_PAGE_VALID)) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + #endif /* !__ASSEMBLY__ */ #endif /* !_SUN3_PGTABLE_H */ -- cgit v1.2.3 From b5c88f21531c3457e3e202817556b68171484e00 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:11 +0100 Subject: microblaze/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the type. Generic MM currently only uses 5 bits for the type (MAX_SWAPFILES_SHIFT), so the stolen bit is effectively unused. The shift by 2 when converting between PTE and arch-specific swap entry makes the swap PTE layout a little bit harder to decipher. While at it, drop the comment from paulus---copy-and-paste leftover from powerpc where we actually have _PAGE_HASHPTE---and mask the type in __swp_entry_to_pte() as well. Link: https://lkml.kernel.org/r/20230113171026.582290-12-david@redhat.com Signed-off-by: David Hildenbrand Cc: Michal Simek Signed-off-by: Andrew Morton --- arch/m68k/include/asm/mcf_pgtable.h | 4 ++-- arch/microblaze/include/asm/pgtable.h | 45 +++++++++++++++++++++++++++-------- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/arch/m68k/include/asm/mcf_pgtable.h b/arch/m68k/include/asm/mcf_pgtable.h index 3f8f4d0e66dd..e573d7b649f7 100644 --- a/arch/m68k/include/asm/mcf_pgtable.h +++ b/arch/m68k/include/asm/mcf_pgtable.h @@ -46,8 +46,8 @@ #define _CACHEMASK040 (~0x060) #define _PAGE_GLOBAL040 0x400 /* 68040 global bit, used for kva descs */ -/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ -#define _PAGE_SWP_EXCLUSIVE 0x080 +/* We borrow bit 24 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE CF_PAGE_NOCACHE /* * Externally used page protection values. diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index 42f5988e998b..7e3de54bf426 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -131,10 +131,10 @@ extern pte_t *va_to_pte(unsigned long address); * of the 16 available. Bit 24-26 of the TLB are cleared in the TLB * miss handler. Bit 27 is PAGE_USER, thus selecting the correct * zone. - * - PRESENT *must* be in the bottom two bits because swap cache - * entries use the top 30 bits. Because 4xx doesn't support SMP - * anyway, M is irrelevant so we borrow it for PAGE_PRESENT. Bit 30 - * is cleared in the TLB miss handler before the TLB entry is loaded. + * - PRESENT *must* be in the bottom two bits because swap PTEs use the top + * 30 bits. Because 4xx doesn't support SMP anyway, M is irrelevant so we + * borrow it for PAGE_PRESENT. Bit 30 is cleared in the TLB miss handler + * before the TLB entry is loaded. * - All other bits of the PTE are loaded into TLBLO without * * modification, leaving us only the bits 20, 21, 24, 25, 26, 30 for * software PTE bits. We actually use bits 21, 24, 25, and @@ -155,6 +155,9 @@ extern pte_t *va_to_pte(unsigned long address); #define _PAGE_ACCESSED 0x400 /* software: R: page referenced */ #define _PMD_PRESENT PAGE_MASK +/* We borrow bit 24 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_DIRTY + /* * Some bits are unused... */ @@ -393,18 +396,40 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* - * Encode and decode a swap entry. - * Note that the bits we use in a PTE for representing a swap entry - * must not include the _PAGE_PRESENT bit, or the _PAGE_HASHPTE bit - * (if used). -- paulus + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * <------------------ offset -------------------> E < type -> 0 0 + * + * E is the exclusive marker that is not stored in swap entries. */ -#define __swp_type(entry) ((entry).val & 0x3f) +#define __swp_type(entry) ((entry).val & 0x1f) #define __swp_offset(entry) ((entry).val >> 6) #define __swp_entry(type, offset) \ - ((swp_entry_t) { (type) | ((offset) << 6) }) + ((swp_entry_t) { ((type) & 0x1f) | ((offset) << 6) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 2 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 2 }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + extern unsigned long iopa(unsigned long addr); /* Values for nocacheflag and cmode */ -- cgit v1.2.3 From 83d3b2b46ea3f8fa542fb7528b3bca6f476d0fab Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:12 +0100 Subject: mips/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE. On 64bit, steal one bit from the type. Generic MM currently only uses 5 bits for the type (MAX_SWAPFILES_SHIFT), so the stolen bit is effectively unused. On 32bit we're able to locate unused bits. As the PTE layout for 32 bit is very confusing, document it a bit better. While at it, mask the type in __swp_entry()/mk_swap_pte(). Link: https://lkml.kernel.org/r/20230113171026.582290-13-david@redhat.com Signed-off-by: David Hildenbrand Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/include/asm/pgtable-32.h | 88 ++++++++++++++++++++++++++++++++------ arch/mips/include/asm/pgtable-64.h | 23 ++++++++-- arch/mips/include/asm/pgtable.h | 36 ++++++++++++++++ 3 files changed, 131 insertions(+), 16 deletions(-) diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h index b40a0e69fccc..ba0016709a1a 100644 --- a/arch/mips/include/asm/pgtable-32.h +++ b/arch/mips/include/asm/pgtable-32.h @@ -191,49 +191,113 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) #define pte_page(x) pfn_to_page(pte_pfn(x)) +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + */ #if defined(CONFIG_CPU_R3K_TLB) -/* Swap entries must have VALID bit cleared. */ +/* + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <----------- offset ------------> < type -> V G E 0 0 0 0 0 0 P + * + * E is the exclusive marker that is not stored in swap entries. + * _PAGE_PRESENT (P), _PAGE_VALID (V) and_PAGE_GLOBAL (G) have to remain + * unused. + */ #define __swp_type(x) (((x).val >> 10) & 0x1f) #define __swp_offset(x) ((x).val >> 15) -#define __swp_entry(type,offset) ((swp_entry_t) { ((type) << 10) | ((offset) << 15) }) +#define __swp_entry(type, offset) ((swp_entry_t) { (((type) & 0x1f) << 10) | ((offset) << 15) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1 << 7) + #else #if defined(CONFIG_XPA) -/* Swap entries must have VALID and GLOBAL bits cleared. */ +/* + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * 0 0 0 0 0 0 E P <------------------ zeroes -------------------> + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <----------------- offset ------------------> < type -> V G 0 0 + * + * E is the exclusive marker that is not stored in swap entries. + * _PAGE_PRESENT (P), _PAGE_VALID (V) and_PAGE_GLOBAL (G) have to remain + * unused. + */ #define __swp_type(x) (((x).val >> 4) & 0x1f) #define __swp_offset(x) ((x).val >> 9) -#define __swp_entry(type,offset) ((swp_entry_t) { ((type) << 4) | ((offset) << 9) }) +#define __swp_entry(type, offset) ((swp_entry_t) { (((type) & 0x1f) << 4) | ((offset) << 9) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_high }) #define __swp_entry_to_pte(x) ((pte_t) { 0, (x).val }) +/* + * We borrow bit 57 (bit 25 in the low PTE) to store the exclusive marker in + * swap PTEs. + */ +#define _PAGE_SWP_EXCLUSIVE (1 << 25) + #elif defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) -/* Swap entries must have VALID and GLOBAL bits cleared. */ +/* + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * <------------------ zeroes -------------------> E P 0 0 0 0 0 0 + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <------------------- offset --------------------> < type -> V G + * + * E is the exclusive marker that is not stored in swap entries. + * _PAGE_PRESENT (P), _PAGE_VALID (V) and_PAGE_GLOBAL (G) have to remain + * unused. + */ #define __swp_type(x) (((x).val >> 2) & 0x1f) #define __swp_offset(x) ((x).val >> 7) -#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 2) | ((offset) << 7) }) +#define __swp_entry(type, offset) ((swp_entry_t) { (((type) & 0x1f) << 2) | ((offset) << 7) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_high }) #define __swp_entry_to_pte(x) ((pte_t) { 0, (x).val }) +/* + * We borrow bit 39 (bit 7 in the low PTE) to store the exclusive marker in swap + * PTEs. + */ +#define _PAGE_SWP_EXCLUSIVE (1 << 7) + #else /* - * Constraints: - * _PAGE_PRESENT at bit 0 - * _PAGE_MODIFIED at bit 4 - * _PAGE_GLOBAL at bit 6 - * _PAGE_VALID at bit 7 + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <------------- offset --------------> < type -> 0 0 0 0 0 0 E P + * + * E is the exclusive marker that is not stored in swap entries. + * _PAGE_PRESENT (P), _PAGE_VALID (V) and_PAGE_GLOBAL (G) have to remain + * unused. The location of V and G varies. */ #define __swp_type(x) (((x).val >> 8) & 0x1f) #define __swp_offset(x) ((x).val >> 13) -#define __swp_entry(type,offset) ((swp_entry_t) { ((type) << 8) | ((offset) << 13) }) +#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 8) | ((offset) << 13) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +/* We borrow bit 1 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1 << 1) + #endif /* defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) */ #endif /* defined(CONFIG_CPU_R3K_TLB) */ diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h index c6310192b654..98e24e3e7f2b 100644 --- a/arch/mips/include/asm/pgtable-64.h +++ b/arch/mips/include/asm/pgtable-64.h @@ -320,16 +320,31 @@ extern void pud_init(void *addr); extern void pmd_init(void *addr); /* - * Non-present pages: high 40 bits are offset, next 8 bits type, - * low 16 bits zero. + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * <--------------------------- offset --------------------------- + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * --------------> E <-- type ---> <---------- zeroes -----------> + * + * E is the exclusive marker that is not stored in swap entries. */ static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) -{ pte_t pte; pte_val(pte) = (type << 16) | (offset << 24); return pte; } +{ pte_t pte; pte_val(pte) = ((type & 0x7f) << 16) | (offset << 24); return pte; } -#define __swp_type(x) (((x).val >> 16) & 0xff) +#define __swp_type(x) (((x).val >> 16) & 0x7f) #define __swp_offset(x) ((x).val >> 24) #define __swp_entry(type, offset) ((swp_entry_t) { pte_val(mk_swap_pte((type), (offset))) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +/* We borrow bit 23 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1 << 23) + #endif /* _ASM_PGTABLE_64_H */ diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index a68c0b01d8cd..711874cee8e4 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -528,6 +528,42 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) } #endif +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +#if defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte.pte_low & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte.pte_low |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte.pte_low &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} +#else +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} +#endif extern void __update_tlb(struct vm_area_struct *vma, unsigned long address, pte_t pte); -- cgit v1.2.3 From 0a9ad8273ff4643dd50ff6b3ec991ab270e87a89 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:13 +0100 Subject: nios2/mm: refactor swap PTE layout nios2 disables swap for a good reason: it doesn't even provide sufficient type bits as required by core MM. However, swap entries are nowadays also used for other purposes (migration entries, PTE markers, HWPoison, ...), and accidential use could be problematic. Let's properly use 5 bits for the swap type and document the layout. Bits 26--31 should get ignored by hardware completely, so they can be used. Link: https://lkml.kernel.org/r/20230113171026.582290-14-david@redhat.com Signed-off-by: David Hildenbrand Cc: Dinh Nguyen Signed-off-by: Andrew Morton --- arch/nios2/include/asm/pgtable.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index ab793bc517f5..d1e5c9eb4643 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -232,19 +232,21 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) __FILE__, __LINE__, pgd_val(e)) /* - * Encode and decode a swap entry (must be !pte_none(pte) && !pte_present(pte): + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). * - * 31 30 29 28 27 26 25 24 23 22 21 20 19 18 ... 1 0 - * 0 0 0 0 type. 0 0 0 0 0 0 offset......... + * Format of swap PTEs: * - * This gives us up to 2**2 = 4 swap files and 2**20 * 4K = 4G per swap file. + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * 0 < type -> 0 0 0 0 0 0 <-------------- offset ---------------> * - * Note that the offset field is always non-zero, thus !pte_none(pte) is always - * true. + * Note that the offset field is always non-zero if the swap type is 0, thus + * !pte_none() is always true. */ -#define __swp_type(swp) (((swp).val >> 26) & 0x3) +#define __swp_type(swp) (((swp).val >> 26) & 0x1f) #define __swp_offset(swp) ((swp).val & 0xfffff) -#define __swp_entry(type, off) ((swp_entry_t) { (((type) & 0x3) << 26) \ +#define __swp_entry(type, off) ((swp_entry_t) { (((type) & 0x1f) << 26) \ | ((off) & 0xfffff) }) #define __swp_entry_to_pte(swp) ((pte_t) { (swp).val }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) -- cgit v1.2.3 From 4d1d955f7c0cdbb7562d19049f859c74276fdfeb Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:14 +0100 Subject: nios2/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by using the yet-unused bit 31. Link: https://lkml.kernel.org/r/20230113171026.582290-15-david@redhat.com Signed-off-by: David Hildenbrand Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/nios2/include/asm/pgtable-bits.h | 3 +++ arch/nios2/include/asm/pgtable.h | 22 +++++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/arch/nios2/include/asm/pgtable-bits.h b/arch/nios2/include/asm/pgtable-bits.h index bfddff383e89..724f9b08b1d1 100644 --- a/arch/nios2/include/asm/pgtable-bits.h +++ b/arch/nios2/include/asm/pgtable-bits.h @@ -31,4 +31,7 @@ #define _PAGE_ACCESSED (1<<26) /* page referenced */ #define _PAGE_DIRTY (1<<27) /* dirty page */ +/* We borrow bit 31 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1<<31) + #endif /* _ASM_NIOS2_PGTABLE_BITS_H */ diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index d1e5c9eb4643..05999da01731 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -239,7 +239,9 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) * * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 - * 0 < type -> 0 0 0 0 0 0 <-------------- offset ---------------> + * E < type -> 0 0 0 0 0 0 <-------------- offset ---------------> + * + * E is the exclusive marker that is not stored in swap entries. * * Note that the offset field is always non-zero if the swap type is 0, thus * !pte_none() is always true. @@ -251,6 +253,24 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define __swp_entry_to_pte(swp) ((pte_t) { (swp).val }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + extern void __init paging_init(void); extern void __init mmu_init(void); -- cgit v1.2.3 From 5ae3e74474f82613482ed67b0c234f61ac6ca2dd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:15 +0100 Subject: openrisc/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the type. Generic MM currently only uses 5 bits for the type (MAX_SWAPFILES_SHIFT), so the stolen bit is effectively unused. While at it, mask the type in __swp_entry(). Link: https://lkml.kernel.org/r/20230113171026.582290-16-david@redhat.com Signed-off-by: David Hildenbrand Cc: Stefan Kristiansson Cc: Stafford Horne Signed-off-by: Andrew Morton --- arch/openrisc/include/asm/pgtable.h | 41 ++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 6477c17b3062..903b32d662ab 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -154,6 +154,9 @@ extern void paging_init(void); #define _KERNPG_TABLE \ (_PAGE_BASE | _PAGE_SRE | _PAGE_SWE | _PAGE_ACCESSED | _PAGE_DIRTY) +/* We borrow bit 11 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_U_SHARED + #define PAGE_NONE __pgprot(_PAGE_ALL) #define PAGE_READONLY __pgprot(_PAGE_ALL | _PAGE_URE | _PAGE_SRE) #define PAGE_READONLY_X __pgprot(_PAGE_ALL | _PAGE_URE | _PAGE_SRE | _PAGE_EXEC) @@ -385,16 +388,44 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, /* __PHX__ FIXME, SWAP, this probably doesn't work */ -/* Encode and de-code a swap entry (must be !pte_none(e) && !pte_present(e)) */ -/* Since the PAGE_PRESENT bit is bit 4, we can use the bits above */ - -#define __swp_type(x) (((x).val >> 5) & 0x7f) +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <-------------- offset ---------------> E <- type --> 0 0 0 0 0 + * + * E is the exclusive marker that is not stored in swap entries. + * The zero'ed bits include _PAGE_PRESENT. + */ +#define __swp_type(x) (((x).val >> 5) & 0x3f) #define __swp_offset(x) ((x).val >> 12) #define __swp_entry(type, offset) \ - ((swp_entry_t) { ((type) << 5) | ((offset) << 12) }) + ((swp_entry_t) { (((type) & 0x3f) << 5) | ((offset) << 12) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + typedef pte_t *pte_addr_t; #endif /* __ASSEMBLY__ */ -- cgit v1.2.3 From 6d239fc78c0b0c687e5408573350714e6e789d71 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:16 +0100 Subject: parisc/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by using the yet-unused _PAGE_ACCESSED location in the swap PTE. Looking at pte_present() and pte_none() checks, there seems to be no actual reason why we cannot use it: we only have to make sure we're not using _PAGE_PRESENT. Reusing this bit avoids having to steal one bit from the swap offset. Link: https://lkml.kernel.org/r/20230113171026.582290-17-david@redhat.com Signed-off-by: David Hildenbrand Cc: "James E.J. Bottomley" Cc: Helge Deller Signed-off-by: Andrew Morton --- arch/parisc/include/asm/pgtable.h | 41 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index ea357430aafe..3033bb88df34 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -218,6 +218,9 @@ extern void __update_cache(pte_t pte); #define _PAGE_KERNEL_RWX (_PAGE_KERNEL_EXEC | _PAGE_WRITE) #define _PAGE_KERNEL (_PAGE_KERNEL_RO | _PAGE_WRITE) +/* We borrow bit 23 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_ACCESSED + /* The pgd/pmd contains a ptr (in phys addr space); since all pgds/pmds * are page-aligned, we don't care about the PAGE_OFFSET bits, except * for a few meta-information bits, so we shift the address to be @@ -394,17 +397,49 @@ extern void paging_init (void); #define update_mmu_cache(vms,addr,ptep) __update_cache(*ptep) -/* Encode and de-code a swap entry */ - +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs (32bit): + * + * 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * <---------------- offset -----------------> P E < type -> + * + * E is the exclusive marker that is not stored in swap entries. + * _PAGE_PRESENT (P) must be 0. + * + * For the 64bit version, the offset is extended by 32bit. + */ #define __swp_type(x) ((x).val & 0x1f) #define __swp_offset(x) ( (((x).val >> 6) & 0x7) | \ (((x).val >> 8) & ~0x7) ) -#define __swp_entry(type, offset) ((swp_entry_t) { (type) | \ +#define __swp_entry(type, offset) ((swp_entry_t) { \ + ((type) & 0x1f) | \ ((offset & 0x7) << 6) | \ ((offset & ~0x7) << 8) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte; -- cgit v1.2.3 From 8897ebff37fd34920d380cbfafbfb47804eb4009 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:17 +0100 Subject: powerpc/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE on 32bit book3s We already implemented support for 64bit book3s in commit bff9beaa2e80 ("powerpc/pgtable: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE for book3s") Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE also in 32bit by reusing yet unused LSB 2 / MSB 29. There seems to be no real reason why that bit cannot be used, and reusing it avoids having to steal one bit from the swap offset. While at it, mask the type in __swp_entry(). Link: https://lkml.kernel.org/r/20230113171026.582290-18-david@redhat.com Signed-off-by: David Hildenbrand Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/32/pgtable.h | 38 ++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 75823f39e042..0ecb3a58f23f 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -42,6 +42,9 @@ #define _PMD_PRESENT_MASK (PAGE_MASK) #define _PMD_BAD (~PAGE_MASK) +/* We borrow the _PAGE_USER bit to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_USER + /* And here we include common definitions */ #define _PAGE_KERNEL_RO 0 @@ -363,17 +366,42 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, #define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) /* - * Encode and decode a swap entry. - * Note that the bits we use in a PTE for representing a swap entry - * must not include the _PAGE_PRESENT bit or the _PAGE_HASHPTE bit (if used). - * -- paulus + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs (32bit PTEs): + * + * 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * <----------------- offset --------------------> < type -> E H P + * + * E is the exclusive marker that is not stored in swap entries. + * _PAGE_PRESENT (P) and __PAGE_HASHPTE (H) must be 0. + * + * For 64bit PTEs, the offset is extended by 32bit. */ #define __swp_type(entry) ((entry).val & 0x1f) #define __swp_offset(entry) ((entry).val >> 5) -#define __swp_entry(type, offset) ((swp_entry_t) { (type) | ((offset) << 5) }) +#define __swp_entry(type, offset) ((swp_entry_t) { ((type) & 0x1f) | ((offset) << 5) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 3 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 3 }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SWP_EXCLUSIVE); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_SWP_EXCLUSIVE); +} + /* Generic accessors to PTE bits */ static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_RW);} static inline int pte_read(pte_t pte) { return 1; } -- cgit v1.2.3 From 2bba2ffbe0303552142af944b891967ccf69a63b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:18 +0100 Subject: powerpc/nohash/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE on 32bit and 64bit. On 64bit, let's use MSB 56 (LSB 7), located right next to the page type. On 32bit, let's use LSB 2 to avoid stealing one bit from the swap offset. There seems to be no real reason why these bits cannot be used for swap PTEs. The important part is that _PAGE_PRESENT and _PAGE_HASHPTE remain 0. While at it, mask the type in __swp_entry() and remove _PAGE_BIT_SWAP_TYPE from pte-e500.h: while it was used in 64bit code it was ignored in 32bit code. Link: https://lkml.kernel.org/r/20230113171026.582290-19-david@redhat.com Signed-off-by: David Hildenbrand Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/nohash/32/pgtable.h | 22 +++++++++++++++++----- arch/powerpc/include/asm/nohash/32/pte-40x.h | 6 +++--- arch/powerpc/include/asm/nohash/32/pte-44x.h | 18 ++++-------------- arch/powerpc/include/asm/nohash/32/pte-85xx.h | 4 ++-- arch/powerpc/include/asm/nohash/64/pgtable.h | 24 +++++++++++++++++++++--- arch/powerpc/include/asm/nohash/pgtable.h | 16 ++++++++++++++++ arch/powerpc/include/asm/nohash/pte-e500.h | 1 - 7 files changed, 63 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 70edad44dff6..fec56d965f00 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -360,18 +360,30 @@ static inline int pte_young(pte_t pte) #endif #define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) + /* - * Encode and decode a swap entry. - * Note that the bits we use in a PTE for representing a swap entry - * must not include the _PAGE_PRESENT bit. - * -- paulus + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs (32bit PTEs): + * + * 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * <------------------ offset -------------------> < type -> E 0 0 + * + * E is the exclusive marker that is not stored in swap entries. + * + * For 64bit PTEs, the offset is extended by 32bit. */ #define __swp_type(entry) ((entry).val & 0x1f) #define __swp_offset(entry) ((entry).val >> 5) -#define __swp_entry(type, offset) ((swp_entry_t) { (type) | ((offset) << 5) }) +#define __swp_entry(type, offset) ((swp_entry_t) { ((type) & 0x1f) | ((offset) << 5) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 3 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 3 }) +/* We borrow LSB 2 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE 0x000004 + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_POWERPC_NOHASH_32_PGTABLE_H */ diff --git a/arch/powerpc/include/asm/nohash/32/pte-40x.h b/arch/powerpc/include/asm/nohash/32/pte-40x.h index 2d3153cfc0d7..6fe46e754556 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-40x.h +++ b/arch/powerpc/include/asm/nohash/32/pte-40x.h @@ -27,9 +27,9 @@ * of the 16 available. Bit 24-26 of the TLB are cleared in the TLB * miss handler. Bit 27 is PAGE_USER, thus selecting the correct * zone. - * - PRESENT *must* be in the bottom two bits because swap cache - * entries use the top 30 bits. Because 40x doesn't support SMP - * anyway, M is irrelevant so we borrow it for PAGE_PRESENT. Bit 30 + * - PRESENT *must* be in the bottom two bits because swap PTEs + * use the top 30 bits. Because 40x doesn't support SMP anyway, M is + * irrelevant so we borrow it for PAGE_PRESENT. Bit 30 * is cleared in the TLB miss handler before the TLB entry is loaded. * - All other bits of the PTE are loaded into TLBLO without * modification, leaving us only the bits 20, 21, 24, 25, 26, 30 for diff --git a/arch/powerpc/include/asm/nohash/32/pte-44x.h b/arch/powerpc/include/asm/nohash/32/pte-44x.h index 78bc304f750e..b7ed13cee137 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-44x.h +++ b/arch/powerpc/include/asm/nohash/32/pte-44x.h @@ -56,20 +56,10 @@ * above bits. Note that the bit values are CPU specific, not architecture * specific. * - * The kernel PTE entry holds an arch-dependent swp_entry structure under - * certain situations. In other words, in such situations some portion of - * the PTE bits are used as a swp_entry. In the PPC implementation, the - * 3-24th LSB are shared with swp_entry, however the 0-2nd three LSB still - * hold protection values. That means the three protection bits are - * reserved for both PTE and SWAP entry at the most significant three - * LSBs. - * - * There are three protection bits available for SWAP entry: - * _PAGE_PRESENT - * _PAGE_HASHPTE (if HW has) - * - * So those three bits have to be inside of 0-2nd LSB of PTE. - * + * The kernel PTE entry can be an ordinary PTE mapping a page or a special swap + * PTE. In case of a swap PTE, LSB 2-24 are used to store information regarding + * the swap entry. However LSB 0-1 still hold protection values, for example, + * to distinguish swap PTEs from ordinary PTEs, and must be used with care. */ #define _PAGE_PRESENT 0x00000001 /* S: PTE valid */ diff --git a/arch/powerpc/include/asm/nohash/32/pte-85xx.h b/arch/powerpc/include/asm/nohash/32/pte-85xx.h index 93fb8e11a3f1..16451df5ddb0 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-85xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-85xx.h @@ -11,8 +11,8 @@ 32 33 34 35 36 ... 50 51 52 53 54 55 56 57 58 59 60 61 62 63 RPN...................... 0 0 U0 U1 U2 U3 UX SX UW SW UR SR - - PRESENT *must* be in the bottom three bits because swap cache - entries use the top 29 bits. + - PRESENT *must* be in the bottom two bits because swap PTEs use + the top 30 bits. */ diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 879e9a6e5a87..287e25864ffa 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -276,22 +276,40 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) -/* Encode and de-code a swap entry */ +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * <-------------------------- offset ---------------------------- + * + * 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 6 6 6 6 + * 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 + * --------------> <----------- zero ------------> E < type -> 0 0 + * + * E is the exclusive marker that is not stored in swap entries. + */ #define MAX_SWAPFILES_CHECK() do { \ BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \ } while (0) #define SWP_TYPE_BITS 5 -#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \ +#define __swp_type(x) (((x).val >> 2) \ & ((1UL << SWP_TYPE_BITS) - 1)) #define __swp_offset(x) ((x).val >> PTE_RPN_SHIFT) #define __swp_entry(type, offset) ((swp_entry_t) { \ - ((type) << _PAGE_BIT_SWAP_TYPE) \ + (((type) & 0x1f) << 2) \ | ((offset) << PTE_RPN_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) #define __swp_entry_to_pte(x) __pte((x).val) +/* We borrow MSB 56 (LSB 7) to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE 0x80 + int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot); void unmap_kernel_page(unsigned long va); extern int __meminit vmemmap_create_mapping(unsigned long start, diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 69c3a050a3d8..5f4620940c2c 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -151,6 +151,22 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); } +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SWP_EXCLUSIVE); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_SWP_EXCLUSIVE); +} + /* Insert a PTE, top-level function is out of line. It uses an inline * low level function in the respective pgtable-* files */ diff --git a/arch/powerpc/include/asm/nohash/pte-e500.h b/arch/powerpc/include/asm/nohash/pte-e500.h index 0934e8965e4e..d8924cbd61e4 100644 --- a/arch/powerpc/include/asm/nohash/pte-e500.h +++ b/arch/powerpc/include/asm/nohash/pte-e500.h @@ -12,7 +12,6 @@ /* Architected bits */ #define _PAGE_PRESENT 0x000001 /* software: pte contains a translation */ #define _PAGE_SW1 0x000002 -#define _PAGE_BIT_SWAP_TYPE 2 #define _PAGE_BAP_SR 0x000004 #define _PAGE_BAP_UR 0x000008 #define _PAGE_BAP_SW 0x000010 -- cgit v1.2.3 From 51a1007d4113c632ec5229c685e2162b72d9746d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:19 +0100 Subject: riscv/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the offset. This reduces the maximum swap space per file: on 32bit to 16 GiB (was 32 GiB). Note that this bit does not conflict with swap PMDs and could also be used in swap PMD context later. While at it, mask the type in __swp_entry(). Link: https://lkml.kernel.org/r/20230113171026.582290-20-david@redhat.com Signed-off-by: David Hildenbrand Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Signed-off-by: Andrew Morton --- arch/riscv/include/asm/pgtable-bits.h | 3 +++ arch/riscv/include/asm/pgtable.h | 29 ++++++++++++++++++++++++----- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h index b9e13a8fe2b7..f896708e8331 100644 --- a/arch/riscv/include/asm/pgtable-bits.h +++ b/arch/riscv/include/asm/pgtable-bits.h @@ -27,6 +27,9 @@ */ #define _PAGE_PROT_NONE _PAGE_GLOBAL +/* Used for swap PTEs only. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_ACCESSED + #define _PAGE_PFN_SHIFT 10 /* diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 4eba9a98d0e3..03a4728db039 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -724,16 +724,18 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* - * Encode and decode a swap entry + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). * * Format of swap PTE: * bit 0: _PAGE_PRESENT (zero) * bit 1 to 3: _PAGE_LEAF (zero) * bit 5: _PAGE_PROT_NONE (zero) - * bits 6 to 10: swap type - * bits 10 to XLEN-1: swap offset + * bit 6: exclusive marker + * bits 7 to 11: swap type + * bits 11 to XLEN-1: swap offset */ -#define __SWP_TYPE_SHIFT 6 +#define __SWP_TYPE_SHIFT 7 #define __SWP_TYPE_BITS 5 #define __SWP_TYPE_MASK ((1UL << __SWP_TYPE_BITS) - 1) #define __SWP_OFFSET_SHIFT (__SWP_TYPE_BITS + __SWP_TYPE_SHIFT) @@ -744,11 +746,28 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, #define __swp_type(x) (((x).val >> __SWP_TYPE_SHIFT) & __SWP_TYPE_MASK) #define __swp_offset(x) ((x).val >> __SWP_OFFSET_SHIFT) #define __swp_entry(type, offset) ((swp_entry_t) \ - { ((type) << __SWP_TYPE_SHIFT) | ((offset) << __SWP_OFFSET_SHIFT) }) + { (((type) & __SWP_TYPE_MASK) << __SWP_TYPE_SHIFT) | \ + ((offset) << __SWP_OFFSET_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SWP_EXCLUSIVE); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_SWP_EXCLUSIVE); +} + #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION #define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val(pmd) }) #define __swp_entry_to_pmd(swp) __pmd((swp).val) -- cgit v1.2.3 From cca10df1029373cda5904887544ca6fcbbd2bac7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:20 +0100 Subject: sh/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by using bit 6 in the PTE, reducing the swap type in the !CONFIG_X2TLB case to 5 bits. Generic MM currently only uses 5 bits for the type (MAX_SWAPFILES_SHIFT), so the stolen bit is effectively unused. Interrestingly, the swap type in the !CONFIG_X2TLB case could currently overlap with the _PAGE_PRESENT bit, because there is a sneaky shift by 1 in __pte_to_swp_entry() and __swp_entry_to_pte(). Bit 0-7 in the architecture specific swap PTE would get shifted to bit 1-8 in the PTE. As generic MM uses 5 bits only, this didn't matter so far. While at it, mask the type in __swp_entry(). Link: https://lkml.kernel.org/r/20230113171026.582290-21-david@redhat.com Signed-off-by: David Hildenbrand Cc: Yoshinori Sato Cc: Rich Felker Signed-off-by: Andrew Morton --- arch/sh/include/asm/pgtable_32.h | 54 +++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/arch/sh/include/asm/pgtable_32.h b/arch/sh/include/asm/pgtable_32.h index d0240decacca..c34aa795a9d2 100644 --- a/arch/sh/include/asm/pgtable_32.h +++ b/arch/sh/include/asm/pgtable_32.h @@ -423,40 +423,70 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #endif /* - * Encode and de-code a swap entry + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). * * Constraints: * _PAGE_PRESENT at bit 8 * _PAGE_PROTNONE at bit 9 * - * For the normal case, we encode the swap type into bits 0:7 and the - * swap offset into bits 10:30. For the 64-bit PTE case, we keep the - * preserved bits in the low 32-bits and use the upper 32 as the swap - * offset (along with a 5-bit type), following the same approach as x86 - * PAE. This keeps the logic quite simple. + * For the normal case, we encode the swap type and offset into the swap PTE + * such that bits 8 and 9 stay zero. For the 64-bit PTE case, we use the + * upper 32 for the swap offset and swap type, following the same approach as + * x86 PAE. This keeps the logic quite simple. * * As is evident by the Alpha code, if we ever get a 64-bit unsigned * long (swp_entry_t) to match up with the 64-bit PTEs, this all becomes * much cleaner.. - * - * NOTE: We should set ZEROs at the position of _PAGE_PRESENT - * and _PAGE_PROTNONE bits */ + #ifdef CONFIG_X2TLB +/* + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * <--------------------- offset ----------------------> < type -> + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <------------------- zeroes --------------------> E 0 0 0 0 0 0 + */ #define __swp_type(x) ((x).val & 0x1f) #define __swp_offset(x) ((x).val >> 5) -#define __swp_entry(type, offset) ((swp_entry_t){ (type) | (offset) << 5}) +#define __swp_entry(type, offset) ((swp_entry_t){ ((type) & 0x1f) | (offset) << 5}) #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) #define __swp_entry_to_pte(x) ((pte_t){ 0, (x).val }) #else -#define __swp_type(x) ((x).val & 0xff) +/* + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <--------------- offset ----------------> 0 0 0 0 E < type -> 0 + * + * E is the exclusive marker that is not stored in swap entries. + */ +#define __swp_type(x) ((x).val & 0x1f) #define __swp_offset(x) ((x).val >> 10) -#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) <<10}) +#define __swp_entry(type, offset) ((swp_entry_t){((type) & 0x1f) | (offset) << 10}) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 1 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 1 }) #endif +/* In both cases, we borrow bit 6 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_USER + +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte.pte_low & _PAGE_SWP_EXCLUSIVE; +} + +PTE_BIT_FUNC(low, swp_mkexclusive, |= _PAGE_SWP_EXCLUSIVE); +PTE_BIT_FUNC(low, swp_clear_exclusive, &= ~_PAGE_SWP_EXCLUSIVE); + #endif /* __ASSEMBLY__ */ #endif /* __ASM_SH_PGTABLE_32_H */ -- cgit v1.2.3 From e6b37d7f6f17db638275b6f8fd78714c047e3a57 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:21 +0100 Subject: sparc/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE on 32bit Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by reusing the SRMMU_DIRTY bit as that seems to be safe to reuse inside a swap PTE. This avoids having to steal one bit from the swap offset. While at it, relocate the swap PTE layout documentation and use the same style now used for most other archs. Note that the old documentation was wrong: we use 20 bit for the offset and the reserved bits were 8 instead of 7 bits in the ascii art. Link: https://lkml.kernel.org/r/20230113171026.582290-22-david@redhat.com Signed-off-by: David Hildenbrand Cc: "David S. Miller" Signed-off-by: Andrew Morton --- arch/sparc/include/asm/pgtable_32.h | 27 ++++++++++++++++++++++++++- arch/sparc/include/asm/pgtsrmmu.h | 14 +++----------- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index 5acc05b572e6..abf7a2601209 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -323,7 +323,16 @@ void srmmu_mapiorange(unsigned int bus, unsigned long xpa, unsigned long xva, unsigned int len); void srmmu_unmapiorange(unsigned long virt_addr, unsigned int len); -/* Encode and de-code a swap entry */ +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <-------------- offset ---------------> < type -> E 0 0 0 0 0 0 + */ static inline unsigned long __swp_type(swp_entry_t entry) { return (entry.val >> SRMMU_SWP_TYPE_SHIFT) & SRMMU_SWP_TYPE_MASK; @@ -344,6 +353,22 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & SRMMU_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return __pte(pte_val(pte) | SRMMU_SWP_EXCLUSIVE); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return __pte(pte_val(pte) & ~SRMMU_SWP_EXCLUSIVE); +} + static inline unsigned long __get_phys (unsigned long addr) { diff --git a/arch/sparc/include/asm/pgtsrmmu.h b/arch/sparc/include/asm/pgtsrmmu.h index 6067925972d9..18e68d43f036 100644 --- a/arch/sparc/include/asm/pgtsrmmu.h +++ b/arch/sparc/include/asm/pgtsrmmu.h @@ -53,21 +53,13 @@ #define SRMMU_CHG_MASK (0xffffff00 | SRMMU_REF | SRMMU_DIRTY) -/* SRMMU swap entry encoding - * - * We use 5 bits for the type and 19 for the offset. This gives us - * 32 swapfiles of 4GB each. Encoding looks like: - * - * oooooooooooooooooootttttRRRRRRRR - * fedcba9876543210fedcba9876543210 - * - * The bottom 7 bits are reserved for protection and status bits, especially - * PRESENT. - */ +/* SRMMU swap entry encoding */ #define SRMMU_SWP_TYPE_MASK 0x1f #define SRMMU_SWP_TYPE_SHIFT 7 #define SRMMU_SWP_OFF_MASK 0xfffff #define SRMMU_SWP_OFF_SHIFT (SRMMU_SWP_TYPE_SHIFT + 5) +/* We borrow bit 6 to store the exclusive marker in swap PTEs. */ +#define SRMMU_SWP_EXCLUSIVE SRMMU_DIRTY /* Some day I will implement true fine grained access bits for * user pages because the SRMMU gives us the capabilities to -- cgit v1.2.3 From adf8e329ff56a873c789f797a69a330fb8c4ff9e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:22 +0100 Subject: sparc/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE on 64bit Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the type. Generic MM currently only uses 5 bits for the type (MAX_SWAPFILES_SHIFT), so the stolen bit was effectively unused. While at it, mask the type in __swp_entry(). Link: https://lkml.kernel.org/r/20230113171026.582290-23-david@redhat.com Signed-off-by: David Hildenbrand Cc: "David S. Miller" Signed-off-by: Andrew Morton --- arch/sparc/include/asm/pgtable_64.h | 38 ++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 3bc9736bddb1..a1658eebd036 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -187,6 +187,9 @@ bool kern_addr_valid(unsigned long addr); #define _PAGE_SZHUGE_4U _PAGE_SZ4MB_4U #define _PAGE_SZHUGE_4V _PAGE_SZ4MB_4V +/* We borrow bit 20 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _AC(0x0000000000100000, UL) + #ifndef __ASSEMBLY__ pte_t mk_pte_io(unsigned long, pgprot_t, int, unsigned long); @@ -961,18 +964,47 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif -/* Encode and de-code a swap entry */ -#define __swp_type(entry) (((entry).val >> PAGE_SHIFT) & 0xffUL) +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * <--------------------------- offset --------------------------- + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * --------------------> E <-- type ---> <------- zeroes --------> + */ +#define __swp_type(entry) (((entry).val >> PAGE_SHIFT) & 0x7fUL) #define __swp_offset(entry) ((entry).val >> (PAGE_SHIFT + 8UL)) #define __swp_entry(type, offset) \ ( (swp_entry_t) \ { \ - (((long)(type) << PAGE_SHIFT) | \ + ((((long)(type) & 0x7fUL) << PAGE_SHIFT) | \ ((long)(offset) << (PAGE_SHIFT + 8UL))) \ } ) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SWP_EXCLUSIVE); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_SWP_EXCLUSIVE); +} + int page_in_phys_avail(unsigned long paddr); /* -- cgit v1.2.3 From e2858d778e6832d8f58dc3b361f78bd9f03cd2dd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:23 +0100 Subject: um/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by using bit 10, which is yet unused for swap PTEs. The pte_mkuptodate() is a bit weird in __pte_to_swp_entry() for a swap PTE ... but it only messes with bit 1 and 2 and there is a comment in set_pte(), so leave these bits alone. While at it, mask the type in __swp_entry(). Link: https://lkml.kernel.org/r/20230113171026.582290-24-david@redhat.com Signed-off-by: David Hildenbrand Cc: Richard Weinberger Cc: Anton Ivanov Cc: Johannes Berg Signed-off-by: Andrew Morton --- arch/um/include/asm/pgtable.h | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index 4e3052f2671a..cedc5fd451ce 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -21,6 +21,9 @@ #define _PAGE_PROTNONE 0x010 /* if the user mapped it with PROT_NONE; pte_present gives true */ +/* We borrow bit 10 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE 0x400 + #ifdef CONFIG_3_LEVEL_PGTABLES #include #else @@ -288,16 +291,46 @@ extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr); #define update_mmu_cache(vma,address,ptep) do {} while (0) -/* Encode and de-code a swap entry */ +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <--------------- offset ----------------> E < type -> 0 0 0 1 0 + * + * E is the exclusive marker that is not stored in swap entries. + * _PAGE_NEWPAGE (bit 1) is always set to 1 in set_pte(). + */ #define __swp_type(x) (((x).val >> 5) & 0x1f) #define __swp_offset(x) ((x).val >> 11) #define __swp_entry(type, offset) \ - ((swp_entry_t) { ((type) << 5) | ((offset) << 11) }) + ((swp_entry_t) { (((type) & 0x1f) << 5) | ((offset) << 11) }) #define __pte_to_swp_entry(pte) \ ((swp_entry_t) { pte_val(pte_mkuptodate(pte)) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_get_bits(pte, _PAGE_SWP_EXCLUSIVE); +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_set_bits(pte, _PAGE_SWP_EXCLUSIVE); + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_clear_bits(pte, _PAGE_SWP_EXCLUSIVE); + return pte; +} + /* Clear a kernel PTE and flush it from the TLB */ #define kpte_clear_flush(ptep, vaddr) \ do { \ -- cgit v1.2.3 From 93c0eac40d4e3ce4d5a3c6e0dc74eceaf8f63e0d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:24 +0100 Subject: x86/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE also on 32bit Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE just like we already do on x86-64. After deciphering the PTE layout it becomes clear that there are still unused bits for 2-level and 3-level page tables that we should be able to use. Reusing a bit avoids stealing one bit from the swap offset. While at it, mask the type in __swp_entry(); use some helper definitions to make the macros easier to grasp. Link: https://lkml.kernel.org/r/20230113171026.582290-25-david@redhat.com Signed-off-by: David Hildenbrand Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable-2level.h | 26 +++++++++++++++++++++----- arch/x86/include/asm/pgtable-3level.h | 26 +++++++++++++++++++++++--- arch/x86/include/asm/pgtable.h | 2 -- 3 files changed, 44 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 60d0f9015317..e9482a11ac52 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -80,21 +80,37 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi return ((value >> rightshift) & mask) << leftshift; } -/* Encode and de-code a swap entry */ +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <----------------- offset ------------------> 0 E <- type --> 0 + * + * E is the exclusive marker that is not stored in swap entries. + */ #define SWP_TYPE_BITS 5 +#define _SWP_TYPE_MASK ((1U << SWP_TYPE_BITS) - 1) +#define _SWP_TYPE_SHIFT (_PAGE_BIT_PRESENT + 1) #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) -#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) -#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \ - & ((1U << SWP_TYPE_BITS) - 1)) +#define __swp_type(x) (((x).val >> _SWP_TYPE_SHIFT) \ + & _SWP_TYPE_MASK) #define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT) #define __swp_entry(type, offset) ((swp_entry_t) { \ - ((type) << (_PAGE_BIT_PRESENT + 1)) \ + (((type) & _SWP_TYPE_MASK) << _SWP_TYPE_SHIFT) \ | ((offset) << SWP_OFFSET_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_PSE + /* No inverted PFNs on 2 level page tables */ static inline u64 protnone_mask(u64 val) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 967b135fa2c0..9e7c0b719c3c 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -145,8 +145,24 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, } #endif -/* Encode and de-code a swap entry */ +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * < type -> <---------------------- offset ---------------------- + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * --------------------------------------------> 0 E 0 0 0 0 0 0 0 + * + * E is the exclusive marker that is not stored in swap entries. + */ #define SWP_TYPE_BITS 5 +#define _SWP_TYPE_MASK ((1U << SWP_TYPE_BITS) - 1) #define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) @@ -154,9 +170,10 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, #define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS) #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) -#define __swp_type(x) (((x).val) & ((1UL << SWP_TYPE_BITS) - 1)) +#define __swp_type(x) (((x).val) & _SWP_TYPE_MASK) #define __swp_offset(x) ((x).val >> SWP_TYPE_BITS) -#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << SWP_TYPE_BITS}) +#define __swp_entry(type, offset) ((swp_entry_t){((type) & _SWP_TYPE_MASK) \ + | (offset) << SWP_TYPE_BITS}) /* * Normally, __swp_entry() converts from arch-independent swp_entry_t to @@ -184,6 +201,9 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \ __pteval_swp_offset(pte))) +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_PSE + #include #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1c843395a8b3..d25195726b78 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1299,7 +1299,6 @@ static inline void update_mmu_cache_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { } -#ifdef _PAGE_SWP_EXCLUSIVE #define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline pte_t pte_swp_mkexclusive(pte_t pte) { @@ -1315,7 +1314,6 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) { return pte_clear_flags(pte, _PAGE_SWP_EXCLUSIVE); } -#endif /* _PAGE_SWP_EXCLUSIVE */ #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) -- cgit v1.2.3 From f5c3fe300c5b40ff9af5ce2c9dd9897e91ce5735 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:25 +0100 Subject: xtensa/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by using bit 1. This bit should be safe to use for our usecase. Most importantly, we can still distinguish swap PTEs from PAGE_NONE PTEs (see pte_present()) and don't use one of the two reserved attribute masks (1101 and 1111). Attribute mask 1100 and 1110 now identify swap PTEs. While at it, remove SWP_TYPE_BITS (not really helpful as it's not used in the actual swap macros) and mask the type in __swp_entry(). Link: https://lkml.kernel.org/r/20230113171026.582290-26-david@redhat.com Signed-off-by: David Hildenbrand Cc: Chris Zankel Cc: Max Filippov Signed-off-by: Andrew Morton --- arch/xtensa/include/asm/pgtable.h | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 5b5484d707b2..1025e2dc292b 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -96,7 +96,7 @@ * +- - - - - - - - - - - - - - - - - - - - -+ * (PAGE_NONE)| PPN | 0 | 00 | ADW | 01 | 11 | 11 | * +-----------------------------------------+ - * swap | index | type | 01 | 11 | 00 | + * swap | index | type | 01 | 11 | e0 | * +-----------------------------------------+ * * For T1050 hardware and earlier the layout differs for present and (PAGE_NONE) @@ -112,6 +112,7 @@ * RI ring (0=privileged, 1=user, 2 and 3 are unused) * CA cache attribute: 00 bypass, 01 writeback, 10 writethrough * (11 is invalid and used to mark pages that are not present) + * e exclusive marker in swap PTEs * w page is writable (hw) * x page is executable (hw) * index swap offset / PAGE_SIZE (bit 11-31: 21 bits -> 8 GB) @@ -158,6 +159,9 @@ #define _PAGE_DIRTY (1<<7) /* software: page dirty */ #define _PAGE_ACCESSED (1<<8) /* software: page accessed (read) */ +/* We borrow bit 1 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1<<1) + #ifdef CONFIG_MMU #define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) @@ -343,19 +347,37 @@ ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) } /* - * Encode and decode a swap and file entry. + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). */ -#define SWP_TYPE_BITS 5 -#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) #define __swp_type(entry) (((entry).val >> 6) & 0x1f) #define __swp_offset(entry) ((entry).val >> 11) #define __swp_entry(type,offs) \ - ((swp_entry_t){((type) << 6) | ((offs) << 11) | \ + ((swp_entry_t){(((type) & 0x1f) << 6) | ((offs) << 11) | \ _PAGE_CA_INVALID | _PAGE_USER}) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + #endif /* !defined (__ASSEMBLY__) */ -- cgit v1.2.3 From 950fe885a89770619e315f9b46301eebf0aab7b3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:26 +0100 Subject: mm: remove __HAVE_ARCH_PTE_SWP_EXCLUSIVE __HAVE_ARCH_PTE_SWP_EXCLUSIVE is now supported by all architectures that support swp PTEs, so let's drop it. Link: https://lkml.kernel.org/r/20230113171026.582290-27-david@redhat.com Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton --- arch/alpha/include/asm/pgtable.h | 1 - arch/arc/include/asm/pgtable-bits-arcv2.h | 1 - arch/arm/include/asm/pgtable.h | 1 - arch/arm64/include/asm/pgtable.h | 1 - arch/csky/include/asm/pgtable.h | 1 - arch/hexagon/include/asm/pgtable.h | 1 - arch/ia64/include/asm/pgtable.h | 1 - arch/loongarch/include/asm/pgtable.h | 1 - arch/m68k/include/asm/mcf_pgtable.h | 1 - arch/m68k/include/asm/motorola_pgtable.h | 1 - arch/m68k/include/asm/sun3_pgtable.h | 1 - arch/microblaze/include/asm/pgtable.h | 1 - arch/mips/include/asm/pgtable.h | 1 - arch/nios2/include/asm/pgtable.h | 1 - arch/openrisc/include/asm/pgtable.h | 1 - arch/parisc/include/asm/pgtable.h | 1 - arch/powerpc/include/asm/book3s/32/pgtable.h | 1 - arch/powerpc/include/asm/book3s/64/pgtable.h | 1 - arch/powerpc/include/asm/nohash/pgtable.h | 1 - arch/riscv/include/asm/pgtable.h | 1 - arch/s390/include/asm/pgtable.h | 1 - arch/sh/include/asm/pgtable_32.h | 1 - arch/sparc/include/asm/pgtable_32.h | 1 - arch/sparc/include/asm/pgtable_64.h | 1 - arch/um/include/asm/pgtable.h | 1 - arch/x86/include/asm/pgtable.h | 1 - arch/xtensa/include/asm/pgtable.h | 1 - include/linux/pgtable.h | 29 ---------------------------- mm/debug_vm_pgtable.c | 2 -- mm/memory.c | 4 ---- mm/rmap.c | 11 ----------- 31 files changed, 73 deletions(-) diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index 970abf511b13..ba43cb841d19 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -328,7 +328,6 @@ extern inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h b/arch/arc/include/asm/pgtable-bits-arcv2.h index 611f412713b9..6e9f8ca6d6a1 100644 --- a/arch/arc/include/asm/pgtable-bits-arcv2.h +++ b/arch/arc/include/asm/pgtable-bits-arcv2.h @@ -132,7 +132,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index 886c275995a2..2e626e6da9a3 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -298,7 +298,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(swp) __pte((swp).val) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_isset(pte, L_PTE_SWP_EXCLUSIVE); diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 65e78999c75d..575c63de894f 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -417,7 +417,6 @@ static inline pgprot_t mk_pmd_sect_prot(pgprot_t prot) return __pgprot((pgprot_val(prot) & ~PMD_TABLE_BIT) | PMD_TYPE_SECT); } -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline pte_t pte_swp_mkexclusive(pte_t pte) { return set_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE)); diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index 574c97b9ecca..d4042495febc 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -200,7 +200,6 @@ static inline pte_t pte_mkyoung(pte_t pte) return pte; } -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index 7eb008e477c8..59393613d086 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -397,7 +397,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) (((type & 0x1f) << 1) | \ ((offset & 0x3ffff8) << 10) | ((offset & 0x7) << 7)) }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index e4b8ab931399..21c97e31a28a 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -424,7 +424,6 @@ extern void paging_init (void); #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index c6b8fe7ac43c..d28fb9dbec59 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -276,7 +276,6 @@ static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) #define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val(pmd) }) #define __swp_entry_to_pmd(x) ((pmd_t) { (x).val | _PAGE_HUGE }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/m68k/include/asm/mcf_pgtable.h b/arch/m68k/include/asm/mcf_pgtable.h index e573d7b649f7..13741c1245e1 100644 --- a/arch/m68k/include/asm/mcf_pgtable.h +++ b/arch/m68k/include/asm/mcf_pgtable.h @@ -275,7 +275,6 @@ extern pgd_t kernel_pg_dir[PTRS_PER_PGD]; #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) (__pte((x).val)) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h index c1782563e793..ec0dc19ab834 100644 --- a/arch/m68k/include/asm/motorola_pgtable.h +++ b/arch/m68k/include/asm/motorola_pgtable.h @@ -190,7 +190,6 @@ extern pgd_t kernel_pg_dir[128]; #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/m68k/include/asm/sun3_pgtable.h b/arch/m68k/include/asm/sun3_pgtable.h index dbfc9703b15d..e582b0484a55 100644 --- a/arch/m68k/include/asm/sun3_pgtable.h +++ b/arch/m68k/include/asm/sun3_pgtable.h @@ -174,7 +174,6 @@ extern pgd_t kernel_pg_dir[PTRS_PER_PGD]; #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index 7e3de54bf426..d1b8272abcd9 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -412,7 +412,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 2 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 2 }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 711874cee8e4..791389bf3c12 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -528,7 +528,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) } #endif -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE #if defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) static inline int pte_swp_exclusive(pte_t pte) { diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index 05999da01731..0f5c2564e9f5 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -253,7 +253,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define __swp_entry_to_pte(swp) ((pte_t) { (swp).val }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 903b32d662ab..3eb9b9555d0d 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -408,7 +408,6 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 3033bb88df34..e2950f5db7c9 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -422,7 +422,6 @@ extern void paging_init (void); #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 0ecb3a58f23f..7bf1fe7297c6 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -386,7 +386,6 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 3 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 3 }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index cb4c67bf45d7..4acc9690f599 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -717,7 +717,6 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) } #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline pte_t pte_swp_mkexclusive(pte_t pte) { return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SWP_EXCLUSIVE)); diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 5f4620940c2c..a6caaaab6f92 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -151,7 +151,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); } -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 03a4728db039..5b9f409a940d 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -752,7 +752,6 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index b26cbf1c533c..2b5db99e31dd 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -812,7 +812,6 @@ static inline int pmd_protnone(pmd_t pmd) } #endif -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/sh/include/asm/pgtable_32.h b/arch/sh/include/asm/pgtable_32.h index c34aa795a9d2..21952b094650 100644 --- a/arch/sh/include/asm/pgtable_32.h +++ b/arch/sh/include/asm/pgtable_32.h @@ -479,7 +479,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) /* In both cases, we borrow bit 6 to store the exclusive marker in swap PTEs. */ #define _PAGE_SWP_EXCLUSIVE _PAGE_USER -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte.pte_low & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index abf7a2601209..d4330e3c57a6 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -353,7 +353,6 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & SRMMU_SWP_EXCLUSIVE; diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index a1658eebd036..2dc8d4641734 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -989,7 +989,6 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index cedc5fd451ce..a70d1618eb35 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -313,7 +313,6 @@ extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr); ((swp_entry_t) { pte_val(pte_mkuptodate(pte)) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_get_bits(pte, _PAGE_SWP_EXCLUSIVE); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index d25195726b78..7425f32e5293 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1299,7 +1299,6 @@ static inline void update_mmu_cache_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { } -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline pte_t pte_swp_mkexclusive(pte_t pte) { return pte_set_flags(pte, _PAGE_SWP_EXCLUSIVE); diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 1025e2dc292b..fc7a14884c6c 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -360,7 +360,6 @@ ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 1159b25b0542..5fd45454c073 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1064,35 +1064,6 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define arch_start_context_switch(prev) do {} while (0) #endif -/* - * When replacing an anonymous page by a real (!non) swap entry, we clear - * PG_anon_exclusive from the page and instead remember whether the flag was - * set in the swp pte. During fork(), we have to mark the entry as !exclusive - * (possibly shared). On swapin, we use that information to restore - * PG_anon_exclusive, which is very helpful in cases where we might have - * additional (e.g., FOLL_GET) references on a page and wouldn't be able to - * detect exclusivity. - * - * These functions don't apply to non-swap entries (e.g., migration, hwpoison, - * ...). - */ -#ifndef __HAVE_ARCH_PTE_SWP_EXCLUSIVE -static inline pte_t pte_swp_mkexclusive(pte_t pte) -{ - return pte; -} - -static inline int pte_swp_exclusive(pte_t pte) -{ - return false; -} - -static inline pte_t pte_swp_clear_exclusive(pte_t pte) -{ - return pte; -} -#endif - #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index ff8d6f6af896..af59cc7bd307 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -810,7 +810,6 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args) { -#ifdef __HAVE_ARCH_PTE_SWP_EXCLUSIVE unsigned long max_swap_offset; swp_entry_t entry, entry2; pte_t pte; @@ -841,7 +840,6 @@ static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args) WARN_ON(!is_swap_pte(pte)); entry2 = pte_to_swp_entry(pte); WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); -#endif /* __HAVE_ARCH_PTE_SWP_EXCLUSIVE */ } static void __init pte_swap_tests(struct pgtable_debug_args *args) diff --git a/mm/memory.c b/mm/memory.c index c6bacd58d032..87b33b4967c2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3864,10 +3864,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * the swap entry concurrently) for certainly exclusive pages. */ if (!folio_test_ksm(folio)) { - /* - * Note that pte_swp_exclusive() == false for architectures - * without __HAVE_ARCH_PTE_SWP_EXCLUSIVE. - */ exclusive = pte_swp_exclusive(vmf->orig_pte); if (folio != swapcache) { /* diff --git a/mm/rmap.c b/mm/rmap.c index 073999f78adf..0d07c500fc86 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1710,17 +1710,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, page_vma_mapped_walk_done(&pvmw); break; } - /* - * Note: We *don't* remember if the page was mapped - * exclusively in the swap pte if the architecture - * doesn't support __HAVE_ARCH_PTE_SWP_EXCLUSIVE. In - * that case, swapin code has to re-determine that - * manually and might detect the page as possibly - * shared, for example, if there are other references on - * the page or if the page is under writeback. We made - * sure that there are no GUP pins on the page that - * would rely on it, so for GUP pins this is fine. - */ if (list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); if (list_empty(&mm->mmlist)) -- cgit v1.2.3 From 6189eb82f0aec8a877190bf52e629c687ed02773 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 13 Jan 2023 15:42:53 +0000 Subject: mm/page_ext: do not allocate space for page_ext->flags if not needed There is 8 byte page_ext->flags field allocated per page whenever CONFIG_PAGE_EXTENSION is enabled. However, not every user of page_ext uses flags. Therefore, check whether flags is needed at least by one user and if so allocate space for it. For example when page_table_check is enabled, on a machine with 128G of memory before the fix: [ 2.244288] allocated 536870912 bytes of page_ext after the fix: [ 2.160154] allocated 268435456 bytes of page_ext Also, add a kernel-doc comment before page_ext_operations that describes the fields, and remove check if need() is set, as that is now a required field. [pasha.tatashin@soleen.com: address comments from Mike Rapoport] Link: https://lkml.kernel.org/r/20230117202103.1412449-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20230113154253.92480-1-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: David Rientjes Reviewed-by: Mike Rapoport (IBM) Cc: Charan Teja Kalla Cc: Li Zhe Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/page_ext.h | 18 ++++++++++++++++++ mm/page_ext.c | 14 ++++++++++++-- mm/page_owner.c | 1 + mm/page_table_check.c | 1 + 4 files changed, 32 insertions(+), 2 deletions(-) diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 22be4582faae..67314f648aeb 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -7,15 +7,33 @@ #include struct pglist_data; + +/** + * struct page_ext_operations - per page_ext client operations + * @offset: Offset to the client's data within page_ext. Offset is returned to + * the client by page_ext_init. + * @size: The size of the client data within page_ext. + * @need: Function that returns true if client requires page_ext. + * @init: (optional) Called to initialize client once page_exts are allocated. + * @need_shared_flags: True when client is using shared page_ext->flags + * field. + * + * Each Page Extension client must define page_ext_operations in + * page_ext_ops array. + */ struct page_ext_operations { size_t offset; size_t size; bool (*need)(void); void (*init)(void); + bool need_shared_flags; }; #ifdef CONFIG_PAGE_EXTENSION +/* + * The page_ext_flags users must set need_shared_flags to true. + */ enum page_ext_flags { PAGE_EXT_OWNER, PAGE_EXT_OWNER_ALLOCATED, diff --git a/mm/page_ext.c b/mm/page_ext.c index 4ee522fd381c..e2c22ffdbb81 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -71,6 +71,7 @@ static bool need_page_idle(void) } static struct page_ext_operations page_idle_ops __initdata = { .need = need_page_idle, + .need_shared_flags = true, }; #endif @@ -86,7 +87,7 @@ static struct page_ext_operations *page_ext_ops[] __initdata = { #endif }; -unsigned long page_ext_size = sizeof(struct page_ext); +unsigned long page_ext_size; static unsigned long total_usage; static struct page_ext *lookup_page_ext(const struct page *page); @@ -106,7 +107,16 @@ static bool __init invoke_need_callbacks(void) bool need = false; for (i = 0; i < entries; i++) { - if (page_ext_ops[i]->need && page_ext_ops[i]->need()) { + if (page_ext_ops[i]->need()) { + if (page_ext_ops[i]->need_shared_flags) { + page_ext_size = sizeof(struct page_ext); + break; + } + } + } + + for (i = 0; i < entries; i++) { + if (page_ext_ops[i]->need()) { page_ext_ops[i]->offset = page_ext_size; page_ext_size += page_ext_ops[i]->size; need = true; diff --git a/mm/page_owner.c b/mm/page_owner.c index 2d27f532df4c..f0553bedb39d 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -99,6 +99,7 @@ struct page_ext_operations page_owner_ops = { .size = sizeof(struct page_owner), .need = need_page_owner, .init = init_page_owner, + .need_shared_flags = true, }; static inline struct page_owner *get_page_owner(struct page_ext *page_ext) diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 93e633c1d587..25d8610c0042 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -45,6 +45,7 @@ struct page_ext_operations page_table_check_ops = { .size = sizeof(struct page_table_check), .need = need_page_table_check, .init = init_page_table_check, + .need_shared_flags = false, }; static struct page_table_check *get_page_table_check(struct page_ext *page_ext) -- cgit v1.2.3 From 524c48072e5673f4511f1ad81493e2485863fd65 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 13 Jan 2023 11:12:12 +0000 Subject: mm/page_alloc: rename ALLOC_HIGH to ALLOC_MIN_RESERVE Patch series "Discard __GFP_ATOMIC", v3. Neil's patch has been residing in mm-unstable as commit 2fafb4fe8f7a ("mm: discard __GFP_ATOMIC") for a long time and recently brought up again. Most recently, I was worried that __GFP_HIGH allocations could use high-order atomic reserves which is unintentional but there was no response so lets revisit -- this series reworks how min reserves are used, protects highorder reserves and then finishes with Neil's patch with very minor modifications so it fits on top. There was a review discussion on renaming __GFP_DIRECT_RECLAIM to __GFP_ALLOW_BLOCKING but I didn't think it was that big an issue and is orthogonal to the removal of __GFP_ATOMIC. There were some concerns about how the gfp flags affect the min reserves but it never reached a solid conclusion so I made my own attempt. The series tries to iron out some of the details on how reserves are used. ALLOC_HIGH becomes ALLOC_MIN_RESERVE and ALLOC_HARDER becomes ALLOC_NON_BLOCK and documents how the reserves are affected. For example, ALLOC_NON_BLOCK (no direct reclaim) on its own allows 25% of the min reserve. ALLOC_MIN_RESERVE (__GFP_HIGH) allows 50% and both combined allows deeper access again. ALLOC_OOM allows access to 75%. High-order atomic allocations are explicitly handled with the caveat that no __GFP_ATOMIC flag means that any high-order allocation that specifies GFP_HIGH and cannot enter direct reclaim will be treated as if it was GFP_ATOMIC. This patch (of 6): __GFP_HIGH aliases to ALLOC_HIGH but the name does not really hint what it means. As ALLOC_HIGH is internal to the allocator, rename it to ALLOC_MIN_RESERVE to document that the min reserves can be depleted. Link: https://lkml.kernel.org/r/20230113111217.14134-1-mgorman@techsingularity.net Link: https://lkml.kernel.org/r/20230113111217.14134-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: NeilBrown Cc: Thierry Reding Signed-off-by: Andrew Morton --- mm/internal.h | 4 +++- mm/page_alloc.c | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 973b48e8b1af..99eb544fbded 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -779,7 +779,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #endif #define ALLOC_HARDER 0x10 /* try to alloc harder */ -#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50% + * of the min watermark. + */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ #ifdef CONFIG_ZONE_DMA32 diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 83be3b571fd0..4c1f1d487c3e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3994,7 +3994,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, /* free_pages may go negative - that's OK */ free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); - if (alloc_flags & ALLOC_HIGH) + if (alloc_flags & ALLOC_MIN_RESERVE) min -= min / 2; if (unlikely(alloc_harder)) { @@ -4836,18 +4836,18 @@ gfp_to_alloc_flags(gfp_t gfp_mask) unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; /* - * __GFP_HIGH is assumed to be the same as ALLOC_HIGH + * __GFP_HIGH is assumed to be the same as ALLOC_MIN_RESERVE * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD * to save two branches. */ - BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); + BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_MIN_RESERVE); BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD); /* * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). + * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_MIN_RESERVE(__GFP_HIGH). */ alloc_flags |= (__force int) (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); -- cgit v1.2.3 From c988dcbecf3fd5430921eaa3fe9054754f76d185 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 13 Jan 2023 11:12:13 +0000 Subject: mm/page_alloc: treat RT tasks similar to __GFP_HIGH RT tasks are allowed to dip below the min reserve but ALLOC_HARDER is typically combined with ALLOC_MIN_RESERVE so RT tasks are a little unusual. While there is some justification for allowing RT tasks access to memory reserves, there is a strong chance that a RT task that is also under memory pressure is at risk of missing deadlines anyway. Relax how much reserves an RT task can access by treating it the same as __GFP_HIGH allocations. Note that in a future kernel release that the RT special casing will be removed. Hard realtime tasks should be locking down resources in advance and ensuring enough memory is available. Even a soft-realtime task like audio or video live decoding which cannot jitter should be allocating both memory and any disk space required up-front before the recording starts instead of relying on reserves. At best, reserve access will only delay the problem by a very short interval. Link: https://lkml.kernel.org/r/20230113111217.14134-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: NeilBrown Cc: Thierry Reding Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4c1f1d487c3e..4f5c2e83cb7b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4865,7 +4865,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) */ alloc_flags &= ~ALLOC_CPUSET; } else if (unlikely(rt_task(current)) && in_task()) - alloc_flags |= ALLOC_HARDER; + alloc_flags |= ALLOC_MIN_RESERVE; alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); -- cgit v1.2.3 From eb2e2b425c6984ca8034448a3f2c680622bd3d4d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 13 Jan 2023 11:12:14 +0000 Subject: mm/page_alloc: explicitly record high-order atomic allocations in alloc_flags A high-order ALLOC_HARDER allocation is assumed to be atomic. While that is accurate, it changes later in the series. In preparation, explicitly record high-order atomic allocations in gfp_to_alloc_flags(). Link: https://lkml.kernel.org/r/20230113111217.14134-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: NeilBrown Cc: Thierry Reding Signed-off-by: Andrew Morton --- mm/internal.h | 1 + mm/page_alloc.c | 29 +++++++++++++++++++++++------ 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 99eb544fbded..62428467d7cf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -789,6 +789,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #else #define ALLOC_NOFRAGMENT 0x0 #endif +#define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ enum ttu_flags; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4f5c2e83cb7b..8a7e1cfa8353 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3724,10 +3724,20 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, * reserved for high-order atomic allocation, so order-0 * request should skip it. */ - if (order > 0 && alloc_flags & ALLOC_HARDER) + if (alloc_flags & ALLOC_HIGHATOMIC) page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (!page) { page = __rmqueue(zone, order, migratetype, alloc_flags); + + /* + * If the allocation fails, allow OOM handling access + * to HIGHATOMIC reserves as failing now is worse than + * failing a high-order atomic allocation in the + * future. + */ + if (!page && (alloc_flags & ALLOC_OOM)) + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (!page) { spin_unlock_irqrestore(&zone->lock, flags); return NULL; @@ -4041,8 +4051,10 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, return true; } #endif - if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC)) + if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) && + !free_area_empty(area, MIGRATE_HIGHATOMIC)) { return true; + } } return false; } @@ -4304,7 +4316,7 @@ try_this_zone: * If this is a high-order atomic allocation then check * if the pageblock should be reserved for the future */ - if (unlikely(order && (alloc_flags & ALLOC_HARDER))) + if (unlikely(alloc_flags & ALLOC_HIGHATOMIC)) reserve_highatomic_pageblock(page, zone, order); return page; @@ -4831,7 +4843,7 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, } static inline unsigned int -gfp_to_alloc_flags(gfp_t gfp_mask) +gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order) { unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; @@ -4857,8 +4869,13 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * Not worth trying to allocate harder for __GFP_NOMEMALLOC even * if it can't schedule. */ - if (!(gfp_mask & __GFP_NOMEMALLOC)) + if (!(gfp_mask & __GFP_NOMEMALLOC)) { alloc_flags |= ALLOC_HARDER; + + if (order > 0) + alloc_flags |= ALLOC_HIGHATOMIC; + } + /* * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the * comment for __cpuset_node_allowed(). @@ -5066,7 +5083,7 @@ restart: * kswapd needs to be woken up, and to avoid the cost of setting up * alloc_flags precisely. So we do that now. */ - alloc_flags = gfp_to_alloc_flags(gfp_mask); + alloc_flags = gfp_to_alloc_flags(gfp_mask, order); /* * We need to recalculate the starting point for the zonelist iterator -- cgit v1.2.3 From ab3508854353793cd35e348fde89a5c09b2fd8b5 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 13 Jan 2023 11:12:15 +0000 Subject: mm/page_alloc: explicitly define what alloc flags deplete min reserves As there are more ALLOC_ flags that affect reserves, define what flags affect reserves and clarify the effect of each flag. Link: https://lkml.kernel.org/r/20230113111217.14134-5-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: NeilBrown Cc: Thierry Reding Signed-off-by: Andrew Morton --- mm/internal.h | 3 +++ mm/page_alloc.c | 34 ++++++++++++++++++++++------------ 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 62428467d7cf..f7c3bc80c475 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -792,6 +792,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ +/* Flags that allow allocations below the min watermark. */ +#define ALLOC_RESERVES (ALLOC_HARDER|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) + enum ttu_flags; struct tlbflush_unmap_batch; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8a7e1cfa8353..ffe2151d11ee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3967,15 +3967,14 @@ ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); static inline long __zone_watermark_unusable_free(struct zone *z, unsigned int order, unsigned int alloc_flags) { - const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); long unusable_free = (1 << order) - 1; /* - * If the caller does not have rights to ALLOC_HARDER then subtract - * the high-atomic reserves. This will over-estimate the size of the - * atomic reserve but it avoids a search. + * If the caller does not have rights to reserves below the min + * watermark then subtract the high-atomic reserves. This will + * over-estimate the size of the atomic reserve but it avoids a search. */ - if (likely(!alloc_harder)) + if (likely(!(alloc_flags & ALLOC_RESERVES))) unusable_free += z->nr_reserved_highatomic; #ifdef CONFIG_CMA @@ -3999,25 +3998,36 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, { long min = mark; int o; - const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); /* free_pages may go negative - that's OK */ free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); - if (alloc_flags & ALLOC_MIN_RESERVE) - min -= min / 2; + if (unlikely(alloc_flags & ALLOC_RESERVES)) { + /* + * __GFP_HIGH allows access to 50% of the min reserve as well + * as OOM. + */ + if (alloc_flags & ALLOC_MIN_RESERVE) + min -= min / 2; - if (unlikely(alloc_harder)) { /* - * OOM victims can try even harder than normal ALLOC_HARDER + * Non-blocking allocations can access some of the reserve + * with more access if also __GFP_HIGH. The reasoning is that + * a non-blocking caller may incur a more severe penalty + * if it cannot get memory quickly, particularly if it's + * also __GFP_HIGH. + */ + if (alloc_flags & ALLOC_HARDER) + min -= min / 4; + + /* + * OOM victims can try even harder than the normal reserve * users on the grounds that it's definitely going to be in * the exit path shortly and free memory. Any allocation it * makes during the free path will be small and short-lived. */ if (alloc_flags & ALLOC_OOM) min -= min / 2; - else - min -= min / 4; } /* -- cgit v1.2.3 From 1ebbb21811b76c3b932959787f37985af36f62fa Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 13 Jan 2023 11:12:16 +0000 Subject: mm/page_alloc: explicitly define how __GFP_HIGH non-blocking allocations accesses reserves GFP_ATOMIC allocations get flagged ALLOC_HARDER which is a vague description. In preparation for the removal of GFP_ATOMIC redefine __GFP_ATOMIC to simply mean non-blocking and renaming ALLOC_HARDER to ALLOC_NON_BLOCK accordingly. __GFP_HIGH is required for access to reserves but non-blocking is granted more access. For example, GFP_NOWAIT is non-blocking but has no special access to reserves. A __GFP_NOFAIL blocking allocation is granted access similar to __GFP_HIGH if the only alternative is an OOM kill. Link: https://lkml.kernel.org/r/20230113111217.14134-6-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: NeilBrown Cc: Thierry Reding Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/internal.h | 7 +++++-- mm/page_alloc.c | 44 ++++++++++++++++++++++++-------------------- 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index f7c3bc80c475..b0b88a95347f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -778,7 +778,10 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_OOM ALLOC_NO_WATERMARKS #endif -#define ALLOC_HARDER 0x10 /* try to alloc harder */ +#define ALLOC_NON_BLOCK 0x10 /* Caller cannot block. Allow access + * to 25% of the min watermark or + * 62.5% if __GFP_HIGH is set. + */ #define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50% * of the min watermark. */ @@ -793,7 +796,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ /* Flags that allow allocations below the min watermark. */ -#define ALLOC_RESERVES (ALLOC_HARDER|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) +#define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) enum ttu_flags; struct tlbflush_unmap_batch; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ffe2151d11ee..18ca33a1945d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4007,18 +4007,19 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, * __GFP_HIGH allows access to 50% of the min reserve as well * as OOM. */ - if (alloc_flags & ALLOC_MIN_RESERVE) + if (alloc_flags & ALLOC_MIN_RESERVE) { min -= min / 2; - /* - * Non-blocking allocations can access some of the reserve - * with more access if also __GFP_HIGH. The reasoning is that - * a non-blocking caller may incur a more severe penalty - * if it cannot get memory quickly, particularly if it's - * also __GFP_HIGH. - */ - if (alloc_flags & ALLOC_HARDER) - min -= min / 4; + /* + * Non-blocking allocations (e.g. GFP_ATOMIC) can + * access more reserves than just __GFP_HIGH. Other + * non-blocking allocations requests such as GFP_NOWAIT + * or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get + * access to the min reserve. + */ + if (alloc_flags & ALLOC_NON_BLOCK) + min -= min / 4; + } /* * OOM victims can try even harder than the normal reserve @@ -4869,28 +4870,30 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order) * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_MIN_RESERVE(__GFP_HIGH). + * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH). */ alloc_flags |= (__force int) (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); - if (gfp_mask & __GFP_ATOMIC) { + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { /* * Not worth trying to allocate harder for __GFP_NOMEMALLOC even * if it can't schedule. */ if (!(gfp_mask & __GFP_NOMEMALLOC)) { - alloc_flags |= ALLOC_HARDER; + alloc_flags |= ALLOC_NON_BLOCK; if (order > 0) alloc_flags |= ALLOC_HIGHATOMIC; } /* - * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the - * comment for __cpuset_node_allowed(). + * Ignore cpuset mems for non-blocking __GFP_HIGH (probably + * GFP_ATOMIC) rather than fail, see the comment for + * __cpuset_node_allowed(). */ - alloc_flags &= ~ALLOC_CPUSET; + if (alloc_flags & ALLOC_MIN_RESERVE) + alloc_flags &= ~ALLOC_CPUSET; } else if (unlikely(rt_task(current)) && in_task()) alloc_flags |= ALLOC_MIN_RESERVE; @@ -5321,12 +5324,13 @@ nopage: WARN_ON_ONCE_GFP(costly_order, gfp_mask); /* - * Help non-failing allocations by giving them access to memory - * reserves but do not use ALLOC_NO_WATERMARKS because this + * Help non-failing allocations by giving some access to memory + * reserves normally used for high priority non-blocking + * allocations but do not use ALLOC_NO_WATERMARKS because this * could deplete whole memory reserves which would just make - * the situation worse + * the situation worse. */ - page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac); + page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac); if (page) goto got_pg; -- cgit v1.2.3 From 2973d8229b78d3f148e0c45916a1e8b237dc6167 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 13 Jan 2023 11:12:17 +0000 Subject: mm: discard __GFP_ATOMIC __GFP_ATOMIC serves little purpose. Its main effect is to set ALLOC_HARDER which adds a few little boosts to increase the chance of an allocation succeeding, one of which is to lower the water-mark at which it will succeed. It is *always* paired with __GFP_HIGH which sets ALLOC_HIGH which also adjusts this watermark. It is probable that other users of __GFP_HIGH should benefit from the other little bonuses that __GFP_ATOMIC gets. __GFP_ATOMIC also gives a warning if used with __GFP_DIRECT_RECLAIM. There is little point to this. We already get a might_sleep() warning if __GFP_DIRECT_RECLAIM is set. __GFP_ATOMIC allows the "watermark_boost" to be side-stepped. It is probable that testing ALLOC_HARDER is a better fit here. __GFP_ATOMIC is used by tegra-smmu.c to check if the allocation might sleep. This should test __GFP_DIRECT_RECLAIM instead. This patch: - removes __GFP_ATOMIC - allows __GFP_HIGH allocations to ignore watermark boosting as well as GFP_ATOMIC requests. - makes other adjustments as suggested by the above. The net result is not change to GFP_ATOMIC allocations. Other allocations that use __GFP_HIGH will benefit from a few different extra privileges. This affects: xen, dm, md, ntfs3 the vermillion frame buffer hibernation ksm swap all of which likely produce more benefit than cost if these selected allocation are more likely to succeed quickly. [mgorman: Minor adjustments to rework on top of a series] Link: https://lkml.kernel.org/r/163712397076.13692.4727608274002939094@noble.neil.brown.name Link: https://lkml.kernel.org/r/20230113111217.14134-7-mgorman@techsingularity.net Signed-off-by: NeilBrown Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: Thierry Reding Signed-off-by: Andrew Morton --- Documentation/mm/balance.rst | 2 +- drivers/iommu/tegra-smmu.c | 4 ++-- include/linux/gfp_types.h | 12 ++++-------- include/trace/events/mmflags.h | 1 - lib/test_printf.c | 8 ++++---- mm/internal.h | 2 +- mm/page_alloc.c | 13 +++---------- tools/perf/builtin-kmem.c | 1 - 8 files changed, 15 insertions(+), 28 deletions(-) diff --git a/Documentation/mm/balance.rst b/Documentation/mm/balance.rst index 6a1fadf3e173..e38e9d83c1c7 100644 --- a/Documentation/mm/balance.rst +++ b/Documentation/mm/balance.rst @@ -6,7 +6,7 @@ Memory Balancing Started Jan 2000 by Kanoj Sarcar -Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as +Memory balancing is needed for !__GFP_HIGH and !__GFP_KSWAPD_RECLAIM as well as for non __GFP_IO allocations. The first reason why a caller may avoid reclaim is that the caller can not diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 5b1af40221ec..af8d0e685260 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -671,12 +671,12 @@ static struct page *as_get_pde_page(struct tegra_smmu_as *as, * allocate page in a sleeping context if GFP flags permit. Hence * spinlock needs to be unlocked and re-locked after allocation. */ - if (!(gfp & __GFP_ATOMIC)) + if (gfpflags_allow_blocking(gfp)) spin_unlock_irqrestore(&as->lock, *flags); page = alloc_page(gfp | __GFP_DMA | __GFP_ZERO); - if (!(gfp & __GFP_ATOMIC)) + if (gfpflags_allow_blocking(gfp)) spin_lock_irqsave(&as->lock, *flags); /* diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index d88c46ca82e1..5088637fe5c2 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -31,7 +31,7 @@ typedef unsigned int __bitwise gfp_t; #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u #define ___GFP_ZERO 0x100u -#define ___GFP_ATOMIC 0x200u +/* 0x200u unused */ #define ___GFP_DIRECT_RECLAIM 0x400u #define ___GFP_KSWAPD_RECLAIM 0x800u #define ___GFP_WRITE 0x1000u @@ -116,11 +116,8 @@ typedef unsigned int __bitwise gfp_t; * * %__GFP_HIGH indicates that the caller is high-priority and that granting * the request is necessary before the system can make forward progress. - * For example, creating an IO context to clean pages. - * - * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is - * high priority. Users are typically interrupt handlers. This may be - * used in conjunction with %__GFP_HIGH + * For example creating an IO context to clean pages and requests + * from atomic context. * * %__GFP_MEMALLOC allows access to all memory. This should only be used when * the caller guarantees the allocation will allow more memory to be freed @@ -135,7 +132,6 @@ typedef unsigned int __bitwise gfp_t; * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. * This takes precedence over the %__GFP_MEMALLOC flag if both are set. */ -#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) #define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) #define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC) #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) @@ -329,7 +325,7 @@ typedef unsigned int __bitwise gfp_t; * version does not attempt reclaim/compaction at all and is by default used * in page fault path, while the non-light is used by khugepaged. */ -#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) +#define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 412b5a46374c..9db52bc4ce19 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -31,7 +31,6 @@ gfpflag_string(__GFP_HIGHMEM), \ gfpflag_string(GFP_DMA32), \ gfpflag_string(__GFP_HIGH), \ - gfpflag_string(__GFP_ATOMIC), \ gfpflag_string(__GFP_IO), \ gfpflag_string(__GFP_FS), \ gfpflag_string(__GFP_NOWARN), \ diff --git a/lib/test_printf.c b/lib/test_printf.c index d34dc636b81c..46b4e6c414a3 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -674,17 +674,17 @@ flags(void) gfp = GFP_ATOMIC|__GFP_DMA; test("GFP_ATOMIC|GFP_DMA", "%pGg", &gfp); - gfp = __GFP_ATOMIC; - test("__GFP_ATOMIC", "%pGg", &gfp); + gfp = __GFP_HIGH; + test("__GFP_HIGH", "%pGg", &gfp); /* Any flags not translated by the table should remain numeric */ gfp = ~__GFP_BITS_MASK; snprintf(cmp_buffer, BUF_SIZE, "%#lx", (unsigned long) gfp); test(cmp_buffer, "%pGg", &gfp); - snprintf(cmp_buffer, BUF_SIZE, "__GFP_ATOMIC|%#lx", + snprintf(cmp_buffer, BUF_SIZE, "__GFP_HIGH|%#lx", (unsigned long) gfp); - gfp |= __GFP_ATOMIC; + gfp |= __GFP_HIGH; test(cmp_buffer, "%pGg", &gfp); kfree(cmp_buffer); diff --git a/mm/internal.h b/mm/internal.h index b0b88a95347f..2d09a7a0600a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -24,7 +24,7 @@ struct folio_batch; #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ - __GFP_ATOMIC|__GFP_NOLOCKDEP) + __GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 18ca33a1945d..0cfad30fb44c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4105,13 +4105,14 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, free_pages)) return true; + /* - * Ignore watermark boosting for GFP_ATOMIC order-0 allocations + * Ignore watermark boosting for __GFP_HIGH order-0 allocations * when checking the min watermark. The min watermark is the * point where boosting is ignored so that kswapd is woken up * when below the low watermark. */ - if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost + if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { mark = z->_watermark[WMARK_MIN]; return __zone_watermark_ok(z, order, mark, highest_zoneidx, @@ -5076,14 +5077,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int zonelist_iter_cookie; int reserve_flags; - /* - * We also sanity check to catch abuse of atomic reserves being used by - * callers that are not in atomic context. - */ - if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == - (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) - gfp_mask &= ~__GFP_ATOMIC; - restart: compaction_retries = 0; no_progress_loops = 0; diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 8ae0a1535293..f3029742b800 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -653,7 +653,6 @@ static const struct { { "__GFP_HIGHMEM", "HM" }, { "GFP_DMA32", "D32" }, { "__GFP_HIGH", "H" }, - { "__GFP_ATOMIC", "_A" }, { "__GFP_IO", "I" }, { "__GFP_FS", "F" }, { "__GFP_NOWARN", "NWR" }, -- cgit v1.2.3 From 2cf1338454a8a9a0b3c1271ccb521afa2d6ae241 Mon Sep 17 00:00:00 2001 From: David Stevens Date: Fri, 13 Jan 2023 11:30:11 +0900 Subject: mm: fix khugepaged with shmem_enabled=advise Pass vm_flags as a parameter to shmem_is_huge, rather than reading the flags from the vm_area_struct in question. This allows the updated flags from hugepage_madvise to be passed to the check, which is necessary because madvise does not update the vm_area_struct's flags until after hugepage_madvise returns. This fixes an issue when shmem_enabled=madvise, where MADV_HUGEPAGE on shmem was not able to register the mm_struct with khugepaged. Prior to cd89fb065099, the mm_struct was registered by MADV_HUGEPAGE regardless of the value of shmem_enabled (which was only checked when scanning vmas). Link: https://lkml.kernel.org/r/20230113023011.1784015-1-stevensd@google.com Fixes: cd89fb065099 ("mm,thp,shmem: make khugepaged obey tmpfs mount flags") Signed-off-by: David Stevens Cc: David Stevens Cc: Hugh Dickins Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 10 ++-------- mm/huge_memory.c | 3 ++- mm/shmem.c | 18 +++++++++--------- 3 files changed, 13 insertions(+), 18 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index d500ea967dc7..d09d54be4ffd 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -92,14 +92,8 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); -extern bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, - pgoff_t index, bool shmem_huge_force); -static inline bool shmem_huge_enabled(struct vm_area_struct *vma, - bool shmem_huge_force) -{ - return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff, - shmem_huge_force); -} +extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, + struct mm_struct *mm, unsigned long vm_flags); extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 868fcccdff72..1d6977dc6b31 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -119,7 +119,8 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, * own flags. */ if (!in_pf && shmem_file(vma->vm_file)) - return shmem_huge_enabled(vma, !enforce_sysfs); + return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, + !enforce_sysfs, vma->vm_mm, vm_flags); /* Enforce sysfs THP requirements as necessary */ if (enforce_sysfs && diff --git a/mm/shmem.c b/mm/shmem.c index c5048c6c83dd..9e1015cbad29 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -468,15 +468,14 @@ static bool shmem_confirm_swap(struct address_space *mapping, static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; -bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, - pgoff_t index, bool shmem_huge_force) +bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, + struct mm_struct *mm, unsigned long vm_flags) { loff_t i_size; if (!S_ISREG(inode->i_mode)) return false; - if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) + if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags))) return false; if (shmem_huge == SHMEM_HUGE_DENY) return false; @@ -493,7 +492,7 @@ bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, return true; fallthrough; case SHMEM_HUGE_ADVISE: - if (vma && (vma->vm_flags & VM_HUGEPAGE)) + if (mm && (vm_flags & VM_HUGEPAGE)) return true; fallthrough; default: @@ -676,8 +675,8 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY -bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, - pgoff_t index, bool shmem_huge_force) +bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, + struct mm_struct *mm, unsigned long vm_flags) { return false; } @@ -1068,7 +1067,7 @@ static int shmem_getattr(struct user_namespace *mnt_userns, STATX_ATTR_NODUMP); generic_fillattr(&init_user_ns, inode, stat); - if (shmem_is_huge(NULL, inode, 0, false)) + if (shmem_is_huge(inode, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { @@ -1926,7 +1925,8 @@ repeat: return 0; } - if (!shmem_is_huge(vma, inode, index, false)) + if (!shmem_is_huge(inode, index, false, + vma ? vma->vm_mm : NULL, vma ? vma->vm_flags : 0)) goto alloc_nohuge; huge_gfp = vma_thp_gfp_mask(vma); -- cgit v1.2.3 From ee7a5906ff08e435ed95ec9fe7c7eed2c11015d2 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:26 -0800 Subject: pagemap: add filemap_grab_folio() Patch series "Convert to filemap_get_folios_tag()", v5. This patch series replaces find_get_pages_range_tag() with filemap_get_folios_tag(). This also allows the removal of multiple calls to compound_head() throughout. It also makes a good chunk of the straightforward conversions to folios, and takes the opportunity to introduce a function that grabs a folio from the pagecache. This patch (of 23): Add function filemap_grab_folio() to grab a folio from the page cache. This function is meant to serve as a folio replacement for grab_cache_page, and is used to facilitate the removal of find_get_pages_range_tag(). Link: https://lkml.kernel.org/r/20230104211448.4804-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20230104211448.4804-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 29e1f9e76eb6..468183be67be 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -546,6 +546,26 @@ static inline struct folio *filemap_lock_folio(struct address_space *mapping, return __filemap_get_folio(mapping, index, FGP_LOCK, 0); } +/** + * filemap_grab_folio - grab a folio from the page cache + * @mapping: The address space to search + * @index: The page index + * + * Looks up the page cache entry at @mapping & @index. If no folio is found, + * a new folio is created. The folio is locked, marked as accessed, and + * returned. + * + * Return: A found or created folio. NULL if no folio is found and failed to + * create a folio. + */ +static inline struct folio *filemap_grab_folio(struct address_space *mapping, + pgoff_t index) +{ + return __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_mask(mapping)); +} + /** * find_get_page - find and get a page reference * @mapping: the address_space to search -- cgit v1.2.3 From 247f9e1feef4e57911510c8f82348efb4491ea0e Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:27 -0800 Subject: filemap: add filemap_get_folios_tag() This is the equivalent of find_get_pages_range_tag(), except for folios instead of pages. One noteable difference is filemap_get_folios_tag() does not take in a maximum pages argument. It instead tries to fill a folio batch and stops either once full (15 folios) or reaching the end of the search range. The new function supports large folios, the initial function did not since all callers don't use large folios. Link: https://lkml.kernel.org/r/20230104211448.4804-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcow (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 2 ++ mm/filemap.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 468183be67be..bb3c1d51b1cb 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -739,6 +739,8 @@ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned filemap_get_folios_contig(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); +unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, + pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch); unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, pgoff_t end, xa_mark_t tag, unsigned int nr_pages, struct page **pages); diff --git a/mm/filemap.c b/mm/filemap.c index c4d4ace9cc70..291bb3e0957a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2281,6 +2281,60 @@ out: } EXPORT_SYMBOL(filemap_get_folios_contig); +/** + * filemap_get_folios_tag - Get a batch of folios matching @tag + * @mapping: The address_space to search + * @start: The starting page index + * @end: The final page index (inclusive) + * @tag: The tag index + * @fbatch: The batch to fill + * + * Same as filemap_get_folios(), but only returning folios tagged with @tag. + * + * Return: The number of folios found. + * Also update @start to index the next folio for traversal. + */ +unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, + pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch) +{ + XA_STATE(xas, &mapping->i_pages, *start); + struct folio *folio; + + rcu_read_lock(); + while ((folio = find_get_entry(&xas, end, tag)) != NULL) { + /* + * Shadow entries should never be tagged, but this iteration + * is lockless so there is a window for page reclaim to evict + * a page we saw tagged. Skip over it. + */ + if (xa_is_value(folio)) + continue; + if (!folio_batch_add(fbatch, folio)) { + unsigned long nr = folio_nr_pages(folio); + + if (folio_test_hugetlb(folio)) + nr = 1; + *start = folio->index + nr; + goto out; + } + } + /* + * We come here when there is no page beyond @end. We take care to not + * overflow the index @start as it confuses some of the callers. This + * breaks the iteration when there is a page at index -1 but that is + * already broke anyway. + */ + if (end == (pgoff_t)-1) + *start = (pgoff_t)-1; + else + *start = end + 1; +out: + rcu_read_unlock(); + + return folio_batch_count(fbatch); +} +EXPORT_SYMBOL(filemap_get_folios_tag); + /** * find_get_pages_range_tag - Find and return head pages matching @tag. * @mapping: the address_space to search -- cgit v1.2.3 From 6817ef514e1aacee228f6a9fbcdc3a2c49cb6c29 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:28 -0800 Subject: filemap: convert __filemap_fdatawait_range() to use filemap_get_folios_tag() Convert function to use folios. This is in preparation for the removal of find_get_pages_range_tag(). This change removes 2 calls to compound_head(). Link: https://lkml.kernel.org/r/20230104211448.4804-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcow (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 291bb3e0957a..85adbcf2d9a7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -503,25 +503,27 @@ static void __filemap_fdatawait_range(struct address_space *mapping, { pgoff_t index = start_byte >> PAGE_SHIFT; pgoff_t end = end_byte >> PAGE_SHIFT; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + unsigned nr_folios; + + folio_batch_init(&fbatch); - pagevec_init(&pvec); while (index <= end) { unsigned i; - nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, - end, PAGECACHE_TAG_WRITEBACK); - if (!nr_pages) + nr_folios = filemap_get_folios_tag(mapping, &index, end, + PAGECACHE_TAG_WRITEBACK, &fbatch); + + if (!nr_folios) break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; - wait_on_page_writeback(page); - ClearPageError(page); + folio_wait_writeback(folio); + folio_clear_error(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } } -- cgit v1.2.3 From 0fff435f060c8b29cb068d4068cb2df513046865 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:29 -0800 Subject: page-writeback: convert write_cache_pages() to use filemap_get_folios_tag() Convert function to use folios throughout. This is in preparation for the removal of find_get_pages_range_tag(). This change removes 8 calls to compound_head(), and the function now supports large folios. Link: https://lkml.kernel.org/r/20230104211448.4804-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcow (Oracle) Signed-off-by: Andrew Morton --- mm/page-writeback.c | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e91f94b3438b..5e892f20bed7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2398,15 +2398,15 @@ int write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0; int error; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + int nr_folios; pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; int range_whole = 0; xa_mark_t tag; - pagevec_init(&pvec); + folio_batch_init(&fbatch); if (wbc->range_cyclic) { index = mapping->writeback_index; /* prev offset */ end = -1; @@ -2426,17 +2426,18 @@ int write_cache_pages(struct address_space *mapping, while (!done && (index <= end)) { int i; - nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag); - if (nr_pages == 0) + nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch); + + if (nr_folios == 0) break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; - done_index = page->index; + done_index = folio->index; - lock_page(page); + folio_lock(folio); /* * Page truncated or invalidated. We can freely skip it @@ -2446,30 +2447,30 @@ int write_cache_pages(struct address_space *mapping, * even if there is now a new, dirty page at the same * pagecache address. */ - if (unlikely(page->mapping != mapping)) { + if (unlikely(folio->mapping != mapping)) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } - if (PageWriteback(page)) { + if (folio_test_writeback(folio)) { if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); + folio_wait_writeback(folio); else goto continue_unlock; } - BUG_ON(PageWriteback(page)); - if (!clear_page_dirty_for_io(page)) + BUG_ON(folio_test_writeback(folio)); + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); - error = (*writepage)(page, wbc, data); + error = writepage(&folio->page, wbc, data); if (unlikely(error)) { /* * Handle errors according to the type of @@ -2484,11 +2485,12 @@ continue_unlock: * the first error. */ if (error == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(page); + folio_unlock(folio); error = 0; } else if (wbc->sync_mode != WB_SYNC_ALL) { ret = error; - done_index = page->index + 1; + done_index = folio->index + + folio_nr_pages(folio); done = 1; break; } @@ -2508,7 +2510,7 @@ continue_unlock: break; } } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } -- cgit v1.2.3 From acc8d8588cb7e3e64b0d2fa611dad06574cd67b1 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:30 -0800 Subject: afs: convert afs_writepages_region() to use filemap_get_folios_tag() Convert to use folios throughout. This function is in preparation to remove find_get_pages_range_tag(). Also modify this function to write the whole batch one at a time, rather than calling for a new set every single write. Link: https://lkml.kernel.org/r/20230104211448.4804-6-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Tested-by: David Howells Signed-off-by: Andrew Morton --- fs/afs/write.c | 116 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 59 insertions(+), 57 deletions(-) diff --git a/fs/afs/write.c b/fs/afs/write.c index 19df10d63323..2d3b08b7406c 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -704,85 +704,87 @@ static int afs_writepages_region(struct address_space *mapping, bool max_one_loop) { struct folio *folio; - struct page *head_page; + struct folio_batch fbatch; ssize_t ret; + unsigned int i; int n, skips = 0; _enter("%llx,%llx,", start, end); + folio_batch_init(&fbatch); do { pgoff_t index = start / PAGE_SIZE; - n = find_get_pages_range_tag(mapping, &index, end / PAGE_SIZE, - PAGECACHE_TAG_DIRTY, 1, &head_page); + n = filemap_get_folios_tag(mapping, &index, end / PAGE_SIZE, + PAGECACHE_TAG_DIRTY, &fbatch); + if (!n) break; + for (i = 0; i < n; i++) { + folio = fbatch.folios[i]; + start = folio_pos(folio); /* May regress with THPs */ - folio = page_folio(head_page); - start = folio_pos(folio); /* May regress with THPs */ - - _debug("wback %lx", folio_index(folio)); + _debug("wback %lx", folio_index(folio)); - /* At this point we hold neither the i_pages lock nor the - * page lock: the page may be truncated or invalidated - * (changing page->mapping to NULL), or even swizzled - * back from swapper_space to tmpfs file mapping - */ - if (wbc->sync_mode != WB_SYNC_NONE) { - ret = folio_lock_killable(folio); - if (ret < 0) { - folio_put(folio); - return ret; - } - } else { - if (!folio_trylock(folio)) { - folio_put(folio); - return 0; + /* At this point we hold neither the i_pages lock nor the + * page lock: the page may be truncated or invalidated + * (changing page->mapping to NULL), or even swizzled + * back from swapper_space to tmpfs file mapping + */ + if (wbc->sync_mode != WB_SYNC_NONE) { + ret = folio_lock_killable(folio); + if (ret < 0) { + folio_batch_release(&fbatch); + return ret; + } + } else { + if (!folio_trylock(folio)) + continue; } - } - if (folio_mapping(folio) != mapping || - !folio_test_dirty(folio)) { - start += folio_size(folio); - folio_unlock(folio); - folio_put(folio); - continue; - } + if (folio->mapping != mapping || + !folio_test_dirty(folio)) { + start += folio_size(folio); + folio_unlock(folio); + continue; + } - if (folio_test_writeback(folio) || - folio_test_fscache(folio)) { - folio_unlock(folio); - if (wbc->sync_mode != WB_SYNC_NONE) { - folio_wait_writeback(folio); + if (folio_test_writeback(folio) || + folio_test_fscache(folio)) { + folio_unlock(folio); + if (wbc->sync_mode != WB_SYNC_NONE) { + folio_wait_writeback(folio); #ifdef CONFIG_AFS_FSCACHE - folio_wait_fscache(folio); + folio_wait_fscache(folio); #endif - } else { - start += folio_size(folio); + } else { + start += folio_size(folio); + } + if (wbc->sync_mode == WB_SYNC_NONE) { + if (skips >= 5 || need_resched()) { + *_next = start; + _leave(" = 0 [%llx]", *_next); + return 0; + } + skips++; + } + continue; } - folio_put(folio); - if (wbc->sync_mode == WB_SYNC_NONE) { - if (skips >= 5 || need_resched()) - break; - skips++; + + if (!folio_clear_dirty_for_io(folio)) + BUG(); + ret = afs_write_back_from_locked_folio(mapping, wbc, + folio, start, end); + if (ret < 0) { + _leave(" = %zd", ret); + folio_batch_release(&fbatch); + return ret; } - continue; - } - if (!folio_clear_dirty_for_io(folio)) - BUG(); - ret = afs_write_back_from_locked_folio(mapping, wbc, folio, start, end); - folio_put(folio); - if (ret < 0) { - _leave(" = %zd", ret); - return ret; + start += ret; } - start += ret; - - if (max_one_loop) - break; - + folio_batch_release(&fbatch); cond_resched(); } while (wbc->nr_to_write > 0); -- cgit v1.2.3 From 51c5cd3bafe5e1e8a678d661c43b09d7c6584274 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:31 -0800 Subject: btrfs: convert btree_write_cache_pages() to use filemap_get_folio_tag() Convert function to use folios throughout. This is in preparation for the removal of find_get_pages_range_tag(). Link: https://lkml.kernel.org/r/20230104211448.4804-7-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: David Sterba Signed-off-by: Andrew Morton --- fs/btrfs/extent_io.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9bd32daa9b9a..d5ef288d3a43 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2865,14 +2865,14 @@ int btree_write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0; int nr_to_write_done = 0; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + unsigned int nr_folios; pgoff_t index; pgoff_t end; /* Inclusive */ int scanned = 0; xa_mark_t tag; - pagevec_init(&pvec); + folio_batch_init(&fbatch); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; @@ -2895,14 +2895,15 @@ retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag))) { + (nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch))) { unsigned i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; - ret = submit_eb_page(page, wbc, &bio_ctrl, &eb_context); + ret = submit_eb_page(&folio->page, wbc, &bio_ctrl, + &eb_context); if (ret == 0) continue; if (ret < 0) { @@ -2917,7 +2918,7 @@ retry: */ nr_to_write_done = wbc->nr_to_write <= 0; } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } if (!scanned && !done) { -- cgit v1.2.3 From 9f50fd2e92e37441da3a1daa8e27fd0c400b6cdd Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:32 -0800 Subject: btrfs: convert extent_write_cache_pages() to use filemap_get_folios_tag() Convert function to use folios throughout. This is in preparation for the removal of find_get_pages_range_tag(). Now also supports large folios. Link: https://lkml.kernel.org/r/20230104211448.4804-8-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: David Sterba Signed-off-by: Andrew Morton --- fs/btrfs/extent_io.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d5ef288d3a43..0a2d6fb611c6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2993,8 +2993,8 @@ static int extent_write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0; int nr_to_write_done = 0; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + unsigned int nr_folios; pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; @@ -3014,7 +3014,7 @@ static int extent_write_cache_pages(struct address_space *mapping, if (!igrab(inode)) return 0; - pagevec_init(&pvec); + folio_batch_init(&fbatch); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; @@ -3052,14 +3052,14 @@ retry: tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, - &index, end, tag))) { + (nr_folios = filemap_get_folios_tag(mapping, &index, + end, tag, &fbatch))) { unsigned i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; - done_index = page->index + 1; + done_index = folio->index + folio_nr_pages(folio); /* * At this point we hold neither the i_pages lock nor * the page lock: the page may be truncated or @@ -3067,29 +3067,29 @@ retry: * or even swizzled back from swapper_space to * tmpfs file mapping */ - if (!trylock_page(page)) { + if (!folio_trylock(folio)) { submit_write_bio(bio_ctrl, 0); - lock_page(page); + folio_lock(folio); } - if (unlikely(page->mapping != mapping)) { - unlock_page(page); + if (unlikely(folio->mapping != mapping)) { + folio_unlock(folio); continue; } if (wbc->sync_mode != WB_SYNC_NONE) { - if (PageWriteback(page)) + if (folio_test_writeback(folio)) submit_write_bio(bio_ctrl, 0); - wait_on_page_writeback(page); + folio_wait_writeback(folio); } - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); + if (folio_test_writeback(folio) || + !folio_clear_dirty_for_io(folio)) { + folio_unlock(folio); continue; } - ret = __extent_writepage(page, wbc, bio_ctrl); + ret = __extent_writepage(&folio->page, wbc, bio_ctrl); if (ret < 0) { done = 1; break; @@ -3102,7 +3102,7 @@ retry: */ nr_to_write_done = wbc->nr_to_write <= 0; } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } if (!scanned && !done) { -- cgit v1.2.3 From 590a2b5f0a9b740e415e0d52bd8a0f87fc15b87b Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:33 -0800 Subject: ceph: convert ceph_writepages_start() to use filemap_get_folios_tag() Convert function to use a folio_batch instead of pagevec. This is in preparation for the removal of find_get_pages_range_tag(). Also some minor renaming for consistency. Link: https://lkml.kernel.org/r/20230104211448.4804-9-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Jeff Layton Signed-off-by: Andrew Morton --- fs/ceph/addr.c | 58 ++++++++++++++++++++++++++++++---------------------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8c74871e37c9..905268bf9741 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -792,7 +792,7 @@ static int ceph_writepages_start(struct address_space *mapping, struct ceph_vino vino = ceph_vino(inode); pgoff_t index, start_index, end = -1; struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; - struct pagevec pvec; + struct folio_batch fbatch; int rc = 0; unsigned int wsize = i_blocksize(inode); struct ceph_osd_request *req = NULL; @@ -821,7 +821,7 @@ static int ceph_writepages_start(struct address_space *mapping, if (fsc->mount_options->wsize < wsize) wsize = fsc->mount_options->wsize; - pagevec_init(&pvec); + folio_batch_init(&fbatch); start_index = wbc->range_cyclic ? mapping->writeback_index : 0; index = start_index; @@ -869,7 +869,7 @@ retry: while (!done && index <= end) { int num_ops = 0, op_idx; - unsigned i, pvec_pages, max_pages, locked_pages = 0; + unsigned i, nr_folios, max_pages, locked_pages = 0; struct page **pages = NULL, **data_pages; struct page *page; pgoff_t strip_unit_end = 0; @@ -879,13 +879,13 @@ retry: max_pages = wsize >> PAGE_SHIFT; get_more_pages: - pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, - end, PAGECACHE_TAG_DIRTY); - dout("pagevec_lookup_range_tag got %d\n", pvec_pages); - if (!pvec_pages && !locked_pages) + nr_folios = filemap_get_folios_tag(mapping, &index, + end, PAGECACHE_TAG_DIRTY, &fbatch); + dout("pagevec_lookup_range_tag got %d\n", nr_folios); + if (!nr_folios && !locked_pages) break; - for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) { - page = pvec.pages[i]; + for (i = 0; i < nr_folios && locked_pages < max_pages; i++) { + page = &fbatch.folios[i]->page; dout("? %p idx %lu\n", page, page->index); if (locked_pages == 0) lock_page(page); /* first page */ @@ -995,7 +995,7 @@ get_more_pages: len = 0; } - /* note position of first page in pvec */ + /* note position of first page in fbatch */ dout("%p will write page %p idx %lu\n", inode, page, page->index); @@ -1005,30 +1005,30 @@ get_more_pages: fsc->write_congested = true; pages[locked_pages++] = page; - pvec.pages[i] = NULL; + fbatch.folios[i] = NULL; len += thp_size(page); } /* did we get anything? */ if (!locked_pages) - goto release_pvec_pages; + goto release_folios; if (i) { unsigned j, n = 0; - /* shift unused page to beginning of pvec */ - for (j = 0; j < pvec_pages; j++) { - if (!pvec.pages[j]) + /* shift unused page to beginning of fbatch */ + for (j = 0; j < nr_folios; j++) { + if (!fbatch.folios[j]) continue; if (n < j) - pvec.pages[n] = pvec.pages[j]; + fbatch.folios[n] = fbatch.folios[j]; n++; } - pvec.nr = n; + fbatch.nr = n; - if (pvec_pages && i == pvec_pages && + if (nr_folios && i == nr_folios && locked_pages < max_pages) { - dout("reached end pvec, trying for more\n"); - pagevec_release(&pvec); + dout("reached end fbatch, trying for more\n"); + folio_batch_release(&fbatch); goto get_more_pages; } } @@ -1164,10 +1164,10 @@ new_request: if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) done = true; -release_pvec_pages: - dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, - pvec.nr ? pvec.pages[0] : NULL); - pagevec_release(&pvec); +release_folios: + dout("folio_batch release on %d folios (%p)\n", (int)fbatch.nr, + fbatch.nr ? fbatch.folios[0] : NULL); + folio_batch_release(&fbatch); } if (should_loop && !done) { @@ -1184,15 +1184,17 @@ release_pvec_pages: unsigned i, nr; index = 0; while ((index <= end) && - (nr = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_WRITEBACK))) { + (nr = filemap_get_folios_tag(mapping, &index, + (pgoff_t)-1, + PAGECACHE_TAG_WRITEBACK, + &fbatch))) { for (i = 0; i < nr; i++) { - page = pvec.pages[i]; + page = &fbatch.folios[i]->page; if (page_snap_context(page) != snapc) continue; wait_on_page_writeback(page); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } } -- cgit v1.2.3 From 4cda80f3a7a53a0bc66cd9f16f7872524cfdd87d Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:34 -0800 Subject: cifs: convert wdata_alloc_and_fillpages() to use filemap_get_folios_tag() This is in preparation for the removal of find_get_pages_range_tag(). Now also supports the use of large folios. Since tofind might be larger than the max number of folios in a folio_batch (15), we loop through filling in wdata->pages pulling more batches until we either reach tofind pages or run out of folios. This function may not return all pages in the last found folio before tofind pages are reached. Link: https://lkml.kernel.org/r/20230104211448.4804-10-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Paulo Alcantara (SUSE) Cc: Tom Talpey Signed-off-by: Andrew Morton --- fs/cifs/file.c | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 22dfc1f8b4f1..8cdd2f67af24 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2527,14 +2527,40 @@ wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping, unsigned int *found_pages) { struct cifs_writedata *wdata; - + struct folio_batch fbatch; + unsigned int i, idx, p, nr; wdata = cifs_writedata_alloc((unsigned int)tofind, cifs_writev_complete); if (!wdata) return NULL; - *found_pages = find_get_pages_range_tag(mapping, index, end, - PAGECACHE_TAG_DIRTY, tofind, wdata->pages); + folio_batch_init(&fbatch); + *found_pages = 0; + +again: + nr = filemap_get_folios_tag(mapping, index, end, + PAGECACHE_TAG_DIRTY, &fbatch); + if (!nr) + goto out; /* No dirty pages left in the range */ + + for (i = 0; i < nr; i++) { + struct folio *folio = fbatch.folios[i]; + + idx = 0; + p = folio_nr_pages(folio); +add_more: + wdata->pages[*found_pages] = folio_page(folio, idx); + folio_get(folio); + if (++*found_pages == tofind) { + folio_batch_release(&fbatch); + goto out; + } + if (++idx < p) + goto add_more; + } + folio_batch_release(&fbatch); + goto again; +out: return wdata; } -- cgit v1.2.3 From 50ead2537441f7df8d493e1085da76034ea92cf1 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:35 -0800 Subject: ext4: convert mpage_prepare_extent_to_map() to use filemap_get_folios_tag() Convert the function to use folios throughout. This is in preparation for the removal of find_get_pages_range_tag(). Now supports large folios. This change removes 11 calls to compound_head(). Link: https://lkml.kernel.org/r/20230104211448.4804-11-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- fs/ext4/inode.c | 65 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9d9f414f99fe..fb6cd994e59a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2595,8 +2595,8 @@ static bool ext4_page_nomap_can_writeout(struct page *page) static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) { struct address_space *mapping = mpd->inode->i_mapping; - struct pagevec pvec; - unsigned int nr_pages; + struct folio_batch fbatch; + unsigned int nr_folios; long left = mpd->wbc->nr_to_write; pgoff_t index = mpd->first_page; pgoff_t end = mpd->last_page; @@ -2610,18 +2610,17 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; - - pagevec_init(&pvec); + folio_batch_init(&fbatch); mpd->map.m_len = 0; mpd->next_page = index; while (index <= end) { - nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag); - if (nr_pages == 0) + nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch); + if (nr_folios == 0) break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; /* * Accumulated enough dirty pages? This doesn't apply @@ -2635,10 +2634,10 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) goto out; /* If we can't merge this page, we are done. */ - if (mpd->map.m_len > 0 && mpd->next_page != page->index) + if (mpd->map.m_len > 0 && mpd->next_page != folio->index) goto out; - lock_page(page); + folio_lock(folio); /* * If the page is no longer dirty, or its mapping no * longer corresponds to inode we are writing (which @@ -2646,16 +2645,16 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * page is already under writeback and we are not doing * a data integrity writeback, skip the page */ - if (!PageDirty(page) || - (PageWriteback(page) && + if (!folio_test_dirty(folio) || + (folio_test_writeback(folio) && (mpd->wbc->sync_mode == WB_SYNC_NONE)) || - unlikely(page->mapping != mapping)) { - unlock_page(page); + unlikely(folio->mapping != mapping)) { + folio_unlock(folio); continue; } - wait_on_page_writeback(page); - BUG_ON(PageWriteback(page)); + folio_wait_writeback(folio); + BUG_ON(folio_test_writeback(folio)); /* * Should never happen but for buggy code in @@ -2666,49 +2665,49 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz */ - if (!page_has_buffers(page)) { - ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", page->index); - ClearPageDirty(page); - unlock_page(page); + if (!folio_buffers(folio)) { + ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", folio->index); + folio_clear_dirty(folio); + folio_unlock(folio); continue; } if (mpd->map.m_len == 0) - mpd->first_page = page->index; - mpd->next_page = page->index + 1; + mpd->first_page = folio->index; + mpd->next_page = folio->index + folio_nr_pages(folio); /* * Writeout for transaction commit where we cannot * modify metadata is simple. Just submit the page. */ if (!mpd->can_map) { - if (ext4_page_nomap_can_writeout(page)) { - err = mpage_submit_page(mpd, page); + if (ext4_page_nomap_can_writeout(&folio->page)) { + err = mpage_submit_page(mpd, &folio->page); if (err < 0) goto out; } else { - unlock_page(page); - mpd->first_page++; + folio_unlock(folio); + mpd->first_page += folio_nr_pages(folio); } } else { /* Add all dirty buffers to mpd */ - lblk = ((ext4_lblk_t)page->index) << + lblk = ((ext4_lblk_t)folio->index) << (PAGE_SHIFT - blkbits); - head = page_buffers(page); + head = folio_buffers(folio); err = mpage_process_page_bufs(mpd, head, head, - lblk); + lblk); if (err <= 0) goto out; err = 0; } - left--; + left -= folio_nr_pages(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } mpd->scanned_until_end = 1; return 0; out: - pagevec_release(&pvec); + folio_batch_release(&fbatch); return err; } -- cgit v1.2.3 From e6e46e1eb7cea179b9b31a62a0bbac6ba24bd050 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:36 -0800 Subject: f2fs: convert f2fs_fsync_node_pages() to use filemap_get_folios_tag() Convert function to use a folio_batch instead of pagevec. This is in preparation for the removal of find_get_pages_range_tag(). Link: https://lkml.kernel.org/r/20230104211448.4804-12-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Chao Yu Signed-off-by: Andrew Morton --- fs/f2fs/node.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index dde4c0458704..3e0362794e27 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1731,12 +1731,12 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, unsigned int *seq_id) { pgoff_t index; - struct pagevec pvec; + struct folio_batch fbatch; int ret = 0; struct page *last_page = NULL; bool marked = false; nid_t ino = inode->i_ino; - int nr_pages; + int nr_folios; int nwritten = 0; if (atomic) { @@ -1745,20 +1745,21 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, return PTR_ERR_OR_ZERO(last_page); } retry: - pagevec_init(&pvec); + folio_batch_init(&fbatch); index = 0; - while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY))) { + while ((nr_folios = filemap_get_folios_tag(NODE_MAPPING(sbi), &index, + (pgoff_t)-1, PAGECACHE_TAG_DIRTY, + &fbatch))) { int i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct page *page = &fbatch.folios[i]->page; bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { f2fs_put_page(last_page, 0); - pagevec_release(&pvec); + folio_batch_release(&fbatch); ret = -EIO; goto out; } @@ -1824,7 +1825,7 @@ continue_unlock: break; } } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); if (ret || marked) -- cgit v1.2.3 From a40a4ad1186a37671070786b8143b16377899b5d Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:37 -0800 Subject: f2fs: convert f2fs_flush_inline_data() to use filemap_get_folios_tag() Convert function to use a folio_batch instead of pagevec. This is in preparation for the removal of find_get_pages_tag(). Link: https://lkml.kernel.org/r/20230104211448.4804-13-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Chao Yu Signed-off-by: Andrew Morton --- fs/f2fs/node.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3e0362794e27..1c5dc7a3207e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1890,17 +1890,18 @@ static bool flush_dirty_inode(struct page *page) void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) { pgoff_t index = 0; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + int nr_folios; - pagevec_init(&pvec); + folio_batch_init(&fbatch); - while ((nr_pages = pagevec_lookup_tag(&pvec, - NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY))) { + while ((nr_folios = filemap_get_folios_tag(NODE_MAPPING(sbi), &index, + (pgoff_t)-1, PAGECACHE_TAG_DIRTY, + &fbatch))) { int i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct page *page = &fbatch.folios[i]->page; if (!IS_DNODE(page)) continue; @@ -1927,7 +1928,7 @@ continue_unlock: } unlock_page(page); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } } -- cgit v1.2.3 From 7525486affa518c9e7ffc9b9dbc966021041ebde Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:38 -0800 Subject: f2fs: convert f2fs_sync_node_pages() to use filemap_get_folios_tag() Convert function to use a folio_batch instead of pagevec. This is in preparation for the removal of find_get_pages_range_tag(). Link: https://lkml.kernel.org/r/20230104211448.4804-14-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Chao Yu Signed-off-by: Andrew Morton --- fs/f2fs/node.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1c5dc7a3207e..51e9f286f53a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1938,23 +1938,24 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, bool do_balance, enum iostat_type io_type) { pgoff_t index; - struct pagevec pvec; + struct folio_batch fbatch; int step = 0; int nwritten = 0; int ret = 0; - int nr_pages, done = 0; + int nr_folios, done = 0; - pagevec_init(&pvec); + folio_batch_init(&fbatch); next_step: index = 0; - while (!done && (nr_pages = pagevec_lookup_tag(&pvec, - NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY))) { + while (!done && (nr_folios = filemap_get_folios_tag(NODE_MAPPING(sbi), + &index, (pgoff_t)-1, PAGECACHE_TAG_DIRTY, + &fbatch))) { int i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct page *page = &fbatch.folios[i]->page; bool submitted = false; /* give a priority to WB_SYNC threads */ @@ -2029,7 +2030,7 @@ write_node: if (--wbc->nr_to_write == 0) break; } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); if (wbc->nr_to_write == 0) { -- cgit v1.2.3 From 1cd98ee747cff120ee9b93988ddb7315d8d8f8e7 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:39 -0800 Subject: f2fs: convert f2fs_write_cache_pages() to use filemap_get_folios_tag() Convert the function to use a folio_batch instead of pagevec. This is in preparation for the removal of find_get_pages_range_tag(). Also modified f2fs_all_cluster_page_ready to take in a folio_batch instead of pagevec. This does NOT support large folios. The function currently only utilizes folios of size 1 so this shouldn't cause any issues right now. This version of the patch limits the number of pages fetched to F2FS_ONSTACK_PAGES. If that ever happens, update the start index here since filemap_get_folios_tag() updates the index to be after the last found folio, not necessarily the last used page. Link: https://lkml.kernel.org/r/20230104211448.4804-15-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Chao Yu Signed-off-by: Andrew Morton --- fs/f2fs/data.c | 84 ++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 58 insertions(+), 26 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 97e816590cd9..b02c5b384204 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2957,6 +2957,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0, retry = 0; struct page *pages[F2FS_ONSTACK_PAGES]; + struct folio_batch fbatch; struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); struct bio *bio = NULL; sector_t last_block; @@ -2977,6 +2978,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, .private = NULL, }; #endif + int nr_folios, p, idx; int nr_pages; pgoff_t index; pgoff_t end; /* Inclusive */ @@ -2987,6 +2989,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int submitted = 0; int i; + folio_batch_init(&fbatch); + if (get_dirty_pages(mapping->host) <= SM_I(F2FS_M_SB(mapping))->min_hot_blocks) set_inode_flag(mapping->host, FI_HOT_DATA); @@ -3012,13 +3016,38 @@ retry: tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !retry && (index <= end)) { - nr_pages = find_get_pages_range_tag(mapping, &index, end, - tag, F2FS_ONSTACK_PAGES, pages); - if (nr_pages == 0) + nr_pages = 0; +again: + nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch); + if (nr_folios == 0) { + if (nr_pages) + goto write; break; + } + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; + + idx = 0; + p = folio_nr_pages(folio); +add_more: + pages[nr_pages] = folio_page(folio, idx); + folio_get(folio); + if (++nr_pages == F2FS_ONSTACK_PAGES) { + index = folio->index + idx + 1; + folio_batch_release(&fbatch); + goto write; + } + if (++idx < p) + goto add_more; + } + folio_batch_release(&fbatch); + goto again; +write: for (i = 0; i < nr_pages; i++) { struct page *page = pages[i]; + struct folio *folio = page_folio(page); bool need_readd; readd: need_readd = false; @@ -3035,7 +3064,7 @@ readd: } if (!f2fs_cluster_can_merge_page(&cc, - page->index)) { + folio->index)) { ret = f2fs_write_multi_pages(&cc, &submitted, wbc, io_type); if (!ret) @@ -3044,27 +3073,28 @@ readd: } if (unlikely(f2fs_cp_error(sbi))) - goto lock_page; + goto lock_folio; if (!f2fs_cluster_is_empty(&cc)) - goto lock_page; + goto lock_folio; if (f2fs_all_cluster_page_ready(&cc, pages, i, nr_pages, true)) - goto lock_page; + goto lock_folio; ret2 = f2fs_prepare_compress_overwrite( inode, &pagep, - page->index, &fsdata); + folio->index, &fsdata); if (ret2 < 0) { ret = ret2; done = 1; break; } else if (ret2 && (!f2fs_compress_write_end(inode, - fsdata, page->index, 1) || + fsdata, folio->index, 1) || !f2fs_all_cluster_page_ready(&cc, - pages, i, nr_pages, false))) { + pages, i, nr_pages, + false))) { retry = 1; break; } @@ -3077,46 +3107,47 @@ readd: break; } #ifdef CONFIG_F2FS_FS_COMPRESSION -lock_page: +lock_folio: #endif - done_index = page->index; + done_index = folio->index; retry_write: - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != mapping)) { + if (unlikely(folio->mapping != mapping)) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } - if (PageWriteback(page)) { + if (folio_test_writeback(folio)) { if (wbc->sync_mode != WB_SYNC_NONE) - f2fs_wait_on_page_writeback(page, + f2fs_wait_on_page_writeback( + &folio->page, DATA, true, true); else goto continue_unlock; } - if (!clear_page_dirty_for_io(page)) + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { - get_page(page); - f2fs_compress_ctx_add_page(&cc, page); + folio_get(folio); + f2fs_compress_ctx_add_page(&cc, &folio->page); continue; } #endif - ret = f2fs_write_single_data_page(page, &submitted, - &bio, &last_block, wbc, io_type, - 0, true); + ret = f2fs_write_single_data_page(&folio->page, + &submitted, &bio, &last_block, + wbc, io_type, 0, true); if (ret == AOP_WRITEPAGE_ACTIVATE) - unlock_page(page); + folio_unlock(folio); #ifdef CONFIG_F2FS_FS_COMPRESSION result: #endif @@ -3140,7 +3171,8 @@ result: } goto next; } - done_index = page->index + 1; + done_index = folio->index + + folio_nr_pages(folio); done = 1; break; } -- cgit v1.2.3 From 4f4a4f0febe6009a4cdc8acac52cc5dc980f185c Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:40 -0800 Subject: f2fs: convert last_fsync_dnode() to use filemap_get_folios_tag() Convert to use a folio_batch instead of pagevec. This is in preparation for the removal of find_get_pages_range_tag(). Link: https://lkml.kernel.org/r/20230104211448.4804-16-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Chao Yu Signed-off-by: Andrew Morton --- fs/f2fs/node.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 51e9f286f53a..cf997356d9f9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1518,23 +1518,24 @@ iput_out: static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index; - struct pagevec pvec; + struct folio_batch fbatch; struct page *last_page = NULL; - int nr_pages; + int nr_folios; - pagevec_init(&pvec); + folio_batch_init(&fbatch); index = 0; - while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY))) { + while ((nr_folios = filemap_get_folios_tag(NODE_MAPPING(sbi), &index, + (pgoff_t)-1, PAGECACHE_TAG_DIRTY, + &fbatch))) { int i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct page *page = &fbatch.folios[i]->page; if (unlikely(f2fs_cp_error(sbi))) { f2fs_put_page(last_page, 0); - pagevec_release(&pvec); + folio_batch_release(&fbatch); return ERR_PTR(-EIO); } @@ -1565,7 +1566,7 @@ continue_unlock: last_page = page; unlock_page(page); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } return last_page; -- cgit v1.2.3 From 580e7a4926089ea735fa09d42030d90e21537f7f Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:41 -0800 Subject: f2fs: convert f2fs_sync_meta_pages() to use filemap_get_folios_tag() Convert function to use folios throughout. This is in preparation for the removal of find_get_pages_range_tag(). This change removes 5 calls to compound_head(). Initially the function was checking if the previous page index is truly the previous page i.e. 1 index behind the current page. To convert to folios and maintain this check we need to make the check folio->index != prev + folio_nr_pages(previous folio) since we don't know how many pages are in a folio. At index i == 0 the check is guaranteed to succeed, so to workaround indexing bounds we can simply ignore the check for that specific index. This makes the initial assignment of prev trivial, so I removed that as well. Also modify a comment in commit_checkpoint for consistency. Link: https://lkml.kernel.org/r/20230104211448.4804-17-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Chao Yu Signed-off-by: Andrew Morton --- fs/f2fs/checkpoint.c | 49 ++++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 56f7d0d6a8b2..5a5515d83a1b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -395,59 +395,62 @@ long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, { struct address_space *mapping = META_MAPPING(sbi); pgoff_t index = 0, prev = ULONG_MAX; - struct pagevec pvec; + struct folio_batch fbatch; long nwritten = 0; - int nr_pages; + int nr_folios; struct writeback_control wbc = { .for_reclaim = 0, }; struct blk_plug plug; - pagevec_init(&pvec); + folio_batch_init(&fbatch); blk_start_plug(&plug); - while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY))) { + while ((nr_folios = filemap_get_folios_tag(mapping, &index, + (pgoff_t)-1, + PAGECACHE_TAG_DIRTY, &fbatch))) { int i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; - if (prev == ULONG_MAX) - prev = page->index - 1; - if (nr_to_write != LONG_MAX && page->index != prev + 1) { - pagevec_release(&pvec); + if (nr_to_write != LONG_MAX && i != 0 && + folio->index != prev + + folio_nr_pages(fbatch.folios[i-1])) { + folio_batch_release(&fbatch); goto stop; } - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != mapping)) { + if (unlikely(folio->mapping != mapping)) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } - f2fs_wait_on_page_writeback(page, META, true, true); + f2fs_wait_on_page_writeback(&folio->page, META, + true, true); - if (!clear_page_dirty_for_io(page)) + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; - if (__f2fs_write_meta_page(page, &wbc, io_type)) { - unlock_page(page); + if (__f2fs_write_meta_page(&folio->page, &wbc, + io_type)) { + folio_unlock(folio); break; } - nwritten++; - prev = page->index; + nwritten += folio_nr_pages(folio); + prev = folio->index; if (unlikely(nwritten >= nr_to_write)) break; } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } stop: @@ -1403,7 +1406,7 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi, }; /* - * pagevec_lookup_tag and lock_page again will take + * filemap_get_folios_tag and lock_page again will take * some extra time. Therefore, f2fs_update_meta_pages and * f2fs_sync_meta_pages are combined in this function. */ -- cgit v1.2.3 From 87ed37e66dfd08f6d692969cbd39282a359a2f7d Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:42 -0800 Subject: gfs2: convert gfs2_write_cache_jdata() to use filemap_get_folios_tag() Convert function to use folios throughout. This is in preparation for the removal of find_get_pgaes_range_tag(). This change removes 8 calls to compound_head(). Also had to modify and rename gfs2_write_jdata_pagevec() to take in and utilize folio_batch rather than pagevec and use folios rather than pages. gfs2_write_jdata_batch() now supports large folios. Link: https://lkml.kernel.org/r/20230104211448.4804-18-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Andreas Gruenbacher Signed-off-by: Andrew Morton --- fs/gfs2/aops.c | 64 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 35 insertions(+), 29 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index e782b4f1d104..0a47068f9acc 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -195,67 +195,71 @@ static int gfs2_writepages(struct address_space *mapping, } /** - * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages + * gfs2_write_jdata_batch - Write back a folio batch's worth of folios * @mapping: The mapping * @wbc: The writeback control - * @pvec: The vector of pages - * @nr_pages: The number of pages to write + * @fbatch: The batch of folios * @done_index: Page index * * Returns: non-zero if loop should terminate, zero otherwise */ -static int gfs2_write_jdata_pagevec(struct address_space *mapping, +static int gfs2_write_jdata_batch(struct address_space *mapping, struct writeback_control *wbc, - struct pagevec *pvec, - int nr_pages, + struct folio_batch *fbatch, pgoff_t *done_index) { struct inode *inode = mapping->host; struct gfs2_sbd *sdp = GFS2_SB(inode); - unsigned nrblocks = nr_pages * (PAGE_SIZE >> inode->i_blkbits); + unsigned nrblocks; int i; int ret; + int nr_pages = 0; + int nr_folios = folio_batch_count(fbatch); + + for (i = 0; i < nr_folios; i++) + nr_pages += folio_nr_pages(fbatch->folios[i]); + nrblocks = nr_pages * (PAGE_SIZE >> inode->i_blkbits); ret = gfs2_trans_begin(sdp, nrblocks, nrblocks); if (ret < 0) return ret; - for(i = 0; i < nr_pages; i++) { - struct page *page = pvec->pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch->folios[i]; - *done_index = page->index; + *done_index = folio->index; - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != mapping)) { + if (unlikely(folio->mapping != mapping)) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } - if (PageWriteback(page)) { + if (folio_test_writeback(folio)) { if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); + folio_wait_writeback(folio); else goto continue_unlock; } - BUG_ON(PageWriteback(page)); - if (!clear_page_dirty_for_io(page)) + BUG_ON(folio_test_writeback(folio)); + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; trace_wbc_writepage(wbc, inode_to_bdi(inode)); - ret = __gfs2_jdata_writepage(page, wbc); + ret = __gfs2_jdata_writepage(&folio->page, wbc); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(page); + folio_unlock(folio); ret = 0; } else { @@ -268,7 +272,8 @@ continue_unlock: * not be suitable for data integrity * writeout). */ - *done_index = page->index + 1; + *done_index = folio->index + + folio_nr_pages(folio); ret = 1; break; } @@ -305,8 +310,8 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, { int ret = 0; int done = 0; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + int nr_folios; pgoff_t writeback_index; pgoff_t index; pgoff_t end; @@ -315,7 +320,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, int range_whole = 0; xa_mark_t tag; - pagevec_init(&pvec); + folio_batch_init(&fbatch); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -341,17 +346,18 @@ retry: tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { - nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag); - if (nr_pages == 0) + nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch); + if (nr_folios == 0) break; - ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, &done_index); + ret = gfs2_write_jdata_batch(mapping, wbc, &fbatch, + &done_index); if (ret) done = 1; if (ret > 0) ret = 0; - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } -- cgit v1.2.3 From 5ee4b25cb7302ae2c62fab7adc1529d9f497bc6d Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:43 -0800 Subject: nilfs2: convert nilfs_lookup_dirty_data_buffers() to use filemap_get_folios_tag() Convert function to use folios throughout. This is in preparation for the removal of find_get_pages_range_tag(). This change removes 4 calls to compound_head(). Link: https://lkml.kernel.org/r/20230104211448.4804-19-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/segment.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index f7a14ed12a66..c17db9a0c665 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -680,7 +680,7 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, loff_t start, loff_t end) { struct address_space *mapping = inode->i_mapping; - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t index = 0, last = ULONG_MAX; size_t ndirties = 0; int i; @@ -694,23 +694,26 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, index = start >> PAGE_SHIFT; last = end >> PAGE_SHIFT; } - pagevec_init(&pvec); + folio_batch_init(&fbatch); repeat: if (unlikely(index > last) || - !pagevec_lookup_range_tag(&pvec, mapping, &index, last, - PAGECACHE_TAG_DIRTY)) + !filemap_get_folios_tag(mapping, &index, last, + PAGECACHE_TAG_DIRTY, &fbatch)) return ndirties; - for (i = 0; i < pagevec_count(&pvec); i++) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { struct buffer_head *bh, *head; - struct page *page = pvec.pages[i]; + struct folio *folio = fbatch.folios[i]; - lock_page(page); - if (!page_has_buffers(page)) - create_empty_buffers(page, i_blocksize(inode), 0); - unlock_page(page); + folio_lock(folio); + head = folio_buffers(folio); + if (!head) { + create_empty_buffers(&folio->page, i_blocksize(inode), 0); + head = folio_buffers(folio); + } + folio_unlock(folio); - bh = head = page_buffers(page); + bh = head; do { if (!buffer_dirty(bh) || buffer_async_write(bh)) continue; @@ -718,13 +721,13 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, list_add_tail(&bh->b_assoc_buffers, listp); ndirties++; if (unlikely(ndirties >= nlimit)) { - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); return ndirties; } } while (bh = bh->b_this_page, bh != head); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); goto repeat; } -- cgit v1.2.3 From a2458658316959a1756d06c2f64ba8a6f316f9de Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:44 -0800 Subject: nilfs2: convert nilfs_lookup_dirty_node_buffers() to use filemap_get_folios_tag() Convert function to use folios throughout. This is in preparation for the removal of find_get_pages_range_tag(). This change removes 1 call to compound_head(). Link: https://lkml.kernel.org/r/20230104211448.4804-20-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/segment.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index c17db9a0c665..19446a8243d7 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -737,20 +737,19 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode, { struct nilfs_inode_info *ii = NILFS_I(inode); struct inode *btnc_inode = ii->i_assoc_inode; - struct pagevec pvec; + struct folio_batch fbatch; struct buffer_head *bh, *head; unsigned int i; pgoff_t index = 0; if (!btnc_inode) return; + folio_batch_init(&fbatch); - pagevec_init(&pvec); - - while (pagevec_lookup_tag(&pvec, btnc_inode->i_mapping, &index, - PAGECACHE_TAG_DIRTY)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - bh = head = page_buffers(pvec.pages[i]); + while (filemap_get_folios_tag(btnc_inode->i_mapping, &index, + (pgoff_t)-1, PAGECACHE_TAG_DIRTY, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + bh = head = folio_buffers(fbatch.folios[i]); do { if (buffer_dirty(bh) && !buffer_async_write(bh)) { @@ -761,7 +760,7 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode, bh = bh->b_this_page; } while (bh != head); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } } -- cgit v1.2.3 From 41f3f3b5373e9b7372a0ecf4814c01f62600c124 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:45 -0800 Subject: nilfs2: convert nilfs_btree_lookup_dirty_buffers() to use filemap_get_folios_tag() Convert function to use folios throughout. This is in preparation for the removal of find_get_pages_range_tag(). This change removes 1 call to compound_head(). Link: https://lkml.kernel.org/r/20230104211448.4804-21-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/btree.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index b5f997e5e670..2681a449edc1 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -2150,7 +2150,7 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree, struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode; struct address_space *btcache = btnc_inode->i_mapping; struct list_head lists[NILFS_BTREE_LEVEL_MAX]; - struct pagevec pvec; + struct folio_batch fbatch; struct buffer_head *bh, *head; pgoff_t index = 0; int level, i; @@ -2160,19 +2160,19 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree, level++) INIT_LIST_HEAD(&lists[level]); - pagevec_init(&pvec); + folio_batch_init(&fbatch); - while (pagevec_lookup_tag(&pvec, btcache, &index, - PAGECACHE_TAG_DIRTY)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - bh = head = page_buffers(pvec.pages[i]); + while (filemap_get_folios_tag(btcache, &index, (pgoff_t)-1, + PAGECACHE_TAG_DIRTY, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + bh = head = folio_buffers(fbatch.folios[i]); do { if (buffer_dirty(bh)) nilfs_btree_add_dirty_buffer(btree, lists, bh); } while ((bh = bh->b_this_page) != head); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } -- cgit v1.2.3 From d4a16d31334e0a1dd948dfb2977f241805fd0e14 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:46 -0800 Subject: nilfs2: convert nilfs_copy_dirty_pages() to use filemap_get_folios_tag() Convert function to use folios throughout. This is in preparation for the removal of find_get_pages_range_tag(). This change removes 8 calls to compound_head(). Link: https://lkml.kernel.org/r/20230104211448.4804-22-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/page.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 39b7eea2642a..d921542a9593 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -240,42 +240,43 @@ static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty) int nilfs_copy_dirty_pages(struct address_space *dmap, struct address_space *smap) { - struct pagevec pvec; + struct folio_batch fbatch; unsigned int i; pgoff_t index = 0; int err = 0; - pagevec_init(&pvec); + folio_batch_init(&fbatch); repeat: - if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY)) + if (!filemap_get_folios_tag(smap, &index, (pgoff_t)-1, + PAGECACHE_TAG_DIRTY, &fbatch)) return 0; - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i], *dpage; + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i], *dfolio; - lock_page(page); - if (unlikely(!PageDirty(page))) - NILFS_PAGE_BUG(page, "inconsistent dirty state"); + folio_lock(folio); + if (unlikely(!folio_test_dirty(folio))) + NILFS_PAGE_BUG(&folio->page, "inconsistent dirty state"); - dpage = grab_cache_page(dmap, page->index); - if (unlikely(!dpage)) { + dfolio = filemap_grab_folio(dmap, folio->index); + if (unlikely(!dfolio)) { /* No empty page is added to the page cache */ err = -ENOMEM; - unlock_page(page); + folio_unlock(folio); break; } - if (unlikely(!page_has_buffers(page))) - NILFS_PAGE_BUG(page, + if (unlikely(!folio_buffers(folio))) + NILFS_PAGE_BUG(&folio->page, "found empty page in dat page cache"); - nilfs_copy_page(dpage, page, 1); - __set_page_dirty_nobuffers(dpage); + nilfs_copy_page(&dfolio->page, &folio->page, 1); + filemap_dirty_folio(folio_mapping(dfolio), dfolio); - unlock_page(dpage); - put_page(dpage); - unlock_page(page); + folio_unlock(dfolio); + folio_put(dfolio); + folio_unlock(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); if (likely(!err)) -- cgit v1.2.3 From 243c5ea4f783922f46779f9071b1948e1c1d0291 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:47 -0800 Subject: nilfs2: convert nilfs_clear_dirty_pages() to use filemap_get_folios_tag() Convert function to use folios throughout. This is in preparation for the removal of find_get_pages_range_tag(). This change removes 2 calls to compound_head(). Link: https://lkml.kernel.org/r/20230104211448.4804-23-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/page.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index d921542a9593..41ccd43cd979 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -358,22 +358,22 @@ repeat: */ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) { - struct pagevec pvec; + struct folio_batch fbatch; unsigned int i; pgoff_t index = 0; - pagevec_init(&pvec); + folio_batch_init(&fbatch); - while (pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + while (filemap_get_folios_tag(mapping, &index, (pgoff_t)-1, + PAGECACHE_TAG_DIRTY, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; - lock_page(page); - nilfs_clear_dirty_page(page, silent); - unlock_page(page); + folio_lock(folio); + nilfs_clear_dirty_page(&folio->page, silent); + folio_unlock(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } } -- cgit v1.2.3 From c5792d9384113de4085dfbce6940e2a853debb67 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:48 -0800 Subject: filemap: remove find_get_pages_range_tag() All callers to find_get_pages_range_tag(), find_get_pages_tag(), pagevec_lookup_range_tag(), and pagevec_lookup_tag() have been removed. Link: https://lkml.kernel.org/r/20230104211448.4804-24-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 10 --------- include/linux/pagevec.h | 8 ------- mm/filemap.c | 60 ------------------------------------------------- mm/swap.c | 10 --------- 4 files changed, 88 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bb3c1d51b1cb..9f1081683771 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -741,16 +741,6 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch); -unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, - pgoff_t end, xa_mark_t tag, unsigned int nr_pages, - struct page **pages); -static inline unsigned find_get_pages_tag(struct address_space *mapping, - pgoff_t *index, xa_mark_t tag, unsigned int nr_pages, - struct page **pages) -{ - return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag, - nr_pages, pages); -} struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index); diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 2a6f61a0c10a..f582f7213ea5 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -26,14 +26,6 @@ struct pagevec { }; void __pagevec_release(struct pagevec *pvec); -unsigned pagevec_lookup_range_tag(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *index, pgoff_t end, - xa_mark_t tag); -static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *index, xa_mark_t tag) -{ - return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag); -} static inline void pagevec_init(struct pagevec *pvec) { diff --git a/mm/filemap.c b/mm/filemap.c index 85adbcf2d9a7..31bf18ec6d01 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2337,66 +2337,6 @@ out: } EXPORT_SYMBOL(filemap_get_folios_tag); -/** - * find_get_pages_range_tag - Find and return head pages matching @tag. - * @mapping: the address_space to search - * @index: the starting page index - * @end: The final page index (inclusive) - * @tag: the tag index - * @nr_pages: the maximum number of pages - * @pages: where the resulting pages are placed - * - * Like find_get_pages_range(), except we only return head pages which are - * tagged with @tag. @index is updated to the index immediately after the - * last page we return, ready for the next iteration. - * - * Return: the number of pages which were found. - */ -unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, - pgoff_t end, xa_mark_t tag, unsigned int nr_pages, - struct page **pages) -{ - XA_STATE(xas, &mapping->i_pages, *index); - struct folio *folio; - unsigned ret = 0; - - if (unlikely(!nr_pages)) - return 0; - - rcu_read_lock(); - while ((folio = find_get_entry(&xas, end, tag))) { - /* - * Shadow entries should never be tagged, but this iteration - * is lockless so there is a window for page reclaim to evict - * a page we saw tagged. Skip over it. - */ - if (xa_is_value(folio)) - continue; - - pages[ret] = &folio->page; - if (++ret == nr_pages) { - *index = folio->index + folio_nr_pages(folio); - goto out; - } - } - - /* - * We come here when we got to @end. We take care to not overflow the - * index @index as it confuses some of the callers. This breaks the - * iteration when there is a page at index -1 but that is already - * broken anyway. - */ - if (end == (pgoff_t)-1) - *index = (pgoff_t)-1; - else - *index = end + 1; -out: - rcu_read_unlock(); - - return ret; -} -EXPORT_SYMBOL(find_get_pages_range_tag); - /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: diff --git a/mm/swap.c b/mm/swap.c index 42d67f9baa8c..5e4f92700c16 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1115,16 +1115,6 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) fbatch->nr = j; } -unsigned pagevec_lookup_range_tag(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *index, pgoff_t end, - xa_mark_t tag) -{ - pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, - PAGEVEC_SIZE, pvec->pages); - return pagevec_count(pvec); -} -EXPORT_SYMBOL(pagevec_lookup_range_tag); - /* * Perform any setup for the swap system */ -- cgit v1.2.3 From 6bc56a4d855303705802c5ede4625973637484c7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:18:09 +0000 Subject: mm: add vma_alloc_zeroed_movable_folio() Replace alloc_zeroed_user_highpage_movable(). The main difference is returning a folio containing a single page instead of returning the page, but take the opportunity to rename the function to match other allocation functions a little better and rewrite the documentation to place more emphasis on the zeroing rather than the highmem aspect. Link: https://lkml.kernel.org/r/20230116191813.2145215-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Signed-off-by: Andrew Morton --- arch/alpha/include/asm/page.h | 5 ++--- arch/arm64/include/asm/page.h | 4 ++-- arch/arm64/mm/fault.c | 4 ++-- arch/ia64/include/asm/page.h | 14 ++++++-------- arch/m68k/include/asm/page_no.h | 5 ++--- arch/s390/include/asm/page.h | 5 ++--- arch/x86/include/asm/page.h | 5 ++--- include/linux/highmem.h | 33 ++++++++++++++++----------------- mm/memory.c | 16 ++++++++++------ 9 files changed, 44 insertions(+), 47 deletions(-) diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h index 8f3f5eecba28..bc5256fba8f0 100644 --- a/arch/alpha/include/asm/page.h +++ b/arch/alpha/include/asm/page.h @@ -17,9 +17,8 @@ extern void clear_page(void *page); #define clear_user_page(page, vaddr, pg) clear_page(page) -#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE +#define vma_alloc_zeroed_movable_folio(vma, vaddr) \ + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) extern void copy_page(void * _to, void * _from); #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index 993a27ea6f54..2312e6ee595f 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -29,9 +29,9 @@ void copy_user_highpage(struct page *to, struct page *from, void copy_highpage(struct page *to, struct page *from); #define __HAVE_ARCH_COPY_HIGHPAGE -struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, +struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr); -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE +#define vma_alloc_zeroed_movable_folio vma_alloc_zeroed_movable_folio void tag_clear_highpage(struct page *to); #define __HAVE_ARCH_TAG_CLEAR_HIGHPAGE diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 596f46dabe4e..f4cb0f85ccf4 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -925,7 +925,7 @@ NOKPROBE_SYMBOL(do_debug_exception); /* * Used during anonymous page fault handling. */ -struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, +struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO; @@ -938,7 +938,7 @@ struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, if (vma->vm_flags & VM_MTE) flags |= __GFP_ZEROTAGS; - return alloc_page_vma(flags, vma, vaddr); + return vma_alloc_folio(flags, 0, vma, vaddr, false); } void tag_clear_highpage(struct page *page) diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h index 1b990466d540..ba0b365cf2b2 100644 --- a/arch/ia64/include/asm/page.h +++ b/arch/ia64/include/asm/page.h @@ -82,17 +82,15 @@ do { \ } while (0) -#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ +#define vma_alloc_zeroed_movable_folio(vma, vaddr) \ ({ \ - struct page *page = alloc_page_vma( \ - GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr); \ - if (page) \ - flush_dcache_page(page); \ - page; \ + struct folio *folio = vma_alloc_folio( \ + GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false); \ + if (folio) \ + flush_dcache_folio(folio); \ + folio; \ }) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE - #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) #include diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h index c9d0d84158a4..abd2c3aeb015 100644 --- a/arch/m68k/include/asm/page_no.h +++ b/arch/m68k/include/asm/page_no.h @@ -13,9 +13,8 @@ extern unsigned long memory_end; #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) -#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE +#define vma_alloc_zeroed_movable_folio(vma, vaddr) \ + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) #define __pa(vaddr) ((unsigned long)(vaddr)) #define __va(paddr) ((void *)((unsigned long)(paddr))) diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 61dea67bb9c7..8a2a3b5d1e29 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -73,9 +73,8 @@ static inline void copy_page(void *to, void *from) #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) -#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE +#define vma_alloc_zeroed_movable_folio(vma, vaddr) \ + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) /* * These are used to make use of C type-checking.. diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 9cc82f305f4b..d18e5c332cb9 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -34,9 +34,8 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, copy_page(to, from); } -#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE +#define vma_alloc_zeroed_movable_folio(vma, vaddr) \ + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) #ifndef __pa #define __pa(x) __phys_addr((unsigned long)(x)) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index d7097b8158f2..e22509420ac6 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -207,31 +207,30 @@ static inline void clear_user_highpage(struct page *page, unsigned long vaddr) } #endif -#ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE +#ifndef vma_alloc_zeroed_movable_folio /** - * alloc_zeroed_user_highpage_movable - Allocate a zeroed HIGHMEM page for a VMA that the caller knows can move - * @vma: The VMA the page is to be allocated for - * @vaddr: The virtual address the page will be inserted into + * vma_alloc_zeroed_movable_folio - Allocate a zeroed page for a VMA. + * @vma: The VMA the page is to be allocated for. + * @vaddr: The virtual address the page will be inserted into. * - * Returns: The allocated and zeroed HIGHMEM page + * This function will allocate a page suitable for inserting into this + * VMA at this virtual address. It may be allocated from highmem or + * the movable zone. An architecture may provide its own implementation. * - * This function will allocate a page for a VMA that the caller knows will - * be able to migrate in the future using move_pages() or reclaimed - * - * An architecture may override this function by defining - * __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE and providing their own - * implementation. + * Return: A folio containing one allocated and zeroed page or NULL if + * we are out of memory. */ -static inline struct page * -alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, +static inline +struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { - struct page *page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); + struct folio *folio; - if (page) - clear_user_highpage(page, vaddr); + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr, false); + if (folio) + clear_user_highpage(&folio->page, vaddr); - return page; + return folio; } #endif diff --git a/mm/memory.c b/mm/memory.c index 87b33b4967c2..b6358ffbccaa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3056,10 +3056,12 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) goto oom; if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { - new_page = alloc_zeroed_user_highpage_movable(vma, - vmf->address); - if (!new_page) + struct folio *new_folio; + + new_folio = vma_alloc_zeroed_movable_folio(vma, vmf->address); + if (!new_folio) goto oom; + new_page = &new_folio->page; } else { new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); @@ -3995,6 +3997,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page; + struct folio *folio; vm_fault_t ret = 0; pte_t entry; @@ -4044,11 +4047,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* Allocate our own private page. */ if (unlikely(anon_vma_prepare(vma))) goto oom; - page = alloc_zeroed_user_highpage_movable(vma, vmf->address); - if (!page) + folio = vma_alloc_zeroed_movable_folio(vma, vmf->address); + if (!folio) goto oom; - if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL)) + page = &folio->page; + if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) goto oom_free_page; cgroup_throttle_swaprate(page, GFP_KERNEL); -- cgit v1.2.3 From cb3184deef10fdc7658fb366189864c89ad118c9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:18:10 +0000 Subject: mm: convert do_anonymous_page() to use a folio Removes six calls to compound_head(); some inline and some external. Link: https://lkml.kernel.org/r/20230116191813.2145215-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index b6358ffbccaa..950e5a4f2cf1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3996,7 +3996,6 @@ out_release: static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page; struct folio *folio; vm_fault_t ret = 0; pte_t entry; @@ -4051,19 +4050,18 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (!folio) goto oom; - page = &folio->page; if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) goto oom_free_page; - cgroup_throttle_swaprate(page, GFP_KERNEL); + cgroup_throttle_swaprate(&folio->page, GFP_KERNEL); /* - * The memory barrier inside __SetPageUptodate makes sure that + * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before * the set_pte_at() write. */ - __SetPageUptodate(page); + __folio_mark_uptodate(folio); - entry = mk_pte(page, vma->vm_page_prot); + entry = mk_pte(&folio->page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); @@ -4082,13 +4080,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); - put_page(page); + folio_put(folio); return handle_userfault(vmf, VM_UFFD_MISSING); } inc_mm_counter(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, vmf->address); - lru_cache_add_inactive_or_unevictable(page, vma); + folio_add_new_anon_rmap(folio, vma, vmf->address); + folio_add_lru_vma(folio, vma); setpte: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); @@ -4098,10 +4096,10 @@ unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; release: - put_page(page); + folio_put(folio); goto unlock; oom_free_page: - put_page(page); + folio_put(folio); oom: return VM_FAULT_OOM; } -- cgit v1.2.3 From 28d41a4863316321bb5aa616bd82d65c84fc0f8b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:18:11 +0000 Subject: mm: convert wp_page_copy() to use folios Use new_folio instead of new_page throughout, because we allocated it and know it's an order-0 folio. Most old_page uses become old_folio, but use vmf->page where we need the precise page. Link: https://lkml.kernel.org/r/20230116191813.2145215-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 65 ++++++++++++++++++++++++++++++------------------------------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 950e5a4f2cf1..720fbb4771de 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3043,8 +3043,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; - struct page *old_page = vmf->page; - struct page *new_page = NULL; + struct folio *old_folio = NULL; + struct folio *new_folio = NULL; pte_t entry; int page_copied = 0; struct mmu_notifier_range range; @@ -3052,23 +3052,22 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) delayacct_wpcopy_start(); + if (vmf->page) + old_folio = page_folio(vmf->page); if (unlikely(anon_vma_prepare(vma))) goto oom; if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { - struct folio *new_folio; - new_folio = vma_alloc_zeroed_movable_folio(vma, vmf->address); if (!new_folio) goto oom; - new_page = &new_folio->page; } else { - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, - vmf->address); - if (!new_page) + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, + vmf->address, false); + if (!new_folio) goto oom; - ret = __wp_page_copy_user(new_page, old_page, vmf); + ret = __wp_page_copy_user(&new_folio->page, vmf->page, vmf); if (ret) { /* * COW failed, if the fault was solved by other, @@ -3077,21 +3076,21 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * from the second attempt. * The -EHWPOISON case will not be retried. */ - put_page(new_page); - if (old_page) - put_page(old_page); + folio_put(new_folio); + if (old_folio) + folio_put(old_folio); delayacct_wpcopy_end(); return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0; } - kmsan_copy_page_meta(new_page, old_page); + kmsan_copy_page_meta(&new_folio->page, vmf->page); } - if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL)) + if (mem_cgroup_charge(new_folio, mm, GFP_KERNEL)) goto oom_free_new; - cgroup_throttle_swaprate(new_page, GFP_KERNEL); + cgroup_throttle_swaprate(&new_folio->page, GFP_KERNEL); - __SetPageUptodate(new_page); + __folio_mark_uptodate(new_folio); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address & PAGE_MASK, @@ -3103,16 +3102,16 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { - if (old_page) { - if (!PageAnon(old_page)) { - dec_mm_counter(mm, mm_counter_file(old_page)); + if (old_folio) { + if (!folio_test_anon(old_folio)) { + dec_mm_counter(mm, mm_counter_file(&old_folio->page)); inc_mm_counter(mm, MM_ANONPAGES); } } else { inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); - entry = mk_pte(new_page, vma->vm_page_prot); + entry = mk_pte(&new_folio->page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (unlikely(unshare)) { if (pte_soft_dirty(vmf->orig_pte)) @@ -3131,8 +3130,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * some TLBs while the old PTE remains in others. */ ptep_clear_flush_notify(vma, vmf->address, vmf->pte); - page_add_new_anon_rmap(new_page, vma, vmf->address); - lru_cache_add_inactive_or_unevictable(new_page, vma); + folio_add_new_anon_rmap(new_folio, vma, vmf->address); + folio_add_lru_vma(new_folio, vma); /* * We call the notify macro here because, when using secondary * mmu page tables (such as kvm shadow page tables), we want the @@ -3141,7 +3140,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) BUG_ON(unshare && pte_write(entry)); set_pte_at_notify(mm, vmf->address, vmf->pte, entry); update_mmu_cache(vma, vmf->address, vmf->pte); - if (old_page) { + if (old_folio) { /* * Only after switching the pte to the new page may * we remove the mapcount here. Otherwise another @@ -3164,18 +3163,18 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ - page_remove_rmap(old_page, vma, false); + page_remove_rmap(vmf->page, vma, false); } /* Free the old page.. */ - new_page = old_page; + new_folio = old_folio; page_copied = 1; } else { update_mmu_tlb(vma, vmf->address, vmf->pte); } - if (new_page) - put_page(new_page); + if (new_folio) + folio_put(new_folio); pte_unmap_unlock(vmf->pte, vmf->ptl); /* @@ -3183,19 +3182,19 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * the above ptep_clear_flush_notify() did already call it. */ mmu_notifier_invalidate_range_only_end(&range); - if (old_page) { + if (old_folio) { if (page_copied) - free_swap_cache(old_page); - put_page(old_page); + free_swap_cache(&old_folio->page); + folio_put(old_folio); } delayacct_wpcopy_end(); return 0; oom_free_new: - put_page(new_page); + folio_put(new_folio); oom: - if (old_page) - put_page(old_page); + if (old_folio) + folio_put(old_folio); delayacct_wpcopy_end(); return VM_FAULT_OOM; -- cgit v1.2.3 From edf5047058395c89a912783ea29ec8f9e53be414 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:18:12 +0000 Subject: mm: use a folio in copy_pte_range() Allocate an order-0 folio instead of a page and pass it all the way down the call chain. Removes dozens of calls to compound_head(). Link: https://lkml.kernel.org/r/20230116191813.2145215-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 51 +++++++++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 720fbb4771de..20a5c8087a5a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -863,13 +863,13 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, static inline int copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, - struct page **prealloc, struct page *page) + struct folio **prealloc, struct page *page) { - struct page *new_page; + struct folio *new_folio; pte_t pte; - new_page = *prealloc; - if (!new_page) + new_folio = *prealloc; + if (!new_folio) return -EAGAIN; /* @@ -877,14 +877,14 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma * over and copy the page & arm it. */ *prealloc = NULL; - copy_user_highpage(new_page, page, addr, src_vma); - __SetPageUptodate(new_page); - page_add_new_anon_rmap(new_page, dst_vma, addr); - lru_cache_add_inactive_or_unevictable(new_page, dst_vma); - rss[mm_counter(new_page)]++; + copy_user_highpage(&new_folio->page, page, addr, src_vma); + __folio_mark_uptodate(new_folio); + folio_add_new_anon_rmap(new_folio, dst_vma, addr); + folio_add_lru_vma(new_folio, dst_vma); + rss[MM_ANONPAGES]++; /* All done, just insert the new page copy in the child */ - pte = mk_pte(new_page, dst_vma->vm_page_prot); + pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot); pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); if (userfaultfd_pte_wp(dst_vma, *src_pte)) /* Uffd-wp needs to be delivered to dest pte as well */ @@ -900,7 +900,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma static inline int copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, - struct page **prealloc) + struct folio **prealloc) { struct mm_struct *src_mm = src_vma->vm_mm; unsigned long vm_flags = src_vma->vm_flags; @@ -922,11 +922,11 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, return copy_present_page(dst_vma, src_vma, dst_pte, src_pte, addr, rss, prealloc, page); } - rss[mm_counter(page)]++; + rss[MM_ANONPAGES]++; } else if (page) { get_page(page); page_dup_file_rmap(page, false); - rss[mm_counter(page)]++; + rss[mm_counter_file(page)]++; } /* @@ -954,23 +954,22 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, return 0; } -static inline struct page * -page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, - unsigned long addr) +static inline struct folio *page_copy_prealloc(struct mm_struct *src_mm, + struct vm_area_struct *vma, unsigned long addr) { - struct page *new_page; + struct folio *new_folio; - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr); - if (!new_page) + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false); + if (!new_folio) return NULL; - if (mem_cgroup_charge(page_folio(new_page), src_mm, GFP_KERNEL)) { - put_page(new_page); + if (mem_cgroup_charge(new_folio, src_mm, GFP_KERNEL)) { + folio_put(new_folio); return NULL; } - cgroup_throttle_swaprate(new_page, GFP_KERNEL); + cgroup_throttle_swaprate(&new_folio->page, GFP_KERNEL); - return new_page; + return new_folio; } static int @@ -986,7 +985,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, int progress, ret = 0; int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; - struct page *prealloc = NULL; + struct folio *prealloc = NULL; again: progress = 0; @@ -1056,7 +1055,7 @@ again: * will allocate page according to address). This * could only happen if one pinned pte changed. */ - put_page(prealloc); + folio_put(prealloc); prealloc = NULL; } progress += 8; @@ -1093,7 +1092,7 @@ again: goto again; out: if (unlikely(prealloc)) - put_page(prealloc); + folio_put(prealloc); return ret; } -- cgit v1.2.3 From 14ddee4126fecff5c5c0a84940ba34f0bfe3e708 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:18:13 +0000 Subject: mm: use a folio in copy_present_pte() We still have to keep the page around because we need to know which page in the folio we're copying, but we can replace five implict calls to compound_head() with one. Link: https://lkml.kernel.org/r/20230116191813.2145215-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 20a5c8087a5a..029f838587d1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -906,25 +906,28 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long vm_flags = src_vma->vm_flags; pte_t pte = *src_pte; struct page *page; + struct folio *folio; page = vm_normal_page(src_vma, addr, pte); - if (page && PageAnon(page)) { + if (page) + folio = page_folio(page); + if (page && folio_test_anon(folio)) { /* * If this page may have been pinned by the parent process, * copy the page immediately for the child so that we'll always * guarantee the pinned page won't be randomly replaced in the * future. */ - get_page(page); + folio_get(folio); if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) { - /* Page maybe pinned, we have to copy. */ - put_page(page); + /* Page may be pinned, we have to copy. */ + folio_put(folio); return copy_present_page(dst_vma, src_vma, dst_pte, src_pte, addr, rss, prealloc, page); } rss[MM_ANONPAGES]++; } else if (page) { - get_page(page); + folio_get(folio); page_dup_file_rmap(page, false); rss[mm_counter_file(page)]++; } @@ -937,7 +940,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, ptep_set_wrprotect(src_mm, addr, src_pte); pte = pte_wrprotect(pte); } - VM_BUG_ON(page && PageAnon(page) && PageAnonExclusive(page)); + VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page)); /* * If it's a shared mapping, mark it clean in -- cgit v1.2.3 From 9cfb816b1c6c99f4b3c1d4a0fb096162cd17ec71 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:25:06 +0000 Subject: mm/fs: convert inode_attach_wb() to take a folio Patch series "Writeback folio conversions". Remove more calls to compound_head() by passing folios around instead of pages. This patch (of 2): The only caller of inode_attach_wb() which doesn't pass NULL already has a folio, so convert the whole call-chain to take folios. Link: https://lkml.kernel.org/r/20230116192507.2146150-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230116192507.2146150-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/fs-writeback.c | 6 +++--- include/linux/writeback.h | 12 ++++++------ mm/page-writeback.c | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6fba5a52127b..12f60f1ed2a0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -237,7 +237,7 @@ void wb_wait_for_completion(struct wb_completion *done) static atomic_t isw_nr_in_flight = ATOMIC_INIT(0); static struct workqueue_struct *isw_wq; -void __inode_attach_wb(struct inode *inode, struct page *page) +void __inode_attach_wb(struct inode *inode, struct folio *folio) { struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; @@ -245,8 +245,8 @@ void __inode_attach_wb(struct inode *inode, struct page *page) if (inode_cgwb_enabled(inode)) { struct cgroup_subsys_state *memcg_css; - if (page) { - memcg_css = mem_cgroup_css_from_page(page); + if (folio) { + memcg_css = mem_cgroup_css_from_page(&folio->page); wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); } else { /* must pin memcg_css, see wb_get_create() */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 2554b71765e9..3f1491b07474 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -207,7 +207,7 @@ static inline void wait_on_inode(struct inode *inode) #include #include -void __inode_attach_wb(struct inode *inode, struct page *page); +void __inode_attach_wb(struct inode *inode, struct folio *folio); void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) __releases(&inode->i_lock); @@ -222,16 +222,16 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb); /** * inode_attach_wb - associate an inode with its wb * @inode: inode of interest - * @page: page being dirtied (may be NULL) + * @folio: folio being dirtied (may be NULL) * * If @inode doesn't have its wb, associate it with the wb matching the - * memcg of @page or, if @page is NULL, %current. May be called w/ or w/o + * memcg of @folio or, if @folio is NULL, %current. May be called w/ or w/o * @inode->i_lock. */ -static inline void inode_attach_wb(struct inode *inode, struct page *page) +static inline void inode_attach_wb(struct inode *inode, struct folio *folio) { if (!inode->i_wb) - __inode_attach_wb(inode, page); + __inode_attach_wb(inode, folio); } /** @@ -290,7 +290,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) #else /* CONFIG_CGROUP_WRITEBACK */ -static inline void inode_attach_wb(struct inode *inode, struct page *page) +static inline void inode_attach_wb(struct inode *inode, struct folio *folio) { } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5e892f20bed7..92b90d2ab513 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2652,7 +2652,7 @@ static void folio_account_dirtied(struct folio *folio, struct bdi_writeback *wb; long nr = folio_nr_pages(folio); - inode_attach_wb(inode, &folio->page); + inode_attach_wb(inode, folio); wb = inode_to_wb(inode); __lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr); -- cgit v1.2.3 From 75376c6fb93b99e94192cfff48222d11819ee917 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:25:07 +0000 Subject: mm: convert mem_cgroup_css_from_page() to mem_cgroup_css_from_folio() Only one caller doesn't have a folio, so move the page_folio() call to that one caller from mem_cgroup_css_from_folio(). Link: https://lkml.kernel.org/r/20230116192507.2146150-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/fs-writeback.c | 6 ++++-- include/linux/memcontrol.h | 2 +- mm/memcontrol.c | 12 +++++------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 12f60f1ed2a0..195dc23e0d83 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -246,7 +246,7 @@ void __inode_attach_wb(struct inode *inode, struct folio *folio) struct cgroup_subsys_state *memcg_css; if (folio) { - memcg_css = mem_cgroup_css_from_page(&folio->page); + memcg_css = mem_cgroup_css_from_folio(folio); wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); } else { /* must pin memcg_css, see wb_get_create() */ @@ -859,6 +859,7 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode); void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, size_t bytes) { + struct folio *folio; struct cgroup_subsys_state *css; int id; @@ -871,7 +872,8 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, if (!wbc->wb || wbc->no_cgroup_owner) return; - css = mem_cgroup_css_from_page(page); + folio = page_folio(page); + css = mem_cgroup_css_from_folio(folio); /* dead cgroups shouldn't contribute to inode ownership arbitration */ if (!(css->flags & CSS_ONLINE)) return; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e605fc885f08..35478695cabf 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -890,7 +890,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, return match; } -struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page); +struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio); ino_t page_cgroup_ino(struct page *page); static inline bool mem_cgroup_online(struct mem_cgroup *memcg) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index faeea84964aa..893427aded01 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -350,21 +350,19 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key); #endif /** - * mem_cgroup_css_from_page - css of the memcg associated with a page - * @page: page of interest + * mem_cgroup_css_from_folio - css of the memcg associated with a folio + * @folio: folio of interest * * If memcg is bound to the default hierarchy, css of the memcg associated - * with @page is returned. The returned css remains associated with @page + * with @folio is returned. The returned css remains associated with @folio * until it is released. * * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup * is returned. */ -struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) +struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio) { - struct mem_cgroup *memcg; - - memcg = page_memcg(page); + struct mem_cgroup *memcg = folio_memcg(folio); if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) memcg = root_mem_cgroup; -- cgit v1.2.3 From 90c9d13a47d45f2f16530c4d62af2fa4d74dfd16 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:28:24 +0000 Subject: mm: remove page_evictable() Patch series "Remove leftover mlock/munlock page wrappers". We no longer need the various mlock page functions as all callers have folios. This patch (of 4): This function now has no users. Also update the unevictable-lru documentation to discuss folios instead of pages (mostly). [akpm@linux-foundation.org: fix Documentation/mm/unevictable-lru.rst underlining] Link: https://lkml.kernel.org/r/20230117145106.585b277b@canb.auug.org.au Link: https://lkml.kernel.org/r/20230116192827.2146732-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230116192827.2146732-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/mm/unevictable-lru.rst | 91 +++++++++++++++++++----------------- mm/internal.h | 11 ----- 2 files changed, 47 insertions(+), 55 deletions(-) diff --git a/Documentation/mm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst index 2a90d0721dd9..552c63eef954 100644 --- a/Documentation/mm/unevictable-lru.rst +++ b/Documentation/mm/unevictable-lru.rst @@ -12,7 +12,7 @@ Introduction This document describes the Linux memory manager's "Unevictable LRU" infrastructure and the use of this to manage several types of "unevictable" -pages. +folios. The document attempts to provide the overall rationale behind this mechanism and the rationale for some of the design decisions that drove the @@ -27,8 +27,8 @@ The Unevictable LRU =================== The Unevictable LRU facility adds an additional LRU list to track unevictable -pages and to hide these pages from vmscan. This mechanism is based on a patch -by Larry Woodman of Red Hat to address several scalability problems with page +folios and to hide these folios from vmscan. This mechanism is based on a patch +by Larry Woodman of Red Hat to address several scalability problems with folio reclaim in Linux. The problems have been observed at customer sites on large memory x86_64 systems. @@ -52,40 +52,41 @@ The infrastructure may also be able to handle other conditions that make pages unevictable, either by definition or by circumstance, in the future. -The Unevictable LRU Page List ------------------------------ +The Unevictable LRU Folio List +------------------------------ -The Unevictable LRU page list is a lie. It was never an LRU-ordered list, but a -companion to the LRU-ordered anonymous and file, active and inactive page lists; -and now it is not even a page list. But following familiar convention, here in -this document and in the source, we often imagine it as a fifth LRU page list. +The Unevictable LRU folio list is a lie. It was never an LRU-ordered +list, but a companion to the LRU-ordered anonymous and file, active and +inactive folio lists; and now it is not even a folio list. But following +familiar convention, here in this document and in the source, we often +imagine it as a fifth LRU folio list. The Unevictable LRU infrastructure consists of an additional, per-node, LRU list -called the "unevictable" list and an associated page flag, PG_unevictable, to -indicate that the page is being managed on the unevictable list. +called the "unevictable" list and an associated folio flag, PG_unevictable, to +indicate that the folio is being managed on the unevictable list. The PG_unevictable flag is analogous to, and mutually exclusive with, the -PG_active flag in that it indicates on which LRU list a page resides when +PG_active flag in that it indicates on which LRU list a folio resides when PG_lru is set. -The Unevictable LRU infrastructure maintains unevictable pages as if they were +The Unevictable LRU infrastructure maintains unevictable folios as if they were on an additional LRU list for a few reasons: - (1) We get to "treat unevictable pages just like we treat other pages in the + (1) We get to "treat unevictable folios just like we treat other folios in the system - which means we get to use the same code to manipulate them, the same code to isolate them (for migrate, etc.), the same code to keep track of the statistics, etc..." [Rik van Riel] - (2) We want to be able to migrate unevictable pages between nodes for memory + (2) We want to be able to migrate unevictable folios between nodes for memory defragmentation, workload management and memory hotplug. The Linux kernel - can only migrate pages that it can successfully isolate from the LRU + can only migrate folios that it can successfully isolate from the LRU lists (or "Movable" pages: outside of consideration here). If we were to - maintain pages elsewhere than on an LRU-like list, where they can be - detected by isolate_lru_page(), we would prevent their migration. + maintain folios elsewhere than on an LRU-like list, where they can be + detected by folio_isolate_lru(), we would prevent their migration. -The unevictable list does not differentiate between file-backed and anonymous, -swap-backed pages. This differentiation is only important while the pages are, -in fact, evictable. +The unevictable list does not differentiate between file-backed and +anonymous, swap-backed folios. This differentiation is only important +while the folios are, in fact, evictable. The unevictable list benefits from the "arrayification" of the per-node LRU lists and statistics originally proposed and posted by Christoph Lameter. @@ -158,7 +159,7 @@ These are currently used in three places in the kernel: Detecting Unevictable Pages --------------------------- -The function page_evictable() in mm/internal.h determines whether a page is +The function folio_evictable() in mm/internal.h determines whether a folio is evictable or not using the query function outlined above [see section :ref:`Marking address spaces unevictable `] to check the AS_UNEVICTABLE flag. @@ -167,7 +168,7 @@ For address spaces that are so marked after being populated (as SHM regions might be), the lock action (e.g. SHM_LOCK) can be lazy, and need not populate the page tables for the region as does, for example, mlock(), nor need it make any special effort to push any pages in the SHM_LOCK'd area to the unevictable -list. Instead, vmscan will do this if and when it encounters the pages during +list. Instead, vmscan will do this if and when it encounters the folios during a reclamation scan. On an unlock action (such as SHM_UNLOCK), the unlocker (e.g. shmctl()) must scan @@ -176,41 +177,43 @@ condition is keeping them unevictable. If an unevictable region is destroyed, the pages are also "rescued" from the unevictable list in the process of freeing them. -page_evictable() also checks for mlocked pages by testing an additional page -flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is -faulted into a VM_LOCKED VMA, or found in a VMA being VM_LOCKED. +folio_evictable() also checks for mlocked folios by calling +folio_test_mlocked(), which is set when a folio is faulted into a +VM_LOCKED VMA, or found in a VMA being VM_LOCKED. -Vmscan's Handling of Unevictable Pages --------------------------------------- +Vmscan's Handling of Unevictable Folios +--------------------------------------- -If unevictable pages are culled in the fault path, or moved to the unevictable -list at mlock() or mmap() time, vmscan will not encounter the pages until they +If unevictable folios are culled in the fault path, or moved to the unevictable +list at mlock() or mmap() time, vmscan will not encounter the folios until they have become evictable again (via munlock() for example) and have been "rescued" from the unevictable list. However, there may be situations where we decide, -for the sake of expediency, to leave an unevictable page on one of the regular +for the sake of expediency, to leave an unevictable folio on one of the regular active/inactive LRU lists for vmscan to deal with. vmscan checks for such -pages in all of the shrink_{active|inactive|page}_list() functions and will -"cull" such pages that it encounters: that is, it diverts those pages to the +folios in all of the shrink_{active|inactive|page}_list() functions and will +"cull" such folios that it encounters: that is, it diverts those folios to the unevictable list for the memory cgroup and node being scanned. -There may be situations where a page is mapped into a VM_LOCKED VMA, but the -page is not marked as PG_mlocked. Such pages will make it all the way to -shrink_active_list() or shrink_page_list() where they will be detected when -vmscan walks the reverse map in folio_referenced() or try_to_unmap(). The page -is culled to the unevictable list when it is released by the shrinker. +There may be situations where a folio is mapped into a VM_LOCKED VMA, +but the folio does not have the mlocked flag set. Such folios will make +it all the way to shrink_active_list() or shrink_page_list() where they +will be detected when vmscan walks the reverse map in folio_referenced() +or try_to_unmap(). The folio is culled to the unevictable list when it +is released by the shrinker. -To "cull" an unevictable page, vmscan simply puts the page back on the LRU list -using putback_lru_page() - the inverse operation to isolate_lru_page() - after -dropping the page lock. Because the condition which makes the page unevictable -may change once the page is unlocked, __pagevec_lru_add_fn() will recheck the -unevictable state of a page before placing it on the unevictable list. +To "cull" an unevictable folio, vmscan simply puts the folio back on +the LRU list using folio_putback_lru() - the inverse operation to +folio_isolate_lru() - after dropping the folio lock. Because the +condition which makes the folio unevictable may change once the folio +is unlocked, __pagevec_lru_add_fn() will recheck the unevictable state +of a folio before placing it on the unevictable list. MLOCKED Pages ============= -The unevictable page list is also useful for mlock(), in addition to ramfs and +The unevictable folio list is also useful for mlock(), in addition to ramfs and SYSV SHM. Note that mlock() is only available in CONFIG_MMU=y situations; in NOMMU situations, all mappings are effectively mlocked. diff --git a/mm/internal.h b/mm/internal.h index 2d09a7a0600a..74bc1fe45711 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -159,17 +159,6 @@ static inline bool folio_evictable(struct folio *folio) return ret; } -static inline bool page_evictable(struct page *page) -{ - bool ret; - - /* Prevent address_space of inode and swap cache from being freed */ - rcu_read_lock(); - ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); - rcu_read_unlock(); - return ret; -} - /* * Turn a non-refcounted page (->_refcount == 0) into refcounted with * a count of one. -- cgit v1.2.3 From 7efecffb8e7968c4a6c53177b0053ca4765fe233 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:28:25 +0000 Subject: mm: remove mlock_vma_page() All callers now have a folio and can call mlock_vma_folio(). Update the documentation to refer to mlock_vma_folio(). Link: https://lkml.kernel.org/r/20230116192827.2146732-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/mm/unevictable-lru.rst | 6 +++--- mm/internal.h | 10 +--------- mm/mlock.c | 4 ++-- mm/rmap.c | 4 ++-- 4 files changed, 8 insertions(+), 16 deletions(-) diff --git a/Documentation/mm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst index 552c63eef954..9257235fe904 100644 --- a/Documentation/mm/unevictable-lru.rst +++ b/Documentation/mm/unevictable-lru.rst @@ -311,7 +311,7 @@ do end up getting faulted into this VM_LOCKED VMA, they will be handled in the fault path - which is also how mlock2()'s MLOCK_ONFAULT areas are handled. For each PTE (or PMD) being faulted into a VMA, the page add rmap function -calls mlock_vma_page(), which calls mlock_folio() when the VMA is VM_LOCKED +calls mlock_vma_folio(), which calls mlock_folio() when the VMA is VM_LOCKED (unless it is a PTE mapping of a part of a transparent huge page). Or when it is a newly allocated anonymous page, folio_add_lru_vma() calls mlock_new_folio() instead: similar to mlock_folio(), but can make better @@ -413,7 +413,7 @@ However, since mlock_vma_pages_range() starts by setting VM_LOCKED on a VMA, before mlocking any pages already present, if one of those pages were migrated before mlock_pte_range() reached it, it would get counted twice in mlock_count. To prevent that, mlock_vma_pages_range() temporarily marks the VMA as VM_IO, -so that mlock_vma_page() will skip it. +so that mlock_vma_folio() will skip it. To complete page migration, we place the old and new pages back onto the LRU afterwards. The "unneeded" page - old page on success, new page on failure - @@ -552,6 +552,6 @@ and node unevictable list. rmap's folio_referenced_one(), called via vmscan's shrink_active_list() or shrink_page_list(), and rmap's try_to_unmap_one() called via shrink_page_list(), -check for (3) pages still mapped into VM_LOCKED VMAs, and call mlock_vma_page() +check for (3) pages still mapped into VM_LOCKED VMAs, and call mlock_vma_folio() to correct them. Such pages are culled to the unevictable list when released by the shrinker. diff --git a/mm/internal.h b/mm/internal.h index 74bc1fe45711..0b74105ea363 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -518,7 +518,7 @@ extern long faultin_vma_page_range(struct vm_area_struct *vma, extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, unsigned long len); /* - * mlock_vma_page() and munlock_vma_page(): + * mlock_vma_folio() and munlock_vma_folio(): * should be called with vma's mmap_lock held for read or write, * under page table lock for the pte/pmd being added or removed. * @@ -547,12 +547,6 @@ static inline void mlock_vma_folio(struct folio *folio, mlock_folio(folio); } -static inline void mlock_vma_page(struct page *page, - struct vm_area_struct *vma, bool compound) -{ - mlock_vma_folio(page_folio(page), vma, compound); -} - void munlock_folio(struct folio *folio); static inline void munlock_vma_folio(struct folio *folio, @@ -656,8 +650,6 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, } #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } -static inline void mlock_vma_page(struct page *page, - struct vm_area_struct *vma, bool compound) { } static inline void munlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { } static inline void mlock_new_folio(struct folio *folio) { } diff --git a/mm/mlock.c b/mm/mlock.c index 9e9c8be58277..b680f11879c3 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -370,9 +370,9 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, /* * There is a slight chance that concurrent page migration, * or page reclaim finding a page of this now-VM_LOCKED vma, - * will call mlock_vma_page() and raise page's mlock_count: + * will call mlock_vma_folio() and raise page's mlock_count: * double counting, leaving the page unevictable indefinitely. - * Communicate this danger to mlock_vma_page() with VM_IO, + * Communicate this danger to mlock_vma_folio() with VM_IO, * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas. * mmap_lock is held in write mode here, so this weird * combination should not be visible to other mmap_lock users; diff --git a/mm/rmap.c b/mm/rmap.c index 0d07c500fc86..33e15181ae73 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1260,7 +1260,7 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, __page_check_anon_rmap(page, vma, address); } - mlock_vma_page(page, vma, compound); + mlock_vma_folio(folio, vma, compound); } /** @@ -1351,7 +1351,7 @@ void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, if (nr) __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr); - mlock_vma_page(page, vma, compound); + mlock_vma_folio(folio, vma, compound); } /** -- cgit v1.2.3 From 672aa27d0bd241759376e62b78abb8aae1792479 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:28:26 +0000 Subject: mm: remove munlock_vma_page() All callers now have a folio and can call munlock_vma_folio(). Update the documentation to refer to munlock_vma_folio(). Link: https://lkml.kernel.org/r/20230116192827.2146732-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/mm/unevictable-lru.rst | 4 ++-- kernel/events/uprobes.c | 1 - mm/internal.h | 8 -------- mm/rmap.c | 12 ++++++------ 4 files changed, 8 insertions(+), 17 deletions(-) diff --git a/Documentation/mm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst index 9257235fe904..34b8b098c5bc 100644 --- a/Documentation/mm/unevictable-lru.rst +++ b/Documentation/mm/unevictable-lru.rst @@ -486,7 +486,7 @@ Before the unevictable/mlock changes, mlocking did not mark the pages in any way, so unmapping them required no processing. For each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls -munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED +munlock_vma_folio(), which calls munlock_folio() when the VMA is VM_LOCKED (unless it was a PTE mapping of a part of a transparent huge page). munlock_page() uses the mlock pagevec to batch up work to be done under @@ -510,7 +510,7 @@ which had been Copied-On-Write from the file pages now being truncated. Mlocked pages can be munlocked and deleted in this way: like with munmap(), for each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls -munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED +munlock_vma_folio(), which calls munlock_folio() when the VMA is VM_LOCKED (unless it was a PTE mapping of a part of a transparent huge page). However, if there is a racing munlock(), since mlock_vma_pages_range() starts diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 29f36d2ae129..1a3904e0179c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -22,7 +22,6 @@ #include /* folio_free_swap */ #include /* user_enable_single_step */ #include /* notifier mechanism */ -#include "../../mm/internal.h" /* munlock_vma_page */ #include #include #include diff --git a/mm/internal.h b/mm/internal.h index 0b74105ea363..ce462bf145b4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -548,7 +548,6 @@ static inline void mlock_vma_folio(struct folio *folio, } void munlock_folio(struct folio *folio); - static inline void munlock_vma_folio(struct folio *folio, struct vm_area_struct *vma, bool compound) { @@ -557,11 +556,6 @@ static inline void munlock_vma_folio(struct folio *folio, munlock_folio(folio); } -static inline void munlock_vma_page(struct page *page, - struct vm_area_struct *vma, bool compound) -{ - munlock_vma_folio(page_folio(page), vma, compound); -} void mlock_new_folio(struct folio *folio); bool need_mlock_drain(int cpu); void mlock_drain_local(void); @@ -650,8 +644,6 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, } #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } -static inline void munlock_vma_page(struct page *page, - struct vm_area_struct *vma, bool compound) { } static inline void mlock_new_folio(struct folio *folio) { } static inline bool need_mlock_drain(int cpu) { return false; } static inline void mlock_drain_local(void) { } diff --git a/mm/rmap.c b/mm/rmap.c index 33e15181ae73..0b5abdda1e6b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1431,14 +1431,14 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma, } /* - * It would be tidy to reset PageAnon mapping when fully unmapped, - * but that might overwrite a racing page_add_anon_rmap - * which increments mapcount after us but sets mapping - * before us: so leave the reset to free_pages_prepare, - * and remember that it's only reliable while mapped. + * It would be tidy to reset folio_test_anon mapping when fully + * unmapped, but that might overwrite a racing page_add_anon_rmap + * which increments mapcount after us but sets mapping before us: + * so leave the reset to free_pages_prepare, and remember that + * it's only reliable while mapped. */ - munlock_vma_page(page, vma, compound); + munlock_vma_folio(folio, vma, compound); } /* -- cgit v1.2.3 From e0650a41f7d024b72669a2a2db846ef70281abd8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:28:27 +0000 Subject: mm: clean up mlock_page / munlock_page references in comments Change documentation and comments that refer to now-renamed functions. Link: https://lkml.kernel.org/r/20230116192827.2146732-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/mm/unevictable-lru.rst | 30 ++++++++++++++++-------------- mm/memory-failure.c | 2 +- mm/swap.c | 4 ++-- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/Documentation/mm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst index 34b8b098c5bc..53e59433497a 100644 --- a/Documentation/mm/unevictable-lru.rst +++ b/Documentation/mm/unevictable-lru.rst @@ -298,7 +298,7 @@ treated as a no-op and mlock_fixup() simply returns. If the VMA passes some filtering as described in "Filtering Special VMAs" below, mlock_fixup() will attempt to merge the VMA with its neighbors or split off a subset of the VMA if the range does not cover the entire VMA. Any pages -already present in the VMA are then marked as mlocked by mlock_page() via +already present in the VMA are then marked as mlocked by mlock_folio() via mlock_pte_range() via walk_page_range() via mlock_vma_pages_range(). Before returning from the system call, do_mlock() or mlockall() will call @@ -373,20 +373,21 @@ Because of the VMA filtering discussed above, VM_LOCKED will not be set in any "special" VMAs. So, those VMAs will be ignored for munlock. If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the -specified range. All pages in the VMA are then munlocked by munlock_page() via +specified range. All pages in the VMA are then munlocked by munlock_folio() via mlock_pte_range() via walk_page_range() via mlock_vma_pages_range() - the same function used when mlocking a VMA range, with new flags for the VMA indicating that it is munlock() being performed. -munlock_page() uses the mlock pagevec to batch up work to be done under -lru_lock by __munlock_page(). __munlock_page() decrements the page's -mlock_count, and when that reaches 0 it clears PG_mlocked and clears -PG_unevictable, moving the page from unevictable state to inactive LRU. +munlock_folio() uses the mlock pagevec to batch up work to be done +under lru_lock by __munlock_folio(). __munlock_folio() decrements the +folio's mlock_count, and when that reaches 0 it clears the mlocked flag +and clears the unevictable flag, moving the folio from unevictable state +to the inactive LRU. -But in practice that may not work ideally: the page may not yet have reached +But in practice that may not work ideally: the folio may not yet have reached "the unevictable LRU", or it may have been temporarily isolated from it. In those cases its mlock_count field is unusable and must be assumed to be 0: so -that the page will be rescued to an evictable LRU, then perhaps be mlocked +that the folio will be rescued to an evictable LRU, then perhaps be mlocked again later if vmscan finds it in a VM_LOCKED VMA. @@ -489,15 +490,16 @@ For each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls munlock_vma_folio(), which calls munlock_folio() when the VMA is VM_LOCKED (unless it was a PTE mapping of a part of a transparent huge page). -munlock_page() uses the mlock pagevec to batch up work to be done under -lru_lock by __munlock_page(). __munlock_page() decrements the page's -mlock_count, and when that reaches 0 it clears PG_mlocked and clears -PG_unevictable, moving the page from unevictable state to inactive LRU. +munlock_folio() uses the mlock pagevec to batch up work to be done +under lru_lock by __munlock_folio(). __munlock_folio() decrements the +folio's mlock_count, and when that reaches 0 it clears the mlocked flag +and clears the unevictable flag, moving the folio from unevictable state +to the inactive LRU. -But in practice that may not work ideally: the page may not yet have reached +But in practice that may not work ideally: the folio may not yet have reached "the unevictable LRU", or it may have been temporarily isolated from it. In those cases its mlock_count field is unusable and must be assumed to be 0: so -that the page will be rescued to an evictable LRU, then perhaps be mlocked +that the folio will be rescued to an evictable LRU, then perhaps be mlocked again later if vmscan finds it in a VM_LOCKED VMA. diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ba0bbfc074ee..38f84bff8bdf 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2167,7 +2167,7 @@ try_again: } /* - * __munlock_pagevec may clear a writeback page's LRU flag without + * __munlock_folio() may clear a writeback page's LRU flag without * page_lock. We need wait writeback completion for this page or it * may trigger vfs BUG while evict inode. */ diff --git a/mm/swap.c b/mm/swap.c index 5e4f92700c16..2a51faa34e64 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -201,7 +201,7 @@ static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) * Is an smp_mb__after_atomic() still required here, before * folio_evictable() tests the mlocked flag, to rule out the possibility * of stranding an evictable folio on an unevictable LRU? I think - * not, because __munlock_page() only clears the mlocked flag + * not, because __munlock_folio() only clears the mlocked flag * while the LRU lock is held. * * (That is not true of __page_cache_release(), and not necessarily @@ -216,7 +216,7 @@ static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) folio_set_unevictable(folio); /* * folio->mlock_count = !!folio_test_mlocked(folio)? - * But that leaves __mlock_page() in doubt whether another + * But that leaves __mlock_folio() in doubt whether another * actor has already counted the mlock or not. Err on the * safe side, underestimate, let page reclaim fix it, rather * than leaving a page on the unevictable LRU indefinitely. -- cgit v1.2.3 From 5b4bd90f9ac76136c7148684b12276d4ae2d64a2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:29:59 +0000 Subject: rmap: add folio parameter to __page_set_anon_rmap() Avoid the compound_head() call in PageAnon() by passing in the folio that all callers have. Also save me from wondering whether page->mapping can ever be overwritten on a tail page (I don't think it can, but I'm not 100% sure). Link: https://lkml.kernel.org/r/20230116192959.2147032-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Signed-off-by: Andrew Morton --- mm/rmap.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 0b5abdda1e6b..43760d622040 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1131,19 +1131,20 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) /** * __page_set_anon_rmap - set up new anonymous rmap - * @page: Page or Hugepage to add to rmap + * @folio: Folio which contains page. + * @page: Page to add to rmap. * @vma: VM area to add page to. * @address: User virtual address of the mapping * @exclusive: the page is exclusively owned by the current process */ -static void __page_set_anon_rmap(struct page *page, +static void __page_set_anon_rmap(struct folio *folio, struct page *page, struct vm_area_struct *vma, unsigned long address, int exclusive) { struct anon_vma *anon_vma = vma->anon_vma; BUG_ON(!anon_vma); - if (PageAnon(page)) + if (folio_test_anon(folio)) goto out; /* @@ -1155,14 +1156,14 @@ static void __page_set_anon_rmap(struct page *page, anon_vma = anon_vma->root; /* - * page_idle does a lockless/optimistic rmap scan on page->mapping. + * page_idle does a lockless/optimistic rmap scan on folio->mapping. * Make sure the compiler doesn't split the stores of anon_vma and * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code * could mistake the mapping for a struct address_space and crash. */ anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; - WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); - page->index = linear_page_index(vma, address); + WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma); + folio->index = linear_page_index(vma, address); out: if (exclusive) SetPageAnonExclusive(page); @@ -1254,7 +1255,7 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, if (likely(!folio_test_ksm(folio))) { /* address might be in next vma when migration races vma_adjust */ if (first) - __page_set_anon_rmap(page, vma, address, + __page_set_anon_rmap(folio, page, vma, address, !!(flags & RMAP_EXCLUSIVE)); else __page_check_anon_rmap(page, vma, address); @@ -1297,7 +1298,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, } __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); - __page_set_anon_rmap(&folio->page, vma, address, 1); + __page_set_anon_rmap(folio, &folio->page, vma, address, 1); } /** @@ -2528,7 +2529,7 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); if (first) - __page_set_anon_rmap(page, vma, address, + __page_set_anon_rmap(folio, page, vma, address, !!(flags & RMAP_EXCLUSIVE)); } @@ -2541,6 +2542,6 @@ void hugepage_add_new_anon_rmap(struct page *page, /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); folio_clear_hugetlb_restore_reserve(folio); - __page_set_anon_rmap(page, vma, address, 1); + __page_set_anon_rmap(folio, page, vma, address, 1); } #endif /* CONFIG_HUGETLB_PAGE */ -- cgit v1.2.3 From 8808ecab3afc18958a9216911cd7967017e7057c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:39:39 +0000 Subject: filemap: convert filemap_map_pmd() to take a folio Patch series "Some more filemap folio conversions". Three more places which could easily be converted to folios. The third one fixes a minor bug in readahead_expand(), but it's only a performance bug and there are few users of readahead_expand(), so I don't think it's worth backporting. This patch (of 3): Save a few calls to compound_head(). We specify exactly which page from the folio to use by passing in start_pgoff, which means this will work for a folio which is larger than PMD size. The rest of the VM isn't prepared for that yet, but now this function is. Link: https://lkml.kernel.org/r/20230116193941.2148487-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230116193941.2148487-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Signed-off-by: Andrew Morton --- mm/filemap.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 31bf18ec6d01..b6b7efc9abc0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3259,22 +3259,24 @@ out_retry: } EXPORT_SYMBOL(filemap_fault); -static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) +static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, + pgoff_t start) { struct mm_struct *mm = vmf->vma->vm_mm; /* Huge page is mapped? No need to proceed. */ if (pmd_trans_huge(*vmf->pmd)) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return true; } - if (pmd_none(*vmf->pmd) && PageTransHuge(page)) { + if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) { + struct page *page = folio_file_page(folio, start); vm_fault_t ret = do_set_pmd(vmf, page); if (!ret) { /* The page is mapped successfully, reference consumed. */ - unlock_page(page); + folio_unlock(folio); return true; } } @@ -3284,8 +3286,8 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) /* See comment in handle_pte_fault() */ if (pmd_devmap_trans_unstable(vmf->pmd)) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return true; } @@ -3368,7 +3370,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, if (!folio) goto out; - if (filemap_map_pmd(vmf, &folio->page)) { + if (filemap_map_pmd(vmf, folio, start_pgoff)) { ret = VM_FAULT_NOPAGE; goto out; } -- cgit v1.2.3 From eff3b364b496e01ec789f3e15a51f9a3589e2a23 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:39:40 +0000 Subject: filemap: convert filemap_range_has_page() to use a folio The folio isn't returned from this function, so this is an entirely internal change. Link: https://lkml.kernel.org/r/20230116193941.2148487-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Signed-off-by: Andrew Morton --- mm/filemap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index b6b7efc9abc0..c915ded191f0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -470,7 +470,7 @@ EXPORT_SYMBOL(filemap_flush); bool filemap_range_has_page(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { - struct page *page; + struct folio *folio; XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; @@ -479,11 +479,11 @@ bool filemap_range_has_page(struct address_space *mapping, rcu_read_lock(); for (;;) { - page = xas_find(&xas, max); - if (xas_retry(&xas, page)) + folio = xas_find(&xas, max); + if (xas_retry(&xas, folio)) continue; /* Shadow entries don't count */ - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; /* * We don't need to try to pin this page; we're about to @@ -494,7 +494,7 @@ bool filemap_range_has_page(struct address_space *mapping, } rcu_read_unlock(); - return page != NULL; + return folio != NULL; } EXPORT_SYMBOL(filemap_range_has_page); -- cgit v1.2.3 From 11a980420719712f419dbb325940907f5d1afbdd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:39:41 +0000 Subject: readahead: convert readahead_expand() to use a folio Replace the uses of page with a folio. Also add a missing test for workingset in the leading edge expansion. Link: https://lkml.kernel.org/r/20230116193941.2148487-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Signed-off-by: Andrew Morton --- mm/readahead.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index b10f0cf81d80..47afbca1d122 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -801,21 +801,25 @@ void readahead_expand(struct readahead_control *ractl, /* Expand the leading edge downwards */ while (ractl->_index > new_index) { unsigned long index = ractl->_index - 1; - struct page *page = xa_load(&mapping->i_pages, index); + struct folio *folio = xa_load(&mapping->i_pages, index); - if (page && !xa_is_value(page)) - return; /* Page apparently present */ + if (folio && !xa_is_value(folio)) + return; /* Folio apparently present */ - page = __page_cache_alloc(gfp_mask); - if (!page) + folio = filemap_alloc_folio(gfp_mask, 0); + if (!folio) return; - if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) { - put_page(page); + if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { + folio_put(folio); return; } - + if (unlikely(folio_test_workingset(folio)) && + !ractl->_workingset) { + ractl->_workingset = true; + psi_memstall_enter(&ractl->_pflags); + } ractl->_nr_pages++; - ractl->_index = page->index; + ractl->_index = folio->index; } new_len += new_start - readahead_pos(ractl); @@ -824,19 +828,20 @@ void readahead_expand(struct readahead_control *ractl, /* Expand the trailing edge upwards */ while (ractl->_nr_pages < new_nr_pages) { unsigned long index = ractl->_index + ractl->_nr_pages; - struct page *page = xa_load(&mapping->i_pages, index); + struct folio *folio = xa_load(&mapping->i_pages, index); - if (page && !xa_is_value(page)) - return; /* Page apparently present */ + if (folio && !xa_is_value(folio)) + return; /* Folio apparently present */ - page = __page_cache_alloc(gfp_mask); - if (!page) + folio = filemap_alloc_folio(gfp_mask, 0); + if (!folio) return; - if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) { - put_page(page); + if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { + folio_put(folio); return; } - if (unlikely(PageWorkingset(page)) && !ractl->_workingset) { + if (unlikely(folio_test_workingset(folio)) && + !ractl->_workingset) { ractl->_workingset = true; psi_memstall_enter(&ractl->_pflags); } -- cgit v1.2.3 From 98001fd63d59d2f99c90db823d322de91ff7d771 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 16 Jan 2023 16:43:32 +0000 Subject: mm/secretmem: remove redundant initiialization of pointer file The pointer file is being initialized with a value that is never read, it is being re-assigned later on. Clean up code by removing the redundant initialization. Link: https://lkml.kernel.org/r/20230116164332.79500-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Reviewed-by: Andrew Morton Reviewed-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/secretmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/secretmem.c b/mm/secretmem.c index 04c3ac9448a1..be3fff86ba00 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -190,7 +190,7 @@ static struct vfsmount *secretmem_mnt; static struct file *secretmem_file_create(unsigned long flags) { - struct file *file = ERR_PTR(-ENOMEM); + struct file *file; struct inode *inode; const char *anon_name = "[secretmem]"; const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name)); -- cgit v1.2.3 From 64517d6e1291b5e942b00c53674ecf33f918313f Mon Sep 17 00:00:00 2001 From: Huaisheng Ye Date: Mon, 16 Jan 2023 14:23:47 +0800 Subject: mm/damon/core: skip apply schemes if empty Sometimes there is no scheme in damon's context, for example just use damo record to monitor workload's data access pattern. If current damon context doesn't have any scheme in the list, kdamond has no need to iterate over list of all targets and regions but do nothing. So, skip apply schemes when ctx->schemes is empty. Link: https://lkml.kernel.org/r/20230116062347.1148553-1-huaisheng.ye@intel.com Signed-off-by: Huaisheng Ye Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 1bf0654ae189..2db8c53491ca 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1269,7 +1269,8 @@ static int kdamond_fn(void *data) if (ctx->callback.after_aggregation && ctx->callback.after_aggregation(ctx)) break; - kdamond_apply_schemes(ctx); + if (!list_empty(&ctx->schemes)) + kdamond_apply_schemes(ctx); kdamond_reset_aggregated(ctx); kdamond_split_regions(ctx); if (ctx->ops.reset_aggregated) -- cgit v1.2.3 From 7ec7096b8577d3b899c1dae456a414f2d08c7ddb Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 17 Jan 2023 20:46:17 +0000 Subject: mm/page_ext: init page_ext early if there are no deferred struct pages page_ext must be initialized after all struct pages are initialized. Therefore, page_ext is initialized after page_alloc_init_late(), and can optionally be initialized earlier via early_page_ext kernel parameter which as a side effect also disables deferred struct pages. Allow to automatically init page_ext early when there are no deferred struct pages in order to be able to use page_ext during kernel boot and track for example page allocations early. [pasha.tatashin@soleen.com: fix build with CONFIG_PAGE_EXTENSION=n] Link: https://lkml.kernel.org/r/20230118155251.2522985-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20230117204617.1553748-1-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Acked-by: Mike Rapoport (IBM) Acked-by: Vlastimil Babka Cc: Charan Teja Kalla Cc: David Hildenbrand Cc: Li Zhe Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/page_ext.h | 2 ++ init/main.c | 6 +++--- mm/page_alloc.c | 6 +++++- mm/page_ext.c | 2 +- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 67314f648aeb..bc2e39090a1f 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -29,6 +29,8 @@ struct page_ext_operations { bool need_shared_flags; }; +extern bool deferred_struct_pages; + #ifdef CONFIG_PAGE_EXTENSION /* diff --git a/init/main.c b/init/main.c index e1c3911d7c70..64cd2ff051c4 100644 --- a/init/main.c +++ b/init/main.c @@ -855,8 +855,8 @@ static void __init mm_init(void) pgtable_init(); debug_objects_mem_init(); vmalloc_init(); - /* Should be run after vmap initialization */ - if (early_page_ext_enabled()) + /* If no deferred init page_ext now, as vmap is fully initialized */ + if (!deferred_struct_pages) page_ext_init(); /* Should be run before the first non-init thread is created */ init_espfix_bsp(); @@ -1628,7 +1628,7 @@ static noinline void __init kernel_init_freeable(void) padata_init(); page_alloc_init_late(); /* Initialize page ext after all struct pages are initialized. */ - if (!early_page_ext_enabled()) + if (deferred_struct_pages) page_ext_init(); do_basic_setup(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0cfad30fb44c..ecb9e9acfe7f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -430,6 +430,8 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; +bool deferred_struct_pages __meminitdata; + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * During boot we initialize deferred pages on-demand, as needed, but once @@ -6803,8 +6805,10 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone if (context == MEMINIT_EARLY) { if (overlap_memmap_init(zone, &pfn)) continue; - if (defer_init(nid, pfn, zone_end_pfn)) + if (defer_init(nid, pfn, zone_end_pfn)) { + deferred_struct_pages = true; break; + } } page = pfn_to_page(pfn); diff --git a/mm/page_ext.c b/mm/page_ext.c index e2c22ffdbb81..dc1626be458b 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -92,7 +92,7 @@ unsigned long page_ext_size; static unsigned long total_usage; static struct page_ext *lookup_page_ext(const struct page *page); -bool early_page_ext; +bool early_page_ext __meminitdata; static int __init setup_early_page_ext(char *str) { early_page_ext = true; -- cgit v1.2.3 From 076cf7ea67010d11a97912423f4cdbeff1bd1f5f Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 5 Jan 2023 13:55:06 +0530 Subject: mm/page_alloc: use deferred_pages_enabled() wherever applicable Instead of directly accessing static deferred_pages, replace such instances with the helper deferred_pages_enabled(). No functional change is intended. Link: https://lkml.kernel.org/r/20230105082506.241529-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Mike Rapoport (IBM) Reviewed-by: David Hildenbrand Acked-by: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ecb9e9acfe7f..717f12e83b85 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4288,7 +4288,7 @@ retry: * Watermark failed for this zone, but see if we can * grow this zone if it contains deferred pages. */ - if (static_branch_unlikely(&deferred_pages)) { + if (deferred_pages_enabled()) { if (_deferred_grow_zone(zone, order)) goto try_this_zone; } @@ -4337,7 +4337,7 @@ try_this_zone: } else { #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* Try again if zone has deferred pages */ - if (static_branch_unlikely(&deferred_pages)) { + if (deferred_pages_enabled()) { if (_deferred_grow_zone(zone, order)) goto try_this_zone; } -- cgit v1.2.3 From 6260ae3583456808dceb4d78077e6388c49ca6d7 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 18 Jan 2023 09:52:07 +0900 Subject: zsmalloc: rework zspage chain size selection Patch series "zsmalloc: make zspage chain size configurable". Computers are bad at division. We currently decide the best zspage chain size (max number of physical pages per-zspage) by looking at a `used percentage` value. This is not enough as we lose precision during usage percentage calculations For example, let's look at size class 208: pages per zspage wasted bytes used% 1 144 96 2 80 99 3 16 99 4 160 99 Current algorithm will select 2 page per zspage configuration, as it's the first one to reach 99%. However, 3 pages per zspage waste less memory. Change algorithm and select zspage configuration that has lowest wasted value. Link: https://lkml.kernel.org/r/20230118005210.2814763-1-senozhatsky@chromium.org Link: https://lkml.kernel.org/r/20230118005210.2814763-2-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 56 +++++++++++++++++++------------------------------------- 1 file changed, 19 insertions(+), 37 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 9d27d9b00bce..00ab4cca49e4 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -822,42 +822,6 @@ out: return newfg; } -/* - * We have to decide on how many pages to link together - * to form a zspage for each size class. This is important - * to reduce wastage due to unusable space left at end of - * each zspage which is given as: - * wastage = Zp % class_size - * usage = Zp - wastage - * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... - * - * For example, for size class of 3/8 * PAGE_SIZE, we should - * link together 3 PAGE_SIZE sized pages to form a zspage - * since then we can perfectly fit in 8 such objects. - */ -static int get_pages_per_zspage(int class_size) -{ - int i, max_usedpc = 0; - /* zspage order which gives maximum used size per KB */ - int max_usedpc_order = 1; - - for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { - int zspage_size; - int waste, usedpc; - - zspage_size = i * PAGE_SIZE; - waste = zspage_size % class_size; - usedpc = (zspage_size - waste) * 100 / zspage_size; - - if (usedpc > max_usedpc) { - max_usedpc = usedpc; - max_usedpc_order = i; - } - } - - return max_usedpc_order; -} - static struct zspage *get_zspage(struct page *page) { struct zspage *zspage = (struct zspage *)page_private(page); @@ -2401,6 +2365,24 @@ static int zs_register_shrinker(struct zs_pool *pool) pool->name); } +static int calculate_zspage_chain_size(int class_size) +{ + int i, min_waste = INT_MAX; + int chain_size = 1; + + for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { + int waste; + + waste = (i * PAGE_SIZE) % class_size; + if (waste < min_waste) { + min_waste = waste; + chain_size = i; + } + } + + return chain_size; +} + /** * zs_create_pool - Creates an allocation pool to work from. * @name: pool name to be created @@ -2445,7 +2427,7 @@ struct zs_pool *zs_create_pool(const char *name) size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; if (size > ZS_MAX_ALLOC_SIZE) size = ZS_MAX_ALLOC_SIZE; - pages_per_zspage = get_pages_per_zspage(size); + pages_per_zspage = calculate_zspage_chain_size(size); objs_per_zspage = pages_per_zspage * PAGE_SIZE / size; /* -- cgit v1.2.3 From e1d1f3546913ae0af9da31df8183a6f3da0cd590 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 18 Jan 2023 09:52:08 +0900 Subject: zsmalloc: skip chain size calculation for pow_of_2 classes If a class size is power of 2 then it wastes no memory and the best configuration is 1 physical page per-zspage. Link: https://lkml.kernel.org/r/20230118005210.2814763-3-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 00ab4cca49e4..7b904f9bed70 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -2370,6 +2370,9 @@ static int calculate_zspage_chain_size(int class_size) int i, min_waste = INT_MAX; int chain_size = 1; + if (is_power_of_2(class_size)) + return chain_size; + for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { int waste; -- cgit v1.2.3 From 4ff93b292c0b91b9a7c9fb54643f122bbe59d8d0 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 18 Jan 2023 09:52:09 +0900 Subject: zsmalloc: make zspage chain size configurable Remove hard coded limit on the maximum number of physical pages per-zspage. This will allow tuning of zsmalloc pool as zspage chain size changes `pages per-zspage` and `objects per-zspage` characteristics of size classes which also affects size classes clustering (the way size classes are merged). Link: https://lkml.kernel.org/r/20230118005210.2814763-4-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Mike Kravetz Signed-off-by: Andrew Morton --- Documentation/mm/zsmalloc.rst | 168 ++++++++++++++++++++++++++++++++++++++++++ mm/Kconfig | 19 +++++ mm/zsmalloc.c | 12 +-- 3 files changed, 191 insertions(+), 8 deletions(-) diff --git a/Documentation/mm/zsmalloc.rst b/Documentation/mm/zsmalloc.rst index 6e79893d6132..40323c9b39d8 100644 --- a/Documentation/mm/zsmalloc.rst +++ b/Documentation/mm/zsmalloc.rst @@ -80,3 +80,171 @@ Similarly, we assign zspage to: * ZS_ALMOST_FULL when n > N / f * ZS_EMPTY when n == 0 * ZS_FULL when n == N + + +Internals +========= + +zsmalloc has 255 size classes, each of which can hold a number of zspages. +Each zspage can contain up to ZSMALLOC_CHAIN_SIZE physical (0-order) pages. +The optimal zspage chain size for each size class is calculated during the +creation of the zsmalloc pool (see calculate_zspage_chain_size()). + +As an optimization, zsmalloc merges size classes that have similar +characteristics in terms of the number of pages per zspage and the number +of objects that each zspage can store. + +For instance, consider the following size classes::: + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + ... + 94 1536 0 0 0 0 0 3 0 + 100 1632 0 0 0 0 0 2 0 + ... + + +Size classes #95-99 are merged with size class #100. This means that when we +need to store an object of size, say, 1568 bytes, we end up using size class +#100 instead of size class #96. Size class #100 is meant for objects of size +1632 bytes, so each object of size 1568 bytes wastes 1632-1568=64 bytes. + +Size class #100 consists of zspages with 2 physical pages each, which can +hold a total of 5 objects. If we need to store 13 objects of size 1568, we +end up allocating three zspages, or 6 physical pages. + +However, if we take a closer look at size class #96 (which is meant for +objects of size 1568 bytes) and trace `calculate_zspage_chain_size()`, we +find that the most optimal zspage configuration for this class is a chain +of 5 physical pages::: + + pages per zspage wasted bytes used% + 1 960 76 + 2 352 95 + 3 1312 89 + 4 704 95 + 5 96 99 + +This means that a class #96 configuration with 5 physical pages can store 13 +objects of size 1568 in a single zspage, using a total of 5 physical pages. +This is more efficient than the class #100 configuration, which would use 6 +physical pages to store the same number of objects. + +As the zspage chain size for class #96 increases, its key characteristics +such as pages per-zspage and objects per-zspage also change. This leads to +dewer class mergers, resulting in a more compact grouping of classes, which +reduces memory wastage. + +Let's take a closer look at the bottom of `/sys/kernel/debug/zsmalloc/zramX/classes`::: + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + ... + 202 3264 0 0 0 0 0 4 0 + 254 4096 0 0 0 0 0 1 0 + ... + +Size class #202 stores objects of size 3264 bytes and has a maximum of 4 pages +per zspage. Any object larger than 3264 bytes is considered huge and belongs +to size class #254, which stores each object in its own physical page (objects +in huge classes do not share pages). + +Increasing the size of the chain of zspages also results in a higher watermark +for the huge size class and fewer huge classes overall. This allows for more +efficient storage of large objects. + +For zspage chain size of 8, huge class watermark becomes 3632 bytes::: + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + ... + 202 3264 0 0 0 0 0 4 0 + 211 3408 0 0 0 0 0 5 0 + 217 3504 0 0 0 0 0 6 0 + 222 3584 0 0 0 0 0 7 0 + 225 3632 0 0 0 0 0 8 0 + 254 4096 0 0 0 0 0 1 0 + ... + +For zspage chain size of 16, huge class watermark becomes 3840 bytes::: + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + ... + 202 3264 0 0 0 0 0 4 0 + 206 3328 0 0 0 0 0 13 0 + 207 3344 0 0 0 0 0 9 0 + 208 3360 0 0 0 0 0 14 0 + 211 3408 0 0 0 0 0 5 0 + 212 3424 0 0 0 0 0 16 0 + 214 3456 0 0 0 0 0 11 0 + 217 3504 0 0 0 0 0 6 0 + 219 3536 0 0 0 0 0 13 0 + 222 3584 0 0 0 0 0 7 0 + 223 3600 0 0 0 0 0 15 0 + 225 3632 0 0 0 0 0 8 0 + 228 3680 0 0 0 0 0 9 0 + 230 3712 0 0 0 0 0 10 0 + 232 3744 0 0 0 0 0 11 0 + 234 3776 0 0 0 0 0 12 0 + 235 3792 0 0 0 0 0 13 0 + 236 3808 0 0 0 0 0 14 0 + 238 3840 0 0 0 0 0 15 0 + 254 4096 0 0 0 0 0 1 0 + ... + +Overall the combined zspage chain size effect on zsmalloc pool configuration::: + + pages per zspage number of size classes (clusters) huge size class watermark + 4 69 3264 + 5 86 3408 + 6 93 3504 + 7 112 3584 + 8 123 3632 + 9 140 3680 + 10 143 3712 + 11 159 3744 + 12 164 3776 + 13 180 3792 + 14 183 3808 + 15 188 3840 + 16 191 3840 + + +A synthetic test +---------------- + +zram as a build artifacts storage (Linux kernel compilation). + +* `CONFIG_ZSMALLOC_CHAIN_SIZE=4` + + zsmalloc classes stats::: + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + ... + Total 13 51 413836 412973 159955 3 + + zram mm_stat::: + + 1691783168 628083717 655175680 0 655175680 60 0 34048 34049 + + +* `CONFIG_ZSMALLOC_CHAIN_SIZE=8` + + zsmalloc classes stats::: + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + ... + Total 18 87 414852 412978 156666 0 + + zram mm_stat::: + + 1691803648 627793930 641703936 0 641703936 60 0 33591 33591 + +Using larger zspage chains may result in using fewer physical pages, as seen +in the example where the number of physical pages used decreased from 159955 +to 156666, at the same time maximum zsmalloc pool memory usage went down from +655175680 to 641703936 bytes. + +However, this advantage may be offset by the potential for increased system +memory pressure (as some zspages have larger chain sizes) in cases where there +is heavy internal fragmentation and zspool compaction is unable to relocate +objects and release zspages. In these cases, it is recommended to decrease +the limit on the size of the zspage chains (as specified by the +CONFIG_ZSMALLOC_CHAIN_SIZE option). diff --git a/mm/Kconfig b/mm/Kconfig index 39df30dcabe3..83b1d278b31c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -191,6 +191,25 @@ config ZSMALLOC_STAT information to userspace via debugfs. If unsure, say N. +config ZSMALLOC_CHAIN_SIZE + int "Maximum number of physical pages per-zspage" + default 4 + range 4 16 + depends on ZSMALLOC + help + This option sets the upper limit on the number of physical pages + that a zmalloc page (zspage) can consist of. The optimal zspage + chain size is calculated for each size class during the + initialization of the pool. + + Changing this option can alter the characteristics of size classes, + such as the number of pages per zspage and the number of objects + per zspage. This can also result in different configurations of + the pool, as zsmalloc merges size classes with similar + characteristics. + + For more information, see zsmalloc documentation. + menu "SLAB allocator options" choice diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 7b904f9bed70..3aed46ab7e6c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -73,13 +73,6 @@ */ #define ZS_ALIGN 8 -/* - * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) - * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. - */ -#define ZS_MAX_ZSPAGE_ORDER 2 -#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) - #define ZS_HANDLE_SIZE (sizeof(unsigned long)) /* @@ -136,10 +129,13 @@ #define HUGE_BITS 1 #define FULLNESS_BITS 2 #define CLASS_BITS 8 -#define ISOLATED_BITS 3 +#define ISOLATED_BITS 5 #define MAGIC_VAL_BITS 8 #define MAX(a, b) ((a) >= (b) ? (a) : (b)) + +#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(CONFIG_ZSMALLOC_CHAIN_SIZE, UL)) + /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ #define ZS_MIN_ALLOC_SIZE \ MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) -- cgit v1.2.3 From b46402fa894f88ddb42a2e841618a8f42f57d16d Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 18 Jan 2023 09:52:10 +0900 Subject: zsmalloc: set default zspage chain size to 8 This changes key characteristics (pages per-zspage and objects per-zspage) of a number of size classes which in results in different pool configuration. With zspage chain size of 8 we have more size clases clusters (123) and higher huge size class watermark (3632 bytes). Please read zsmalloc documentation for more details. Link: https://lkml.kernel.org/r/20230118005210.2814763-5-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index 83b1d278b31c..4751031f3f05 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -193,7 +193,7 @@ config ZSMALLOC_STAT config ZSMALLOC_CHAIN_SIZE int "Maximum number of physical pages per-zspage" - default 4 + default 8 range 4 16 depends on ZSMALLOC help -- cgit v1.2.3 From 04bac040bc71b4b37550eed5854f34ca161756f9 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 18 Jan 2023 09:40:39 -0800 Subject: mm/hugetlb: convert get_hwpoison_huge_page() to folios Straightforward conversion of get_hwpoison_huge_page() to get_hwpoison_hugetlb_folio(). Reduces two references to a head page in memory-failure.c [arnd@arndb.de: fix get_hwpoison_hugetlb_folio() stub] Link: https://lkml.kernel.org/r/20230119111920.635260-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20230118174039.14247-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Signed-off-by: Arnd Bergmann Acked-by: Naoya Horiguchi Reviewed-by: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/hugetlb.c | 10 +++++----- mm/memory-failure.c | 22 +++++++++++----------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index cf60fe741c1d..a51e6daacac6 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -172,7 +172,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); int isolate_hugetlb(struct page *page, struct list_head *list); -int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison); +int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void putback_active_hugepage(struct page *page); @@ -418,7 +418,7 @@ static inline int isolate_hugetlb(struct page *page, struct list_head *list) return -EBUSY; } -static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison) +static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) { return 0; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 291ad4cb02f9..0f9df0143772 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7268,18 +7268,18 @@ unlock: return ret; } -int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison) +int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) { int ret = 0; *hugetlb = false; spin_lock_irq(&hugetlb_lock); - if (PageHeadHuge(page)) { + if (folio_test_hugetlb(folio)) { *hugetlb = true; - if (HPageFreed(page)) + if (folio_test_hugetlb_freed(folio)) ret = 0; - else if (HPageMigratable(page) || unpoison) - ret = get_page_unless_zero(page); + else if (folio_test_hugetlb_migratable(folio) || unpoison) + ret = folio_try_get(folio); else ret = -EBUSY; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 38f84bff8bdf..0a382191737f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1257,28 +1257,28 @@ static inline bool HWPoisonHandlable(struct page *page, unsigned long flags) static int __get_hwpoison_page(struct page *page, unsigned long flags) { - struct page *head = compound_head(page); + struct folio *folio = page_folio(page); int ret = 0; bool hugetlb = false; - ret = get_hwpoison_huge_page(head, &hugetlb, false); + ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, false); if (hugetlb) return ret; /* - * This check prevents from calling get_page_unless_zero() for any - * unsupported type of page in order to reduce the risk of unexpected - * races caused by taking a page refcount. + * This check prevents from calling folio_try_get() for any + * unsupported type of folio in order to reduce the risk of unexpected + * races caused by taking a folio refcount. */ - if (!HWPoisonHandlable(head, flags)) + if (!HWPoisonHandlable(&folio->page, flags)) return -EBUSY; - if (get_page_unless_zero(head)) { - if (head == compound_head(page)) + if (folio_try_get(folio)) { + if (folio == page_folio(page)) return 1; pr_info("%#lx cannot catch tail\n", page_to_pfn(page)); - put_page(head); + folio_put(folio); } return 0; @@ -1347,11 +1347,11 @@ out: static int __get_unpoison_page(struct page *page) { - struct page *head = compound_head(page); + struct folio *folio = page_folio(page); int ret = 0; bool hugetlb = false; - ret = get_hwpoison_huge_page(head, &hugetlb, true); + ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, true); if (hugetlb) return ret; -- cgit v1.2.3 From 5649d113ffce9f532a9ecc5ab96a93e02efbf283 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Wed, 18 Jan 2023 20:13:03 +0800 Subject: swap_state: update shadow_nodes for anonymous page Shadow_nodes is for shadow nodes reclaiming of workingset handling, it is updated when page cache add or delete since long time ago workingset only supported page cache. But when workingset supports anonymous page detection, we missied updating shadow nodes for it. This caused that shadow nodes of anonymous page will never be reclaimd by scan_shadow_nodes() even they use much memory and system memory is tense. So update shadow_nodes of anonymous page when swap cache is add or delete by calling xas_set_update(..workingset_update_node). Link: https://lkml.kernel.org/r/202301182013032211005@zte.com.cn Fixes: aae466b0052e ("mm/swap: implement workingset detection for anonymous LRU") Signed-off-by: Yang Yang Reviewed-by: Ran Xiaokai Cc: Bagas Sanjaya Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/xarray.h | 3 ++- mm/swap_state.c | 6 ++++++ mm/workingset.c | 21 +++++++++++++-------- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 44dd6d6e01bc..741703b45f61 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1643,7 +1643,8 @@ static inline void xas_set_order(struct xa_state *xas, unsigned long index, * @update: Function to call when updating a node. * * The XArray can notify a caller after it has updated an xa_node. - * This is advanced functionality and is only needed by the page cache. + * This is advanced functionality and is only needed by the page + * cache and swap cache. */ static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update) { diff --git a/mm/swap_state.c b/mm/swap_state.c index cb9aaa00951d..7a003d8abb37 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -94,6 +94,8 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, unsigned long i, nr = folio_nr_pages(folio); void *old; + xas_set_update(&xas, workingset_update_node); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); @@ -145,6 +147,8 @@ void __delete_from_swap_cache(struct folio *folio, pgoff_t idx = swp_offset(entry); XA_STATE(xas, &address_space->i_pages, idx); + xas_set_update(&xas, workingset_update_node); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); @@ -252,6 +256,8 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, struct address_space *address_space = swap_address_space(entry); XA_STATE(xas, &address_space->i_pages, curr); + xas_set_update(&xas, workingset_update_node); + xa_lock_irq(&address_space->i_pages); xas_for_each(&xas, old, end) { if (!xa_is_value(old)) diff --git a/mm/workingset.c b/mm/workingset.c index f194d13beabb..00c6f4d9d9be 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -657,11 +657,14 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out; } - if (!spin_trylock(&mapping->host->i_lock)) { - xa_unlock(&mapping->i_pages); - spin_unlock_irq(lru_lock); - ret = LRU_RETRY; - goto out; + /* For page cache we need to hold i_lock */ + if (mapping->host != NULL) { + if (!spin_trylock(&mapping->host->i_lock)) { + xa_unlock(&mapping->i_pages); + spin_unlock_irq(lru_lock); + ret = LRU_RETRY; + goto out; + } } list_lru_isolate(lru, item); @@ -683,9 +686,11 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, out_invalid: xa_unlock_irq(&mapping->i_pages); - if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); - spin_unlock(&mapping->host->i_lock); + if (mapping->host != NULL) { + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); + } ret = LRU_REMOVED_RETRY; out: cond_resched(); -- cgit v1.2.3 From 148aa87e4f631e98d926d006604116fd2b2f3a93 Mon Sep 17 00:00:00 2001 From: Levi Yun Date: Wed, 18 Jan 2023 17:05:23 +0900 Subject: mm/cma: fix potential memory loss on cma_declare_contiguous_nid Suppose memblock_alloc_range_nid() with highmem_start succeeds when cma_declare_contiguous_nid is called with !fixed on a 32-bit system with PHYS_ADDR_T_64BIT enabled with memblock.bottom_up == false. But the next trial to memblock_alloc_range_nid() to allocate in [SIZE_4G, limits) nullifies former successfully allocated addr and it retries memblock_alloc_ragne_nid(). In this situation, the first successfully allocated address area is lost. Change the order of allocation (SIZE_4G, high_memory and base) and check whether the allocated succeeded to prevent potential memory loss. Link: https://lkml.kernel.org/r/20230118080523.44522-1-ppbuk5246@gmail.com Signed-off-by: Levi Yun Cc: Laurent Pinchart Cc: Marek Szyprowski Cc: Joonsoo Kim Cc: Minchan Kim Signed-off-by: Andrew Morton --- mm/cma.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index a75b17b03b66..a7263aa02c92 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -321,18 +321,6 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, } else { phys_addr_t addr = 0; - /* - * All pages in the reserved area must come from the same zone. - * If the requested region crosses the low/high memory boundary, - * try allocating from high memory first and fall back to low - * memory in case of failure. - */ - if (base < highmem_start && limit > highmem_start) { - addr = memblock_alloc_range_nid(size, alignment, - highmem_start, limit, nid, true); - limit = highmem_start; - } - /* * If there is enough memory, try a bottom-up allocation first. * It will place the new cma area close to the start of the node @@ -350,6 +338,18 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, } #endif + /* + * All pages in the reserved area must come from the same zone. + * If the requested region crosses the low/high memory boundary, + * try allocating from high memory first and fall back to low + * memory in case of failure. + */ + if (!addr && base < highmem_start && limit > highmem_start) { + addr = memblock_alloc_range_nid(size, alignment, + highmem_start, limit, nid, true); + limit = highmem_start; + } + if (!addr) { addr = memblock_alloc_range_nid(size, alignment, base, limit, nid, true); -- cgit v1.2.3 From d0634a622be35df2dfa80dc14ee1482ae1889cb2 Mon Sep 17 00:00:00 2001 From: Deming Wang Date: Tue, 17 Jan 2023 21:54:03 -0500 Subject: Documentation: mm: use `s/higmem/highmem/` fix typo for highmem We should use highmem replace higmem. Link: https://lkml.kernel.org/r/20230118025403.1531-1-wangdeming@inspur.com Signed-off-by: Deming Wang Reviewed-by: Ira Weiny Cc: "Fabio M. De Francesco" Cc: Jonathan Corbet Cc: Mike Rapoport (IBM) Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- Documentation/mm/highmem.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/mm/highmem.rst b/Documentation/mm/highmem.rst index e691a06fb337..4503868b0865 100644 --- a/Documentation/mm/highmem.rst +++ b/Documentation/mm/highmem.rst @@ -83,7 +83,7 @@ list shows them in order of preference of use. for pages which are known to not come from ZONE_HIGHMEM. However, it is always safe to use kmap_local_page() / kunmap_local(). - While it is significantly faster than kmap(), for the higmem case it + While it is significantly faster than kmap(), for the highmem case it comes with restrictions about the pointers validity. Contrary to kmap() mappings, the local mappings are only valid in the context of the caller and cannot be handed to other contexts. This implies that users must -- cgit v1.2.3 From e6d2c436ff693869e83a65e61643b922e193e162 Mon Sep 17 00:00:00 2001 From: "Herton R. Krzesinski" Date: Mon, 16 Jan 2023 19:49:21 -0300 Subject: tools/mm: allow users to provide additional cflags/ldflags Right now there is no way to provide additional cflags/ldflags when building tools/vm binaries. And using eg. make CFLAGS= will override the CFLAGS being set in the Makefile, making the build fail since it requires the include of the ../lib dir (for libapi). This change then allows you to specify: CFLAGS= LDFLAGS= make V=1 -C tools/vm And the options will be correctly appended as can be seen from the make output. Link: https://lkml.kernel.org/r/20230116224921.4106324-1-herton@redhat.com Signed-off-by: Herton R. Krzesinski Cc: Don Zickus Cc: Justin Forbes Cc: Vlastimil Babka Cc: Scott Weaver Signed-off-by: Andrew Morton --- tools/mm/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/mm/Makefile b/tools/mm/Makefile index 9860622cbb15..6c1da51f4177 100644 --- a/tools/mm/Makefile +++ b/tools/mm/Makefile @@ -8,8 +8,8 @@ TARGETS=page-types slabinfo page_owner_sort LIB_DIR = ../lib/api LIBS = $(LIB_DIR)/libapi.a -CFLAGS = -Wall -Wextra -I../lib/ -LDFLAGS = $(LIBS) +CFLAGS += -Wall -Wextra -I../lib/ +LDFLAGS += $(LIBS) all: $(TARGETS) -- cgit v1.2.3 From b507808ebce23561d4ff8c2aa1fb949fe402bc61 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 19 Jan 2023 16:03:43 +0000 Subject: mm: implement memory-deny-write-execute as a prctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: In-kernel support for memory-deny-write-execute (MDWE)", v2. The background to this is that systemd has a configuration option called MemoryDenyWriteExecute [2], implemented as a SECCOMP BPF filter. Its aim is to prevent a user task from inadvertently creating an executable mapping that is (or was) writeable. Since such BPF filter is stateless, it cannot detect mappings that were previously writeable but subsequently changed to read-only. Therefore the filter simply rejects any mprotect(PROT_EXEC). The side-effect is that on arm64 with BTI support (Branch Target Identification), the dynamic loader cannot change an ELF section from PROT_EXEC to PROT_EXEC|PROT_BTI using mprotect(). For libraries, it can resort to unmapping and re-mapping but for the main executable it does not have a file descriptor. The original bug report in the Red Hat bugzilla - [3] - and subsequent glibc workaround for libraries - [4]. This series adds in-kernel support for this feature as a prctl PR_SET_MDWE, that is inherited on fork(). The prctl denies PROT_WRITE | PROT_EXEC mappings. Like the systemd BPF filter it also denies adding PROT_EXEC to mappings. However unlike the BPF filter it only denies it if the mapping didn't previous have PROT_EXEC. This allows to PROT_EXEC -> PROT_EXEC | PROT_BTI with mprotect(), which is a problem with the BPF filter. This patch (of 2): The aim of such policy is to prevent a user task from creating an executable mapping that is also writeable. An example of mmap() returning -EACCESS if the policy is enabled: mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, flags, 0, 0); Similarly, mprotect() would return -EACCESS below: addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0); mprotect(addr, size, PROT_READ | PROT_WRITE | PROT_EXEC); The BPF filter that systemd MDWE uses is stateless, and disallows mprotect() with PROT_EXEC completely. This new prctl allows PROT_EXEC to be enabled if it was already PROT_EXEC, which allows the following case: addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0); mprotect(addr, size, PROT_READ | PROT_EXEC | PROT_BTI); where PROT_BTI enables branch tracking identification on arm64. Link: https://lkml.kernel.org/r/20230119160344.54358-1-joey.gouly@arm.com Link: https://lkml.kernel.org/r/20230119160344.54358-2-joey.gouly@arm.com Signed-off-by: Joey Gouly Co-developed-by: Catalin Marinas Signed-off-by: Catalin Marinas Cc: Alexander Viro Cc: Jeremy Linton Cc: Kees Cook Cc: Lennart Poettering Cc: Mark Brown Cc: nd Cc: Shuah Khan Cc: Szabolcs Nagy Cc: Topi Miettinen Cc: Zbigniew Jędrzejewski-Szmek Cc: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/mman.h | 34 ++++++++++++++++++++++++++++++++++ include/linux/sched/coredump.h | 6 +++++- include/uapi/linux/prctl.h | 6 ++++++ kernel/sys.c | 33 +++++++++++++++++++++++++++++++++ mm/mmap.c | 10 ++++++++++ mm/mprotect.c | 5 +++++ 6 files changed, 93 insertions(+), 1 deletion(-) diff --git a/include/linux/mman.h b/include/linux/mman.h index 58b3abd457a3..cee1e4b566d8 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -156,4 +156,38 @@ calc_vm_flag_bits(unsigned long flags) } unsigned long vm_commit_limit(void); + +/* + * Denies creating a writable executable mapping or gaining executable permissions. + * + * This denies the following: + * + * a) mmap(PROT_WRITE | PROT_EXEC) + * + * b) mmap(PROT_WRITE) + * mprotect(PROT_EXEC) + * + * c) mmap(PROT_WRITE) + * mprotect(PROT_READ) + * mprotect(PROT_EXEC) + * + * But allows the following: + * + * d) mmap(PROT_READ | PROT_EXEC) + * mmap(PROT_READ | PROT_EXEC | PROT_BTI) + */ +static inline bool map_deny_write_exec(struct vm_area_struct *vma, unsigned long vm_flags) +{ + if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + return false; + + if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE)) + return true; + + if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC)) + return true; + + return false; +} + #endif /* _LINUX_MMAN_H */ diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 8270ad7ae14c..0e17ae7fbfd3 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -81,9 +81,13 @@ static inline int get_dumpable(struct mm_struct *mm) * lifecycle of this mm, just for simplicity. */ #define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ + +#define MMF_HAS_MDWE 28 +#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE) + #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ - MMF_DISABLE_THP_MASK) + MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK) #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index a5e06dcbba13..1312a137f7fb 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -281,6 +281,12 @@ struct prctl_mm_map { # define PR_SME_VL_LEN_MASK 0xffff # define PR_SME_VL_INHERIT (1 << 17) /* inherit across exec */ +/* Memory deny write / execute */ +#define PR_SET_MDWE 65 +# define PR_MDWE_REFUSE_EXEC_GAIN 1 + +#define PR_GET_MDWE 66 + #define PR_SET_VMA 0x53564d41 # define PR_SET_VMA_ANON_NAME 0 diff --git a/kernel/sys.c b/kernel/sys.c index 5fd54bf0e886..b3cab94545ed 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2348,6 +2348,33 @@ static int prctl_set_vma(unsigned long opt, unsigned long start, } #endif /* CONFIG_ANON_VMA_NAME */ +static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3, + unsigned long arg4, unsigned long arg5) +{ + if (arg3 || arg4 || arg5) + return -EINVAL; + + if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN)) + return -EINVAL; + + if (bits & PR_MDWE_REFUSE_EXEC_GAIN) + set_bit(MMF_HAS_MDWE, ¤t->mm->flags); + else if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + return -EPERM; /* Cannot unset the flag */ + + return 0; +} + +static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5) +{ + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + + return test_bit(MMF_HAS_MDWE, ¤t->mm->flags) ? + PR_MDWE_REFUSE_EXEC_GAIN : 0; +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2623,6 +2650,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = sched_core_share_pid(arg2, arg3, arg4, arg5); break; #endif + case PR_SET_MDWE: + error = prctl_set_mdwe(arg2, arg3, arg4, arg5); + break; + case PR_GET_MDWE: + error = prctl_get_mdwe(arg2, arg3, arg4, arg5); + break; case PR_SET_VMA: error = prctl_set_vma(arg2, arg3, arg4, arg5); break; diff --git a/mm/mmap.c b/mm/mmap.c index 335ba3df9898..ffc0815cd7fb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2669,6 +2669,16 @@ cannot_expand: vma_set_anonymous(vma); } + if (map_deny_write_exec(vma, vma->vm_flags)) { + error = -EACCES; + if (file) + goto close_and_free_vma; + else if (vma->vm_file) + goto unmap_and_free_vma; + else + goto free_vma; + } + /* Allow architectures to sanity-check the vm_flags */ if (!arch_validate_flags(vma->vm_flags)) { error = -EINVAL; diff --git a/mm/mprotect.c b/mm/mprotect.c index 6ecdf0671b81..6a22f3ad9b84 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -799,6 +799,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len, break; } + if (map_deny_write_exec(vma, newflags)) { + error = -EACCES; + goto out; + } + /* Allow architectures to sanity-check the new flags */ if (!arch_validate_flags(newflags)) { error = -EINVAL; -- cgit v1.2.3 From 4cf1fe34fd18b752ae2403927277715d4444f331 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 19 Jan 2023 16:03:44 +0000 Subject: kselftest: vm: add tests for memory-deny-write-execute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add some tests to cover the new PR_SET_MDWE prctl. Link: https://lkml.kernel.org/r/20230119160344.54358-3-joey.gouly@arm.com Co-developed-by: Joey Gouly Signed-off-by: Joey Gouly Signed-off-by: Kees Cook Cc: Shuah Khan Cc: Alexander Viro Cc: Catalin Marinas Cc: Jeremy Linton Cc: Lennart Poettering Cc: Mark Brown Cc: nd Cc: Szabolcs Nagy Cc: Topi Miettinen Cc: Zbigniew Jędrzejewski-Szmek Cc: David Hildenbrand Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 1 + tools/testing/selftests/mm/mdwe_test.c | 197 +++++++++++++++++++++++++++++++++ 2 files changed, 198 insertions(+) create mode 100644 tools/testing/selftests/mm/mdwe_test.c diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 0a44d77f8437..d90cdc06aa59 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -60,6 +60,7 @@ TEST_GEN_PROGS += soft-dirty TEST_GEN_PROGS += split_huge_page_test TEST_GEN_FILES += ksm_tests TEST_GEN_PROGS += ksm_functional_tests +TEST_GEN_PROGS += mdwe_test ifeq ($(MACHINE),x86_64) CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32) diff --git a/tools/testing/selftests/mm/mdwe_test.c b/tools/testing/selftests/mm/mdwe_test.c new file mode 100644 index 000000000000..f466a099f1bf --- /dev/null +++ b/tools/testing/selftests/mm/mdwe_test.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifdef __aarch64__ +#include +#endif + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +#ifndef __aarch64__ +# define PROT_BTI 0 +#endif + +TEST(prctl_flags) +{ + EXPECT_LT(prctl(PR_SET_MDWE, 7L, 0L, 0L, 0L), 0); + EXPECT_LT(prctl(PR_SET_MDWE, 0L, 7L, 0L, 0L), 0); + EXPECT_LT(prctl(PR_SET_MDWE, 0L, 0L, 7L, 0L), 0); + EXPECT_LT(prctl(PR_SET_MDWE, 0L, 0L, 0L, 7L), 0); + + EXPECT_LT(prctl(PR_GET_MDWE, 7L, 0L, 0L, 0L), 0); + EXPECT_LT(prctl(PR_GET_MDWE, 0L, 7L, 0L, 0L), 0); + EXPECT_LT(prctl(PR_GET_MDWE, 0L, 0L, 7L, 0L), 0); + EXPECT_LT(prctl(PR_GET_MDWE, 0L, 0L, 0L, 7L), 0); +} + +FIXTURE(mdwe) +{ + void *p; + int flags; + size_t size; + pid_t pid; +}; + +FIXTURE_VARIANT(mdwe) +{ + bool enabled; + bool forked; +}; + +FIXTURE_VARIANT_ADD(mdwe, stock) +{ + .enabled = false, + .forked = false, +}; + +FIXTURE_VARIANT_ADD(mdwe, enabled) +{ + .enabled = true, + .forked = false, +}; + +FIXTURE_VARIANT_ADD(mdwe, forked) +{ + .enabled = true, + .forked = true, +}; + +FIXTURE_SETUP(mdwe) +{ + int ret, status; + + self->p = NULL; + self->flags = MAP_SHARED | MAP_ANONYMOUS; + self->size = getpagesize(); + + if (!variant->enabled) + return; + + ret = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0L, 0L, 0L); + ASSERT_EQ(ret, 0) { + TH_LOG("PR_SET_MDWE failed or unsupported"); + } + + ret = prctl(PR_GET_MDWE, 0L, 0L, 0L, 0L); + ASSERT_EQ(ret, 1); + + if (variant->forked) { + self->pid = fork(); + ASSERT_GE(self->pid, 0) { + TH_LOG("fork failed\n"); + } + + if (self->pid > 0) { + ret = waitpid(self->pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + exit(WEXITSTATUS(status)); + } + } +} + +FIXTURE_TEARDOWN(mdwe) +{ + if (self->p && self->p != MAP_FAILED) + munmap(self->p, self->size); +} + +TEST_F(mdwe, mmap_READ_EXEC) +{ + self->p = mmap(NULL, self->size, PROT_READ | PROT_EXEC, self->flags, 0, 0); + EXPECT_NE(self->p, MAP_FAILED); +} + +TEST_F(mdwe, mmap_WRITE_EXEC) +{ + self->p = mmap(NULL, self->size, PROT_WRITE | PROT_EXEC, self->flags, 0, 0); + if (variant->enabled) { + EXPECT_EQ(self->p, MAP_FAILED); + } else { + EXPECT_NE(self->p, MAP_FAILED); + } +} + +TEST_F(mdwe, mprotect_stay_EXEC) +{ + int ret; + + self->p = mmap(NULL, self->size, PROT_READ | PROT_EXEC, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + ret = mprotect(self->p, self->size, PROT_READ | PROT_EXEC); + EXPECT_EQ(ret, 0); +} + +TEST_F(mdwe, mprotect_add_EXEC) +{ + int ret; + + self->p = mmap(NULL, self->size, PROT_READ, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + ret = mprotect(self->p, self->size, PROT_READ | PROT_EXEC); + if (variant->enabled) { + EXPECT_LT(ret, 0); + } else { + EXPECT_EQ(ret, 0); + } +} + +TEST_F(mdwe, mprotect_WRITE_EXEC) +{ + int ret; + + self->p = mmap(NULL, self->size, PROT_WRITE, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + ret = mprotect(self->p, self->size, PROT_WRITE | PROT_EXEC); + if (variant->enabled) { + EXPECT_LT(ret, 0); + } else { + EXPECT_EQ(ret, 0); + } +} + +TEST_F(mdwe, mmap_FIXED) +{ + void *p, *p2; + + p2 = mmap(NULL, self->size, PROT_READ | PROT_EXEC, self->flags, 0, 0); + self->p = mmap(NULL, self->size, PROT_READ, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + p = mmap(self->p + self->size, self->size, PROT_READ | PROT_EXEC, + self->flags | MAP_FIXED, 0, 0); + if (variant->enabled) { + EXPECT_EQ(p, MAP_FAILED); + } else { + EXPECT_EQ(p, self->p); + } +} + +TEST_F(mdwe, arm64_BTI) +{ + int ret; + +#ifdef __aarch64__ + if (!(getauxval(AT_HWCAP2) & HWCAP2_BTI)) +#endif + SKIP(return, "HWCAP2_BTI not supported"); + + self->p = mmap(NULL, self->size, PROT_EXEC, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + ret = mprotect(self->p, self->size, PROT_EXEC | PROT_BTI); + EXPECT_EQ(ret, 0); +} + +TEST_HARNESS_MAIN -- cgit v1.2.3 From 6061e740822530a4ef443548b19c4e0bc6342c7a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 18 Jan 2023 23:01:10 -0500 Subject: mm/kmemleak: simplify kmemleak_cond_resched() usage Patch series "mm/kmemleak: Simplify kmemleak_cond_resched() & fix UAF", v2. It was found that a KASAN use-after-free error was reported in the kmemleak_scan() function. After further examination, it is believe that even though a reference is taken from the current object, it does not prevent the object pointed to by the next pointer from going away after a cond_resched(). To fix that, additional flags are added to make sure that the current object won't be removed from the object_list during the duration of the cond_resched() to ensure the validity of the next pointer. While making the change, I also simplify the current usage of kmemleak_cond_resched() to make it easier to understand. This patch (of 2): The presence of a pinned argument and the 64k loop count make kmemleak_cond_resched() a bit more complex to read. The pinned argument is used only by first kmemleak_scan() loop. Simplify the usage of kmemleak_cond_resched() by removing the pinned argument and always do a get_object()/put_object() sequence. In addition, the 64k loop is removed by using need_resched() to decide if kmemleak_cond_resched() should be called. Link: https://lkml.kernel.org/r/20230119040111.350923-1-longman@redhat.com Link: https://lkml.kernel.org/r/20230119040111.350923-2-longman@redhat.com Signed-off-by: Waiman Long Reviewed-by: Catalin Marinas Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/kmemleak.c | 48 ++++++++++++------------------------------------ 1 file changed, 12 insertions(+), 36 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 55dc8b8b0616..69327b71fcf9 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1472,22 +1472,17 @@ static void scan_gray_list(void) /* * Conditionally call resched() in an object iteration loop while making sure * that the given object won't go away without RCU read lock by performing a - * get_object() if !pinned. - * - * Return: false if can't do a cond_resched() due to get_object() failure - * true otherwise + * get_object() if necessaary. */ -static bool kmemleak_cond_resched(struct kmemleak_object *object, bool pinned) +static void kmemleak_cond_resched(struct kmemleak_object *object) { - if (!pinned && !get_object(object)) - return false; + if (!get_object(object)) + return; /* Try next object */ rcu_read_unlock(); cond_resched(); rcu_read_lock(); - if (!pinned) - put_object(object); - return true; + put_object(object); } /* @@ -1501,15 +1496,12 @@ static void kmemleak_scan(void) struct zone *zone; int __maybe_unused i; int new_leaks = 0; - int loop_cnt = 0; jiffies_last_scan = jiffies; /* prepare the kmemleak_object's */ rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { - bool obj_pinned = false; - raw_spin_lock_irq(&object->lock); #ifdef DEBUG /* @@ -1535,19 +1527,13 @@ static void kmemleak_scan(void) /* reset the reference count (whiten the object) */ object->count = 0; - if (color_gray(object) && get_object(object)) { + if (color_gray(object) && get_object(object)) list_add_tail(&object->gray_list, &gray_list); - obj_pinned = true; - } raw_spin_unlock_irq(&object->lock); - /* - * Do a cond_resched() every 64k objects to avoid soft lockup. - */ - if (!(++loop_cnt & 0xffff) && - !kmemleak_cond_resched(object, obj_pinned)) - loop_cnt--; /* Try again on next object */ + if (need_resched()) + kmemleak_cond_resched(object); } rcu_read_unlock(); @@ -1614,14 +1600,9 @@ static void kmemleak_scan(void) * scan and color them gray until the next scan. */ rcu_read_lock(); - loop_cnt = 0; list_for_each_entry_rcu(object, &object_list, object_list) { - /* - * Do a cond_resched() every 64k objects to avoid soft lockup. - */ - if (!(++loop_cnt & 0xffff) && - !kmemleak_cond_resched(object, false)) - loop_cnt--; /* Try again on next object */ + if (need_resched()) + kmemleak_cond_resched(object); /* * This is racy but we can save the overhead of lock/unlock @@ -1656,14 +1637,9 @@ static void kmemleak_scan(void) * Scanning result reporting. */ rcu_read_lock(); - loop_cnt = 0; list_for_each_entry_rcu(object, &object_list, object_list) { - /* - * Do a cond_resched() every 64k objects to avoid soft lockup. - */ - if (!(++loop_cnt & 0xffff) && - !kmemleak_cond_resched(object, false)) - loop_cnt--; /* Try again on next object */ + if (need_resched()) + kmemleak_cond_resched(object); /* * This is racy but we can save the overhead of lock/unlock -- cgit v1.2.3 From 782e4179535971c3574c367bfaaefea8970b3e0b Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 18 Jan 2023 23:01:11 -0500 Subject: mm/kmemleak: fix UAF bug in kmemleak_scan() Commit 6edda04ccc7c ("mm/kmemleak: prevent soft lockup in first object iteration loop of kmemleak_scan()") fixes soft lockup problem in kmemleak_scan() by periodically doing a cond_resched(). It does take a reference of the current object before doing it. Unfortunately, if the object has been deleted from the object_list, the next object pointed to by its next pointer may no longer be valid after coming back from cond_resched(). This can result in use-after-free and other nasty problem. Fix this problem by adding a del_state flag into kmemleak_object structure to synchronize the object deletion process between kmemleak_cond_resched() and __remove_object() to make sure that the object remained in the object_list in the duration of the cond_resched() call. Link: https://lkml.kernel.org/r/20230119040111.350923-3-longman@redhat.com Fixes: 6edda04ccc7c ("mm/kmemleak: prevent soft lockup in first object iteration loop of kmemleak_scan()") Signed-off-by: Waiman Long Reviewed-by: Catalin Marinas Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/kmemleak.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 69327b71fcf9..d9b242cfdb1c 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -13,11 +13,12 @@ * * The following locks and mutexes are used by kmemleak: * - * - kmemleak_lock (raw_spinlock_t): protects the object_list modifications and - * accesses to the object_tree_root (or object_phys_tree_root). The - * object_list is the main list holding the metadata (struct kmemleak_object) - * for the allocated memory blocks. The object_tree_root and object_phys_tree_root - * are red black trees used to look-up metadata based on a pointer to the + * - kmemleak_lock (raw_spinlock_t): protects the object_list as well as + * del_state modifications and accesses to the object_tree_root (or + * object_phys_tree_root). The object_list is the main list holding the + * metadata (struct kmemleak_object) for the allocated memory blocks. + * The object_tree_root and object_phys_tree_root are red + * black trees used to look-up metadata based on a pointer to the * corresponding memory block. The object_phys_tree_root is for objects * allocated with physical address. The kmemleak_object structures are * added to the object_list and object_tree_root (or object_phys_tree_root) @@ -148,6 +149,7 @@ struct kmemleak_object { struct rcu_head rcu; /* object_list lockless traversal */ /* object usage count; object freed when use_count == 0 */ atomic_t use_count; + unsigned int del_state; /* deletion state */ unsigned long pointer; size_t size; /* pass surplus references to this pointer */ @@ -177,6 +179,11 @@ struct kmemleak_object { /* flag set for object allocated with physical address */ #define OBJECT_PHYS (1 << 4) +/* set when __remove_object() called */ +#define DELSTATE_REMOVED (1 << 0) +/* set to temporarily prevent deletion from object_list */ +#define DELSTATE_NO_DELETE (1 << 1) + #define HEX_PREFIX " " /* number of bytes to print per line; must be 16 or 32 */ #define HEX_ROW_SIZE 16 @@ -571,7 +578,9 @@ static void __remove_object(struct kmemleak_object *object) rb_erase(&object->rb_node, object->flags & OBJECT_PHYS ? &object_phys_tree_root : &object_tree_root); - list_del_rcu(&object->object_list); + if (!(object->del_state & DELSTATE_NO_DELETE)) + list_del_rcu(&object->object_list); + object->del_state |= DELSTATE_REMOVED; } /* @@ -643,6 +652,7 @@ static void __create_object(unsigned long ptr, size_t size, object->count = 0; /* white color initially */ object->jiffies = jiffies; object->checksum = 0; + object->del_state = 0; /* task information */ if (in_hardirq()) { @@ -1479,9 +1489,22 @@ static void kmemleak_cond_resched(struct kmemleak_object *object) if (!get_object(object)) return; /* Try next object */ + raw_spin_lock_irq(&kmemleak_lock); + if (object->del_state & DELSTATE_REMOVED) + goto unlock_put; /* Object removed */ + object->del_state |= DELSTATE_NO_DELETE; + raw_spin_unlock_irq(&kmemleak_lock); + rcu_read_unlock(); cond_resched(); rcu_read_lock(); + + raw_spin_lock_irq(&kmemleak_lock); + if (object->del_state & DELSTATE_REMOVED) + list_del_rcu(&object->object_list); + object->del_state &= ~DELSTATE_NO_DELETE; +unlock_put: + raw_spin_unlock_irq(&kmemleak_lock); put_object(object); } -- cgit v1.2.3 From 6b3f013bb90e737b06c7955571407190b4c760ce Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 19 Jan 2023 01:38:29 +0000 Subject: mm/damon: update comments in damon.h for damon_attrs Patch series "mm/damon: misc fixes". This patchset contains three miscellaneous simple fixes for DAMON online tuning. This patch (of 3): Commit cbeaa77b0449 ("mm/damon/core: use a dedicated struct for monitoring attributes") moved monitoring intervals from damon_ctx to a new struct, damon_attrs, but a comment in the header file has not updated for the change. Update it. Link: https://lkml.kernel.org/r/20230119013831.1911-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230119013831.1911-2-sj@kernel.org Fixes: cbeaa77b0449 ("mm/damon/core: use a dedicated struct for monitoring attributes") Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index dfb245bb3053..d5d4d19928e0 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -354,10 +354,10 @@ struct damon_ctx; * users should register the low level operations for their target address * space and usecase via the &damon_ctx.ops. Then, the monitoring thread * (&damon_ctx.kdamond) calls @init and @prepare_access_checks before starting - * the monitoring, @update after each &damon_ctx.ops_update_interval, and + * the monitoring, @update after each &damon_attrs.ops_update_interval, and * @check_accesses, @target_valid and @prepare_access_checks after each - * &damon_ctx.sample_interval. Finally, @reset_aggregated is called after each - * &damon_ctx.aggr_interval. + * &damon_attrs.sample_interval. Finally, @reset_aggregated is called after + * each &damon_attrs.aggr_interval. * * Each &struct damon_operations instance having valid @id can be registered * via damon_register_ops() and selected by damon_select_ops() later. -- cgit v1.2.3 From 2f5bef5a590be4bf4111ee8f49d97a8613a3e980 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 19 Jan 2023 01:38:30 +0000 Subject: mm/damon/core: update monitoring results for new monitoring attributes region->nr_accesses is the number of sampling intervals in the last aggregation interval that access to the region has found, and region->age is the number of aggregation intervals that its access pattern has maintained. Hence, the real meaning of the two fields' values is depending on current sampling and aggregation intervals. This means the values need to be updated for every sampling and/or aggregation intervals updates. As DAMON core doesn't, it is a duty of in-kernel DAMON framework applications like DAMON sysfs interface, or the userspace users. Handling it in userspace or in-kernel DAMON application is complicated, inefficient, and repetitive compared to doing the update in DAMON core. Do the update in DAMON core. Link: https://lkml.kernel.org/r/20230119013831.1911-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- mm/damon/core.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index 2db8c53491ca..d9ef62047bf5 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -465,6 +465,76 @@ void damon_destroy_ctx(struct damon_ctx *ctx) kfree(ctx); } +static unsigned int damon_age_for_new_attrs(unsigned int age, + struct damon_attrs *old_attrs, struct damon_attrs *new_attrs) +{ + return age * old_attrs->aggr_interval / new_attrs->aggr_interval; +} + +/* convert access ratio in bp (per 10,000) to nr_accesses */ +static unsigned int damon_accesses_bp_to_nr_accesses( + unsigned int accesses_bp, struct damon_attrs *attrs) +{ + unsigned int max_nr_accesses = + attrs->aggr_interval / attrs->sample_interval; + + return accesses_bp * max_nr_accesses / 10000; +} + +/* convert nr_accesses to access ratio in bp (per 10,000) */ +static unsigned int damon_nr_accesses_to_accesses_bp( + unsigned int nr_accesses, struct damon_attrs *attrs) +{ + unsigned int max_nr_accesses = + attrs->aggr_interval / attrs->sample_interval; + + return nr_accesses * 10000 / max_nr_accesses; +} + +static unsigned int damon_nr_accesses_for_new_attrs(unsigned int nr_accesses, + struct damon_attrs *old_attrs, struct damon_attrs *new_attrs) +{ + return damon_accesses_bp_to_nr_accesses( + damon_nr_accesses_to_accesses_bp( + nr_accesses, old_attrs), + new_attrs); +} + +static void damon_update_monitoring_result(struct damon_region *r, + struct damon_attrs *old_attrs, struct damon_attrs *new_attrs) +{ + r->nr_accesses = damon_nr_accesses_for_new_attrs(r->nr_accesses, + old_attrs, new_attrs); + r->age = damon_age_for_new_attrs(r->age, old_attrs, new_attrs); +} + +/* + * region->nr_accesses is the number of sampling intervals in the last + * aggregation interval that access to the region has found, and region->age is + * the number of aggregation intervals that its access pattern has maintained. + * For the reason, the real meaning of the two fields depend on current + * sampling interval and aggregation interval. This function updates + * ->nr_accesses and ->age of given damon_ctx's regions for new damon_attrs. + */ +static void damon_update_monitoring_results(struct damon_ctx *ctx, + struct damon_attrs *new_attrs) +{ + struct damon_attrs *old_attrs = &ctx->attrs; + struct damon_target *t; + struct damon_region *r; + + /* if any interval is zero, simply forgive conversion */ + if (!old_attrs->sample_interval || !old_attrs->aggr_interval || + !new_attrs->sample_interval || + !new_attrs->aggr_interval) + return; + + damon_for_each_target(t, ctx) + damon_for_each_region(r, t) + damon_update_monitoring_result( + r, old_attrs, new_attrs); +} + /** * damon_set_attrs() - Set attributes for the monitoring. * @ctx: monitoring context @@ -482,6 +552,7 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) if (attrs->min_nr_regions > attrs->max_nr_regions) return -EINVAL; + damon_update_monitoring_results(ctx, attrs); ctx->attrs = *attrs; return 0; } -- cgit v1.2.3 From f4c978b6594b7452ef22a2fcff376debafcf25eb Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 19 Jan 2023 01:38:31 +0000 Subject: mm/damon/core-test: add a test for damon_update_monitoring_results() Add a simple unit test for damon_update_monitoring_results() function. Link: https://lkml.kernel.org/r/20230119013831.1911-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- mm/damon/core-test.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 3db9b7368756..fae64d32b925 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -289,6 +289,35 @@ static void damon_test_set_regions(struct kunit *test) damon_destroy_target(t); } +static void damon_test_update_monitoring_result(struct kunit *test) +{ + struct damon_attrs old_attrs = { + .sample_interval = 10, .aggr_interval = 1000,}; + struct damon_attrs new_attrs; + struct damon_region *r = damon_new_region(3, 7); + + r->nr_accesses = 15; + r->age = 20; + + new_attrs = (struct damon_attrs){ + .sample_interval = 100, .aggr_interval = 10000,}; + damon_update_monitoring_result(r, &old_attrs, &new_attrs); + KUNIT_EXPECT_EQ(test, r->nr_accesses, 15); + KUNIT_EXPECT_EQ(test, r->age, 2); + + new_attrs = (struct damon_attrs){ + .sample_interval = 1, .aggr_interval = 1000}; + damon_update_monitoring_result(r, &old_attrs, &new_attrs); + KUNIT_EXPECT_EQ(test, r->nr_accesses, 150); + KUNIT_EXPECT_EQ(test, r->age, 2); + + new_attrs = (struct damon_attrs){ + .sample_interval = 1, .aggr_interval = 100}; + damon_update_monitoring_result(r, &old_attrs, &new_attrs); + KUNIT_EXPECT_EQ(test, r->nr_accesses, 150); + KUNIT_EXPECT_EQ(test, r->age, 20); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_target), KUNIT_CASE(damon_test_regions), @@ -299,6 +328,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_split_regions_of), KUNIT_CASE(damon_test_ops_registration), KUNIT_CASE(damon_test_set_regions), + KUNIT_CASE(damon_test_update_monitoring_result), {}, }; -- cgit v1.2.3 From b2db9ef2c0926ba86898136704cdc757c7a5a60a Mon Sep 17 00:00:00 2001 From: Zhaoyang Huang Date: Thu, 19 Jan 2023 09:22:24 +0800 Subject: mm: move KMEMLEAK's Kconfig items from lib to mm Have the kmemleak's source code and Kconfig items be in the same directory. Link: https://lkml.kernel.org/r/1674091345-14799-1-git-send-email-zhaoyang.huang@unisoc.com Signed-off-by: Zhaoyang Huang Acked-by: Mike Rapoport (IBM) Acked-by: Vlastimil Babka Cc: ke.wang Cc: Mirsad Goran Todorovac Cc: Nathan Chancellor Cc: Peter Zijlstra (Intel) Cc: Catalin Marinas Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 71 ------------------------------------------------------ mm/Kconfig.debug | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 71 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 139758854ce6..958087475edb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -743,77 +743,6 @@ config SHRINKER_DEBUG visibility into the kernel memory shrinkers subsystem. Disable it to avoid an extra memory footprint. -config HAVE_DEBUG_KMEMLEAK - bool - -config DEBUG_KMEMLEAK - bool "Kernel memory leak detector" - depends on DEBUG_KERNEL && HAVE_DEBUG_KMEMLEAK - select DEBUG_FS - select STACKTRACE if STACKTRACE_SUPPORT - select KALLSYMS - select CRC32 - select STACKDEPOT - select STACKDEPOT_ALWAYS_INIT if !DEBUG_KMEMLEAK_DEFAULT_OFF - help - Say Y here if you want to enable the memory leak - detector. The memory allocation/freeing is traced in a way - similar to the Boehm's conservative garbage collector, the - difference being that the orphan objects are not freed but - only shown in /sys/kernel/debug/kmemleak. Enabling this - feature will introduce an overhead to memory - allocations. See Documentation/dev-tools/kmemleak.rst for more - details. - - Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances - of finding leaks due to the slab objects poisoning. - - In order to access the kmemleak file, debugfs needs to be - mounted (usually at /sys/kernel/debug). - -config DEBUG_KMEMLEAK_MEM_POOL_SIZE - int "Kmemleak memory pool size" - depends on DEBUG_KMEMLEAK - range 200 1000000 - default 16000 - help - Kmemleak must track all the memory allocations to avoid - reporting false positives. Since memory may be allocated or - freed before kmemleak is fully initialised, use a static pool - of metadata objects to track such callbacks. After kmemleak is - fully initialised, this memory pool acts as an emergency one - if slab allocations fail. - -config DEBUG_KMEMLEAK_TEST - tristate "Simple test for the kernel memory leak detector" - depends on DEBUG_KMEMLEAK && m - help - This option enables a module that explicitly leaks memory. - - If unsure, say N. - -config DEBUG_KMEMLEAK_DEFAULT_OFF - bool "Default kmemleak to off" - depends on DEBUG_KMEMLEAK - help - Say Y here to disable kmemleak by default. It can then be enabled - on the command line via kmemleak=on. - -config DEBUG_KMEMLEAK_AUTO_SCAN - bool "Enable kmemleak auto scan thread on boot up" - default y - depends on DEBUG_KMEMLEAK - help - Depending on the cpu, kmemleak scan may be cpu intensive and can - stall user tasks at times. This option enables/disables automatic - kmemleak scan at boot up. - - Say N here to disable kmemleak auto scan thread to stop automatic - scanning. Disabling this option disables automatic reporting of - memory leaks. - - If unsure, say Y. - config DEBUG_STACK_USAGE bool "Stack utilization instrumentation" depends on DEBUG_KERNEL && !IA64 diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index d62f48131952..c3547a373c9c 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -207,3 +207,75 @@ config PTDUMP_DEBUGFS kernel. If in doubt, say N. + +config HAVE_DEBUG_KMEMLEAK + bool + +config DEBUG_KMEMLEAK + bool "Kernel memory leak detector" + depends on DEBUG_KERNEL && HAVE_DEBUG_KMEMLEAK + select DEBUG_FS + select STACKTRACE if STACKTRACE_SUPPORT + select KALLSYMS + select CRC32 + select STACKDEPOT + select STACKDEPOT_ALWAYS_INIT if !DEBUG_KMEMLEAK_DEFAULT_OFF + help + Say Y here if you want to enable the memory leak + detector. The memory allocation/freeing is traced in a way + similar to the Boehm's conservative garbage collector, the + difference being that the orphan objects are not freed but + only shown in /sys/kernel/debug/kmemleak. Enabling this + feature will introduce an overhead to memory + allocations. See Documentation/dev-tools/kmemleak.rst for more + details. + + Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances + of finding leaks due to the slab objects poisoning. + + In order to access the kmemleak file, debugfs needs to be + mounted (usually at /sys/kernel/debug). + +config DEBUG_KMEMLEAK_MEM_POOL_SIZE + int "Kmemleak memory pool size" + depends on DEBUG_KMEMLEAK + range 200 1000000 + default 16000 + help + Kmemleak must track all the memory allocations to avoid + reporting false positives. Since memory may be allocated or + freed before kmemleak is fully initialised, use a static pool + of metadata objects to track such callbacks. After kmemleak is + fully initialised, this memory pool acts as an emergency one + if slab allocations fail. + +config DEBUG_KMEMLEAK_TEST + tristate "Simple test for the kernel memory leak detector" + depends on DEBUG_KMEMLEAK && m + help + This option enables a module that explicitly leaks memory. + + If unsure, say N. + +config DEBUG_KMEMLEAK_DEFAULT_OFF + bool "Default kmemleak to off" + depends on DEBUG_KMEMLEAK + help + Say Y here to disable kmemleak by default. It can then be enabled + on the command line via kmemleak=on. + +config DEBUG_KMEMLEAK_AUTO_SCAN + bool "Enable kmemleak auto scan thread on boot up" + default y + depends on DEBUG_KMEMLEAK + help + Depending on the cpu, kmemleak scan may be cpu intensive and can + stall user tasks at times. This option enables/disables automatic + kmemleak scan at boot up. + + Say N here to disable kmemleak auto scan thread to stop automatic + scanning. Disabling this option disables automatic reporting of + memory leaks. + + If unsure, say Y. + -- cgit v1.2.3 From 7b8144e63d84716f16a1b929e0c7e03ae5c4d5c1 Mon Sep 17 00:00:00 2001 From: "T.J. Alumbaugh" Date: Wed, 18 Jan 2023 00:18:21 +0000 Subject: mm: multi-gen LRU: section for working set protection Patch series "mm: multi-gen LRU: improve". This patch series improves a few MGLRU functions, collects related functions, and adds additional documentation. This patch (of 7): Add a section for working set protection in the code and the design doc. The admin doc already contains its usage. Link: https://lkml.kernel.org/r/20230118001827.1040870-1-talumbau@google.com Link: https://lkml.kernel.org/r/20230118001827.1040870-2-talumbau@google.com Signed-off-by: T.J. Alumbaugh Cc: Yu Zhao Signed-off-by: Andrew Morton --- Documentation/mm/multigen_lru.rst | 15 +++++++++++++++ mm/vmscan.c | 4 ++++ 2 files changed, 19 insertions(+) diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst index d8f721f98868..6e1483e70fdc 100644 --- a/Documentation/mm/multigen_lru.rst +++ b/Documentation/mm/multigen_lru.rst @@ -141,6 +141,21 @@ loop has detected outlying refaults from the tier this page is in. To this end, the feedback loop uses the first tier as the baseline, for the reason stated earlier. +Working set protection +---------------------- +Each generation is timestamped at birth. If ``lru_gen_min_ttl`` is +set, an ``lruvec`` is protected from the eviction when its oldest +generation was born within ``lru_gen_min_ttl`` milliseconds. In other +words, it prevents the working set of ``lru_gen_min_ttl`` milliseconds +from getting evicted. The OOM killer is triggered if this working set +cannot be kept in memory. + +This time-based approach has the following advantages: + +1. It is easier to configure because it is agnostic to applications + and memory sizes. +2. It is more reliable because it is directly wired to the OOM killer. + Summary ------- The multi-gen LRU can be disassembled into the following parts: diff --git a/mm/vmscan.c b/mm/vmscan.c index 394ff4962cbc..a741765896b6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4475,6 +4475,10 @@ done: return true; } +/****************************************************************************** + * working set protection + ******************************************************************************/ + static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) { int gen, type, zone; -- cgit v1.2.3 From db19a43d9b3a8876552f00f656008206ef9a5efa Mon Sep 17 00:00:00 2001 From: "T.J. Alumbaugh" Date: Wed, 18 Jan 2023 00:18:22 +0000 Subject: mm: multi-gen LRU: section for rmap/PT walk feedback Add a section for lru_gen_look_around() in the code and the design doc. Link: https://lkml.kernel.org/r/20230118001827.1040870-3-talumbau@google.com Signed-off-by: T.J. Alumbaugh Cc: Yu Zhao Signed-off-by: Andrew Morton --- Documentation/mm/multigen_lru.rst | 14 ++++++++++++++ mm/vmscan.c | 4 ++++ 2 files changed, 18 insertions(+) diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst index 6e1483e70fdc..bd988a142bc2 100644 --- a/Documentation/mm/multigen_lru.rst +++ b/Documentation/mm/multigen_lru.rst @@ -156,6 +156,20 @@ This time-based approach has the following advantages: and memory sizes. 2. It is more reliable because it is directly wired to the OOM killer. +Rmap/PT walk feedback +--------------------- +Searching the rmap for PTEs mapping each page on an LRU list (to test +and clear the accessed bit) can be expensive because pages from +different VMAs (PA space) are not cache friendly to the rmap (VA +space). For workloads mostly using mapped pages, searching the rmap +can incur the highest CPU cost in the reclaim path. + +``lru_gen_look_around()`` exploits spatial locality to reduce the +trips into the rmap. It scans the adjacent PTEs of a young PTE and +promotes hot pages. If the scan was done cacheline efficiently, it +adds the PMD entry pointing to the PTE table to the Bloom filter. This +forms a feedback loop between the eviction and the aging. + Summary ------- The multi-gen LRU can be disassembled into the following parts: diff --git a/mm/vmscan.c b/mm/vmscan.c index a741765896b6..eb9263bf6806 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4569,6 +4569,10 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) } } +/****************************************************************************** + * rmap/PT walk feedback + ******************************************************************************/ + /* * This function exploits spatial locality when shrink_folio_list() walks the * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If -- cgit v1.2.3 From ccbbbb85945d8f0255aa9dbc1b617017e2294f2c Mon Sep 17 00:00:00 2001 From: "T.J. Alumbaugh" Date: Wed, 18 Jan 2023 00:18:23 +0000 Subject: mm: multi-gen LRU: section for Bloom filters Move Bloom filters code into a dedicated section. Improve the design doc to explain Bloom filter usage and connection between aging and eviction in their use. Link: https://lkml.kernel.org/r/20230118001827.1040870-4-talumbau@google.com Signed-off-by: T.J. Alumbaugh Cc: Yu Zhao Signed-off-by: Andrew Morton --- Documentation/mm/multigen_lru.rst | 16 ++++ mm/vmscan.c | 180 +++++++++++++++++++------------------- 2 files changed, 108 insertions(+), 88 deletions(-) diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst index bd988a142bc2..770b5d539856 100644 --- a/Documentation/mm/multigen_lru.rst +++ b/Documentation/mm/multigen_lru.rst @@ -170,6 +170,22 @@ promotes hot pages. If the scan was done cacheline efficiently, it adds the PMD entry pointing to the PTE table to the Bloom filter. This forms a feedback loop between the eviction and the aging. +Bloom Filters +------------- +Bloom filters are a space and memory efficient data structure for set +membership test, i.e., test if an element is not in the set or may be +in the set. + +In the eviction path, specifically, in ``lru_gen_look_around()``, if a +PMD has a sufficient number of hot pages, its address is placed in the +filter. In the aging path, set membership means that the PTE range +will be scanned for young pages. + +Note that Bloom filters are probabilistic on set membership. If a test +is false positive, the cost is an additional scan of a range of PTEs, +which may yield hot pages anyway. Parameters of the filter itself can +control the false positive rate in the limit. + Summary ------- The multi-gen LRU can be disassembled into the following parts: diff --git a/mm/vmscan.c b/mm/vmscan.c index eb9263bf6806..1be9120349f8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3233,6 +3233,98 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; } +/****************************************************************************** + * Bloom filters + ******************************************************************************/ + +/* + * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when + * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of + * bits in a bitmap, k is the number of hash functions and n is the number of + * inserted items. + * + * Page table walkers use one of the two filters to reduce their search space. + * To get rid of non-leaf entries that no longer have enough leaf entries, the + * aging uses the double-buffering technique to flip to the other filter each + * time it produces a new generation. For non-leaf entries that have enough + * leaf entries, the aging carries them over to the next generation in + * walk_pmd_range(); the eviction also report them when walking the rmap + * in lru_gen_look_around(). + * + * For future optimizations: + * 1. It's not necessary to keep both filters all the time. The spare one can be + * freed after the RCU grace period and reallocated if needed again. + * 2. And when reallocating, it's worth scaling its size according to the number + * of inserted entries in the other filter, to reduce the memory overhead on + * small systems and false positives on large systems. + * 3. Jenkins' hash function is an alternative to Knuth's. + */ +#define BLOOM_FILTER_SHIFT 15 + +static inline int filter_gen_from_seq(unsigned long seq) +{ + return seq % NR_BLOOM_FILTERS; +} + +static void get_item_key(void *item, int *key) +{ + u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); + + BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); + + key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); + key[1] = hash >> BLOOM_FILTER_SHIFT; +} + +static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +{ + int key[2]; + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = READ_ONCE(lruvec->mm_state.filters[gen]); + if (!filter) + return true; + + get_item_key(item, key); + + return test_bit(key[0], filter) && test_bit(key[1], filter); +} + +static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +{ + int key[2]; + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = READ_ONCE(lruvec->mm_state.filters[gen]); + if (!filter) + return; + + get_item_key(item, key); + + if (!test_bit(key[0], filter)) + set_bit(key[0], filter); + if (!test_bit(key[1], filter)) + set_bit(key[1], filter); +} + +static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) +{ + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = lruvec->mm_state.filters[gen]; + if (filter) { + bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); + return; + } + + filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + WRITE_ONCE(lruvec->mm_state.filters[gen], filter); +} + /****************************************************************************** * mm_struct list ******************************************************************************/ @@ -3352,94 +3444,6 @@ void lru_gen_migrate_mm(struct mm_struct *mm) } #endif -/* - * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when - * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of - * bits in a bitmap, k is the number of hash functions and n is the number of - * inserted items. - * - * Page table walkers use one of the two filters to reduce their search space. - * To get rid of non-leaf entries that no longer have enough leaf entries, the - * aging uses the double-buffering technique to flip to the other filter each - * time it produces a new generation. For non-leaf entries that have enough - * leaf entries, the aging carries them over to the next generation in - * walk_pmd_range(); the eviction also report them when walking the rmap - * in lru_gen_look_around(). - * - * For future optimizations: - * 1. It's not necessary to keep both filters all the time. The spare one can be - * freed after the RCU grace period and reallocated if needed again. - * 2. And when reallocating, it's worth scaling its size according to the number - * of inserted entries in the other filter, to reduce the memory overhead on - * small systems and false positives on large systems. - * 3. Jenkins' hash function is an alternative to Knuth's. - */ -#define BLOOM_FILTER_SHIFT 15 - -static inline int filter_gen_from_seq(unsigned long seq) -{ - return seq % NR_BLOOM_FILTERS; -} - -static void get_item_key(void *item, int *key) -{ - u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); - - BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); - - key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); - key[1] = hash >> BLOOM_FILTER_SHIFT; -} - -static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) -{ - unsigned long *filter; - int gen = filter_gen_from_seq(seq); - - filter = lruvec->mm_state.filters[gen]; - if (filter) { - bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); - return; - } - - filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), - __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); - WRITE_ONCE(lruvec->mm_state.filters[gen], filter); -} - -static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) -{ - int key[2]; - unsigned long *filter; - int gen = filter_gen_from_seq(seq); - - filter = READ_ONCE(lruvec->mm_state.filters[gen]); - if (!filter) - return; - - get_item_key(item, key); - - if (!test_bit(key[0], filter)) - set_bit(key[0], filter); - if (!test_bit(key[1], filter)) - set_bit(key[1], filter); -} - -static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) -{ - int key[2]; - unsigned long *filter; - int gen = filter_gen_from_seq(seq); - - filter = READ_ONCE(lruvec->mm_state.filters[gen]); - if (!filter) - return true; - - get_item_key(item, key); - - return test_bit(key[0], filter) && test_bit(key[1], filter); -} - static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) { int i; -- cgit v1.2.3 From 36c7b4db7c942ae9e1b111f0c6b468c8b2e33842 Mon Sep 17 00:00:00 2001 From: "T.J. Alumbaugh" Date: Wed, 18 Jan 2023 00:18:24 +0000 Subject: mm: multi-gen LRU: section for memcg LRU Move memcg LRU code into a dedicated section. Improve the design doc to outline its architecture. Link: https://lkml.kernel.org/r/20230118001827.1040870-5-talumbau@google.com Signed-off-by: T.J. Alumbaugh Cc: Yu Zhao Signed-off-by: Andrew Morton --- Documentation/mm/multigen_lru.rst | 33 ++++- include/linux/mm_inline.h | 17 --- include/linux/mmzone.h | 13 +- mm/memcontrol.c | 8 +- mm/vmscan.c | 250 ++++++++++++++++++++++---------------- 5 files changed, 178 insertions(+), 143 deletions(-) diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst index 770b5d539856..5f1f6ecbb79b 100644 --- a/Documentation/mm/multigen_lru.rst +++ b/Documentation/mm/multigen_lru.rst @@ -186,9 +186,40 @@ is false positive, the cost is an additional scan of a range of PTEs, which may yield hot pages anyway. Parameters of the filter itself can control the false positive rate in the limit. +Memcg LRU +--------- +An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs, +since each node and memcg combination has an LRU of folios (see +``mem_cgroup_lruvec()``). Its goal is to improve the scalability of +global reclaim, which is critical to system-wide memory overcommit in +data centers. Note that memcg LRU only applies to global reclaim. + +The basic structure of an memcg LRU can be understood by an analogy to +the active/inactive LRU (of folios): + +1. It has the young and the old (generations), i.e., the counterparts + to the active and the inactive; +2. The increment of ``max_seq`` triggers promotion, i.e., the + counterpart to activation; +3. Other events trigger similar operations, e.g., offlining an memcg + triggers demotion, i.e., the counterpart to deactivation. + +In terms of global reclaim, it has two distinct features: + +1. Sharding, which allows each thread to start at a random memcg (in + the old generation) and improves parallelism; +2. Eventual fairness, which allows direct reclaim to bail out at will + and reduces latency without affecting fairness over some time. + +In terms of traversing memcgs during global reclaim, it improves the +best-case complexity from O(n) to O(1) and does not affect the +worst-case complexity O(n). Therefore, on average, it has a sublinear +complexity. + Summary ------- -The multi-gen LRU can be disassembled into the following parts: +The multi-gen LRU (of folios) can be disassembled into the following +parts: * Generations * Rmap walks diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 26dcbda07e92..de1e622dd366 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -122,18 +122,6 @@ static inline bool lru_gen_in_fault(void) return current->in_lru_fault; } -#ifdef CONFIG_MEMCG -static inline int lru_gen_memcg_seg(struct lruvec *lruvec) -{ - return READ_ONCE(lruvec->lrugen.seg); -} -#else -static inline int lru_gen_memcg_seg(struct lruvec *lruvec) -{ - return 0; -} -#endif - static inline int lru_gen_from_seq(unsigned long seq) { return seq % MAX_NR_GENS; @@ -309,11 +297,6 @@ static inline bool lru_gen_in_fault(void) return false; } -static inline int lru_gen_memcg_seg(struct lruvec *lruvec) -{ - return 0; -} - static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 815c7c2edf45..977be526c939 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -368,15 +368,6 @@ struct page_vma_mapped_walk; #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) -/* see the comment on MEMCG_NR_GENS */ -enum { - MEMCG_LRU_NOP, - MEMCG_LRU_HEAD, - MEMCG_LRU_TAIL, - MEMCG_LRU_OLD, - MEMCG_LRU_YOUNG, -}; - #ifdef CONFIG_LRU_GEN enum { @@ -557,7 +548,7 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg); void lru_gen_online_memcg(struct mem_cgroup *memcg); void lru_gen_offline_memcg(struct mem_cgroup *memcg); void lru_gen_release_memcg(struct mem_cgroup *memcg); -void lru_gen_rotate_memcg(struct lruvec *lruvec, int op); +void lru_gen_soft_reclaim(struct lruvec *lruvec); #else /* !CONFIG_MEMCG */ @@ -608,7 +599,7 @@ static inline void lru_gen_release_memcg(struct mem_cgroup *memcg) { } -static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) +static inline void lru_gen_soft_reclaim(struct lruvec *lruvec) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 893427aded01..17335459d8dc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -476,12 +476,8 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) struct mem_cgroup_tree_per_node *mctz; if (lru_gen_enabled()) { - struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; - - /* see the comment on MEMCG_NR_GENS */ - if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) - lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); - + if (soft_limit_excess(memcg)) + lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec); return; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 1be9120349f8..796d4ca65e97 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4705,6 +4705,148 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) mem_cgroup_unlock_pages(); } +/****************************************************************************** + * memcg LRU + ******************************************************************************/ + +/* see the comment on MEMCG_NR_GENS */ +enum { + MEMCG_LRU_NOP, + MEMCG_LRU_HEAD, + MEMCG_LRU_TAIL, + MEMCG_LRU_OLD, + MEMCG_LRU_YOUNG, +}; + +#ifdef CONFIG_MEMCG + +static int lru_gen_memcg_seg(struct lruvec *lruvec) +{ + return READ_ONCE(lruvec->lrugen.seg); +} + +static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) +{ + int seg; + int old, new; + int bin = get_random_u32_below(MEMCG_NR_BINS); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + spin_lock(&pgdat->memcg_lru.lock); + + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); + + seg = 0; + new = old = lruvec->lrugen.gen; + + /* see the comment on MEMCG_NR_GENS */ + if (op == MEMCG_LRU_HEAD) + seg = MEMCG_LRU_HEAD; + else if (op == MEMCG_LRU_TAIL) + seg = MEMCG_LRU_TAIL; + else if (op == MEMCG_LRU_OLD) + new = get_memcg_gen(pgdat->memcg_lru.seq); + else if (op == MEMCG_LRU_YOUNG) + new = get_memcg_gen(pgdat->memcg_lru.seq + 1); + else + VM_WARN_ON_ONCE(true); + + hlist_nulls_del_rcu(&lruvec->lrugen.list); + + if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) + hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); + else + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); + + pgdat->memcg_lru.nr_memcgs[old]--; + pgdat->memcg_lru.nr_memcgs[new]++; + + lruvec->lrugen.gen = new; + WRITE_ONCE(lruvec->lrugen.seg, seg); + + if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); + + spin_unlock(&pgdat->memcg_lru.lock); +} + +void lru_gen_online_memcg(struct mem_cgroup *memcg) +{ + int gen; + int nid; + int bin = get_random_u32_below(MEMCG_NR_BINS); + + for_each_node(nid) { + struct pglist_data *pgdat = NODE_DATA(nid); + struct lruvec *lruvec = get_lruvec(memcg, nid); + + spin_lock(&pgdat->memcg_lru.lock); + + VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); + + gen = get_memcg_gen(pgdat->memcg_lru.seq); + + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); + pgdat->memcg_lru.nr_memcgs[gen]++; + + lruvec->lrugen.gen = gen; + + spin_unlock(&pgdat->memcg_lru.lock); + } +} + +void lru_gen_offline_memcg(struct mem_cgroup *memcg) +{ + int nid; + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD); + } +} + +void lru_gen_release_memcg(struct mem_cgroup *memcg) +{ + int gen; + int nid; + + for_each_node(nid) { + struct pglist_data *pgdat = NODE_DATA(nid); + struct lruvec *lruvec = get_lruvec(memcg, nid); + + spin_lock(&pgdat->memcg_lru.lock); + + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); + + gen = lruvec->lrugen.gen; + + hlist_nulls_del_rcu(&lruvec->lrugen.list); + pgdat->memcg_lru.nr_memcgs[gen]--; + + if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); + + spin_unlock(&pgdat->memcg_lru.lock); + } +} + +void lru_gen_soft_reclaim(struct lruvec *lruvec) +{ + /* see the comment on MEMCG_NR_GENS */ + if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); +} + +#else /* !CONFIG_MEMCG */ + +static int lru_gen_memcg_seg(struct lruvec *lruvec) +{ + return 0; +} + +#endif + /****************************************************************************** * the eviction ******************************************************************************/ @@ -5397,53 +5539,6 @@ done: pgdat->kswapd_failures = 0; } -#ifdef CONFIG_MEMCG -void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) -{ - int seg; - int old, new; - int bin = get_random_u32_below(MEMCG_NR_BINS); - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - - spin_lock(&pgdat->memcg_lru.lock); - - VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); - - seg = 0; - new = old = lruvec->lrugen.gen; - - /* see the comment on MEMCG_NR_GENS */ - if (op == MEMCG_LRU_HEAD) - seg = MEMCG_LRU_HEAD; - else if (op == MEMCG_LRU_TAIL) - seg = MEMCG_LRU_TAIL; - else if (op == MEMCG_LRU_OLD) - new = get_memcg_gen(pgdat->memcg_lru.seq); - else if (op == MEMCG_LRU_YOUNG) - new = get_memcg_gen(pgdat->memcg_lru.seq + 1); - else - VM_WARN_ON_ONCE(true); - - hlist_nulls_del_rcu(&lruvec->lrugen.list); - - if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) - hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); - else - hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); - - pgdat->memcg_lru.nr_memcgs[old]--; - pgdat->memcg_lru.nr_memcgs[new]++; - - lruvec->lrugen.gen = new; - WRITE_ONCE(lruvec->lrugen.seg, seg); - - if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) - WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); - - spin_unlock(&pgdat->memcg_lru.lock); -} -#endif - /****************************************************************************** * state change ******************************************************************************/ @@ -6086,67 +6181,6 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg) } } -void lru_gen_online_memcg(struct mem_cgroup *memcg) -{ - int gen; - int nid; - int bin = get_random_u32_below(MEMCG_NR_BINS); - - for_each_node(nid) { - struct pglist_data *pgdat = NODE_DATA(nid); - struct lruvec *lruvec = get_lruvec(memcg, nid); - - spin_lock(&pgdat->memcg_lru.lock); - - VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); - - gen = get_memcg_gen(pgdat->memcg_lru.seq); - - hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); - pgdat->memcg_lru.nr_memcgs[gen]++; - - lruvec->lrugen.gen = gen; - - spin_unlock(&pgdat->memcg_lru.lock); - } -} - -void lru_gen_offline_memcg(struct mem_cgroup *memcg) -{ - int nid; - - for_each_node(nid) { - struct lruvec *lruvec = get_lruvec(memcg, nid); - - lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD); - } -} - -void lru_gen_release_memcg(struct mem_cgroup *memcg) -{ - int gen; - int nid; - - for_each_node(nid) { - struct pglist_data *pgdat = NODE_DATA(nid); - struct lruvec *lruvec = get_lruvec(memcg, nid); - - spin_lock(&pgdat->memcg_lru.lock); - - VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); - - gen = lruvec->lrugen.gen; - - hlist_nulls_del_rcu(&lruvec->lrugen.list); - pgdat->memcg_lru.nr_memcgs[gen]--; - - if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) - WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); - - spin_unlock(&pgdat->memcg_lru.lock); - } -} - #endif /* CONFIG_MEMCG */ static int __init init_lru_gen(void) -- cgit v1.2.3 From 37cc99979d04cca677c0ad5c0acd1149ec165d1b Mon Sep 17 00:00:00 2001 From: "T.J. Alumbaugh" Date: Wed, 18 Jan 2023 00:18:25 +0000 Subject: mm: multi-gen LRU: improve lru_gen_exit_memcg() Add warnings and poison ->next. Link: https://lkml.kernel.org/r/20230118001827.1040870-6-talumbau@google.com Signed-off-by: T.J. Alumbaugh Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index 796d4ca65e97..c2e6ad53447b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6168,12 +6168,17 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg) int i; int nid; + VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo)); + for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); + VM_WARN_ON_ONCE(lruvec->mm_state.nr_walkers); VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, sizeof(lruvec->lrugen.nr_pages))); + lruvec->lrugen.list.next = LIST_POISON1; + for (i = 0; i < NR_BLOOM_FILTERS; i++) { bitmap_free(lruvec->mm_state.filters[i]); lruvec->mm_state.filters[i] = NULL; -- cgit v1.2.3 From b5ff4133617d0eced35b685da0bd0929dd9fabb7 Mon Sep 17 00:00:00 2001 From: "T.J. Alumbaugh" Date: Wed, 18 Jan 2023 00:18:26 +0000 Subject: mm: multi-gen LRU: improve walk_pmd_range() Improve readability of walk_pmd_range() and walk_pmd_range_locked(). Link: https://lkml.kernel.org/r/20230118001827.1040870-7-talumbau@google.com Signed-off-by: T.J. Alumbaugh Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index c2e6ad53447b..ff3b4aa3c31f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3999,8 +3999,8 @@ restart: } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) -static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, - struct mm_walk *args, unsigned long *bitmap, unsigned long *start) +static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, + struct mm_walk *args, unsigned long *bitmap, unsigned long *first) { int i; pmd_t *pmd; @@ -4013,18 +4013,19 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area VM_WARN_ON_ONCE(pud_leaf(*pud)); /* try to batch at most 1+MIN_LRU_BATCH+1 entries */ - if (*start == -1) { - *start = next; + if (*first == -1) { + *first = addr; + bitmap_zero(bitmap, MIN_LRU_BATCH); return; } - i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start); + i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first); if (i && i <= MIN_LRU_BATCH) { __set_bit(i - 1, bitmap); return; } - pmd = pmd_offset(pud, *start); + pmd = pmd_offset(pud, *first); ptl = pmd_lockptr(args->mm, pmd); if (!spin_trylock(ptl)) @@ -4035,15 +4036,16 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area do { unsigned long pfn; struct folio *folio; - unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start; + + /* don't round down the first address */ + addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first; pfn = get_pmd_pfn(pmd[i], vma, addr); if (pfn == -1) goto next; if (!pmd_trans_huge(pmd[i])) { - if (arch_has_hw_nonleaf_pmd_young() && - get_cap(LRU_GEN_NONLEAF_YOUNG)) + if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) pmdp_test_and_clear_young(vma, addr, pmd + i); goto next; } @@ -4072,12 +4074,11 @@ next: arch_leave_lazy_mmu_mode(); spin_unlock(ptl); done: - *start = -1; - bitmap_zero(bitmap, MIN_LRU_BATCH); + *first = -1; } #else -static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, - struct mm_walk *args, unsigned long *bitmap, unsigned long *start) +static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, + struct mm_walk *args, unsigned long *bitmap, unsigned long *first) { } #endif @@ -4090,9 +4091,9 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, unsigned long next; unsigned long addr; struct vm_area_struct *vma; - unsigned long pos = -1; + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)]; + unsigned long first = -1; struct lru_gen_mm_walk *walk = args->private; - unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; VM_WARN_ON_ONCE(pud_leaf(*pud)); @@ -4131,18 +4132,17 @@ restart: if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) continue; - walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); continue; } #endif walk->mm_stats[MM_NONLEAF_TOTAL]++; - if (arch_has_hw_nonleaf_pmd_young() && - get_cap(LRU_GEN_NONLEAF_YOUNG)) { + if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) { if (!pmd_young(val)) continue; - walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); } if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) @@ -4159,7 +4159,7 @@ restart: update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); } - walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos); + walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first); if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end)) goto restart; -- cgit v1.2.3 From abf086721a2f1e6897c57796f7268df1b194c750 Mon Sep 17 00:00:00 2001 From: "T.J. Alumbaugh" Date: Wed, 18 Jan 2023 00:18:27 +0000 Subject: mm: multi-gen LRU: simplify lru_gen_look_around() Update the folio generation in place with or without current->reclaim_state->mm_walk. The LRU lock is held for longer, if mm_walk is NULL and the number of folios to update is more than PAGEVEC_SIZE. This causes a measurable regression from the LRU lock contention during a microbencmark. But a tiny regression is not worth the complexity. Link: https://lkml.kernel.org/r/20230118001827.1040870-8-talumbau@google.com Signed-off-by: T.J. Alumbaugh Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 73 +++++++++++++++++++------------------------------------------ 1 file changed, 23 insertions(+), 50 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index ff3b4aa3c31f..ac51150d2d36 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4587,13 +4587,12 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { int i; - pte_t *pte; unsigned long start; unsigned long end; - unsigned long addr; struct lru_gen_mm_walk *walk; int young = 0; - unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; + pte_t *pte = pvmw->pte; + unsigned long addr = pvmw->address; struct folio *folio = pfn_folio(pvmw->pfn); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); @@ -4610,25 +4609,28 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) /* avoid taking the LRU lock under the PTL when possible */ walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; - start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); - end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; + start = max(addr & PMD_MASK, pvmw->vma->vm_start); + end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { - if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) + if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2) end = start + MIN_LRU_BATCH * PAGE_SIZE; - else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2) + else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2) start = end - MIN_LRU_BATCH * PAGE_SIZE; else { - start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2; - end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2; + start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2; + end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2; } } - pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; + /* folio_update_gen() requires stable folio_memcg() */ + if (!mem_cgroup_trylock_pages(memcg)) + return; - rcu_read_lock(); arch_enter_lazy_mmu_mode(); + pte -= (addr - start) / PAGE_SIZE; + for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; @@ -4653,56 +4655,27 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) !folio_test_swapcache(folio))) folio_mark_dirty(folio); + if (walk) { + old_gen = folio_update_gen(folio, new_gen); + if (old_gen >= 0 && old_gen != new_gen) + update_batch_size(walk, folio, old_gen, new_gen); + + continue; + } + old_gen = folio_lru_gen(folio); if (old_gen < 0) folio_set_referenced(folio); else if (old_gen != new_gen) - __set_bit(i, bitmap); + folio_activate(folio); } arch_leave_lazy_mmu_mode(); - rcu_read_unlock(); + mem_cgroup_unlock_pages(); /* feedback from rmap walkers to page table walkers */ if (suitable_to_scan(i, young)) update_bloom_filter(lruvec, max_seq, pvmw->pmd); - - if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { - for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { - folio = pfn_folio(pte_pfn(pte[i])); - folio_activate(folio); - } - return; - } - - /* folio_update_gen() requires stable folio_memcg() */ - if (!mem_cgroup_trylock_pages(memcg)) - return; - - if (!walk) { - spin_lock_irq(&lruvec->lru_lock); - new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); - } - - for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { - folio = pfn_folio(pte_pfn(pte[i])); - if (folio_memcg_rcu(folio) != memcg) - continue; - - old_gen = folio_update_gen(folio, new_gen); - if (old_gen < 0 || old_gen == new_gen) - continue; - - if (walk) - update_batch_size(walk, folio, old_gen, new_gen); - else - lru_gen_update_size(lruvec, folio, old_gen, new_gen); - } - - if (!walk) - spin_unlock_irq(&lruvec->lru_lock); - - mem_cgroup_unlock_pages(); } /****************************************************************************** -- cgit v1.2.3 From 44b8f8bf2438bfee3aceae4d647a7460213ff340 Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Fri, 20 Jan 2023 03:46:20 +0000 Subject: mm: memory-failure: add memory failure stats to sysfs Patch series "Introduce per NUMA node memory error statistics", v2. Background ========== In the RFC for Kernel Support of Memory Error Detection [1], one advantage of software-based scanning over hardware patrol scrubber is the ability to make statistics visible to system administrators. The statistics include 2 categories: * Memory error statistics, for example, how many memory error are encountered, how many of them are recovered by the kernel. Note these memory errors are non-fatal to kernel: during the machine check exception (MCE) handling kernel already classified MCE's severity to be unnecessary to panic (but either action required or optional). * Scanner statistics, for example how many times the scanner have fully scanned a NUMA node, how many errors are first detected by the scanner. The memory error statistics are useful to userspace and actually not specific to scanner detected memory errors, and are the focus of this patchset. Motivation ========== Memory error stats are important to userspace but insufficient in kernel today. Datacenter administrators can better monitor a machine's memory health with the visible stats. For example, while memory errors are inevitable on servers with 10+ TB memory, starting server maintenance when there are only 1~2 recovered memory errors could be overreacting; in cloud production environment maintenance usually means live migrate all the workload running on the server and this usually causes nontrivial disruption to the customer. Providing insight into the scope of memory errors on a system helps to determine the appropriate follow-up action. In addition, the kernel's existing memory error stats need to be standardized so that userspace can reliably count on their usefulness. Today kernel provides following memory error info to userspace, but they are not sufficient or have disadvantages: * HardwareCorrupted in /proc/meminfo: number of bytes poisoned in total, not per NUMA node stats though * ras:memory_failure_event: only available after explicitly enabled * /dev/mcelog provides many useful info about the MCEs, but doesn't capture how memory_failure recovered memory MCEs * kernel logs: userspace needs to process log text Exposing memory error stats is also a good start for the in-kernel memory error detector. Today the data source of memory error stats are either direct memory error consumption, or hardware patrol scrubber detection (either signaled as UCNA or SRAO). Once in-kernel memory scanner is implemented, it will be the main source as it is usually configured to scan memory DIMMs constantly and faster than hardware patrol scrubber. How Implemented =============== As Naoya pointed out [2], exposing memory error statistics to userspace is useful independent of software or hardware scanner. Therefore we implement the memory error statistics independent of the in-kernel memory error detector. It exposes the following per NUMA node memory error counters: /sys/devices/system/node/node${X}/memory_failure/total /sys/devices/system/node/node${X}/memory_failure/recovered /sys/devices/system/node/node${X}/memory_failure/ignored /sys/devices/system/node/node${X}/memory_failure/failed /sys/devices/system/node/node${X}/memory_failure/delayed These counters describe how many raw pages are poisoned and after the attempted recoveries by the kernel, their resolutions: how many are recovered, ignored, failed, or delayed respectively. This approach can be easier to extend for future use cases than /proc/meminfo, trace event, and log. The following math holds for the statistics: * total = recovered + ignored + failed + delayed These memory error stats are reset during machine boot. The 1st commit introduces these sysfs entries. The 2nd commit populates memory error stats every time memory_failure attempts memory error recovery. The 3rd commit adds documentations for introduced stats. [1] https://lore.kernel.org/linux-mm/7E670362-C29E-4626-B546-26530D54F937@gmail.com/T/#mc22959244f5388891c523882e61163c6e4d703af [2] https://lore.kernel.org/linux-mm/7E670362-C29E-4626-B546-26530D54F937@gmail.com/T/#m52d8d7a333d8536bd7ce74253298858b1c0c0ac6 This patch (of 3): Today kernel provides following memory error info to userspace, but each has its own disadvantage * HardwareCorrupted in /proc/meminfo: number of bytes poisoned in total, not per NUMA node stats though * ras:memory_failure_event: only available after explicitly enabled * /dev/mcelog provides many useful info about the MCEs, but doesn't capture how memory_failure recovered memory MCEs * kernel logs: userspace needs to process log text Exposes per NUMA node memory error stats as sysfs entries: /sys/devices/system/node/node${X}/memory_failure/total /sys/devices/system/node/node${X}/memory_failure/recovered /sys/devices/system/node/node${X}/memory_failure/ignored /sys/devices/system/node/node${X}/memory_failure/failed /sys/devices/system/node/node${X}/memory_failure/delayed These counters describe how many raw pages are poisoned and after the attempted recoveries by the kernel, their resolutions: how many are recovered, ignored, failed, or delayed respectively. The following math holds for the statistics: * total = recovered + ignored + failed + delayed Link: https://lkml.kernel.org/r/20230120034622.2698268-1-jiaqiyan@google.com Link: https://lkml.kernel.org/r/20230120034622.2698268-2-jiaqiyan@google.com Signed-off-by: Jiaqi Yan Acked-by: David Rientjes Acked-by: Naoya Horiguchi Cc: Kefeng Wang Cc: Tony Luck Cc: Yang Shi Signed-off-by: Andrew Morton --- drivers/base/node.c | 3 +++ include/linux/mm.h | 5 +++++ include/linux/mmzone.h | 28 ++++++++++++++++++++++++++++ mm/memory-failure.c | 35 +++++++++++++++++++++++++++++++++++ 4 files changed, 71 insertions(+) diff --git a/drivers/base/node.c b/drivers/base/node.c index faf3597a96da..b46db17124f3 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -586,6 +586,9 @@ static const struct attribute_group *node_dev_groups[] = { &node_dev_group, #ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP &arch_node_dev_group, +#endif +#ifdef CONFIG_MEMORY_FAILURE + &memory_failure_attr_group, #endif NULL }; diff --git a/include/linux/mm.h b/include/linux/mm.h index 836b96e08a14..c9db257f09b3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3455,6 +3455,11 @@ enum mf_action_page_type { MF_MSG_UNKNOWN, }; +/* + * Sysfs entries for memory failure handling statistics. + */ +extern const struct attribute_group memory_failure_attr_group; + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) extern void clear_huge_page(struct page *page, unsigned long addr_hint, diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 977be526c939..9fb1b03b83b2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1212,6 +1212,31 @@ struct deferred_split { }; #endif +#ifdef CONFIG_MEMORY_FAILURE +/* + * Per NUMA node memory failure handling statistics. + */ +struct memory_failure_stats { + /* + * Number of raw pages poisoned. + * Cases not accounted: memory outside kernel control, offline page, + * arch-specific memory_failure (SGX), hwpoison_filter() filtered + * error events, and unpoison actions from hwpoison_unpoison. + */ + unsigned long total; + /* + * Recovery results of poisoned raw pages handled by memory_failure, + * in sync with mf_result. + * total = ignored + failed + delayed + recovered. + * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted. + */ + unsigned long ignored; + unsigned long failed; + unsigned long delayed; + unsigned long recovered; +}; +#endif + /* * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. On UMA machines there is a single pglist_data which @@ -1357,6 +1382,9 @@ typedef struct pglist_data { #ifdef CONFIG_NUMA struct memory_tier __rcu *memtier; #endif +#ifdef CONFIG_MEMORY_FAILURE + struct memory_failure_stats mf_stats; +#endif } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0a382191737f..44eec2e93a0b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -87,6 +87,41 @@ inline void num_poisoned_pages_sub(unsigned long pfn, long i) memblk_nr_poison_sub(pfn, i); } +/** + * MF_ATTR_RO - Create sysfs entry for each memory failure statistics. + * @_name: name of the file in the per NUMA sysfs directory. + */ +#define MF_ATTR_RO(_name) \ +static ssize_t _name##_show(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ +{ \ + struct memory_failure_stats *mf_stats = \ + &NODE_DATA(dev->id)->mf_stats; \ + return sprintf(buf, "%lu\n", mf_stats->_name); \ +} \ +static DEVICE_ATTR_RO(_name) + +MF_ATTR_RO(total); +MF_ATTR_RO(ignored); +MF_ATTR_RO(failed); +MF_ATTR_RO(delayed); +MF_ATTR_RO(recovered); + +static struct attribute *memory_failure_attr[] = { + &dev_attr_total.attr, + &dev_attr_ignored.attr, + &dev_attr_failed.attr, + &dev_attr_delayed.attr, + &dev_attr_recovered.attr, + NULL, +}; + +const struct attribute_group memory_failure_attr_group = { + .name = "memory_failure", + .attrs = memory_failure_attr, +}; + /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy, -- cgit v1.2.3 From 18f41fa616ee4d66c67033eb46b951bf6e1b4a12 Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Fri, 20 Jan 2023 03:46:21 +0000 Subject: mm: memory-failure: bump memory failure stats to pglist_data Right before memory_failure finishes its handling, accumulate poisoned page's resolution counters to pglist_data's memory_failure_stats, so as to update the corresponding sysfs entries. Tested: 1) Start an application to allocate memory buffer chunks 2) Convert random memory buffer addresses to physical addresses 3) Inject memory errors using EINJ at chosen physical addresses 4) Access poisoned memory buffer and recover from SIGBUS 5) Check counter values under /sys/devices/system/node/node*/memory_failure/* Link: https://lkml.kernel.org/r/20230120034622.2698268-3-jiaqiyan@google.com Signed-off-by: Jiaqi Yan Acked-by: David Rientjes Acked-by: Naoya Horiguchi Cc: Kefeng Wang Cc: Tony Luck Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 44eec2e93a0b..b4b30d9b0782 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1227,6 +1227,39 @@ static struct page_state error_states[] = { #undef slab #undef reserved +static void update_per_node_mf_stats(unsigned long pfn, + enum mf_result result) +{ + int nid = MAX_NUMNODES; + struct memory_failure_stats *mf_stats = NULL; + + nid = pfn_to_nid(pfn); + if (unlikely(nid < 0 || nid >= MAX_NUMNODES)) { + WARN_ONCE(1, "Memory failure: pfn=%#lx, invalid nid=%d", pfn, nid); + return; + } + + mf_stats = &NODE_DATA(nid)->mf_stats; + switch (result) { + case MF_IGNORED: + ++mf_stats->ignored; + break; + case MF_FAILED: + ++mf_stats->failed; + break; + case MF_DELAYED: + ++mf_stats->delayed; + break; + case MF_RECOVERED: + ++mf_stats->recovered; + break; + default: + WARN_ONCE(1, "Memory failure: mf_result=%d is not properly handled", result); + break; + } + ++mf_stats->total; +} + /* * "Dirty/Clean" indication is not 100% accurate due to the possibility of * setting PG_dirty outside page lock. See also comment above set_page_dirty(). @@ -1237,6 +1270,9 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type, trace_memory_failure_event(pfn, type, result); num_poisoned_pages_inc(pfn); + + update_per_node_mf_stats(pfn, result); + pr_err("%#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); -- cgit v1.2.3 From 4180887f0625af739896aaafc44ee98103ab8f71 Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Fri, 20 Jan 2023 03:46:22 +0000 Subject: mm: memory-failure: document memory failure stats Add documentation for memory_failure's per NUMA node sysfs entries Link: https://lkml.kernel.org/r/20230120034622.2698268-4-jiaqiyan@google.com Signed-off-by: Jiaqi Yan Acked-by: Naoya Horiguchi Cc: David Rientjes Cc: Kefeng Wang Cc: Tony Luck Cc: Yang Shi Signed-off-by: Andrew Morton --- Documentation/ABI/stable/sysfs-devices-node | 39 +++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node index 8db67aa472f1..402af4b2b905 100644 --- a/Documentation/ABI/stable/sysfs-devices-node +++ b/Documentation/ABI/stable/sysfs-devices-node @@ -182,3 +182,42 @@ Date: November 2021 Contact: Jarkko Sakkinen Description: The total amount of SGX physical memory in bytes. + +What: /sys/devices/system/node/nodeX/memory_failure/total +Date: January 2023 +Contact: Jiaqi Yan +Description: + The total number of raw poisoned pages (pages containing + corrupted data due to memory errors) on a NUMA node. + +What: /sys/devices/system/node/nodeX/memory_failure/ignored +Date: January 2023 +Contact: Jiaqi Yan +Description: + Of the raw poisoned pages on a NUMA node, how many pages are + ignored by memory error recovery attempt, usually because + support for this type of pages is unavailable, and kernel + gives up the recovery. + +What: /sys/devices/system/node/nodeX/memory_failure/failed +Date: January 2023 +Contact: Jiaqi Yan +Description: + Of the raw poisoned pages on a NUMA node, how many pages are + failed by memory error recovery attempt. This usually means + a key recovery operation failed. + +What: /sys/devices/system/node/nodeX/memory_failure/delayed +Date: January 2023 +Contact: Jiaqi Yan +Description: + Of the raw poisoned pages on a NUMA node, how many pages are + delayed by memory error recovery attempt. Delayed poisoned + pages usually will be retried by kernel. + +What: /sys/devices/system/node/nodeX/memory_failure/recovered +Date: January 2023 +Contact: Jiaqi Yan +Description: + Of the raw poisoned pages on a NUMA node, how many pages are + recovered by memory error recovery attempt. -- cgit v1.2.3 From 05a421995503b746095d8ac93fa0ddadfc3c81bc Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Sun, 22 Jan 2023 01:50:54 +0900 Subject: mm/page_owner: record single timestamp value for high order allocations When allocating a high-order page, separate allocation timestamp is recorded for each sub-page resulting in different timestamp values between them. This behavior is not consistent with the behavior when recording free timestamp and caused confusion when analyzing memory dumps. Record single timestamp for the entire allocation, aligning with the behavior for free timestamps. Link: https://lkml.kernel.org/r/20230121165054.520507-1-42.hyeyoo@gmail.com Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: Joonsoo Kim Cc: Michal Hocko Cc: Mike Rapoport Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_owner.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index f0553bedb39d..80dc8f4050fa 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -163,6 +163,7 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext, { struct page_owner *page_owner; int i; + u64 ts_nsec = local_clock(); for (i = 0; i < (1 << order); i++) { page_owner = get_page_owner(page_ext); @@ -172,7 +173,7 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext, page_owner->last_migrate_reason = -1; page_owner->pid = current->pid; page_owner->tgid = current->tgid; - page_owner->ts_nsec = local_clock(); + page_owner->ts_nsec = ts_nsec; strscpy(page_owner->comm, current->comm, sizeof(page_owner->comm)); __set_bit(PAGE_EXT_OWNER, &page_ext->flags); -- cgit v1.2.3 From 2e126aa29007353f069c3bab5c2cf52d231cd3ae Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Sat, 21 Jan 2023 12:11:51 +0200 Subject: mm/sparse: fix "unused function 'pgdat_to_phys'" warning W=1 build with clangs complains: mm/sparse.c:347:27: warning: unused function 'pgdat_to_phys' [-Wunused-function] static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat) ^ 1 warning generated. pgdat_to_phys() is only used by functions defined when CONFIG_MEMORY_HOTREMOVE=y. Move pgdat_to_phys() under #ifdef CONFIG_MEMORY_HOTREMOVE to make clang happy. Link: https://lkml.kernel.org/r/20230121101151.1703292-1-rppt@kernel.org Signed-off-by: Mike Rapoport Reported-by: kernel test robot Link: https://lore.kernel.org/all/202301210155.1E5zABb5-lkp@intel.com Cc: Miles Chen Signed-off-by: Andrew Morton --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/sparse.c b/mm/sparse.c index 2779b419ef2a..fb7aeb1899a4 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -318,6 +318,7 @@ size_t mem_section_usage_size(void) return sizeof(struct mem_section_usage) + usemap_size(); } +#ifdef CONFIG_MEMORY_HOTREMOVE static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat) { #ifndef CONFIG_NUMA @@ -328,7 +329,6 @@ static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat) #endif } -#ifdef CONFIG_MEMORY_HOTREMOVE static struct mem_section_usage * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, unsigned long size) -- cgit v1.2.3 From 420ef683b5217338bc679c33fd9361b52f53a526 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 24 Jan 2023 21:35:26 +0100 Subject: kasan: reset page tags properly with sampling The implementation of page_alloc poisoning sampling assumed that tag_clear_highpage resets page tags for __GFP_ZEROTAGS allocations. However, this is no longer the case since commit 70c248aca9e7 ("mm: kasan: Skip unpoisoning of user pages"). This leads to kernel crashes when MTE-enabled userspace mappings are used with Hardware Tag-Based KASAN enabled. Reset page tags for __GFP_ZEROTAGS allocations in post_alloc_hook(). Also clarify and fix related comments. [andreyknvl@google.com: update comment] Link: https://lkml.kernel.org/r/5dbd866714b4839069e2d8469ac45b60953db290.1674592780.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/24ea20c1b19c2b4b56cf9f5b354915f8dbccfc77.1674592496.git.andreyknvl@google.com Fixes: 44383cef54c0 ("kasan: allow sampling page_alloc allocations for HW_TAGS") Signed-off-by: Andrey Konovalov Reported-by: Peter Collingbourne Tested-by: Peter Collingbourne Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- mm/page_alloc.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 717f12e83b85..5ebce58026f1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2476,7 +2476,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && !should_skip_init(gfp_flags); bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS); - bool reset_tags = !zero_tags; + bool reset_tags = true; int i; set_page_private(page, 0); @@ -2503,7 +2503,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, * (which happens only when memory should be initialized as well). */ if (zero_tags) { - /* Initialize both memory and tags. */ + /* Initialize both memory and memory tags. */ for (i = 0; i != 1 << order; ++i) tag_clear_highpage(page + i); @@ -2521,14 +2521,15 @@ inline void post_alloc_hook(struct page *page, unsigned int order, } else { /* * KASAN decided to exclude this allocation from being - * poisoned due to sampling. Skip poisoning as well. + * (un)poisoned due to sampling. Make KASAN skip + * poisoning when the allocation is freed. */ SetPageSkipKASanPoison(page); } } /* - * If memory tags have not been set, reset the page tags to ensure - * page_address() dereferencing does not fault. + * If memory tags have not been set by KASAN, reset the page tags to + * ensure page_address() dereferencing does not fault. */ if (reset_tags) { for (i = 0; i != 1 << order; ++i) -- cgit v1.2.3 From c5acf1f6f0a11b8e521ad43f59b1bed27dcf34f6 Mon Sep 17 00:00:00 2001 From: Jongwoo Han Date: Thu, 26 Jan 2023 03:08:47 +0900 Subject: mm/gup.c: fix typo in comments Link: https://lkml.kernel.org/r/20230125180847.4542-1-jongwooo.han@gmail.com Signed-off-by: Jongwoo Han Signed-off-by: Andrew Morton --- mm/gup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index 38ba1697dd61..c4b793385ed2 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1055,7 +1055,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * This does not guarantee that the page exists in the user mappings when * __get_user_pages returns, and there may even be a completely different * page there in some cases (eg. if mmapped pagecache has been invalidated - * and subsequently re faulted). However it does guarantee that the page + * and subsequently re-faulted). However it does guarantee that the page * won't be freed completely. And mostly callers simply care that the page * contains data that was valid *at some point in time*. Typically, an IO * or similar operation cannot guarantee anything stronger anyway because -- cgit v1.2.3 From 48731c8436c68ce5597dfe72f3836bd6808bedde Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 25 Jan 2023 13:44:31 +0000 Subject: mm, compaction: rename compact_control->rescan to finish_pageblock Patch series "Fix excessive CPU usage during compaction". Commit 7efc3b726103 ("mm/compaction: fix set skip in fast_find_migrateblock") fixed a problem where pageblocks found by fast_find_migrateblock() were ignored. Unfortunately there were numerous bug reports complaining about high CPU usage and massive stalls once 6.1 was released. Due to the severity, the patch was reverted by Vlastimil as a short-term fix[1] to -stable. The underlying problem for each of the bugs is suspected to be the repeated scanning of the same pageblocks. This series should guarantee forward progress even with commit 7efc3b726103. More information is in the changelog for patch 4. [1] http://lore.kernel.org/r/20230113173345.9692-1-vbabka@suse.cz This patch (of 4): The rescan field was not well named albeit accurate at the time. Rename the field to finish_pageblock to indicate that the remainder of the pageblock should be scanned regardless of COMPACT_CLUSTER_MAX. The intent is that pageblocks with transient failures get marked for skipping to avoid revisiting the same pageblock. Link: https://lkml.kernel.org/r/20230125134434.18017-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Cc: Chuyi Zhou Cc: Jiri Slaby Cc: Maxim Levitsky Cc: Michal Hocko Cc: Paolo Bonzini Cc: Pedro Falcato Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 24 ++++++++++++------------ mm/internal.h | 6 +++++- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index b758b00a4885..28a9596609fe 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1101,12 +1101,12 @@ isolate_success_no_list: /* * Avoid isolating too much unless this block is being - * rescanned (e.g. dirty/writeback pages, parallel allocation) + * fully scanned (e.g. dirty/writeback pages, parallel allocation) * or a lock is contended. For contention, isolate quickly to * potentially remove one source of contention. */ if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX && - !cc->rescan && !cc->contended) { + !cc->finish_pageblock && !cc->contended) { ++low_pfn; break; } @@ -1171,14 +1171,14 @@ isolate_abort: } /* - * Updated the cached scanner pfn once the pageblock has been scanned + * Update the cached scanner pfn once the pageblock has been scanned. * Pages will either be migrated in which case there is no point * scanning in the near future or migration failed in which case the * failure reason may persist. The block is marked for skipping if * there were no pages isolated in the block or if the block is * rescanned twice in a row. */ - if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) { + if (low_pfn == end_pfn && (!nr_isolated || cc->finish_pageblock)) { if (valid_page && !skip_updated) set_pageblock_skip(valid_page); update_cached_migrate(cc, low_pfn); @@ -2372,17 +2372,17 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) unsigned long iteration_start_pfn = cc->migrate_pfn; /* - * Avoid multiple rescans which can happen if a page cannot be - * isolated (dirty/writeback in async mode) or if the migrated - * pages are being allocated before the pageblock is cleared. - * The first rescan will capture the entire pageblock for - * migration. If it fails, it'll be marked skip and scanning - * will proceed as normal. + * Avoid multiple rescans of the same pageblock which can + * happen if a page cannot be isolated (dirty/writeback in + * async mode) or if the migrated pages are being allocated + * before the pageblock is cleared. The first rescan will + * capture the entire pageblock for migration. If it fails, + * it'll be marked skip and scanning will proceed as normal. */ - cc->rescan = false; + cc->finish_pageblock = false; if (pageblock_start_pfn(last_migrated_pfn) == pageblock_start_pfn(iteration_start_pfn)) { - cc->rescan = true; + cc->finish_pageblock = true; } switch (isolate_migratepages(cc)) { diff --git a/mm/internal.h b/mm/internal.h index ce462bf145b4..2d1b9fa8083e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -448,7 +448,11 @@ struct compact_control { bool proactive_compaction; /* kcompactd proactive compaction */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock contention */ - bool rescan; /* Rescanning the same pageblock */ + bool finish_pageblock; /* Scan the remainder of a pageblock. Used + * when there are potentially transient + * isolation or migration failures to + * ensure forward progress. + */ bool alloc_contig; /* alloc_contig_range allocation */ }; -- cgit v1.2.3 From 16b3be4034316bf56a171478cf1dccdf94dede43 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 25 Jan 2023 13:44:32 +0000 Subject: mm, compaction: check if a page has been captured before draining PCP pages If a page has been captured then draining is unnecssary so check first for a captured page. Link: https://lkml.kernel.org/r/20230125134434.18017-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Cc: Chuyi Zhou Cc: Jiri Slaby Cc: Maxim Levitsky Cc: Michal Hocko Cc: Paolo Bonzini Cc: Pedro Falcato Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 28a9596609fe..a86559910fd9 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2439,6 +2439,12 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) } } + /* Stop if a page has been captured */ + if (capc && capc->page) { + ret = COMPACT_SUCCESS; + break; + } + check_drain: /* * Has the migration scanner moved away from the previous @@ -2457,12 +2463,6 @@ check_drain: last_migrated_pfn = 0; } } - - /* Stop if a page has been captured */ - if (capc && capc->page) { - ret = COMPACT_SUCCESS; - break; - } } out: -- cgit v1.2.3 From f9d7fc1ae3349759f25903cd867ab72e6ba4a63e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 25 Jan 2023 13:44:33 +0000 Subject: mm, compaction: finish scanning the current pageblock if requested cc->finish_pageblock is set when the current pageblock should be rescanned but fast_find_migrateblock can select an alternative block. Disable fast_find_migrateblock when the current pageblock scan should be completed. Link: https://lkml.kernel.org/r/20230125134434.18017-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Cc: Chuyi Zhou Cc: Jiri Slaby Cc: Maxim Levitsky Cc: Michal Hocko Cc: Paolo Bonzini Cc: Pedro Falcato Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index a86559910fd9..91acde906ae3 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1761,6 +1761,13 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) if (cc->ignore_skip_hint) return pfn; + /* + * If the pageblock should be finished then do not select a different + * pageblock. + */ + if (cc->finish_pageblock) + return pfn; + /* * If the migrate_pfn is not at the start of a zone or the start * of a pageblock then assume this is a continuation of a previous -- cgit v1.2.3 From cfccd2e63e7e0c84c514676cffa755dd71a3b2bc Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 25 Jan 2023 13:44:34 +0000 Subject: mm, compaction: finish pageblocks on complete migration failure Commit 7efc3b726103 ("mm/compaction: fix set skip in fast_find_migrateblock") address an issue where a pageblock selected by fast_find_migrateblock() was ignored. Unfortunately, the same fix resulted in numerous reports of khugepaged or kcompactd stalling for long periods of time or consuming 100% of CPU. Tracing showed that there was a lot of rescanning between a small subset of pageblocks because the conditions for marking the block skip are not met. The scan is not reaching the end of the pageblock because enough pages were isolated but none were migrated successfully. Eventually it circles back to the same block. Pageblock skip tracking tries to minimise both latency and excessive scanning but tracking exactly when a block is fully scanned requires an excessive amount of state. This patch forcibly rescans a pageblock when all isolated pages fail to migrate even though it could be for transient reasons such as page writeback or page dirty. This will sometimes migrate too many pages but pageblocks will be marked skip and forward progress will be made. "Usemen" from the mmtests configuration workload-usemem-stress-numa-compact was used to stress compaction. The compaction trace events were recorded using a 6.2-rc5 kernel that includes commit 7efc3b726103 and count of unique ranges were measured. The top 5 ranges were 3076 range=(0x10ca00-0x10cc00) 3076 range=(0x110a00-0x110c00) 3098 range=(0x13b600-0x13b800) 3104 range=(0x141c00-0x141e00) 11424 range=(0x11b600-0x11b800) While this workload is very different than what the bugs reported, the pattern of the same subset of blocks being repeatedly scanned is observed. At one point, *only* the range range=(0x11b600 ~ 0x11b800) was scanned for 2 seconds. 14 seconds passed between the first migration-related event and the last. With the series applied including this patch, the top 5 ranges were 1 range=(0x11607e-0x116200) 1 range=(0x116200-0x116278) 1 range=(0x116278-0x116400) 1 range=(0x116400-0x116424) 1 range=(0x116424-0x116600) Only unique ranges were scanned and the time between the first migration-related event was 0.11 milliseconds. Link: https://lkml.kernel.org/r/20230125134434.18017-5-mgorman@techsingularity.net Fixes: 7efc3b726103 ("mm/compaction: fix set skip in fast_find_migrateblock") Signed-off-by: Mel Gorman Cc: Chuyi Zhou Cc: Jiri Slaby Cc: Maxim Levitsky Cc: Michal Hocko Cc: Paolo Bonzini Cc: Pedro Falcato Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 91acde906ae3..d73578af44cc 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2392,6 +2392,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) cc->finish_pageblock = true; } +rescan: switch (isolate_migratepages(cc)) { case ISOLATE_ABORT: ret = COMPACT_CONTENDED; @@ -2434,15 +2435,28 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) goto out; } /* - * We failed to migrate at least one page in the current - * order-aligned block, so skip the rest of it. + * If an ASYNC or SYNC_LIGHT fails to migrate a page + * within the current order-aligned block, scan the + * remainder of the pageblock. This will mark the + * pageblock "skip" to avoid rescanning in the near + * future. This will isolate more pages than necessary + * for the request but avoid loops due to + * fast_find_migrateblock revisiting blocks that were + * recently partially scanned. */ - if (cc->direct_compaction && - (cc->mode == MIGRATE_ASYNC)) { - cc->migrate_pfn = block_end_pfn( - cc->migrate_pfn - 1, cc->order); - /* Draining pcplists is useless in this case */ - last_migrated_pfn = 0; + if (cc->direct_compaction && !cc->finish_pageblock && + (cc->mode < MIGRATE_SYNC)) { + cc->finish_pageblock = true; + + /* + * Draining pcplists does not help THP if + * any page failed to migrate. Even after + * drain, the pageblock will not be free. + */ + if (cc->order == COMPACTION_HPAGE_ORDER) + last_migrated_pfn = 0; + + goto rescan; } } -- cgit v1.2.3 From 37f3605e5e7af7de12aeb670c5b94e5a3c8dbf74 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:42 +0100 Subject: mm: reject vmap with VM_FLUSH_RESET_PERMS Patch series "cleanup vfree and vunmap". This little series untangles the vfree and vunmap code path a bit. This patch (of 10): VM_FLUSH_RESET_PERMS is just for use with vmalloc as it is tied to freeing the underlying pages. Link: https://lkml.kernel.org/r/20230121071051.1143058-1-hch@lst.de Link: https://lkml.kernel.org/r/20230121071051.1143058-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 428e0bee5c9c..3f3cb4875966 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2868,6 +2868,9 @@ void *vmap(struct page **pages, unsigned int count, might_sleep(); + if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS)) + return NULL; + /* * Your top guard is someone else's bottom guard. Not having a top * guard compromises someone else's mappings too. -- cgit v1.2.3 From f41f036b804d0d920f9b6fd3fca9489dd7afd358 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:43 +0100 Subject: mm: remove __vfree __vfree is a subset of vfree that just skips a few checks, and which is only used by vfree and an error cleanup path. Fold __vfree into vfree and switch the only other caller to call vfree() instead. Link: https://lkml.kernel.org/r/20230121071051.1143058-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3f3cb4875966..67fc9d7e4024 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2786,14 +2786,6 @@ void vfree_atomic(const void *addr) __vfree_deferred(addr); } -static void __vfree(const void *addr) -{ - if (unlikely(in_interrupt())) - __vfree_deferred(addr); - else - __vunmap(addr, 1); -} - /** * vfree - Release memory allocated by vmalloc() * @addr: Memory base address @@ -2821,8 +2813,10 @@ void vfree(const void *addr) if (!addr) return; - - __vfree(addr); + if (unlikely(in_interrupt())) + __vfree_deferred(addr); + else + __vunmap(addr, 1); } EXPORT_SYMBOL(vfree); @@ -3089,7 +3083,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, /* * If not enough pages were obtained to accomplish an - * allocation request, free them via __vfree() if any. + * allocation request, free them via vfree() if any. */ if (area->nr_pages != nr_small_pages) { warn_alloc(gfp_mask, NULL, @@ -3129,7 +3123,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return area->addr; fail: - __vfree(area->addr); + vfree(area->addr); return NULL; } -- cgit v1.2.3 From 01e2e8394a527644de5192f92f64e1c883a3e493 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:44 +0100 Subject: mm: remove __vfree_deferred Fold __vfree_deferred into vfree_atomic, and call vfree_atomic early on from vfree if called from interrupt context so that the extra low-level helper can be avoided. Link: https://lkml.kernel.org/r/20230121071051.1143058-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 43 +++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 67fc9d7e4024..cfd796570e61 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2754,20 +2754,6 @@ static void __vunmap(const void *addr, int deallocate_pages) kfree(area); } -static inline void __vfree_deferred(const void *addr) -{ - /* - * Use raw_cpu_ptr() because this can be called from preemptible - * context. Preemption is absolutely fine here, because the llist_add() - * implementation is lockless, so it works even if we are adding to - * another cpu's list. schedule_work() should be fine with this too. - */ - struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); - - if (llist_add((struct llist_node *)addr, &p->list)) - schedule_work(&p->wq); -} - /** * vfree_atomic - release memory allocated by vmalloc() * @addr: memory base address @@ -2777,13 +2763,19 @@ static inline void __vfree_deferred(const void *addr) */ void vfree_atomic(const void *addr) { - BUG_ON(in_nmi()); + struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); + BUG_ON(in_nmi()); kmemleak_free(addr); - if (!addr) - return; - __vfree_deferred(addr); + /* + * Use raw_cpu_ptr() because this can be called from preemptible + * context. Preemption is absolutely fine here, because the llist_add() + * implementation is lockless, so it works even if we are adding to + * another cpu's list. schedule_work() should be fine with this too. + */ + if (addr && llist_add((struct llist_node *)addr, &p->list)) + schedule_work(&p->wq); } /** @@ -2805,17 +2797,16 @@ void vfree_atomic(const void *addr) */ void vfree(const void *addr) { - BUG_ON(in_nmi()); + if (unlikely(in_interrupt())) { + vfree_atomic(addr); + return; + } + BUG_ON(in_nmi()); kmemleak_free(addr); + might_sleep(); - might_sleep_if(!in_interrupt()); - - if (!addr) - return; - if (unlikely(in_interrupt())) - __vfree_deferred(addr); - else + if (addr) __vunmap(addr, 1); } EXPORT_SYMBOL(vfree); -- cgit v1.2.3 From 208162f42f958b37147d3c1c5f947c7c1a8b9c41 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:45 +0100 Subject: mm: move vmalloc_init and free_work down in vmalloc.c Move these two functions around a bit to avoid forward declarations. Link: https://lkml.kernel.org/r/20230121071051.1143058-5-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 104 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 51 insertions(+), 53 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index cfd796570e61..333228c652d1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -89,17 +89,6 @@ struct vfree_deferred { }; static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); -static void __vunmap(const void *, int); - -static void free_work(struct work_struct *w) -{ - struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); - struct llist_node *t, *llnode; - - llist_for_each_safe(llnode, t, llist_del_all(&p->list)) - __vunmap((void *)llnode, 1); -} - /*** Page table manipulation functions ***/ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, @@ -2434,48 +2423,6 @@ static void vmap_init_free_space(void) } } -void __init vmalloc_init(void) -{ - struct vmap_area *va; - struct vm_struct *tmp; - int i; - - /* - * Create the cache for vmap_area objects. - */ - vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); - - for_each_possible_cpu(i) { - struct vmap_block_queue *vbq; - struct vfree_deferred *p; - - vbq = &per_cpu(vmap_block_queue, i); - spin_lock_init(&vbq->lock); - INIT_LIST_HEAD(&vbq->free); - p = &per_cpu(vfree_deferred, i); - init_llist_head(&p->list); - INIT_WORK(&p->wq, free_work); - } - - /* Import existing vmlist entries. */ - for (tmp = vmlist; tmp; tmp = tmp->next) { - va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); - if (WARN_ON_ONCE(!va)) - continue; - - va->va_start = (unsigned long)tmp->addr; - va->va_end = va->va_start + tmp->size; - va->vm = tmp; - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); - } - - /* - * Now we can initialize a free vmap space. - */ - vmap_init_free_space(); - vmap_initialized = true; -} - static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { @@ -2754,6 +2701,15 @@ static void __vunmap(const void *addr, int deallocate_pages) kfree(area); } +static void delayed_vfree_work(struct work_struct *w) +{ + struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); + struct llist_node *t, *llnode; + + llist_for_each_safe(llnode, t, llist_del_all(&p->list)) + __vunmap((void *)llnode, 1); +} + /** * vfree_atomic - release memory allocated by vmalloc() * @addr: memory base address @@ -4209,3 +4165,45 @@ static int __init proc_vmalloc_init(void) module_init(proc_vmalloc_init); #endif + +void __init vmalloc_init(void) +{ + struct vmap_area *va; + struct vm_struct *tmp; + int i; + + /* + * Create the cache for vmap_area objects. + */ + vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); + + for_each_possible_cpu(i) { + struct vmap_block_queue *vbq; + struct vfree_deferred *p; + + vbq = &per_cpu(vmap_block_queue, i); + spin_lock_init(&vbq->lock); + INIT_LIST_HEAD(&vbq->free); + p = &per_cpu(vfree_deferred, i); + init_llist_head(&p->list); + INIT_WORK(&p->wq, delayed_vfree_work); + } + + /* Import existing vmlist entries. */ + for (tmp = vmlist; tmp; tmp = tmp->next) { + va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (WARN_ON_ONCE(!va)) + continue; + + va->va_start = (unsigned long)tmp->addr; + va->va_end = va->va_start + tmp->size; + va->vm = tmp; + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + } + + /* + * Now we can initialize a free vmap space. + */ + vmap_init_free_space(); + vmap_initialized = true; +} -- cgit v1.2.3 From 5d3d31d6fb17a8eb83af50ea8a0616a3cfde3e58 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:46 +0100 Subject: mm: call vfree instead of __vunmap from delayed_vfree_work This adds an extra, never taken, in_interrupt() branch, but will allow to cut down the maze of vfree helpers. Link: https://lkml.kernel.org/r/20230121071051.1143058-6-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 333228c652d1..0c0267b34afa 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2707,7 +2707,7 @@ static void delayed_vfree_work(struct work_struct *w) struct llist_node *t, *llnode; llist_for_each_safe(llnode, t, llist_del_all(&p->list)) - __vunmap((void *)llnode, 1); + vfree(llnode); } /** -- cgit v1.2.3 From 39e65b7f63392d70f2f6aff5f4c5c3262f49637e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:47 +0100 Subject: mm: move __remove_vm_area out of va_remove_mappings __remove_vm_area is the only part of va_remove_mappings that requires a vmap_area. Move the call out to the caller and only pass the vm_struct to va_remove_mappings. Link: https://lkml.kernel.org/r/20230121071051.1143058-7-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0c0267b34afa..003e49d0e628 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2599,18 +2599,15 @@ static inline void set_area_direct_map(const struct vm_struct *area, set_direct_map(area->pages[i]); } -/* Handle removing and resetting vm mappings related to the VA's vm_struct. */ -static void va_remove_mappings(struct vmap_area *va, int deallocate_pages) +/* Handle removing and resetting vm mappings related to the vm_struct. */ +static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) { - struct vm_struct *area = va->vm; unsigned long start = ULONG_MAX, end = 0; unsigned int page_order = vm_area_page_order(area); int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; int flush_dmap = 0; int i; - __remove_vm_area(va); - /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ if (!flush_reset) return; @@ -2676,7 +2673,8 @@ static void __vunmap(const void *addr, int deallocate_pages) kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); - va_remove_mappings(va, deallocate_pages); + __remove_vm_area(va); + vm_remove_mappings(area, deallocate_pages); if (deallocate_pages) { int i; -- cgit v1.2.3 From 75c59ce74e47d3e11aa7666f1877aa64495f7b03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:48 +0100 Subject: mm: use remove_vm_area in __vunmap Use the common helper to find and remove a vmap_area instead of open coding it. Link: https://lkml.kernel.org/r/20230121071051.1143058-8-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 003e49d0e628..bc6791a0adbe 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2556,20 +2556,6 @@ struct vm_struct *find_vm_area(const void *addr) return va->vm; } -static struct vm_struct *__remove_vm_area(struct vmap_area *va) -{ - struct vm_struct *vm; - - if (!va || !va->vm) - return NULL; - - vm = va->vm; - kasan_free_module_shadow(vm); - free_unmap_vmap_area(va); - - return vm; -} - /** * remove_vm_area - find and remove a continuous kernel virtual area * @addr: base address @@ -2582,10 +2568,18 @@ static struct vm_struct *__remove_vm_area(struct vmap_area *va) */ struct vm_struct *remove_vm_area(const void *addr) { + struct vmap_area *va; + struct vm_struct *vm; + might_sleep(); - return __remove_vm_area( - find_unlink_vmap_area((unsigned long) addr)); + va = find_unlink_vmap_area((unsigned long)addr); + if (!va || !va->vm) + return NULL; + vm = va->vm; + kasan_free_module_shadow(vm); + free_unmap_vmap_area(va); + return vm; } static inline void set_area_direct_map(const struct vm_struct *area, @@ -2651,7 +2645,6 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) static void __vunmap(const void *addr, int deallocate_pages) { struct vm_struct *area; - struct vmap_area *va; if (!addr) return; @@ -2660,20 +2653,18 @@ static void __vunmap(const void *addr, int deallocate_pages) addr)) return; - va = find_unlink_vmap_area((unsigned long)addr); - if (unlikely(!va)) { + area = remove_vm_area(addr); + if (unlikely(!area)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); return; } - area = va->vm; debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); - __remove_vm_area(va); vm_remove_mappings(area, deallocate_pages); if (deallocate_pages) { -- cgit v1.2.3 From 17d3ef432dcbe80c134f1f79e2ed1ebd1076eab1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:49 +0100 Subject: mm: move debug checks from __vunmap to remove_vm_area All these checks apply to the free_vm_area interface as well, so move them to the common routine. Link: https://lkml.kernel.org/r/20230121071051.1143058-9-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bc6791a0adbe..cb8c8cd161c8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2573,11 +2573,20 @@ struct vm_struct *remove_vm_area(const void *addr) might_sleep(); + if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", + addr)) + return NULL; + va = find_unlink_vmap_area((unsigned long)addr); if (!va || !va->vm) return NULL; vm = va->vm; + + debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm)); + debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm)); kasan_free_module_shadow(vm); + kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm)); + free_unmap_vmap_area(va); return vm; } @@ -2649,10 +2658,6 @@ static void __vunmap(const void *addr, int deallocate_pages) if (!addr) return; - if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", - addr)) - return; - area = remove_vm_area(addr); if (unlikely(!area)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", @@ -2660,11 +2665,6 @@ static void __vunmap(const void *addr, int deallocate_pages) return; } - debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); - debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); - - kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); - vm_remove_mappings(area, deallocate_pages); if (deallocate_pages) { -- cgit v1.2.3 From 79311c1fe0175941298fb362ba072514e2fe5c54 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:50 +0100 Subject: mm: split __vunmap vunmap only needs to find and free the vmap_area and vm_strut, so open code that there and merge the rest of the code into vfree. Link: https://lkml.kernel.org/r/20230121071051.1143058-10-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Uladzislau Rezki (Sony) Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 84 +++++++++++++++++++++++++++++------------------------------- 1 file changed, 41 insertions(+), 43 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index cb8c8cd161c8..11144a29665a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2651,45 +2651,6 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) set_area_direct_map(area, set_direct_map_default_noflush); } -static void __vunmap(const void *addr, int deallocate_pages) -{ - struct vm_struct *area; - - if (!addr) - return; - - area = remove_vm_area(addr); - if (unlikely(!area)) { - WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", - addr); - return; - } - - vm_remove_mappings(area, deallocate_pages); - - if (deallocate_pages) { - int i; - - for (i = 0; i < area->nr_pages; i++) { - struct page *page = area->pages[i]; - - BUG_ON(!page); - mod_memcg_page_state(page, MEMCG_VMALLOC, -1); - /* - * High-order allocs for huge vmallocs are split, so - * can be freed as an array of order-0 allocations - */ - __free_pages(page, 0); - cond_resched(); - } - atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); - - kvfree(area->pages); - } - - kfree(area); -} - static void delayed_vfree_work(struct work_struct *w) { struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); @@ -2742,6 +2703,9 @@ void vfree_atomic(const void *addr) */ void vfree(const void *addr) { + struct vm_struct *vm; + int i; + if (unlikely(in_interrupt())) { vfree_atomic(addr); return; @@ -2751,8 +2715,32 @@ void vfree(const void *addr) kmemleak_free(addr); might_sleep(); - if (addr) - __vunmap(addr, 1); + if (!addr) + return; + + vm = remove_vm_area(addr); + if (unlikely(!vm)) { + WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", + addr); + return; + } + + vm_remove_mappings(vm, true); + for (i = 0; i < vm->nr_pages; i++) { + struct page *page = vm->pages[i]; + + BUG_ON(!page); + mod_memcg_page_state(page, MEMCG_VMALLOC, -1); + /* + * High-order allocs for huge vmallocs are split, so + * can be freed as an array of order-0 allocations + */ + __free_pages(page, 0); + cond_resched(); + } + atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages); + kvfree(vm->pages); + kfree(vm); } EXPORT_SYMBOL(vfree); @@ -2767,10 +2755,20 @@ EXPORT_SYMBOL(vfree); */ void vunmap(const void *addr) { + struct vm_struct *vm; + BUG_ON(in_interrupt()); might_sleep(); - if (addr) - __vunmap(addr, 0); + + if (!addr) + return; + vm = remove_vm_area(addr); + if (unlikely(!vm)) { + WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n", + addr); + return; + } + kfree(vm); } EXPORT_SYMBOL(vunmap); -- cgit v1.2.3 From 9e5fa0ae52fc67dea86f95ea4e3909b3e10a160f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:51 +0100 Subject: mm: refactor va_remove_mappings Move the VM_FLUSH_RESET_PERMS to the caller and rename the function to better describe what it is doing. Link: https://lkml.kernel.org/r/20230121071051.1143058-11-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 11144a29665a..9b71ec3213cb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2602,35 +2602,23 @@ static inline void set_area_direct_map(const struct vm_struct *area, set_direct_map(area->pages[i]); } -/* Handle removing and resetting vm mappings related to the vm_struct. */ -static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) +/* + * Flush the vm mapping and reset the direct map. + */ +static void vm_reset_perms(struct vm_struct *area) { unsigned long start = ULONG_MAX, end = 0; unsigned int page_order = vm_area_page_order(area); - int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; int flush_dmap = 0; int i; - /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ - if (!flush_reset) - return; - - /* - * If not deallocating pages, just do the flush of the VM area and - * return. - */ - if (!deallocate_pages) { - vm_unmap_aliases(); - return; - } - /* - * If execution gets here, flush the vm mapping and reset the direct - * map. Find the start and end range of the direct mappings to make sure + * Find the start and end range of the direct mappings to make sure that * the vm_unmap_aliases() flush includes the direct map. */ for (i = 0; i < area->nr_pages; i += 1U << page_order) { unsigned long addr = (unsigned long)page_address(area->pages[i]); + if (addr) { unsigned long page_size; @@ -2725,7 +2713,8 @@ void vfree(const void *addr) return; } - vm_remove_mappings(vm, true); + if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) + vm_reset_perms(vm); for (i = 0; i < vm->nr_pages; i++) { struct page *page = vm->pages[i]; -- cgit v1.2.3 From 7d28631786b2333c5d48ad25172eb159aaa2945f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Jan 2023 14:34:30 +0100 Subject: mpage: stop using bdev_{read,write}_page Patch series "remove ->rw_page". This series removes the ->rw_page block_device_operation, which is an old and clumsy attempt at a simple read/write fast path for the block layer. It isn't actually used by the fastest block layer operations that we support (polled I/O through io_uring), but only used by the mpage buffered I/O helpers which are some of the slowest I/O we have and do not make any difference there at all, and zram which is a block device abused to duplicate the zram functionality. Given that zram is heavily used we need to make sure there is a good replacement for synchronous I/O, so this series adds a new flag for drivers that complete I/O synchronously and uses that flag to use on-stack bios and synchronous submission for them in the swap code. This patch (of 7): These are micro-optimizations for synchronous I/O, which do not matter compared to all the other inefficiencies in the legacy buffer_head based mpage code. Link: https://lkml.kernel.org/r/20230125133436.447864-1-hch@lst.de Link: https://lkml.kernel.org/r/20230125133436.447864-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Cc: Keith Busch Cc: Dave Jiang Cc: Ira Weiny Cc: Jens Axboe Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Vishal Verma Signed-off-by: Andrew Morton --- fs/mpage.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/fs/mpage.c b/fs/mpage.c index b8e7975159bc..55988ea994ee 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -269,11 +269,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) alloc_new: if (args->bio == NULL) { - if (first_hole == blocks_per_page) { - if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9), - &folio->page)) - goto out; - } args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), opf, gfp); if (args->bio == NULL) @@ -585,11 +580,6 @@ page_is_mapped: alloc_new: if (bio == NULL) { - if (first_unmapped == blocks_per_page) { - if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9), - page, wbc)) - goto out; - } bio = bio_alloc(bdev, BIO_MAX_VECS, REQ_OP_WRITE | wbc_to_write_flags(wbc), GFP_NOFS); -- cgit v1.2.3 From a8c1408f870ef5308088b02c76082136b2c514ad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Jan 2023 14:34:31 +0100 Subject: mm: remove the swap_readpage return value swap_readpage always returns 0, and no caller checks the return value. [akpm@linux-foundation.org: fix void-returning swap_readpage() stub, per Keith] Link: https://lkml.kernel.org/r/20230125133436.447864-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Cc: Dave Jiang Cc: Ira Weiny Cc: Jens Axboe Cc: Keith Busch Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Vishal Verma Signed-off-by: Andrew Morton --- mm/page_io.c | 16 +++++----------- mm/swap.h | 8 +++----- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 905d9fcc0c96..84b348fe4c7c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -444,11 +444,9 @@ static void swap_readpage_fs(struct page *page, *plug = sio; } -int swap_readpage(struct page *page, bool synchronous, - struct swap_iocb **plug) +void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) { struct bio *bio; - int ret = 0; struct swap_info_struct *sis = page_swap_info(page); bool workingset = PageWorkingset(page); unsigned long pflags; @@ -480,15 +478,12 @@ int swap_readpage(struct page *page, bool synchronous, goto out; } - if (sis->flags & SWP_SYNCHRONOUS_IO) { - ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); - if (!ret) { - count_vm_event(PSWPIN); - goto out; - } + if ((sis->flags & SWP_SYNCHRONOUS_IO) && + !bdev_read_page(sis->bdev, swap_page_sector(page), page)) { + count_vm_event(PSWPIN); + goto out; } - ret = 0; bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL); bio->bi_iter.bi_sector = swap_page_sector(page); bio->bi_end_io = end_swap_bio_read; @@ -520,7 +515,6 @@ out: psi_memstall_leave(&pflags); } delayacct_swapin_end(); - return ret; } void __swap_read_unplug(struct swap_iocb *sio) diff --git a/mm/swap.h b/mm/swap.h index f78065c8ef52..c8fdda601751 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -8,8 +8,7 @@ /* linux/mm/page_io.c */ int sio_pool_init(void); struct swap_iocb; -int swap_readpage(struct page *page, bool do_poll, - struct swap_iocb **plug); +void swap_readpage(struct page *page, bool do_poll, struct swap_iocb **plug); void __swap_read_unplug(struct swap_iocb *plug); static inline void swap_read_unplug(struct swap_iocb *plug) { @@ -64,10 +63,9 @@ static inline unsigned int folio_swap_flags(struct folio *folio) } #else /* CONFIG_SWAP */ struct swap_iocb; -static inline int swap_readpage(struct page *page, bool do_poll, - struct swap_iocb **plug) +static inline void swap_readpage(struct page *page, bool do_poll, + struct swap_iocb **plug) { - return 0; } static inline void swap_write_unplug(struct swap_iocb *sio) { -- cgit v1.2.3 From 14bd75f57400dba0e75eaee4dcb44ac52a46253f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Jan 2023 14:34:32 +0100 Subject: mm: factor out a swap_readpage_bdev helper Split the block device case from swap_readpage into a separate helper, following the abstraction for file based swap and frontswap. Link: https://lkml.kernel.org/r/20230125133436.447864-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Cc: Dave Jiang Cc: Ira Weiny Cc: Jens Axboe Cc: Keith Busch Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Vishal Verma Signed-off-by: Andrew Morton --- mm/page_io.c | 68 +++++++++++++++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 84b348fe4c7c..872a226d15dc 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -444,44 +444,15 @@ static void swap_readpage_fs(struct page *page, *plug = sio; } -void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) +static void swap_readpage_bdev(struct page *page, bool synchronous, + struct swap_info_struct *sis) { struct bio *bio; - struct swap_info_struct *sis = page_swap_info(page); - bool workingset = PageWorkingset(page); - unsigned long pflags; - bool in_thrashing; - - VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page); - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageUptodate(page), page); - - /* - * Count submission time as memory stall and delay. When the device - * is congested, or the submitting cgroup IO-throttled, submission - * can be a significant part of overall IO time. - */ - if (workingset) { - delayacct_thrashing_start(&in_thrashing); - psi_memstall_enter(&pflags); - } - delayacct_swapin_start(); - - if (frontswap_load(page) == 0) { - SetPageUptodate(page); - unlock_page(page); - goto out; - } - - if (data_race(sis->flags & SWP_FS_OPS)) { - swap_readpage_fs(page, plug); - goto out; - } if ((sis->flags & SWP_SYNCHRONOUS_IO) && !bdev_read_page(sis->bdev, swap_page_sector(page), page)) { count_vm_event(PSWPIN); - goto out; + return; } bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL); @@ -508,8 +479,39 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) } __set_current_state(TASK_RUNNING); bio_put(bio); +} + +void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) +{ + struct swap_info_struct *sis = page_swap_info(page); + bool workingset = PageWorkingset(page); + unsigned long pflags; + bool in_thrashing; + + VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageUptodate(page), page); + + /* + * Count submission time as memory stall and delay. When the device + * is congested, or the submitting cgroup IO-throttled, submission + * can be a significant part of overall IO time. + */ + if (workingset) { + delayacct_thrashing_start(&in_thrashing); + psi_memstall_enter(&pflags); + } + delayacct_swapin_start(); + + if (frontswap_load(page) == 0) { + SetPageUptodate(page); + unlock_page(page); + } else if (data_race(sis->flags & SWP_FS_OPS)) { + swap_readpage_fs(page, plug); + } else { + swap_readpage_bdev(page, synchronous, sis); + } -out: if (workingset) { delayacct_thrashing_end(&in_thrashing); psi_memstall_leave(&pflags); -- cgit v1.2.3 From 9b4e30bd7309222f74a5198f44bd45feea024b00 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Jan 2023 14:34:33 +0100 Subject: mm: use an on-stack bio for synchronous swapin Optimize the synchronous swap in case by using an on-stack bio instead of allocating one using bio_alloc. Link: https://lkml.kernel.org/r/20230125133436.447864-5-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Dan Williams Cc: Dave Jiang Cc: Ira Weiny Cc: Jens Axboe Cc: Keith Busch Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Vishal Verma Signed-off-by: Andrew Morton --- mm/page_io.c | 69 +++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 31 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 872a226d15dc..d47def70e81f 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -51,10 +51,9 @@ static void end_swap_bio_write(struct bio *bio) bio_put(bio); } -static void end_swap_bio_read(struct bio *bio) +static void __end_swap_bio_read(struct bio *bio) { struct page *page = bio_first_page_all(bio); - struct task_struct *waiter = bio->bi_private; if (bio->bi_status) { SetPageError(page); @@ -62,18 +61,16 @@ static void end_swap_bio_read(struct bio *bio) pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); - goto out; + } else { + SetPageUptodate(page); } - - SetPageUptodate(page); -out: unlock_page(page); - WRITE_ONCE(bio->bi_private, NULL); +} + +static void end_swap_bio_read(struct bio *bio) +{ + __end_swap_bio_read(bio); bio_put(bio); - if (waiter) { - blk_wake_io_task(waiter); - put_task_struct(waiter); - } } int generic_swapfile_activate(struct swap_info_struct *sis, @@ -444,10 +441,11 @@ static void swap_readpage_fs(struct page *page, *plug = sio; } -static void swap_readpage_bdev(struct page *page, bool synchronous, +static void swap_readpage_bdev_sync(struct page *page, struct swap_info_struct *sis) { - struct bio *bio; + struct bio_vec bv; + struct bio bio; if ((sis->flags & SWP_SYNCHRONOUS_IO) && !bdev_read_page(sis->bdev, swap_page_sector(page), page)) { @@ -455,30 +453,37 @@ static void swap_readpage_bdev(struct page *page, bool synchronous, return; } - bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL); - bio->bi_iter.bi_sector = swap_page_sector(page); - bio->bi_end_io = end_swap_bio_read; - bio_add_page(bio, page, thp_size(page), 0); + bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ); + bio.bi_iter.bi_sector = swap_page_sector(page); + bio_add_page(&bio, page, thp_size(page), 0); /* * Keep this task valid during swap readpage because the oom killer may * attempt to access it in the page fault retry time check. */ - if (synchronous) { - get_task_struct(current); - bio->bi_private = current; - } + get_task_struct(current); count_vm_event(PSWPIN); - bio_get(bio); - submit_bio(bio); - while (synchronous) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!READ_ONCE(bio->bi_private)) - break; + submit_bio_wait(&bio); + __end_swap_bio_read(&bio); + put_task_struct(current); +} + +static void swap_readpage_bdev_async(struct page *page, + struct swap_info_struct *sis) +{ + struct bio *bio; - blk_io_schedule(); + if ((sis->flags & SWP_SYNCHRONOUS_IO) && + !bdev_read_page(sis->bdev, swap_page_sector(page), page)) { + count_vm_event(PSWPIN); + return; } - __set_current_state(TASK_RUNNING); - bio_put(bio); + + bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL); + bio->bi_iter.bi_sector = swap_page_sector(page); + bio->bi_end_io = end_swap_bio_read; + bio_add_page(bio, page, thp_size(page), 0); + count_vm_event(PSWPIN); + submit_bio(bio); } void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) @@ -508,8 +513,10 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) unlock_page(page); } else if (data_race(sis->flags & SWP_FS_OPS)) { swap_readpage_fs(page, plug); + } else if (synchronous) { + swap_readpage_bdev_sync(page, sis); } else { - swap_readpage_bdev(page, synchronous, sis); + swap_readpage_bdev_async(page, sis); } if (workingset) { -- cgit v1.2.3 From e3e2762bd3c5e02780618fc42f5b0049a3bedb30 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Jan 2023 14:34:34 +0100 Subject: mm: remove the __swap_writepage return value __swap_writepage always returns 0. Link: https://lkml.kernel.org/r/20230125133436.447864-6-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Dan Williams Cc: Dave Jiang Cc: Ira Weiny Cc: Jens Axboe Cc: Keith Busch Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Vishal Verma Signed-off-by: Andrew Morton --- mm/page_io.c | 23 +++++++++-------------- mm/swap.h | 2 +- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index d47def70e81f..3ba5a6e99030 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -177,11 +177,11 @@ bad_bmap: int swap_writepage(struct page *page, struct writeback_control *wbc) { struct folio *folio = page_folio(page); - int ret = 0; + int ret; if (folio_free_swap(folio)) { folio_unlock(folio); - goto out; + return 0; } /* * Arch code may have to preserve more data than just the page @@ -191,17 +191,16 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) if (ret) { folio_mark_dirty(folio); folio_unlock(folio); - goto out; + return ret; } if (frontswap_store(&folio->page) == 0) { folio_start_writeback(folio); folio_unlock(folio); folio_end_writeback(folio); - goto out; + return 0; } - ret = __swap_writepage(&folio->page, wbc); -out: - return ret; + __swap_writepage(&folio->page, wbc); + return 0; } static inline void count_swpout_vm_event(struct page *page) @@ -288,7 +287,7 @@ static void sio_write_complete(struct kiocb *iocb, long ret) mempool_free(sio, sio_pool); } -static int swap_writepage_fs(struct page *page, struct writeback_control *wbc) +static void swap_writepage_fs(struct page *page, struct writeback_control *wbc) { struct swap_iocb *sio = NULL; struct swap_info_struct *sis = page_swap_info(page); @@ -325,11 +324,9 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc) } if (wbc->swap_plug) *wbc->swap_plug = sio; - - return 0; } -int __swap_writepage(struct page *page, struct writeback_control *wbc) +void __swap_writepage(struct page *page, struct writeback_control *wbc) { struct bio *bio; int ret; @@ -347,7 +344,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc) ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc); if (!ret) { count_swpout_vm_event(page); - return 0; + return; } bio = bio_alloc(sis->bdev, 1, @@ -362,8 +359,6 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc) set_page_writeback(page); unlock_page(page); submit_bio(bio); - - return 0; } void swap_write_unplug(struct swap_iocb *sio) diff --git a/mm/swap.h b/mm/swap.h index c8fdda601751..7c033d793f15 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -17,7 +17,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug) } void swap_write_unplug(struct swap_iocb *sio); int swap_writepage(struct page *page, struct writeback_control *wbc); -int __swap_writepage(struct page *page, struct writeback_control *wbc); +void __swap_writepage(struct page *page, struct writeback_control *wbc); /* linux/mm/swap_state.c */ /* One swap address space for each 64M swap space */ -- cgit v1.2.3 From 05cda97ecb7046f4192a921741aae33b300dd628 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Jan 2023 14:34:35 +0100 Subject: mm: factor out a swap_writepage_bdev helper Split the block device case from swap_readpage into a separate helper, following the abstraction for file based swap. Link: https://lkml.kernel.org/r/20230125133436.447864-7-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Dan Williams Cc: Dave Jiang Cc: Ira Weiny Cc: Jens Axboe Cc: Keith Busch Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Vishal Verma Signed-off-by: Andrew Morton --- mm/page_io.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 3ba5a6e99030..0a1a3b831344 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -326,23 +326,12 @@ static void swap_writepage_fs(struct page *page, struct writeback_control *wbc) *wbc->swap_plug = sio; } -void __swap_writepage(struct page *page, struct writeback_control *wbc) +static void swap_writepage_bdev(struct page *page, + struct writeback_control *wbc, struct swap_info_struct *sis) { struct bio *bio; - int ret; - struct swap_info_struct *sis = page_swap_info(page); - - VM_BUG_ON_PAGE(!PageSwapCache(page), page); - /* - * ->flags can be updated non-atomicially (scan_swap_map_slots), - * but that will never affect SWP_FS_OPS, so the data_race - * is safe. - */ - if (data_race(sis->flags & SWP_FS_OPS)) - return swap_writepage_fs(page, wbc); - ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc); - if (!ret) { + if (!bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc)) { count_swpout_vm_event(page); return; } @@ -361,6 +350,22 @@ void __swap_writepage(struct page *page, struct writeback_control *wbc) submit_bio(bio); } +void __swap_writepage(struct page *page, struct writeback_control *wbc) +{ + struct swap_info_struct *sis = page_swap_info(page); + + VM_BUG_ON_PAGE(!PageSwapCache(page), page); + /* + * ->flags can be updated non-atomicially (scan_swap_map_slots), + * but that will never affect SWP_FS_OPS, so the data_race + * is safe. + */ + if (data_race(sis->flags & SWP_FS_OPS)) + swap_writepage_fs(page, wbc); + else + swap_writepage_bdev(page, wbc, sis); +} + void swap_write_unplug(struct swap_iocb *sio) { struct iov_iter from; -- cgit v1.2.3 From 3222d8c2a7f888bf38b845b125e9470b12108a4d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Jan 2023 14:34:36 +0100 Subject: block: remove ->rw_page The ->rw_page method is a special purpose bypass of the usual bio handling path that is limited to single-page reads and writes and synchronous which causes a lot of extra code in the drivers, callers and the block layer. The only remaining user is the MM swap code. Switch that swap code to simply submit a single-vec on-stack bio an synchronously wait on it based on a newly added QUEUE_FLAG_SYNCHRONOUS flag set by the drivers that currently implement ->rw_page instead. While this touches one extra cache line and executes extra code, it simplifies the block layer and drivers and ensures that all feastures are properly supported by all drivers, e.g. right now ->rw_page bypassed cgroup writeback entirely. [akpm@linux-foundation.org: fix comment typo, per Dan] Link: https://lkml.kernel.org/r/20230125133436.447864-8-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Cc: Dave Jiang Cc: Ira Weiny Cc: Jens Axboe Cc: Keith Busch Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Vishal Verma Signed-off-by: Andrew Morton --- block/bdev.c | 78 ------------------------------------------- drivers/block/brd.c | 15 +-------- drivers/block/zram/zram_drv.c | 61 +-------------------------------- drivers/nvdimm/btt.c | 16 +-------- drivers/nvdimm/pmem.c | 24 +------------ include/linux/blkdev.h | 12 ++++--- mm/page_io.c | 53 +++++++++++++++++------------ mm/swapfile.c | 2 +- 8 files changed, 44 insertions(+), 217 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index edc110d90df4..1795c7d4b99e 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -304,84 +304,6 @@ out: } EXPORT_SYMBOL(thaw_bdev); -/** - * bdev_read_page() - Start reading a page from a block device - * @bdev: The device to read the page from - * @sector: The offset on the device to read the page to (need not be aligned) - * @page: The page to read - * - * On entry, the page should be locked. It will be unlocked when the page - * has been read. If the block driver implements rw_page synchronously, - * that will be true on exit from this function, but it need not be. - * - * Errors returned by this function are usually "soft", eg out of memory, or - * queue full; callers should try a different route to read this page rather - * than propagate an error back up the stack. - * - * Return: negative errno if an error occurs, 0 if submission was successful. - */ -int bdev_read_page(struct block_device *bdev, sector_t sector, - struct page *page) -{ - const struct block_device_operations *ops = bdev->bd_disk->fops; - int result = -EOPNOTSUPP; - - if (!ops->rw_page || bdev_get_integrity(bdev)) - return result; - - result = blk_queue_enter(bdev_get_queue(bdev), 0); - if (result) - return result; - result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, - REQ_OP_READ); - blk_queue_exit(bdev_get_queue(bdev)); - return result; -} - -/** - * bdev_write_page() - Start writing a page to a block device - * @bdev: The device to write the page to - * @sector: The offset on the device to write the page to (need not be aligned) - * @page: The page to write - * @wbc: The writeback_control for the write - * - * On entry, the page should be locked and not currently under writeback. - * On exit, if the write started successfully, the page will be unlocked and - * under writeback. If the write failed already (eg the driver failed to - * queue the page to the device), the page will still be locked. If the - * caller is a ->writepage implementation, it will need to unlock the page. - * - * Errors returned by this function are usually "soft", eg out of memory, or - * queue full; callers should try a different route to write this page rather - * than propagate an error back up the stack. - * - * Return: negative errno if an error occurs, 0 if submission was successful. - */ -int bdev_write_page(struct block_device *bdev, sector_t sector, - struct page *page, struct writeback_control *wbc) -{ - int result; - const struct block_device_operations *ops = bdev->bd_disk->fops; - - if (!ops->rw_page || bdev_get_integrity(bdev)) - return -EOPNOTSUPP; - result = blk_queue_enter(bdev_get_queue(bdev), 0); - if (result) - return result; - - set_page_writeback(page); - result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, - REQ_OP_WRITE); - if (result) { - end_page_writeback(page); - } else { - clean_page_buffers(page); - unlock_page(page); - } - blk_queue_exit(bdev_get_queue(bdev)); - return result; -} - /* * pseudo-fs */ diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 20acc4a1fd6d..37dce184eb56 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -309,23 +309,9 @@ static void brd_submit_bio(struct bio *bio) bio_endio(bio); } -static int brd_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, enum req_op op) -{ - struct brd_device *brd = bdev->bd_disk->private_data; - int err; - - if (PageTransHuge(page)) - return -ENOTSUPP; - err = brd_do_bvec(brd, page, PAGE_SIZE, 0, op, sector); - page_endio(page, op_is_write(op), err); - return err; -} - static const struct block_device_operations brd_fops = { .owner = THIS_MODULE, .submit_bio = brd_submit_bio, - .rw_page = brd_rw_page, }; /* @@ -411,6 +397,7 @@ static int brd_alloc(int i) /* Tell the block layer that this is not a rotational device */ blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); + blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); err = add_disk(disk); if (err) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 5d1088a645e3..25526707f607 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1453,10 +1453,6 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, /* Slot should be unlocked before the function call */ zram_slot_unlock(zram, index); - /* A null bio means rw_page was used, we must fallback to bio */ - if (!bio) - return -EOPNOTSUPP; - ret = zram_bvec_read_from_bdev(zram, page, index, bio, partial_io); } @@ -2081,61 +2077,6 @@ static void zram_slot_free_notify(struct block_device *bdev, zram_slot_unlock(zram, index); } -static int zram_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, enum req_op op) -{ - int offset, ret; - u32 index; - struct zram *zram; - struct bio_vec bv; - unsigned long start_time; - - if (PageTransHuge(page)) - return -ENOTSUPP; - zram = bdev->bd_disk->private_data; - - if (!valid_io_request(zram, sector, PAGE_SIZE)) { - atomic64_inc(&zram->stats.invalid_io); - ret = -EINVAL; - goto out; - } - - index = sector >> SECTORS_PER_PAGE_SHIFT; - offset = (sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; - - bv.bv_page = page; - bv.bv_len = PAGE_SIZE; - bv.bv_offset = 0; - - start_time = bdev_start_io_acct(bdev->bd_disk->part0, - SECTORS_PER_PAGE, op, jiffies); - ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL); - bdev_end_io_acct(bdev->bd_disk->part0, op, start_time); -out: - /* - * If I/O fails, just return error(ie, non-zero) without - * calling page_endio. - * It causes resubmit the I/O with bio request by upper functions - * of rw_page(e.g., swap_readpage, __swap_writepage) and - * bio->bi_end_io does things to handle the error - * (e.g., SetPageError, set_page_dirty and extra works). - */ - if (unlikely(ret < 0)) - return ret; - - switch (ret) { - case 0: - page_endio(page, op_is_write(op), 0); - break; - case 1: - ret = 0; - break; - default: - WARN_ON(1); - } - return ret; -} - static void zram_destroy_comps(struct zram *zram) { u32 prio; @@ -2290,7 +2231,6 @@ static const struct block_device_operations zram_devops = { .open = zram_open, .submit_bio = zram_submit_bio, .swap_slot_free_notify = zram_slot_free_notify, - .rw_page = zram_rw_page, .owner = THIS_MODULE }; @@ -2389,6 +2329,7 @@ static int zram_add(void) set_capacity(zram->disk, 0); /* zram devices sort of resembles non-rotational disks */ blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue); + blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue); /* diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 0297b7882e33..d5593b0dc700 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1482,20 +1482,6 @@ static void btt_submit_bio(struct bio *bio) bio_endio(bio); } -static int btt_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, enum req_op op) -{ - struct btt *btt = bdev->bd_disk->private_data; - int rc; - - rc = btt_do_bvec(btt, NULL, page, thp_size(page), 0, op, sector); - if (rc == 0) - page_endio(page, op_is_write(op), 0); - - return rc; -} - - static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo) { /* some standard values */ @@ -1508,7 +1494,6 @@ static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo) static const struct block_device_operations btt_fops = { .owner = THIS_MODULE, .submit_bio = btt_submit_bio, - .rw_page = btt_rw_page, .getgeo = btt_getgeo, }; @@ -1530,6 +1515,7 @@ static int btt_blk_init(struct btt *btt) blk_queue_logical_block_size(btt->btt_disk->queue, btt->sector_size); blk_queue_max_hw_sectors(btt->btt_disk->queue, UINT_MAX); blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_disk->queue); + blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, btt->btt_disk->queue); if (btt_meta_size(btt)) { rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt)); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 96e6e9a5f235..ceea55f621cc 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -238,28 +238,6 @@ static void pmem_submit_bio(struct bio *bio) bio_endio(bio); } -static int pmem_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, enum req_op op) -{ - struct pmem_device *pmem = bdev->bd_disk->private_data; - blk_status_t rc; - - if (op_is_write(op)) - rc = pmem_do_write(pmem, page, 0, sector, thp_size(page)); - else - rc = pmem_do_read(pmem, page, 0, sector, thp_size(page)); - /* - * The ->rw_page interface is subtle and tricky. The core - * retries on any error, so we can only invoke page_endio() in - * the successful completion case. Otherwise, we'll see crashes - * caused by double completion. - */ - if (rc == 0) - page_endio(page, op_is_write(op), 0); - - return blk_status_to_errno(rc); -} - /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, @@ -310,7 +288,6 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, static const struct block_device_operations pmem_fops = { .owner = THIS_MODULE, .submit_bio = pmem_submit_bio, - .rw_page = pmem_rw_page, }; static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, @@ -565,6 +542,7 @@ static int pmem_attach_disk(struct device *dev, blk_queue_logical_block_size(q, pmem_sector_size(ndns)); blk_queue_max_hw_sectors(q, UINT_MAX); blk_queue_flag_set(QUEUE_FLAG_NONROT, q); + blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, q); if (pmem->pfn_flags & PFN_MAP) blk_queue_flag_set(QUEUE_FLAG_DAX, q); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 43d4e073b111..c5e59965b145 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -554,6 +554,7 @@ struct request_queue { #define QUEUE_FLAG_IO_STAT 7 /* do disk/partitions IO accounting */ #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ #define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ +#define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ #define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */ @@ -1250,6 +1251,12 @@ static inline bool bdev_nonrot(struct block_device *bdev) return blk_queue_nonrot(bdev_get_queue(bdev)); } +static inline bool bdev_synchronous(struct block_device *bdev) +{ + return test_bit(QUEUE_FLAG_SYNCHRONOUS, + &bdev_get_queue(bdev)->queue_flags); +} + static inline bool bdev_stable_writes(struct block_device *bdev) { return test_bit(QUEUE_FLAG_STABLE_WRITES, @@ -1382,7 +1389,6 @@ struct block_device_operations { unsigned int flags); int (*open) (struct block_device *, fmode_t); void (*release) (struct gendisk *, fmode_t); - int (*rw_page)(struct block_device *, sector_t, struct page *, enum req_op); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); unsigned int (*check_events) (struct gendisk *disk, @@ -1417,10 +1423,6 @@ extern int blkdev_compat_ptr_ioctl(struct block_device *, fmode_t, #define blkdev_compat_ptr_ioctl NULL #endif -extern int bdev_read_page(struct block_device *, sector_t, struct page *); -extern int bdev_write_page(struct block_device *, sector_t, struct page *, - struct writeback_control *); - static inline void blk_wake_io_task(struct task_struct *waiter) { /* diff --git a/mm/page_io.c b/mm/page_io.c index 0a1a3b831344..a805117f7fd7 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -27,7 +27,7 @@ #include #include "swap.h" -static void end_swap_bio_write(struct bio *bio) +static void __end_swap_bio_write(struct bio *bio) { struct page *page = bio_first_page_all(bio); @@ -48,6 +48,11 @@ static void end_swap_bio_write(struct bio *bio) ClearPageReclaim(page); } end_page_writeback(page); +} + +static void end_swap_bio_write(struct bio *bio) +{ + __end_swap_bio_write(bio); bio_put(bio); } @@ -326,15 +331,31 @@ static void swap_writepage_fs(struct page *page, struct writeback_control *wbc) *wbc->swap_plug = sio; } -static void swap_writepage_bdev(struct page *page, +static void swap_writepage_bdev_sync(struct page *page, struct writeback_control *wbc, struct swap_info_struct *sis) { - struct bio *bio; + struct bio_vec bv; + struct bio bio; - if (!bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc)) { - count_swpout_vm_event(page); - return; - } + bio_init(&bio, sis->bdev, &bv, 1, + REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc)); + bio.bi_iter.bi_sector = swap_page_sector(page); + bio_add_page(&bio, page, thp_size(page), 0); + + bio_associate_blkg_from_page(&bio, page); + count_swpout_vm_event(page); + + set_page_writeback(page); + unlock_page(page); + + submit_bio_wait(&bio); + __end_swap_bio_write(&bio); +} + +static void swap_writepage_bdev_async(struct page *page, + struct writeback_control *wbc, struct swap_info_struct *sis) +{ + struct bio *bio; bio = bio_alloc(sis->bdev, 1, REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc), @@ -362,8 +383,10 @@ void __swap_writepage(struct page *page, struct writeback_control *wbc) */ if (data_race(sis->flags & SWP_FS_OPS)) swap_writepage_fs(page, wbc); + else if (sis->flags & SWP_SYNCHRONOUS_IO) + swap_writepage_bdev_sync(page, wbc, sis); else - swap_writepage_bdev(page, wbc, sis); + swap_writepage_bdev_async(page, wbc, sis); } void swap_write_unplug(struct swap_iocb *sio) @@ -447,12 +470,6 @@ static void swap_readpage_bdev_sync(struct page *page, struct bio_vec bv; struct bio bio; - if ((sis->flags & SWP_SYNCHRONOUS_IO) && - !bdev_read_page(sis->bdev, swap_page_sector(page), page)) { - count_vm_event(PSWPIN); - return; - } - bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ); bio.bi_iter.bi_sector = swap_page_sector(page); bio_add_page(&bio, page, thp_size(page), 0); @@ -472,12 +489,6 @@ static void swap_readpage_bdev_async(struct page *page, { struct bio *bio; - if ((sis->flags & SWP_SYNCHRONOUS_IO) && - !bdev_read_page(sis->bdev, swap_page_sector(page), page)) { - count_vm_event(PSWPIN); - return; - } - bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL); bio->bi_iter.bi_sector = swap_page_sector(page); bio->bi_end_io = end_swap_bio_read; @@ -513,7 +524,7 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) unlock_page(page); } else if (data_race(sis->flags & SWP_FS_OPS)) { swap_readpage_fs(page, plug); - } else if (synchronous) { + } else if (synchronous || (sis->flags & SWP_SYNCHRONOUS_IO)) { swap_readpage_bdev_sync(page, sis); } else { swap_readpage_bdev_async(page, sis); diff --git a/mm/swapfile.c b/mm/swapfile.c index af151679d13a..888aed774fb6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3071,7 +3071,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (p->bdev && bdev_stable_writes(p->bdev)) p->flags |= SWP_STABLE_WRITES; - if (p->bdev && p->bdev->bd_disk->fops->rw_page) + if (p->bdev && bdev_synchronous(p->bdev)) p->flags |= SWP_SYNCHRONOUS_IO; if (p->bdev && bdev_nonrot(p->bdev)) { -- cgit v1.2.3 From 00cdf76012ab78b225345e8cf77d5391b4680b45 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 26 Jan 2023 20:15:52 +0000 Subject: mm: add memcpy_from_file_folio() This is the equivalent of memcpy_from_page(). It differs in that it takes the position in a file instead of offset in a folio, it accepts the total number of bytes to be copied (instead of the number of bytes to be copied from this folio) and it returns how many bytes were copied from the folio, rather than making the caller calculate that and then checking if the caller got it right. [akpm@linux-foundation.org: fix typo in comment] Link: https://lkml.kernel.org/r/20230126201552.1681588-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: "Fabio M. De Francesco" Cc: Ira Weiny Signed-off-by: Andrew Morton --- include/linux/highmem.h | 29 +++++++++++++++++++++++++++++ include/linux/page-flags.h | 1 + 2 files changed, 30 insertions(+) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index e22509420ac6..348701dae77f 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -413,6 +413,35 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len) kunmap_local(addr); } +/** + * memcpy_from_file_folio - Copy some bytes from a file folio. + * @to: The destination buffer. + * @folio: The folio to copy from. + * @pos: The position in the file. + * @len: The maximum number of bytes to copy. + * + * Copy up to @len bytes from this folio. This may be limited by PAGE_SIZE + * if the folio comes from HIGHMEM, and by the size of the folio. + * + * Return: The number of bytes copied from the folio. + */ +static inline size_t memcpy_from_file_folio(char *to, struct folio *folio, + loff_t pos, size_t len) +{ + size_t offset = offset_in_folio(folio, pos); + char *from = kmap_local_folio(folio, offset); + + if (folio_test_highmem(folio)) + len = min_t(size_t, len, PAGE_SIZE - offset); + else + len = min(len, folio_size(folio) - offset); + + memcpy(to, from, len); + kunmap_local(from); + + return len; +} + /** * folio_zero_segments() - Zero two byte ranges in a folio. * @folio: The folio to write to. diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 69e93a0c1277..a7e3a3405520 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -531,6 +531,7 @@ PAGEFLAG(Readahead, readahead, PF_NO_COMPOUND) * available at this point. */ #define PageHighMem(__p) is_highmem_idx(page_zonenum(__p)) +#define folio_test_highmem(__f) is_highmem_idx(folio_zonenum(__f)) #else PAGEFLAG_FALSE(HighMem, highmem) #endif -- cgit v1.2.3 From d585bdbeb79aa13b8a9bbe952d90f5252f7fe909 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 26 Jan 2023 20:12:54 +0000 Subject: fs: convert writepage_t callback to pass a folio Patch series "Convert writepage_t to use a folio". More folioisation. I split out the mpage work from everything else because it completely dominated the patch, but some implementations I just converted outright. This patch (of 2): We always write back an entire folio, but that's currently passed as the head page. Convert all filesystems that use write_cache_pages() to expect a folio instead of a page. Link: https://lkml.kernel.org/r/20230126201255.1681189-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230126201255.1681189-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- fs/cifs/file.c | 8 ++++---- fs/ext4/inode.c | 4 ++-- fs/ext4/super.c | 6 +++--- fs/fuse/file.c | 18 +++++++++--------- fs/iomap/buffered-io.c | 5 ++--- fs/mpage.c | 3 ++- fs/nfs/write.c | 7 ++++--- fs/ntfs3/inode.c | 6 +++--- fs/orangefs/inode.c | 23 +++++++++++------------ include/linux/writeback.h | 2 +- mm/page-writeback.c | 6 +++--- 11 files changed, 44 insertions(+), 44 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8cdd2f67af24..162fab5a4583 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2675,14 +2675,14 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, static int cifs_writepage_locked(struct page *page, struct writeback_control *wbc); -static int cifs_write_one_page(struct page *page, struct writeback_control *wbc, - void *data) +static int cifs_write_one_page(struct folio *folio, + struct writeback_control *wbc, void *data) { struct address_space *mapping = data; int ret; - ret = cifs_writepage_locked(page, wbc); - unlock_page(page); + ret = cifs_writepage_locked(&folio->page, wbc); + folio_unlock(folio); mapping_set_error(mapping, ret); return ret; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fb6cd994e59a..98c018dcd3fd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2711,10 +2711,10 @@ out: return err; } -static int ext4_writepage_cb(struct page *page, struct writeback_control *wbc, +static int ext4_writepage_cb(struct folio *folio, struct writeback_control *wbc, void *data) { - return ext4_writepage(page, wbc); + return ext4_writepage(&folio->page, wbc); } static int ext4_do_writepages(struct mpage_da_data *mpd) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 260c1b3e3ef2..49a8942b1e51 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -482,7 +482,7 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) * * However, we may have to redirty a page (see below.) */ -static int ext4_journalled_writepage_callback(struct page *page, +static int ext4_journalled_writepage_callback(struct folio *folio, struct writeback_control *wbc, void *data) { @@ -490,7 +490,7 @@ static int ext4_journalled_writepage_callback(struct page *page, struct buffer_head *bh, *head; struct journal_head *jh; - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); do { /* * We have to redirty a page in these cases: @@ -509,7 +509,7 @@ static int ext4_journalled_writepage_callback(struct page *page, if (buffer_dirty(bh) || (jh && (jh->b_transaction != transaction || jh->b_next_transaction))) { - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); goto out; } } while ((bh = bh->b_this_page) != head); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 875314ee6f59..3648747fb64d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2184,7 +2184,7 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, return false; } -static int fuse_writepages_fill(struct page *page, +static int fuse_writepages_fill(struct folio *folio, struct writeback_control *wbc, void *_data) { struct fuse_fill_wb_data *data = _data; @@ -2203,7 +2203,7 @@ static int fuse_writepages_fill(struct page *page, goto out_unlock; } - if (wpa && fuse_writepage_need_send(fc, page, ap, data)) { + if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) { fuse_writepages_send(data); data->wpa = NULL; } @@ -2238,7 +2238,7 @@ static int fuse_writepages_fill(struct page *page, data->max_pages = 1; ap = &wpa->ia.ap; - fuse_write_args_fill(&wpa->ia, data->ff, page_offset(page), 0); + fuse_write_args_fill(&wpa->ia, data->ff, folio_pos(folio), 0); wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; wpa->next = NULL; ap->args.in_pages = true; @@ -2246,13 +2246,13 @@ static int fuse_writepages_fill(struct page *page, ap->num_pages = 0; wpa->inode = inode; } - set_page_writeback(page); + folio_start_writeback(folio); - copy_highpage(tmp_page, page); + copy_highpage(tmp_page, &folio->page); ap->pages[ap->num_pages] = tmp_page; ap->descs[ap->num_pages].offset = 0; ap->descs[ap->num_pages].length = PAGE_SIZE; - data->orig_pages[ap->num_pages] = page; + data->orig_pages[ap->num_pages] = &folio->page; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); @@ -2266,13 +2266,13 @@ static int fuse_writepages_fill(struct page *page, spin_lock(&fi->lock); ap->num_pages++; spin_unlock(&fi->lock); - } else if (fuse_writepage_add(wpa, page)) { + } else if (fuse_writepage_add(wpa, &folio->page)) { data->wpa = wpa; } else { - end_page_writeback(page); + folio_end_writeback(folio); } out_unlock: - unlock_page(page); + folio_unlock(folio); return err; } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 356193e44cf0..292d273a2c80 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1685,10 +1685,9 @@ done: * For unwritten space on the page, we need to start the conversion to * regular allocated space. */ -static int -iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) +static int iomap_do_writepage(struct folio *folio, + struct writeback_control *wbc, void *data) { - struct folio *folio = page_folio(page); struct iomap_writepage_ctx *wpc = data; struct inode *inode = folio->mapping->host; u64 end_pos, isize; diff --git a/fs/mpage.c b/fs/mpage.c index 55988ea994ee..4890369bb10a 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -440,9 +440,10 @@ void clean_page_buffers(struct page *page) clean_buffers(page, ~0U); } -static int __mpage_writepage(struct page *page, struct writeback_control *wbc, +static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { + struct page *page = &folio->page; struct mpage_data *mpd = data; struct bio *bio = mpd->bio; struct address_space *mapping = page->mapping; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 80c240e50952..9d6432cb3f44 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -689,13 +689,14 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc) return ret; } -static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data) +static int nfs_writepages_callback(struct folio *folio, + struct writeback_control *wbc, void *data) { int ret; - ret = nfs_do_writepage(page, wbc, data); + ret = nfs_do_writepage(&folio->page, wbc, data); if (ret != AOP_WRITEPAGE_ACTIVATE) - unlock_page(page); + folio_unlock(folio); return ret; } diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 6b50b6e32378..9c646615f714 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -832,7 +832,7 @@ out: return err; } -static int ntfs_resident_writepage(struct page *page, +static int ntfs_resident_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; @@ -840,11 +840,11 @@ static int ntfs_resident_writepage(struct page *page, int ret; ni_lock(ni); - ret = attr_data_write_resident(ni, page); + ret = attr_data_write_resident(ni, &folio->page); ni_unlock(ni); if (ret != E_NTFS_NONRESIDENT) - unlock_page(page); + folio_unlock(folio); mapping_set_error(mapping, ret); return ret; } diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 4df560894386..c25468974c8a 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -154,21 +154,20 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow, return ret; } -static int orangefs_writepages_callback(struct page *page, - struct writeback_control *wbc, void *data) +static int orangefs_writepages_callback(struct folio *folio, + struct writeback_control *wbc, void *data) { struct orangefs_writepages *ow = data; - struct orangefs_write_range *wr; + struct orangefs_write_range *wr = folio->private; int ret; - if (!PagePrivate(page)) { - unlock_page(page); + if (!wr) { + folio_unlock(folio); /* It's not private so there's nothing to write, right? */ printk("writepages_callback not private!\n"); BUG(); return 0; } - wr = (struct orangefs_write_range *)page_private(page); ret = -1; if (ow->npages == 0) { @@ -176,7 +175,7 @@ static int orangefs_writepages_callback(struct page *page, ow->len = wr->len; ow->uid = wr->uid; ow->gid = wr->gid; - ow->pages[ow->npages++] = page; + ow->pages[ow->npages++] = &folio->page; ret = 0; goto done; } @@ -188,7 +187,7 @@ static int orangefs_writepages_callback(struct page *page, } if (ow->off + ow->len == wr->pos) { ow->len += wr->len; - ow->pages[ow->npages++] = page; + ow->pages[ow->npages++] = &folio->page; ret = 0; goto done; } @@ -198,10 +197,10 @@ done: orangefs_writepages_work(ow, wbc); ow->npages = 0; } - ret = orangefs_writepage_locked(page, wbc); - mapping_set_error(page->mapping, ret); - unlock_page(page); - end_page_writeback(page); + ret = orangefs_writepage_locked(&folio->page, wbc); + mapping_set_error(folio->mapping, ret); + folio_unlock(folio); + folio_end_writeback(folio); } else { if (ow->npages == ow->maxpages) { orangefs_writepages_work(ow, wbc); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 3f1491b07474..46020373e155 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -366,7 +366,7 @@ int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, bool wb_over_bg_thresh(struct bdi_writeback *wb); -typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, +typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc, void *data); void tag_pages_for_writeback(struct address_space *mapping, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 92b90d2ab513..516b1aa247e8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2470,7 +2470,7 @@ continue_unlock: goto continue_unlock; trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); - error = writepage(&folio->page, wbc, data); + error = writepage(folio, wbc, data); if (unlikely(error)) { /* * Handle errors according to the type of @@ -2528,11 +2528,11 @@ continue_unlock: } EXPORT_SYMBOL(write_cache_pages); -static int writepage_cb(struct page *page, struct writeback_control *wbc, +static int writepage_cb(struct folio *folio, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; - int ret = mapping->a_ops->writepage(page, wbc); + int ret = mapping->a_ops->writepage(&folio->page, wbc); mapping_set_error(mapping, ret); return ret; } -- cgit v1.2.3 From 9160cffd45ee93bc20de134e4f127dac9af0cc18 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 26 Jan 2023 20:12:55 +0000 Subject: mpage: convert __mpage_writepage() to use a folio more fully This is just a conversion to the folio API. While there are some nods towards supporting multi-page folios in here, the blocks array is still sized for one page's worth of blocks, and there are other assumptions such as the blocks_per_page variable. [willy@infradead.org: fix accidentally-triggering WARN_ON_ONCE] Link: https://lkml.kernel.org/r/Y9kuaBgXf9lKJ8b0@casper.infradead.org Link: https://lkml.kernel.org/r/20230126201255.1681189-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Jan Kara Signed-off-by: Andrew Morton --- fs/mpage.c | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/fs/mpage.c b/fs/mpage.c index 4890369bb10a..b9b7f6dc9c37 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -443,13 +443,11 @@ void clean_page_buffers(struct page *page) static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { - struct page *page = &folio->page; struct mpage_data *mpd = data; struct bio *bio = mpd->bio; - struct address_space *mapping = page->mapping; - struct inode *inode = page->mapping->host; + struct address_space *mapping = folio->mapping; + struct inode *inode = mapping->host; const unsigned blkbits = inode->i_blkbits; - unsigned long end_index; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; sector_t last_block; sector_t block_in_file; @@ -460,13 +458,13 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, int boundary = 0; sector_t boundary_block = 0; struct block_device *boundary_bdev = NULL; - int length; + size_t length; struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; + struct buffer_head *head = folio_buffers(folio); - if (page_has_buffers(page)) { - struct buffer_head *head = page_buffers(page); + if (head) { struct buffer_head *bh = head; /* If they're all mapped and dirty, do it */ @@ -518,8 +516,8 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, /* * The page has no buffers: map it to disk */ - BUG_ON(!PageUptodate(page)); - block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); + BUG_ON(!folio_test_uptodate(folio)); + block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits); /* * Whole page beyond EOF? Skip allocating blocks to avoid leaking * space. @@ -527,7 +525,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, if (block_in_file >= (i_size + (1 << blkbits) - 1) >> blkbits) goto page_is_mapped; last_block = (i_size - 1) >> blkbits; - map_bh.b_page = page; + map_bh.b_folio = folio; for (page_block = 0; page_block < blocks_per_page; ) { map_bh.b_state = 0; @@ -556,8 +554,11 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, first_unmapped = page_block; page_is_mapped: - end_index = i_size >> PAGE_SHIFT; - if (page->index >= end_index) { + /* Don't bother writing beyond EOF, truncate will discard the folio */ + if (folio_pos(folio) >= i_size) + goto confused; + length = folio_size(folio); + if (folio_pos(folio) + length > i_size) { /* * The page straddles i_size. It must be zeroed out on each * and every writepage invocation because it may be mmapped. @@ -566,11 +567,8 @@ page_is_mapped: * is zeroed when mapped, and writes to that region are not * written out to the file." */ - unsigned offset = i_size & (PAGE_SIZE - 1); - - if (page->index > end_index || !offset) - goto confused; - zero_user_segment(page, offset, PAGE_SIZE); + length = i_size - folio_pos(folio); + folio_zero_segment(folio, length, folio_size(folio)); } /* @@ -593,18 +591,18 @@ alloc_new: * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ - wbc_account_cgroup_owner(wbc, page, PAGE_SIZE); + wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio)); length = first_unmapped << blkbits; - if (bio_add_page(bio, page, length, 0) < length) { + if (!bio_add_folio(bio, folio, length, 0)) { bio = mpage_bio_submit(bio); goto alloc_new; } - clean_buffers(page, first_unmapped); + clean_buffers(&folio->page, first_unmapped); - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); + folio_unlock(folio); if (boundary || (first_unmapped != blocks_per_page)) { bio = mpage_bio_submit(bio); if (boundary_block) { @@ -623,7 +621,7 @@ confused: /* * The caller has a ref on the inode, so *mapping is stable */ - ret = block_write_full_page(page, mpd->get_block, wbc); + ret = block_write_full_page(&folio->page, mpd->get_block, wbc); mapping_set_error(mapping, ret); out: mpd->bio = bio; -- cgit v1.2.3 From 6f74c0ec2095335158015ce29b708e775b9cea3a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 8 Feb 2023 15:08:01 +0100 Subject: arm/mm: fix swp type masking in __swp_entry() We're masking with the number of type bits instead of the type mask, which is obviously wrong. Link: https://lkml.kernel.org/r/39fd91e3-c93b-23c6-afc6-cbe473bb0ca9@redhat.com Fixes: 20aae9eff5ac ("arm/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE") Signed-off-by: David Hildenbrand Reported-by: Mark Brown Tested-by: Mark Brown Cc: Russell King (Oracle) Signed-off-by: Andrew Morton --- arch/arm/include/asm/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index 2e626e6da9a3..a58ccbb406ad 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -292,7 +292,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define __swp_type(x) (((x).val >> __SWP_TYPE_SHIFT) & __SWP_TYPE_MASK) #define __swp_offset(x) ((x).val >> __SWP_OFFSET_SHIFT) -#define __swp_entry(type, offset) ((swp_entry_t) { (((type) & __SWP_TYPE_BITS) << __SWP_TYPE_SHIFT) | \ +#define __swp_entry(type, offset) ((swp_entry_t) { (((type) & __SWP_TYPE_MASK) << __SWP_TYPE_SHIFT) | \ ((offset) << __SWP_OFFSET_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) -- cgit v1.2.3 From c643e6ebedb435bcf863001f5e69a578f2658055 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 3 Feb 2023 16:28:40 -0500 Subject: mm: fix memcpy_from_file_folio() integer underflow If we have a HIGHMEM system with a large folio, 'offset' may be larger than PAGE_SIZE, and so min_t will cap at 'len' instead of the intended end-of-page. That can overflow into the next page which is likely to be unmapped and fault, but could theoretically copy the wrong data. Link: https://lkml.kernel.org/r/Y919vmSrtAgsf6K3@casper.infradead.org Fixes: 00cdf76012ab ("mm: add memcpy_from_file_folio()") Signed-off-by: Matthew Wilcox (Oracle) Cc: "Fabio M. De Francesco" Cc: Ira Weiny Signed-off-by: Andrew Morton --- include/linux/highmem.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 348701dae77f..b06254e76d99 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -431,9 +431,10 @@ static inline size_t memcpy_from_file_folio(char *to, struct folio *folio, size_t offset = offset_in_folio(folio, pos); char *from = kmap_local_folio(folio, offset); - if (folio_test_highmem(folio)) + if (folio_test_highmem(folio)) { + offset = offset_in_page(offset); len = min_t(size_t, len, PAGE_SIZE - offset); - else + } else len = min(len, folio_size(folio) - offset); memcpy(to, from, len); -- cgit v1.2.3 From e7f43ca99fc8bff2333547bb08dae20a35a23450 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:02 -0500 Subject: maple_tree: add mas_init() function Patch series "VMA tree type safety and remove __vma_adjust()", v4. This patchset does two things: 1. Clean up, including removal of __vma_adjust() and 2. Extends the VMA iterator API to provide type safety to the VMA operations using the maple tree, as requested by Linus [1]. It also addresses another issue of usability brought up by Linus about needing to modify the maple state within the loops. The maple state has been replaced by the VMA iterator and the iterator is now modified within the MM code so the caller should not need to worry about doing the work themselves when tree modifications occur. This brought up a potential inconsistency of the iterator state and what the user expects, so the inconsistency is addressed to keep the VMA iterator safe for use after the looping over a VMA range. This is addressed in patch 3 ("maple_tree: Reduce user error potential") and 4 ("test_maple_tree: Test modifications while iterating"). While cleaning up the state, the duplicate locking code in mm/mmap.c introduced by the maple tree has been address by abstracting it to two functions: vma_prepare() and vma_complete(). These abstractions allowed for a much simpler __vma_adjust(), which eventually leads to the removal of the __vma_adjust() function by placing the logic into the vma_merge() function itself. 1. https://lore.kernel.org/linux-mm/CAHk-=wg9WQXBGkNdKD2bqocnN73rDswuWsavBB7T-tekykEn_A@mail.gmail.com/ This patch (of 49): Add a function that will zero out the maple state struct and set some basic defaults. Link: https://lkml.kernel.org/r/20230120162650.984577-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230120162650.984577-2-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index a7bf58fd7cc6..1fadb5f5978b 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -432,6 +432,7 @@ struct ma_wr_state { .min = 0, \ .max = ULONG_MAX, \ .alloc = NULL, \ + .mas_flags = 0, \ } #define MA_WR_STATE(name, ma_state, wr_entry) \ @@ -470,6 +471,16 @@ void *mas_next(struct ma_state *mas, unsigned long max); int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); +static inline void mas_init(struct ma_state *mas, struct maple_tree *tree, + unsigned long addr) +{ + memset(mas, 0, sizeof(struct ma_state)); + mas->tree = tree; + mas->index = mas->last = addr; + mas->max = ULONG_MAX; + mas->node = MAS_START; +} + /* Checks if a mas has not found anything */ static inline bool mas_is_none(struct ma_state *mas) { -- cgit v1.2.3 From 65be6f058b0eba98dc6c6f197ea9f62c9b6a519f Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:03 -0500 Subject: maple_tree: fix potential rcu issue Ensure the node isn't dead after reading the node end. Link: https://lkml.kernel.org/r/20230120162650.984577-3-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 1c5d3b640a24..7e3cf5b7e68b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4655,13 +4655,13 @@ static inline void *mas_next_nentry(struct ma_state *mas, pivots = ma_pivots(node, type); slots = ma_slots(node, type); mas->index = mas_safe_min(mas, pivots, mas->offset); + count = ma_data_end(node, type, pivots, mas->max); if (ma_dead_node(node)) return NULL; if (mas->index > max) return NULL; - count = ma_data_end(node, type, pivots, mas->max); if (mas->offset > count) return NULL; -- cgit v1.2.3 From 50e81c82ad947045c7ed26ddc9acb17276b653b6 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:04 -0500 Subject: maple_tree: reduce user error potential When iterating, a user may operate on the tree and cause the maple state to be altered and left in an unintuitive state. Detect this scenario and correct it by setting to the limit and invalidating the state. Link: https://lkml.kernel.org/r/20230120162650.984577-4-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 7e3cf5b7e68b..5804c5997598 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4732,6 +4732,11 @@ static inline void *mas_next_entry(struct ma_state *mas, unsigned long limit) unsigned long last; enum maple_type mt; + if (mas->index > limit) { + mas->index = mas->last = limit; + mas_pause(mas); + return NULL; + } last = mas->last; retry: offset = mas->offset; @@ -4838,6 +4843,11 @@ static inline void *mas_prev_entry(struct ma_state *mas, unsigned long min) { void *entry; + if (mas->index < min) { + mas->index = mas->last = min; + mas_pause(mas); + return NULL; + } retry: while (likely(!mas_is_none(mas))) { entry = mas_prev_nentry(mas, min, mas->index); -- cgit v1.2.3 From 5159d64b335401fa83f18c27e2267f1eafc41bd3 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:05 -0500 Subject: test_maple_tree: test modifications while iterating Add a testcase to ensure the iterator detects bad states on modifications and does what the user expects Link: https://lkml.kernel.org/r/20230120162650.984577-5-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index ec847bf4dcb4..3d19b1f78d71 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -1709,6 +1709,74 @@ static noinline void check_forking(struct maple_tree *mt) mtree_destroy(&newmt); } +static noinline void check_iteration(struct maple_tree *mt) +{ + int i, nr_entries = 125; + void *val; + MA_STATE(mas, mt, 0, 0); + + for (i = 0; i <= nr_entries; i++) + mtree_store_range(mt, i * 10, i * 10 + 9, + xa_mk_value(i), GFP_KERNEL); + + mt_set_non_kernel(99999); + + i = 0; + mas_lock(&mas); + mas_for_each(&mas, val, 925) { + MT_BUG_ON(mt, mas.index != i * 10); + MT_BUG_ON(mt, mas.last != i * 10 + 9); + /* Overwrite end of entry 92 */ + if (i == 92) { + mas.index = 925; + mas.last = 929; + mas_store(&mas, val); + } + i++; + } + /* Ensure mas_find() gets the next value */ + val = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, val != xa_mk_value(i)); + + mas_set(&mas, 0); + i = 0; + mas_for_each(&mas, val, 785) { + MT_BUG_ON(mt, mas.index != i * 10); + MT_BUG_ON(mt, mas.last != i * 10 + 9); + /* Overwrite start of entry 78 */ + if (i == 78) { + mas.index = 780; + mas.last = 785; + mas_store(&mas, val); + } else { + i++; + } + } + val = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, val != xa_mk_value(i)); + + mas_set(&mas, 0); + i = 0; + mas_for_each(&mas, val, 765) { + MT_BUG_ON(mt, mas.index != i * 10); + MT_BUG_ON(mt, mas.last != i * 10 + 9); + /* Overwrite end of entry 76 and advance to the end */ + if (i == 76) { + mas.index = 760; + mas.last = 765; + mas_store(&mas, val); + mas_next(&mas, ULONG_MAX); + } + i++; + } + /* Make sure the next find returns the one after 765, 766-769 */ + val = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, val != xa_mk_value(76)); + mas_unlock(&mas); + mas_destroy(&mas); + mt_set_non_kernel(0); +} + static noinline void check_mas_store_gfp(struct maple_tree *mt) { @@ -2659,6 +2727,10 @@ static int maple_tree_seed(void) goto skip; #endif + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_iteration(&tree); + mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_forking(&tree); mtree_destroy(&tree); -- cgit v1.2.3 From 1202700c3f8cc5f7e4646c3cf05ee6f7c8bc6ccf Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:06 -0500 Subject: maple_tree: fix handle of invalidated state in mas_wr_store_setup() If an invalidated maple state is encountered during write, reset the maple state to MAS_START. This will result in a re-walk of the tree to the correct location for the write. Link: https://lore.kernel.org/all/20230107020126.1627-1-sj@kernel.org/ Link: https://lkml.kernel.org/r/20230120162650.984577-6-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reported-by: SeongJae Park Signed-off-by: Andrew Morton --- lib/maple_tree.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 5804c5997598..7c786bd5e575 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5609,6 +5609,9 @@ static inline void mte_destroy_walk(struct maple_enode *enode, static void mas_wr_store_setup(struct ma_wr_state *wr_mas) { + if (unlikely(mas_is_paused(wr_mas->mas))) + mas_reset(wr_mas->mas); + if (!mas_is_start(wr_mas->mas)) { if (mas_is_none(wr_mas->mas)) { mas_reset(wr_mas->mas); -- cgit v1.2.3 From 17dc622c7b0f94e49bed030726df4db12ecaa6b5 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:07 -0500 Subject: maple_tree: fix mas_prev() and mas_find() state handling When mas_prev() does not find anything, set the state to MAS_NONE. Handle the MAS_NONE in mas_find() like a MAS_START. Link: https://lkml.kernel.org/r/20230120162650.984577-7-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reported-by: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 7c786bd5e575..5e9703189259 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4845,7 +4845,7 @@ static inline void *mas_prev_entry(struct ma_state *mas, unsigned long min) if (mas->index < min) { mas->index = mas->last = min; - mas_pause(mas); + mas->node = MAS_NONE; return NULL; } retry: @@ -5919,6 +5919,7 @@ void *mas_prev(struct ma_state *mas, unsigned long min) if (!mas->index) { /* Nothing comes before 0 */ mas->last = 0; + mas->node = MAS_NONE; return NULL; } @@ -6009,6 +6010,9 @@ void *mas_find(struct ma_state *mas, unsigned long max) mas->index = ++mas->last; } + if (unlikely(mas_is_none(mas))) + mas->node = MAS_START; + if (unlikely(mas_is_start(mas))) { /* First run or continue */ void *entry; -- cgit v1.2.3 From b62b633e048bbddef90b2e55d2e33823187b425f Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:08 -0500 Subject: mm: expand vma iterator interface Add wrappers for the maple tree to the vma iterator. This will provide type safety at compile time. Link: https://lkml.kernel.org/r/20230120162650.984577-8-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 46 +++++++++++++++++++++++++++++++--- include/linux/mm_types.h | 4 +-- mm/internal.h | 64 ++++++++++++++++++++++++++++++++++++++++++++++++ mm/mmap.c | 18 ++++++++++++++ 4 files changed, 125 insertions(+), 7 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index c9db257f09b3..b977a90d9829 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -670,16 +670,16 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { - return mas_find(&vmi->mas, max); + return mas_find(&vmi->mas, max - 1); } static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) { /* - * Uses vma_find() to get the first VMA when the iterator starts. + * Uses mas_find() to get the first VMA when the iterator starts. * Calling mas_next() could skip the first entry. */ - return vma_find(vmi, ULONG_MAX); + return mas_find(&vmi->mas, ULONG_MAX); } static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) @@ -692,12 +692,50 @@ static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) return vmi->mas.index; } +static inline unsigned long vma_iter_end(struct vma_iterator *vmi) +{ + return vmi->mas.last + 1; +} +static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, + unsigned long count) +{ + return mas_expected_entries(&vmi->mas, count); +} + +/* Free any unused preallocations */ +static inline void vma_iter_free(struct vma_iterator *vmi) +{ + mas_destroy(&vmi->mas); +} + +static inline int vma_iter_bulk_store(struct vma_iterator *vmi, + struct vm_area_struct *vma) +{ + vmi->mas.index = vma->vm_start; + vmi->mas.last = vma->vm_end - 1; + mas_store(&vmi->mas, vma); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + +static inline void vma_iter_invalidate(struct vma_iterator *vmi) +{ + mas_pause(&vmi->mas); +} + +static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) +{ + mas_set(&vmi->mas, addr); +} + #define for_each_vma(__vmi, __vma) \ while (((__vma) = vma_next(&(__vmi))) != NULL) /* The MM code likes to work with exclusive end addresses */ #define for_each_vma_range(__vmi, __vma, __end) \ - while (((__vma) = vma_find(&(__vmi), (__end) - 1)) != NULL) + while (((__vma) = vma_find(&(__vmi), (__end))) != NULL) #ifdef CONFIG_SHMEM /* diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 452920467223..5ca11c6c46e8 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -849,9 +849,7 @@ struct vma_iterator { static inline void vma_iter_init(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long addr) { - vmi->mas.tree = &mm->mm_mt; - vmi->mas.index = addr; - vmi->mas.node = MAS_START; + mas_init(&vmi->mas, &mm->mm_mt, addr); } struct mmu_gather; diff --git a/mm/internal.h b/mm/internal.h index 2d1b9fa8083e..ffd65248f266 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -877,4 +877,68 @@ static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) return !(vma->vm_flags & VM_SOFTDIRTY); } +/* + * VMA Iterator functions shared between nommu and mmap + */ +static inline int vma_iter_prealloc(struct vma_iterator *vmi) +{ + return mas_preallocate(&vmi->mas, GFP_KERNEL); +} + +static inline void vma_iter_clear(struct vma_iterator *vmi, + unsigned long start, unsigned long end) +{ + mas_set_range(&vmi->mas, start, end - 1); + mas_store_prealloc(&vmi->mas, NULL); +} + +static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) +{ + return mas_walk(&vmi->mas); +} + +/* Store a VMA with preallocated memory */ +static inline void vma_iter_store(struct vma_iterator *vmi, + struct vm_area_struct *vma) +{ + +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.index > vma->vm_start)) { + printk("%lu > %lu\n", vmi->mas.index, vma->vm_start); + printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end); + printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last); + mt_dump(vmi->mas.tree); + } + if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.last < vma->vm_start)) { + printk("%lu < %lu\n", vmi->mas.last, vma->vm_start); + printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end); + printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last); + mt_dump(vmi->mas.tree); + } +#endif + + if (vmi->mas.node != MAS_START && + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) + vma_iter_invalidate(vmi); + + vmi->mas.index = vma->vm_start; + vmi->mas.last = vma->vm_end - 1; + mas_store_prealloc(&vmi->mas, vma); +} + +static inline int vma_iter_store_gfp(struct vma_iterator *vmi, + struct vm_area_struct *vma, gfp_t gfp) +{ + if (vmi->mas.node != MAS_START && + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) + vma_iter_invalidate(vmi); + + vmi->mas.index = vma->vm_start; + vmi->mas.last = vma->vm_end - 1; + mas_store_gfp(&vmi->mas, vma, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} #endif /* __MM_INTERNAL_H */ diff --git a/mm/mmap.c b/mm/mmap.c index ffc0815cd7fb..db70f3e2181e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -144,6 +144,24 @@ static void remove_vma(struct vm_area_struct *vma) vm_area_free(vma); } +static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, + unsigned long min) +{ + return mas_prev(&vmi->mas, min); +} + +static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, + unsigned long start, unsigned long end, gfp_t gfp) +{ + vmi->mas.index = start; + vmi->mas.last = end - 1; + mas_store_gfp(&vmi->mas, NULL, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + /* * check_brk_limits() - Use platform specific check of range & verify mlock * limits. -- cgit v1.2.3 From 92fed82047d7febc83614a9579c37f1ce80442be Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:09 -0500 Subject: mm/mmap: convert brk to use vma iterator Use the vma iterator API for the brk() system call. This will provide type safety at compile time. Link: https://lkml.kernel.org/r/20230120162650.984577-9-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 48 +++++++++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index db70f3e2181e..94a477a55109 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -180,10 +180,10 @@ static int check_brk_limits(unsigned long addr, unsigned long len) return mlock_future_check(current->mm, current->mm->def_flags, len); } -static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, +static int do_brk_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long newbrk, unsigned long oldbrk, struct list_head *uf); -static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma, +static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, unsigned long addr, unsigned long request, unsigned long flags); SYSCALL_DEFINE1(brk, unsigned long, brk) { @@ -194,7 +194,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) bool populate; bool downgraded = false; LIST_HEAD(uf); - MA_STATE(mas, &mm->mm_mt, 0, 0); + struct vma_iterator vmi; if (mmap_write_lock_killable(mm)) return -EINTR; @@ -242,8 +242,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) int ret; /* Search one past newbrk */ - mas_set(&mas, newbrk); - brkvma = mas_find(&mas, oldbrk); + vma_iter_init(&vmi, mm, newbrk); + brkvma = vma_find(&vmi, oldbrk); if (!brkvma || brkvma->vm_start >= oldbrk) goto out; /* mapping intersects with an existing non-brk vma. */ /* @@ -252,7 +252,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) * before calling do_brk_munmap(). */ mm->brk = brk; - ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf); + ret = do_brk_munmap(&vmi, brkvma, newbrk, oldbrk, &uf); if (ret == 1) { downgraded = true; goto success; @@ -270,14 +270,14 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) * Only check if the next VMA is within the stack_guard_gap of the * expansion area */ - mas_set(&mas, oldbrk); - next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap); + vma_iter_init(&vmi, mm, oldbrk); + next = vma_find(&vmi, newbrk + PAGE_SIZE + stack_guard_gap); if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) goto out; - brkvma = mas_prev(&mas, mm->start_brk); + brkvma = vma_prev_limit(&vmi, mm->start_brk); /* Ok, looks good - let it rip. */ - if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) + if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) goto out; mm->brk = brk; @@ -2917,8 +2917,8 @@ out: } /* - * brk_munmap() - Unmap a partial vma. - * @mas: The maple tree state. + * brk_munmap() - Unmap a full or partial vma. + * @vmi: The vma iterator * @vma: The vma to be modified * @newbrk: the start of the address to unmap * @oldbrk: The end of the address to unmap @@ -2928,7 +2928,7 @@ out: * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if * possible. */ -static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, +static int do_brk_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long newbrk, unsigned long oldbrk, struct list_head *uf) { @@ -2936,14 +2936,14 @@ static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, int ret; arch_unmap(mm, newbrk, oldbrk); - ret = do_mas_align_munmap(mas, vma, mm, newbrk, oldbrk, uf, true); + ret = do_mas_align_munmap(&vmi->mas, vma, mm, newbrk, oldbrk, uf, true); validate_mm_mt(mm); return ret; } /* * do_brk_flags() - Increase the brk vma if the flags match. - * @mas: The maple tree state. + * @vmi: The vma iterator * @addr: The start address * @len: The length of the increase * @vma: The vma, @@ -2953,7 +2953,7 @@ static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, * do not match then create a new anonymous VMA. Eventually we may be able to * do some brk-specific accounting here. */ -static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, +static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, unsigned long len, unsigned long flags) { struct mm_struct *mm = current->mm; @@ -2980,8 +2980,7 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, if (vma && vma->vm_end == addr && !vma_policy(vma) && can_vma_merge_after(vma, flags, NULL, NULL, addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) { - mas_set_range(mas, vma->vm_start, addr + len - 1); - if (mas_preallocate(mas, GFP_KERNEL)) + if (vma_iter_prealloc(vmi)) goto unacct_fail; vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); @@ -2991,7 +2990,7 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, } vma->vm_end = addr + len; vma->vm_flags |= VM_SOFTDIRTY; - mas_store_prealloc(mas, vma); + vma_iter_store(vmi, vma); if (vma->anon_vma) { anon_vma_interval_tree_post_update_vma(vma); @@ -3012,8 +3011,7 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, vma->vm_pgoff = addr >> PAGE_SHIFT; vma->vm_flags = flags; vma->vm_page_prot = vm_get_page_prot(flags); - mas_set_range(mas, vma->vm_start, addr + len - 1); - if (mas_store_gfp(mas, vma, GFP_KERNEL)) + if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; mm->map_count++; @@ -3042,7 +3040,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) int ret; bool populate; LIST_HEAD(uf); - MA_STATE(mas, &mm->mm_mt, addr, addr); + VMA_ITERATOR(vmi, mm, addr); len = PAGE_ALIGN(request); if (len < request) @@ -3061,12 +3059,12 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) if (ret) goto limits_failed; - ret = do_mas_munmap(&mas, mm, addr, len, &uf, 0); + ret = do_mas_munmap(&vmi.mas, mm, addr, len, &uf, 0); if (ret) goto munmap_failed; - vma = mas_prev(&mas, 0); - ret = do_brk_flags(&mas, vma, addr, len, flags); + vma = vma_prev(&vmi); + ret = do_brk_flags(&vmi, vma, addr, len, flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); -- cgit v1.2.3 From 3b9dbd5e91b11911d21effbb80d1976fb21660df Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:10 -0500 Subject: kernel/fork: convert forking to using the vmi iterator Avoid using the maple tree interface directly. This gains type safety. Link: https://lkml.kernel.org/r/20230120162650.984577-10-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- kernel/fork.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 9f7fe3541897..441dcec60aae 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -585,8 +585,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, int retval; unsigned long charge = 0; LIST_HEAD(uf); - MA_STATE(old_mas, &oldmm->mm_mt, 0, 0); - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(old_vmi, oldmm, 0); + VMA_ITERATOR(vmi, mm, 0); uprobe_start_dup_mmap(); if (mmap_write_lock_killable(oldmm)) { @@ -613,11 +613,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto out; khugepaged_fork(mm, oldmm); - retval = mas_expected_entries(&mas, oldmm->map_count); + retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count); if (retval) goto out; - mas_for_each(&old_mas, mpnt, ULONG_MAX) { + for_each_vma(old_vmi, mpnt) { struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { @@ -683,11 +683,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, hugetlb_dup_vma_private(tmp); /* Link the vma into the MT */ - mas.index = tmp->vm_start; - mas.last = tmp->vm_end - 1; - mas_store(&mas, tmp); - if (mas_is_err(&mas)) - goto fail_nomem_mas_store; + if (vma_iter_bulk_store(&vmi, tmp)) + goto fail_nomem_vmi_store; mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) @@ -702,7 +699,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); loop_out: - mas_destroy(&mas); + vma_iter_free(&vmi); out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); @@ -712,7 +709,7 @@ fail_uprobe_end: uprobe_end_dup_mmap(); return retval; -fail_nomem_mas_store: +fail_nomem_vmi_store: unlink_anon_vmas(tmp); fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); -- cgit v1.2.3 From 79e4f2caa4401e56f8df34f658c43bacddc0ae03 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:11 -0500 Subject: mmap: convert vma_link() vma iterator Avoid using the maple tree interface directly. Link: https://lkml.kernel.org/r/20230120162650.984577-11-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 94a477a55109..17de99f31ff5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -487,10 +487,10 @@ static inline void vma_mas_szero(struct ma_state *mas, unsigned long start, static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) { - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); struct address_space *mapping = NULL; - if (mas_preallocate(&mas, GFP_KERNEL)) + if (vma_iter_prealloc(&vmi)) return -ENOMEM; if (vma->vm_file) { @@ -498,7 +498,7 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) i_mmap_lock_write(mapping); } - vma_mas_store(vma, &mas); + vma_iter_store(&vmi, vma); if (mapping) { __vma_link_file(vma, mapping); -- cgit v1.2.3 From 0378c0a0e9e463b9e31b94fbbbc10f94b34225b6 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:12 -0500 Subject: mm/mmap: remove preallocation from do_mas_align_munmap() In preparation of passing the vma state through split, the pre-allocation that occurs before the split has to be moved to after. Since the preallocation would then live right next to the store, just call store instead of preallocating. This effectively restores the potential error path of splitting and not munmap'ing which pre-dates the maple tree. Link: https://lkml.kernel.org/r/20230120162650.984577-12-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 17de99f31ff5..d46a798c38ab 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2329,9 +2329,6 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN); mt_set_external_lock(&mt_detach, &mm->mmap_lock); - if (mas_preallocate(mas, GFP_KERNEL)) - return -ENOMEM; - mas->last = end - 1; /* * If we need to split any vma, do it now to save pain later. @@ -2422,8 +2419,6 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, goto userfaultfd_error; } - /* Point of no return */ - mas_set_range(mas, start, end - 1); #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) /* Make sure no VMAs are about to be lost. */ { @@ -2431,6 +2426,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *vma_mas, *vma_test; int test_count = 0; + mas_set_range(mas, start, end - 1); rcu_read_lock(); vma_test = mas_find(&test, end - 1); mas_for_each(mas, vma_mas, end - 1) { @@ -2440,10 +2436,13 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, } rcu_read_unlock(); BUG_ON(count != test_count); - mas_set_range(mas, start, end - 1); } #endif - mas_store_prealloc(mas, NULL); + /* Point of no return */ + mas_set_range(mas, start, end - 1); + if (mas_store_gfp(mas, NULL, GFP_KERNEL)) + return -ENOMEM; + mm->map_count -= count; /* * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or @@ -2475,7 +2474,6 @@ end_split_failed: __mt_destroy(&mt_detach); start_split_failed: map_count_exceeded: - mas_destroy(mas); return error; } -- cgit v1.2.3 From 183654ce26a5d5bd7bc11bcb02e8086f02f66d7d Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:13 -0500 Subject: mmap: change do_mas_munmap and do_mas_aligned_munmap() to use vma iterator Start passing the vma iterator through the mm code. This will allow for reuse of the state and cleaner invalidation if necessary. Link: https://lkml.kernel.org/r/20230120162650.984577-13-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- mm/mmap.c | 77 +++++++++++++++++++++++++----------------------------- mm/mremap.c | 6 ++--- 3 files changed, 39 insertions(+), 46 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index b977a90d9829..152a1362b800 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2905,7 +2905,7 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr, extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); -extern int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, +extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, diff --git a/mm/mmap.c b/mm/mmap.c index d46a798c38ab..5b83023ba6a6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2305,8 +2305,8 @@ static inline int munmap_sidetree(struct vm_area_struct *vma, } /* - * do_mas_align_munmap() - munmap the aligned region from @start to @end. - * @mas: The maple_state, ideally set up to alter the correct tree location. + * do_vmi_align_munmap() - munmap the aligned region from @start to @end. + * @vmi: The vma iterator * @vma: The starting vm_area_struct * @mm: The mm_struct * @start: The aligned start address to munmap. @@ -2317,7 +2317,7 @@ static inline int munmap_sidetree(struct vm_area_struct *vma, * If @downgrade is true, check return code for potential release of the lock. */ static int -do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, +do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool downgrade) { @@ -2329,7 +2329,6 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN); mt_set_external_lock(&mt_detach, &mm->mmap_lock); - mas->last = end - 1; /* * If we need to split any vma, do it now to save pain later. * @@ -2349,27 +2348,23 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) goto map_count_exceeded; - /* - * mas_pause() is not needed since mas->index needs to be set - * differently than vma->vm_end anyways. - */ error = __split_vma(mm, vma, start, 0); if (error) goto start_split_failed; - mas_set(mas, start); - vma = mas_walk(mas); + vma_iter_set(vmi, start); + vma = vma_find(vmi, end); } - prev = mas_prev(mas, 0); + prev = vma_prev(vmi); if (unlikely((!prev))) - mas_set(mas, start); + vma_iter_set(vmi, start); /* * Detach a range of VMAs from the mm. Using next as a temp variable as * it is always overwritten. */ - mas_for_each(mas, next, end - 1) { + for_each_vma_range(*vmi, next, end) { /* Does it split the end? */ if (next->vm_end > end) { struct vm_area_struct *split; @@ -2378,8 +2373,8 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, if (error) goto end_split_failed; - mas_set(mas, end); - split = mas_prev(mas, 0); + vma_iter_set(vmi, end); + split = vma_prev(vmi); error = munmap_sidetree(split, &mas_detach); if (error) goto munmap_sidetree_failed; @@ -2401,7 +2396,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, } if (!next) - next = mas_next(mas, ULONG_MAX); + next = vma_next(vmi); if (unlikely(uf)) { /* @@ -2426,10 +2421,10 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *vma_mas, *vma_test; int test_count = 0; - mas_set_range(mas, start, end - 1); + vma_iter_set(vmi, start); rcu_read_lock(); vma_test = mas_find(&test, end - 1); - mas_for_each(mas, vma_mas, end - 1) { + for_each_vma_range(*vmi, vma_mas, end) { BUG_ON(vma_mas != vma_test); test_count++; vma_test = mas_next(&test, end - 1); @@ -2439,8 +2434,8 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, } #endif /* Point of no return */ - mas_set_range(mas, start, end - 1); - if (mas_store_gfp(mas, NULL, GFP_KERNEL)) + vma_iter_set(vmi, start); + if (vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL)) return -ENOMEM; mm->map_count -= count; @@ -2478,8 +2473,8 @@ map_count_exceeded: } /* - * do_mas_munmap() - munmap a given range. - * @mas: The maple state + * do_vmi_munmap() - munmap a given range. + * @vmi: The vma iterator * @mm: The mm_struct * @start: The start address to munmap * @len: The length of the range to munmap @@ -2493,7 +2488,7 @@ map_count_exceeded: * * Returns: -EINVAL on failure, 1 on success and unlock, 0 otherwise. */ -int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, +int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool downgrade) { @@ -2511,11 +2506,11 @@ int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, arch_unmap(mm, start, end); /* Find the first overlapping VMA */ - vma = mas_find(mas, end - 1); + vma = vma_find(vmi, end); if (!vma) return 0; - return do_mas_align_munmap(mas, vma, mm, start, end, uf, downgrade); + return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade); } /* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls. @@ -2527,9 +2522,9 @@ int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { - MA_STATE(mas, &mm->mm_mt, start, start); + VMA_ITERATOR(vmi, mm, start); - return do_mas_munmap(&mas, mm, start, len, uf, false); + return do_vmi_munmap(&vmi, mm, start, len, uf, false); } unsigned long mmap_region(struct file *file, unsigned long addr, @@ -2545,7 +2540,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long merge_start = addr, merge_end = end; pgoff_t vm_pgoff; int error; - MA_STATE(mas, &mm->mm_mt, addr, end - 1); + VMA_ITERATOR(vmi, mm, addr); /* Check against address space limit. */ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { @@ -2563,7 +2558,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } /* Unmap any existing mapping in the area */ - if (do_mas_munmap(&mas, mm, addr, len, uf, false)) + if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) return -ENOMEM; /* @@ -2576,8 +2571,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vm_flags |= VM_ACCOUNT; } - next = mas_next(&mas, ULONG_MAX); - prev = mas_prev(&mas, 0); + next = vma_next(&vmi); + prev = vma_prev(&vmi); if (vm_flags & VM_SPECIAL) goto cannot_expand; @@ -2605,13 +2600,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Actually expand, if possible */ if (vma && - !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) { + !vma_expand(&vmi.mas, vma, merge_start, merge_end, vm_pgoff, next)) { khugepaged_enter_vma(vma, vm_flags); goto expanded; } - mas.index = addr; - mas.last = end - 1; cannot_expand: /* * Determine the object being mapped and call the appropriate @@ -2650,7 +2643,7 @@ cannot_expand: error = -EINVAL; goto close_and_free_vma; } - mas_reset(&mas); + vma_iter_set(&vmi, addr); /* * If vm_flags changed after call_mmap(), we should try merge @@ -2706,7 +2699,7 @@ cannot_expand: goto free_vma; } - if (mas_preallocate(&mas, GFP_KERNEL)) { + if (vma_iter_prealloc(&vmi)) { error = -ENOMEM; if (file) goto close_and_free_vma; @@ -2719,7 +2712,7 @@ cannot_expand: if (vma->vm_file) i_mmap_lock_write(vma->vm_file->f_mapping); - vma_mas_store(vma, &mas); + vma_iter_store(&vmi, vma); mm->map_count++; if (vma->vm_file) { if (vma->vm_flags & VM_SHARED) @@ -2780,7 +2773,7 @@ unmap_and_free_vma: vma->vm_file = NULL; /* Undo any partial mapping done by a device driver. */ - unmap_region(mm, mas.tree, vma, prev, next, vma->vm_start, vma->vm_end); + unmap_region(mm, &mm->mm_mt, vma, prev, next, vma->vm_start, vma->vm_end); if (file && (vm_flags & VM_SHARED)) mapping_unmap_writable(file->f_mapping); free_vma: @@ -2797,12 +2790,12 @@ static int __vm_munmap(unsigned long start, size_t len, bool downgrade) int ret; struct mm_struct *mm = current->mm; LIST_HEAD(uf); - MA_STATE(mas, &mm->mm_mt, start, start); + VMA_ITERATOR(vmi, mm, start); if (mmap_write_lock_killable(mm)) return -EINTR; - ret = do_mas_munmap(&mas, mm, start, len, &uf, downgrade); + ret = do_vmi_munmap(&vmi, mm, start, len, &uf, downgrade); /* * Returning 1 indicates mmap_lock is downgraded. * But 1 is not legal return value of vm_munmap() and munmap(), reset @@ -2934,7 +2927,7 @@ static int do_brk_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, int ret; arch_unmap(mm, newbrk, oldbrk); - ret = do_mas_align_munmap(&vmi->mas, vma, mm, newbrk, oldbrk, uf, true); + ret = do_vmi_align_munmap(vmi, vma, mm, newbrk, oldbrk, uf, true); validate_mm_mt(mm); return ret; } @@ -3057,7 +3050,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) if (ret) goto limits_failed; - ret = do_mas_munmap(&vmi.mas, mm, addr, len, &uf, 0); + ret = do_vmi_munmap(&vmi, mm, addr, len, &uf, 0); if (ret) goto munmap_failed; diff --git a/mm/mremap.c b/mm/mremap.c index 05f90f47e149..3cc64c3f8bdb 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -978,14 +978,14 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. - * do_mas_munmap does all the needed commit accounting, and + * do_vmi_munmap does all the needed commit accounting, and * downgrades mmap_lock to read if so directed. */ if (old_len >= new_len) { int retval; - MA_STATE(mas, &mm->mm_mt, addr + new_len, addr + new_len); + VMA_ITERATOR(vmi, mm, addr + new_len); - retval = do_mas_munmap(&mas, mm, addr + new_len, + retval = do_vmi_munmap(&vmi, mm, addr + new_len, old_len - new_len, &uf_unmap, true); /* Returning 1 indicates mmap_lock is downgraded to read. */ if (retval == 1) { -- cgit v1.2.3 From 3c441ab7d059ebfd2535a52c001c50eac5d63886 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:14 -0500 Subject: mmap: convert vma_expand() to use vma iterator Use the vma iterator instead of the maple state for type safety and for consistency through the mm code. Link: https://lkml.kernel.org/r/20230120162650.984577-14-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 5b83023ba6a6..7e406416cf47 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -527,7 +527,7 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) * * Returns: 0 on success */ -inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, +inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *next) { @@ -556,7 +556,7 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, /* Only handles expanding */ VM_BUG_ON(vma->vm_start < start || vma->vm_end > end); - if (mas_preallocate(mas, GFP_KERNEL)) + if (vma_iter_prealloc(vmi)) goto nomem; vma_adjust_trans_huge(vma, start, end, 0); @@ -581,8 +581,7 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, vma->vm_start = start; vma->vm_end = end; vma->vm_pgoff = pgoff; - /* Note: mas must be pointing to the expanding VMA */ - vma_mas_store(vma, mas); + vma_iter_store(vmi, vma); if (file) { vma_interval_tree_insert(vma, root); @@ -2600,7 +2599,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Actually expand, if possible */ if (vma && - !vma_expand(&vmi.mas, vma, merge_start, merge_end, vm_pgoff, next)) { + !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) { khugepaged_enter_vma(vma, vm_flags); goto expanded; } -- cgit v1.2.3 From f2ebfe43ba6c845e70b6acbabd6c69ab74b3c52e Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:15 -0500 Subject: mm: add temporary vma iterator versions of vma_merge(), split_vma(), and __split_vma() These wrappers are short-lived in this patch set so that each user can be converted on its own. In the end, these functions are renamed in one commit. Link: https://lkml.kernel.org/r/20230120162650.984577-15-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 11 ++++++++++- mm/mmap.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 152a1362b800..956025940053 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2843,11 +2843,20 @@ extern struct vm_area_struct *vma_merge(struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *); +extern struct vm_area_struct *vmi_vma_merge(struct vma_iterator *vmi, + struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, + unsigned long end, unsigned long vm_flags, struct anon_vma *, + struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, + struct anon_vma_name *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int __split_vma(struct mm_struct *, struct vm_area_struct *, - unsigned long addr, int new_below); + unsigned long addr, int new_below); +extern int vmi__split_vma(struct vma_iterator *vmi, struct mm_struct *, + struct vm_area_struct *, unsigned long addr, int new_below); extern int split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); +extern int vmi_split_vma(struct vma_iterator *vmi, struct mm_struct *, + struct vm_area_struct *, unsigned long addr, int new_below); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, diff --git a/mm/mmap.c b/mm/mmap.c index 7e406416cf47..894017841d5d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1091,6 +1091,25 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, return res; } +struct vm_area_struct *vmi_vma_merge(struct vma_iterator *vmi, + struct mm_struct *mm, + struct vm_area_struct *prev, unsigned long addr, + unsigned long end, unsigned long vm_flags, + struct anon_vma *anon_vma, struct file *file, + pgoff_t pgoff, struct mempolicy *policy, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) +{ + struct vm_area_struct *tmp; + + tmp = vma_merge(mm, prev, addr, end, vm_flags, anon_vma, file, pgoff, + policy, vm_userfaultfd_ctx, anon_name); + if (tmp) + vma_iter_set(vmi, end); + + return tmp; +} + /* * Rough compatibility check to quickly see if it's even worth looking * at sharing an anon_vma. @@ -2276,6 +2295,18 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, validate_mm_mt(mm); return err; } +int vmi__split_vma(struct vma_iterator *vmi, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr, int new_below) +{ + int ret; + unsigned long end = vma->vm_end; + + ret = __split_vma(mm, vma, addr, new_below); + if (!ret) + vma_iter_set(vmi, end); + + return ret; +} /* * Split a vma into two pieces at address 'addr', a new vma is allocated @@ -2290,6 +2321,19 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return __split_vma(mm, vma, addr, new_below); } +int vmi_split_vma(struct vma_iterator *vmi, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr, int new_below) +{ + int ret; + unsigned long end = vma->vm_end; + + ret = split_vma(mm, vma, addr, new_below); + if (!ret) + vma_iter_set(vmi, end); + + return ret; +} + static inline int munmap_sidetree(struct vm_area_struct *vma, struct ma_state *mas_detach) { -- cgit v1.2.3 From d60beb1f698a429825ea2c463ee9e3dc3b1a79b7 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:16 -0500 Subject: ipc/shm: use the vma iterator for munmap calls Pass through the vma iterator to do_vmi_munmap() to handle the iterator state internally Link: https://lkml.kernel.org/r/20230120162650.984577-16-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- ipc/shm.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/ipc/shm.c b/ipc/shm.c index bd2fcc4d454e..1c6a6b319a49 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1786,8 +1786,8 @@ long ksys_shmdt(char __user *shmaddr) */ file = vma->vm_file; size = i_size_read(file_inode(vma->vm_file)); - do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); - mas_pause(&vmi.mas); + do_vmi_munmap(&vmi, mm, vma->vm_start, + vma->vm_end - vma->vm_start, NULL, false); /* * We discovered the size of the shm segment, so * break out of here and fall through to the next @@ -1810,10 +1810,9 @@ long ksys_shmdt(char __user *shmaddr) /* finding a matching vma now does not alter retval */ if ((vma->vm_ops == &shm_vm_ops) && ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) && - (vma->vm_file == file)) { - do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); - mas_pause(&vmi.mas); - } + (vma->vm_file == file)) + do_vmi_munmap(&vmi, mm, vma->vm_start, + vma->vm_end - vma->vm_start, NULL, false); vma = vma_next(&vmi); } -- cgit v1.2.3 From 27b267011296e35dd5c983bf6c53b7230c78f383 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 26 Jan 2023 16:20:49 -0500 Subject: ipc/shm: introduce new do_vma_munmap() to munmap The shm already has the vma iterator in position for a write. do_vmi_munmap() searches for the correct position and aligns the write, so it is not the right function to use in this case. The shm VMA tree modification is similar to the brk munmap situation, the vma iterator is in position and the VMA is already known. This patch generalizes the brk munmap function do_brk_munmap() to be used for any other callers with the vma iterator already in position to munmap a VMA. Link: https://lkml.kernel.org/r/20230126212049.980501-1-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reported-by: Sven Schnelle Link: https://lore.kernel.org/linux-mm/yt9dh6wec21a.fsf@linux.ibm.com/ Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 +++ ipc/shm.c | 11 ++++++----- mm/mmap.c | 38 ++++++++++++++++++-------------------- 3 files changed, 27 insertions(+), 25 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 956025940053..5b5f26d6588a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2922,6 +2922,9 @@ extern int do_munmap(struct mm_struct *, unsigned long, size_t, extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); #ifdef CONFIG_MMU +extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct list_head *uf, bool downgrade); extern int __mm_populate(unsigned long addr, unsigned long len, int ignore_errors); static inline void mm_populate(unsigned long addr, unsigned long len) diff --git a/ipc/shm.c b/ipc/shm.c index 1c6a6b319a49..60e45e7045d4 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1786,8 +1786,8 @@ long ksys_shmdt(char __user *shmaddr) */ file = vma->vm_file; size = i_size_read(file_inode(vma->vm_file)); - do_vmi_munmap(&vmi, mm, vma->vm_start, - vma->vm_end - vma->vm_start, NULL, false); + do_vma_munmap(&vmi, vma, vma->vm_start, vma->vm_end, + NULL, false); /* * We discovered the size of the shm segment, so * break out of here and fall through to the next @@ -1810,9 +1810,10 @@ long ksys_shmdt(char __user *shmaddr) /* finding a matching vma now does not alter retval */ if ((vma->vm_ops == &shm_vm_ops) && ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) && - (vma->vm_file == file)) - do_vmi_munmap(&vmi, mm, vma->vm_start, - vma->vm_end - vma->vm_start, NULL, false); + (vma->vm_file == file)) { + do_vma_munmap(&vmi, vma, vma->vm_start, vma->vm_end, + NULL, false); + } vma = vma_next(&vmi); } diff --git a/mm/mmap.c b/mm/mmap.c index 894017841d5d..408e9cc47333 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -180,9 +180,6 @@ static int check_brk_limits(unsigned long addr, unsigned long len) return mlock_future_check(current->mm, current->mm->def_flags, len); } -static int do_brk_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long newbrk, unsigned long oldbrk, - struct list_head *uf); static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, unsigned long addr, unsigned long request, unsigned long flags); SYSCALL_DEFINE1(brk, unsigned long, brk) @@ -236,7 +233,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Always allow shrinking brk. - * do_brk_munmap() may downgrade mmap_lock to read. + * do_vma_munmap() may downgrade mmap_lock to read. */ if (brk <= mm->brk) { int ret; @@ -248,11 +245,11 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) goto out; /* mapping intersects with an existing non-brk vma. */ /* * mm->brk must be protected by write mmap_lock. - * do_brk_munmap() may downgrade the lock, so update it - * before calling do_brk_munmap(). + * do_vma_munmap() may downgrade the lock, so update it + * before calling do_vma_munmap(). */ mm->brk = brk; - ret = do_brk_munmap(&vmi, brkvma, newbrk, oldbrk, &uf); + ret = do_vma_munmap(&vmi, brkvma, newbrk, oldbrk, &uf, true); if (ret == 1) { downgraded = true; goto success; @@ -2951,26 +2948,27 @@ out: } /* - * brk_munmap() - Unmap a full or partial vma. - * @vmi: The vma iterator - * @vma: The vma to be modified - * @newbrk: the start of the address to unmap - * @oldbrk: The end of the address to unmap + * do_vma_munmap() - Unmap a full or partial vma. + * @vmi: The vma iterator pointing at the vma + * @vma: The first vma to be munmapped + * @start: the start of the address to unmap + * @end: The end of the address to unmap * @uf: The userfaultfd list_head + * @downgrade: Attempt to downgrade or not * - * Returns: 1 on success. - * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if - * possible. + * Returns: 0 on success and not downgraded, 1 on success and downgraded. + * unmaps a VMA mapping when the vma iterator is already in position. + * Does not handle alignment. */ -static int do_brk_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long newbrk, unsigned long oldbrk, - struct list_head *uf) +int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct list_head *uf, bool downgrade) { struct mm_struct *mm = vma->vm_mm; int ret; - arch_unmap(mm, newbrk, oldbrk); - ret = do_vmi_align_munmap(vmi, vma, mm, newbrk, oldbrk, uf, true); + arch_unmap(mm, start, end); + ret = do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade); validate_mm_mt(mm); return ret; } -- cgit v1.2.3 From 11a9b90274f6a50f7877a61c8e82dd3c845ff1dd Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:17 -0500 Subject: userfaultfd: use vma iterator Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-17-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 87 +++++++++++++++++++++----------------------------------- 1 file changed, 33 insertions(+), 54 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 15a5bf765d43..4334bd35984d 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -883,7 +883,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); WRITE_ONCE(ctx->released, true); @@ -900,7 +900,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) */ mmap_write_lock(mm); prev = NULL; - mas_for_each(&mas, vma, ULONG_MAX) { + for_each_vma(vmi, vma) { cond_resched(); BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ !!(vma->vm_flags & __VM_UFFD_FLAGS)); @@ -909,13 +909,12 @@ static int userfaultfd_release(struct inode *inode, struct file *file) continue; } new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, + prev = vmi_vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) { - mas_pause(&mas); vma = prev; } else { prev = vma; @@ -1302,7 +1301,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, bool found; bool basic_ioctls; unsigned long start, end, vma_end; - MA_STATE(mas, &mm->mm_mt, 0, 0); + struct vma_iterator vmi; user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1344,17 +1343,13 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, if (!mmget_not_zero(mm)) goto out; + ret = -EINVAL; mmap_write_lock(mm); - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); + vma_iter_init(&vmi, mm, start); + vma = vma_find(&vmi, end); if (!vma) goto out_unlock; - /* check that there's at least one vma in the range */ - ret = -EINVAL; - if (vma->vm_start >= end) - goto out_unlock; - /* * If the first vma contains huge pages, make sure start address * is aligned to huge page size. @@ -1371,7 +1366,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, */ found = false; basic_ioctls = false; - for (cur = vma; cur; cur = mas_next(&mas, end - 1)) { + cur = vma; + do { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ @@ -1428,16 +1424,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, basic_ioctls = true; found = true; - } + } for_each_vma_range(vmi, cur, end); BUG_ON(!found); - mas_set(&mas, start); - prev = mas_prev(&mas, 0); - if (prev != vma) - mas_next(&mas, ULONG_MAX); + vma_iter_set(&vmi, start); + prev = vma_prev(&vmi); ret = 0; - do { + for_each_vma_range(vmi, vma, end) { cond_resched(); BUG_ON(!vma_can_userfault(vma, vm_flags)); @@ -1458,30 +1452,25 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma_end = min(end, vma->vm_end); new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; - prev = vma_merge(mm, prev, start, vma_end, new_flags, + prev = vmi_vma_merge(&vmi, mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), ((struct vm_userfaultfd_ctx){ ctx }), anon_vma_name(vma)); if (prev) { /* vma_merge() invalidated the mas */ - mas_pause(&mas); vma = prev; goto next; } if (vma->vm_start < start) { - ret = split_vma(mm, vma, start, 1); + ret = vmi_split_vma(&vmi, mm, vma, start, 1); if (ret) break; - /* split_vma() invalidated the mas */ - mas_pause(&mas); } if (vma->vm_end > end) { - ret = split_vma(mm, vma, end, 0); + ret = vmi_split_vma(&vmi, mm, vma, end, 0); if (ret) break; - /* split_vma() invalidated the mas */ - mas_pause(&mas); } next: /* @@ -1498,8 +1487,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, skip: prev = vma; start = vma->vm_end; - vma = mas_next(&mas, end - 1); - } while (vma); + } + out_unlock: mmap_write_unlock(mm); mmput(mm); @@ -1543,7 +1532,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, bool found; unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; - MA_STATE(mas, &mm->mm_mt, 0, 0); + struct vma_iterator vmi; ret = -EFAULT; if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) @@ -1562,14 +1551,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); - if (!vma) - goto out_unlock; - - /* check that there's at least one vma in the range */ ret = -EINVAL; - if (vma->vm_start >= end) + vma_iter_init(&vmi, mm, start); + vma = vma_find(&vmi, end); + if (!vma) goto out_unlock; /* @@ -1587,8 +1572,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * Search for not compatible vmas. */ found = false; - ret = -EINVAL; - for (cur = vma; cur; cur = mas_next(&mas, end - 1)) { + cur = vma; + do { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ @@ -1605,16 +1590,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out_unlock; found = true; - } + } for_each_vma_range(vmi, cur, end); BUG_ON(!found); - mas_set(&mas, start); - prev = mas_prev(&mas, 0); - if (prev != vma) - mas_next(&mas, ULONG_MAX); - + vma_iter_set(&vmi, start); + prev = vma_prev(&vmi); ret = 0; - do { + for_each_vma_range(vmi, vma, end) { cond_resched(); BUG_ON(!vma_can_userfault(vma, vma->vm_flags)); @@ -1650,26 +1632,23 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, uffd_wp_range(mm, vma, start, vma_end - start, false); new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - prev = vma_merge(mm, prev, start, vma_end, new_flags, + prev = vmi_vma_merge(&vmi, mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) { vma = prev; - mas_pause(&mas); goto next; } if (vma->vm_start < start) { - ret = split_vma(mm, vma, start, 1); + ret = vmi_split_vma(&vmi, mm, vma, start, 1); if (ret) break; - mas_pause(&mas); } if (vma->vm_end > end) { - ret = split_vma(mm, vma, end, 0); + ret = vmi_split_vma(&vmi, mm, vma, end, 0); if (ret) break; - mas_pause(&mas); } next: /* @@ -1683,8 +1662,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, skip: prev = vma; start = vma->vm_end; - vma = mas_next(&mas, end - 1); - } while (vma); + } + out_unlock: mmap_write_unlock(mm); mmput(mm); -- cgit v1.2.3 From 2286a6914c776ec34cd97e4573b1466d055cb9de Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:18 -0500 Subject: mm: change mprotect_fixup to vma iterator Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-18-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/exec.c | 5 ++++- include/linux/mm.h | 6 +++--- mm/mprotect.c | 47 ++++++++++++++++++++++------------------------- 3 files changed, 29 insertions(+), 29 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index ab913243a367..b98647eeae9f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -758,6 +758,7 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_expand; unsigned long rlim_stack; struct mmu_gather tlb; + struct vma_iterator vmi; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ @@ -812,8 +813,10 @@ int setup_arg_pages(struct linux_binprm *bprm, vm_flags |= mm->def_flags; vm_flags |= VM_STACK_INCOMPLETE_SETUP; + vma_iter_init(&vmi, mm, vma->vm_start); + tlb_gather_mmu(&tlb, mm); - ret = mprotect_fixup(&tlb, vma, &prev, vma->vm_start, vma->vm_end, + ret = mprotect_fixup(&vmi, &tlb, vma, &prev, vma->vm_start, vma->vm_end, vm_flags); tlb_finish_mmu(&tlb); diff --git a/include/linux/mm.h b/include/linux/mm.h index 5b5f26d6588a..144ddfd65992 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2197,9 +2197,9 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, extern long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long cp_flags); -extern int mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, - struct vm_area_struct **pprev, unsigned long start, - unsigned long end, unsigned long newflags); +extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, + struct vm_area_struct *vma, struct vm_area_struct **pprev, + unsigned long start, unsigned long end, unsigned long newflags); /* * doesn't attempt to fault and will return short. diff --git a/mm/mprotect.c b/mm/mprotect.c index 6a22f3ad9b84..39b6335b8813 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -585,9 +585,9 @@ static const struct mm_walk_ops prot_none_walk_ops = { }; int -mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, - struct vm_area_struct **pprev, unsigned long start, - unsigned long end, unsigned long newflags) +mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, + struct vm_area_struct *vma, struct vm_area_struct **pprev, + unsigned long start, unsigned long end, unsigned long newflags) { struct mm_struct *mm = vma->vm_mm; unsigned long oldflags = vma->vm_flags; @@ -642,7 +642,7 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, * First try to merge with previous and/or next vma. */ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *pprev = vma_merge(mm, *pprev, start, end, newflags, + *pprev = vmi_vma_merge(vmi, mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*pprev) { @@ -654,13 +654,13 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, *pprev = vma; if (start != vma->vm_start) { - error = split_vma(mm, vma, start, 1); + error = vmi_split_vma(vmi, mm, vma, start, 1); if (error) goto fail; } if (end != vma->vm_end) { - error = split_vma(mm, vma, end, 0); + error = vmi_split_vma(vmi, mm, vma, end, 0); if (error) goto fail; } @@ -709,7 +709,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); struct mmu_gather tlb; - MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); + struct vma_iterator vmi; start = untagged_addr(start); @@ -741,8 +741,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey)) goto out; - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); + vma_iter_init(&vmi, current->mm, start); + vma = vma_find(&vmi, end); error = -ENOMEM; if (!vma) goto out; @@ -765,18 +765,22 @@ static int do_mprotect_pkey(unsigned long start, size_t len, } } + prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; - else - prev = mas_prev(&mas, 0); tlb_gather_mmu(&tlb, current->mm); - for (nstart = start ; ; ) { + nstart = start; + tmp = vma->vm_start; + for_each_vma_range(vmi, vma, end) { unsigned long mask_off_old_flags; unsigned long newflags; int new_vma_pkey; - /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + if (vma->vm_start != tmp) { + error = -ENOMEM; + break; + } /* Does the application expect PROT_READ to imply PROT_EXEC */ if (rier && (vma->vm_flags & VM_MAYEXEC)) @@ -824,25 +828,18 @@ static int do_mprotect_pkey(unsigned long start, size_t len, break; } - error = mprotect_fixup(&tlb, vma, &prev, nstart, tmp, newflags); + error = mprotect_fixup(&vmi, &tlb, vma, &prev, nstart, tmp, newflags); if (error) break; nstart = tmp; - - if (nstart < prev->vm_end) - nstart = prev->vm_end; - if (nstart >= end) - break; - - vma = find_vma(current->mm, prev->vm_end); - if (!vma || vma->vm_start != nstart) { - error = -ENOMEM; - break; - } prot = reqprot; } tlb_finish_mmu(&tlb); + + if (vma_iter_end(&vmi) < end) + error = -ENOMEM; + out: mmap_write_unlock(current->mm); return error; -- cgit v1.2.3 From 37598f5a9d8b63b91cce0cb6bac5f6374ed1bb80 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:19 -0500 Subject: mlock: convert mlock to vma iterator Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-19-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mlock.c | 57 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index b680f11879c3..0d09b9070071 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -401,8 +401,9 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, * * For vmas that pass the filters, merge/split as appropriate. */ -static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end, vm_flags_t newflags) +static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, vm_flags_t newflags) { struct mm_struct *mm = vma->vm_mm; pgoff_t pgoff; @@ -417,22 +418,22 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, goto out; pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, - vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + *prev = vmi_vma_merge(vmi, mm, *prev, start, end, newflags, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*prev) { vma = *prev; goto success; } if (start != vma->vm_start) { - ret = split_vma(mm, vma, start, 1); + ret = vmi_split_vma(vmi, mm, vma, start, 1); if (ret) goto out; } if (end != vma->vm_end) { - ret = split_vma(mm, vma, end, 0); + ret = vmi_split_vma(vmi, mm, vma, end, 0); if (ret) goto out; } @@ -471,7 +472,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, unsigned long nstart, end, tmp; struct vm_area_struct *vma, *prev; int error; - MA_STATE(mas, ¤t->mm->mm_mt, start, start); + VMA_ITERATOR(vmi, current->mm, start); VM_BUG_ON(offset_in_page(start)); VM_BUG_ON(len != PAGE_ALIGN(len)); @@ -480,39 +481,37 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, return -EINVAL; if (end == start) return 0; - vma = mas_walk(&mas); + vma = vma_iter_load(&vmi); if (!vma) return -ENOMEM; + prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; - else - prev = mas_prev(&mas, 0); - for (nstart = start ; ; ) { - vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + nstart = start; + tmp = vma->vm_start; + for_each_vma_range(vmi, vma, end) { + vm_flags_t newflags; - newflags |= flags; + if (vma->vm_start != tmp) + return -ENOMEM; + newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + newflags |= flags; /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ tmp = vma->vm_end; if (tmp > end) tmp = end; - error = mlock_fixup(vma, &prev, nstart, tmp, newflags); + error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags); if (error) break; nstart = tmp; - if (nstart < prev->vm_end) - nstart = prev->vm_end; - if (nstart >= end) - break; - - vma = find_vma(prev->vm_mm, prev->vm_end); - if (!vma || vma->vm_start != nstart) { - error = -ENOMEM; - break; - } } + + if (vma_iter_end(&vmi) < end) + return -ENOMEM; + return error; } @@ -658,7 +657,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) */ static int apply_mlockall_flags(int flags) { - MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, current->mm, 0); struct vm_area_struct *vma, *prev = NULL; vm_flags_t to_add = 0; @@ -679,15 +678,15 @@ static int apply_mlockall_flags(int flags) to_add |= VM_LOCKONFAULT; } - mas_for_each(&mas, vma, ULONG_MAX) { + for_each_vma(vmi, vma) { vm_flags_t newflags; newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; newflags |= to_add; /* Ignore errors */ - mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); - mas_pause(&mas); + mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end, + newflags); cond_resched(); } out: -- cgit v1.2.3 From e552cdb853dab085d30d54815e044aa4836a6dc6 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:20 -0500 Subject: coredump: convert to vma iterator Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-20-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/coredump.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index de78bde2991b..f27d734f3102 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1111,14 +1111,14 @@ whole: * Helper function for iterating across a vma list. It ensures that the caller * will visit `gate_vma' prior to terminating the search. */ -static struct vm_area_struct *coredump_next_vma(struct ma_state *mas, +static struct vm_area_struct *coredump_next_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct *gate_vma) { if (gate_vma && (vma == gate_vma)) return NULL; - vma = mas_next(mas, ULONG_MAX); + vma = vma_next(vmi); if (vma) return vma; return gate_vma; @@ -1146,7 +1146,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) { struct vm_area_struct *gate_vma, *vma = NULL; struct mm_struct *mm = current->mm; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); int i = 0; /* @@ -1167,7 +1167,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) return false; } - while ((vma = coredump_next_vma(&mas, vma, gate_vma)) != NULL) { + while ((vma = coredump_next_vma(&vmi, vma, gate_vma)) != NULL) { struct core_vma_metadata *m = cprm->vma_meta + i; m->start = vma->vm_start; -- cgit v1.2.3 From f10c2abcdac4a44795fae9118eaedfe56204afda Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:21 -0500 Subject: mempolicy: convert to vma iterator Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-21-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mempolicy.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 72142fbe7652..ed68bdf980d3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -787,24 +787,21 @@ static int vma_replace_policy(struct vm_area_struct *vma, static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct mempolicy *new_pol) { - MA_STATE(mas, &mm->mm_mt, start, start); + VMA_ITERATOR(vmi, mm, start); struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; pgoff_t pgoff; - prev = mas_prev(&mas, 0); - if (unlikely(!prev)) - mas_set(&mas, start); - - vma = mas_find(&mas, end - 1); + prev = vma_prev(&vmi); + vma = vma_find(&vmi, end); if (WARN_ON(!vma)) return 0; if (start > vma->vm_start) prev = vma; - for (; vma; vma = mas_next(&mas, end - 1)) { + do { unsigned long vmstart = max(start, vma->vm_start); unsigned long vmend = min(end, vma->vm_end); @@ -813,29 +810,23 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); - prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, + prev = vmi_vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (prev) { - /* vma_merge() invalidated the mas */ - mas_pause(&mas); vma = prev; goto replace; } if (vma->vm_start != vmstart) { - err = split_vma(vma->vm_mm, vma, vmstart, 1); + err = vmi_split_vma(&vmi, vma->vm_mm, vma, vmstart, 1); if (err) goto out; - /* split_vma() invalidated the mas */ - mas_pause(&mas); } if (vma->vm_end != vmend) { - err = split_vma(vma->vm_mm, vma, vmend, 0); + err = vmi_split_vma(&vmi, vma->vm_mm, vma, vmend, 0); if (err) goto out; - /* split_vma() invalidated the mas */ - mas_pause(&mas); } replace: err = vma_replace_policy(vma, new_pol); @@ -843,7 +834,7 @@ replace: goto out; next: prev = vma; - } + } for_each_vma_range(vmi, vma, end); out: return err; -- cgit v1.2.3 From 250cb40f0afee232d8573d7b1d8bc56d4b92f63e Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:22 -0500 Subject: task_mmu: convert to vma iterator Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Update the comments to how the vma iterator works. The vma iterator will keep track of the last vm_end and start the search from vm_end + 1. Link: https://lkml.kernel.org/r/20230120162650.984577-22-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a44339a77a75..a944e1816364 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -890,7 +890,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) struct vm_area_struct *vma; unsigned long vma_start = 0, last_vma_end = 0; int ret = 0; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); priv->task = get_proc_task(priv->inode); if (!priv->task) @@ -908,7 +908,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) goto out_put_mm; hold_task_mempolicy(priv); - vma = mas_find(&mas, ULONG_MAX); + vma = vma_next(&vmi); if (unlikely(!vma)) goto empty_set; @@ -923,7 +923,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * access it for write request. */ if (mmap_lock_is_contended(mm)) { - mas_pause(&mas); + vma_iter_invalidate(&vmi); mmap_read_unlock(mm); ret = mmap_read_lock_killable(mm); if (ret) { @@ -948,31 +948,31 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * * 1) VMA2 is freed, but VMA3 exists: * - * find_vma(mm, 16k - 1) will return VMA3. + * vma_next(vmi) will return VMA3. * In this case, just continue from VMA3. * * 2) VMA2 still exists: * - * find_vma(mm, 16k - 1) will return VMA2. - * Iterate the loop like the original one. + * vma_next(vmi) will return VMA3. + * In this case, just continue from VMA3. * * 3) No more VMAs can be found: * - * find_vma(mm, 16k - 1) will return NULL. + * vma_next(vmi) will return NULL. * No more things to do, just break. * * 4) (last_vma_end - 1) is the middle of a vma (VMA'): * - * find_vma(mm, 16k - 1) will return VMA' whose range + * vma_next(vmi) will return VMA' whose range * contains last_vma_end. * Iterate VMA' from last_vma_end. */ - vma = mas_find(&mas, ULONG_MAX); + vma = vma_next(&vmi); /* Case 3 above */ if (!vma) break; - /* Case 1 above */ + /* Case 1 and 2 above */ if (vma->vm_start >= last_vma_end) continue; @@ -980,8 +980,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) if (vma->vm_end > last_vma_end) smap_gather_stats(vma, &mss, last_vma_end); } - /* Case 2 above */ - } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); + } for_each_vma(vmi, vma); empty_set: show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0); @@ -1277,7 +1276,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); struct mmu_notifier_range range; struct clear_refs_private cp = { .type = type, @@ -1297,7 +1296,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, } if (type == CLEAR_REFS_SOFT_DIRTY) { - mas_for_each(&mas, vma, ULONG_MAX) { + for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; vma->vm_flags &= ~VM_SOFTDIRTY; -- cgit v1.2.3 From 214dbc4281374cbbd833edd502d0ed1fd1b0e243 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:23 -0500 Subject: sched: convert to vma iterator Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-23-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- kernel/sched/fair.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c36aa54ae071..9c9950249d7b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2938,11 +2938,11 @@ static void task_numa_work(struct callback_head *work) struct task_struct *p = current; struct mm_struct *mm = p->mm; u64 runtime = p->se.sum_exec_runtime; - MA_STATE(mas, &mm->mm_mt, 0, 0); struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; long pages, virtpages; + struct vma_iterator vmi; SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); @@ -2995,16 +2995,16 @@ static void task_numa_work(struct callback_head *work) if (!mmap_read_trylock(mm)) return; - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); + vma_iter_init(&vmi, mm, start); + vma = vma_next(&vmi); if (!vma) { reset_ptenuma_scan(p); start = 0; - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); + vma_iter_set(&vmi, start); + vma = vma_next(&vmi); } - for (; vma; vma = mas_find(&mas, ULONG_MAX)) { + do { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { continue; @@ -3051,7 +3051,7 @@ static void task_numa_work(struct callback_head *work) cond_resched(); } while (end != vma->vm_end); - } + } for_each_vma(vmi, vma); out: /* -- cgit v1.2.3 From 178e22ac2078b1a7d284c7e3b4c3fbdb8e85ae99 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:24 -0500 Subject: madvise: use vmi iterator for __split_vma() and vma_merge() Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-24-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/madvise.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 92a3c6bd84c1..4d4471916465 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -142,6 +142,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; int error; pgoff_t pgoff; + VMA_ITERATOR(vmi, mm, 0); if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { *prev = vma; @@ -149,8 +150,8 @@ static int madvise_update_vma(struct vm_area_struct *vma, } pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, - vma->vm_file, pgoff, vma_policy(vma), + *prev = vmi_vma_merge(&vmi, mm, *prev, start, end, new_flags, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_name); if (*prev) { vma = *prev; @@ -162,7 +163,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, if (start != vma->vm_start) { if (unlikely(mm->map_count >= sysctl_max_map_count)) return -ENOMEM; - error = __split_vma(mm, vma, start, 1); + error = vmi__split_vma(&vmi, mm, vma, start, 1); if (error) return error; } @@ -170,7 +171,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, if (end != vma->vm_end) { if (unlikely(mm->map_count >= sysctl_max_map_count)) return -ENOMEM; - error = __split_vma(mm, vma, end, 0); + error = vmi__split_vma(&vmi, mm, vma, end, 0); if (error) return error; } -- cgit v1.2.3 From 0c0c5bffd0a24637f1601ce15937ae38e572069c Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:25 -0500 Subject: mmap: pass through vmi iterator to __split_vma() Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-25-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 408e9cc47333..2b588b831ead 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2388,7 +2388,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) goto map_count_exceeded; - error = __split_vma(mm, vma, start, 0); + error = vmi__split_vma(vmi, mm, vma, start, 0); if (error) goto start_split_failed; @@ -2409,7 +2409,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (next->vm_end > end) { struct vm_area_struct *split; - error = __split_vma(mm, next, end, 1); + error = vmi__split_vma(vmi, mm, next, end, 1); if (error) goto end_split_failed; -- cgit v1.2.3 From 076f16bf7698fae4b27030238998474a21d2233c Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:26 -0500 Subject: mmap: use vmi version of vma_merge() Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-26-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 2b588b831ead..8806bfbaa505 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2690,8 +2690,9 @@ cannot_expand: * vma again as we may succeed this time. */ if (unlikely(vm_flags != vma->vm_flags && prev)) { - merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, - NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); + merge = vmi_vma_merge(&vmi, mm, prev, vma->vm_start, + vma->vm_end, vma->vm_flags, NULL, vma->vm_file, + vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (merge) { /* * ->mmap() can change vma->vm_file and fput @@ -3232,6 +3233,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma, *prev; bool faulted_in_anon_vma = true; + VMA_ITERATOR(vmi, mm, addr); validate_mm_mt(mm); /* @@ -3247,7 +3249,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ - new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, + new_vma = vmi_vma_merge(&vmi, mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (new_vma) { -- cgit v1.2.3 From a27a11f92fe2bb1c02c8bba0a8315f9b7ad3b396 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:27 -0500 Subject: mm/mremap: use vmi version of vma_merge() Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-27-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mremap.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index 3cc64c3f8bdb..f161516ab3c1 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1018,6 +1018,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long extension_end = addr + new_len; pgoff_t extension_pgoff = vma->vm_pgoff + ((extension_start - vma->vm_start) >> PAGE_SHIFT); + VMA_ITERATOR(vmi, mm, extension_start); if (vma->vm_flags & VM_ACCOUNT) { if (security_vm_enough_memory_mm(mm, pages)) { @@ -1042,10 +1043,12 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, * when a vma would be actually removed due to a merge. */ if (!vma->vm_ops || !vma->vm_ops->close) { - vma = vma_merge(mm, vma, extension_start, extension_end, - vma->vm_flags, vma->anon_vma, vma->vm_file, - extension_pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + vma = vmi_vma_merge(&vmi, mm, vma, + extension_start, extension_end, + vma->vm_flags, vma->anon_vma, + vma->vm_file, extension_pgoff, + vma_policy(vma), vma->vm_userfaultfd_ctx, + anon_vma_name(vma)); } else if (vma_adjust(vma, vma->vm_start, addr + new_len, vma->vm_pgoff, NULL)) { vma = NULL; -- cgit v1.2.3 From 47d9644de92c1aa9dcd791203397b161c67096ca Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:28 -0500 Subject: nommu: convert nommu to using the vma iterator Gain type safety in nommu by using the vma_iterator and not the maple tree directly. Link: https://lkml.kernel.org/r/20230120162650.984577-28-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/nommu.c | 79 ++++++++++++++++++++++++-------------------------------------- 1 file changed, 31 insertions(+), 48 deletions(-) diff --git a/mm/nommu.c b/mm/nommu.c index 0481922fe66e..7a52a7c37009 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -544,19 +544,6 @@ static void put_nommu_region(struct vm_region *region) __put_nommu_region(region); } -void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas) -{ - mas_set_range(mas, vma->vm_start, vma->vm_end - 1); - mas_store_prealloc(mas, vma); -} - -void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) -{ - mas->index = vma->vm_start; - mas->last = vma->vm_end - 1; - mas_store_prealloc(mas, NULL); -} - static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) { vma->vm_mm = mm; @@ -574,13 +561,13 @@ static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) } /* - * mas_add_vma_to_mm() - Maple state variant of add_mas_to_mm(). - * @mas: The maple state with preallocations. + * vmi_add_vma_to_mm() - VMA Iterator variant of add_vmi_to_mm(). + * @vmi: The VMA iterator * @mm: The mm_struct * @vma: The vma to add * */ -static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm, +static void vmi_add_vma_to_mm(struct vma_iterator *vmi, struct mm_struct *mm, struct vm_area_struct *vma) { BUG_ON(!vma->vm_region); @@ -589,7 +576,7 @@ static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm, mm->map_count++; /* add the VMA to the tree */ - vma_mas_store(vma, mas); + vma_iter_store(vmi, vma); } /* @@ -600,14 +587,14 @@ static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm, */ static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { - MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); + VMA_ITERATOR(vmi, mm, vma->vm_start); - if (mas_preallocate(&mas, GFP_KERNEL)) { + if (vma_iter_prealloc(&vmi)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); return -ENOMEM; } - mas_add_vma_to_mm(&mas, mm, vma); + vmi_add_vma_to_mm(&vmi, mm, vma); return 0; } @@ -626,14 +613,15 @@ static void cleanup_vma_from_mm(struct vm_area_struct *vma) i_mmap_unlock_write(mapping); } } + /* * delete a VMA from its owning mm_struct and address space */ static int delete_vma_from_mm(struct vm_area_struct *vma) { - MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_start); - if (mas_preallocate(&mas, GFP_KERNEL)) { + if (vma_iter_prealloc(&vmi)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); return -ENOMEM; @@ -641,10 +629,9 @@ static int delete_vma_from_mm(struct vm_area_struct *vma) cleanup_vma_from_mm(vma); /* remove from the MM's tree and list */ - vma_mas_remove(vma, &mas); + vma_iter_clear(&vmi, vma->vm_start, vma->vm_end); return 0; } - /* * destroy a VMA record */ @@ -675,9 +662,9 @@ EXPORT_SYMBOL(find_vma_intersection); */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { - MA_STATE(mas, &mm->mm_mt, addr, addr); + VMA_ITERATOR(vmi, mm, addr); - return mas_walk(&mas); + return vma_iter_load(&vmi); } EXPORT_SYMBOL(find_vma); @@ -709,9 +696,9 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, { struct vm_area_struct *vma; unsigned long end = addr + len; - MA_STATE(mas, &mm->mm_mt, addr, addr); + VMA_ITERATOR(vmi, mm, addr); - vma = mas_walk(&mas); + vma = vma_iter_load(&vmi); if (!vma) return NULL; if (vma->vm_start != addr) @@ -1062,7 +1049,7 @@ unsigned long do_mmap(struct file *file, vm_flags_t vm_flags; unsigned long capabilities, result; int ret; - MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, current->mm, 0); *populate = 0; @@ -1091,8 +1078,8 @@ unsigned long do_mmap(struct file *file, if (!vma) goto error_getting_vma; - if (mas_preallocate(&mas, GFP_KERNEL)) - goto error_maple_preallocate; + if (vma_iter_prealloc(&vmi)) + goto error_vma_iter_prealloc; region->vm_usage = 1; region->vm_flags = vm_flags; @@ -1234,7 +1221,7 @@ unsigned long do_mmap(struct file *file, current->mm->total_vm += len >> PAGE_SHIFT; share: - mas_add_vma_to_mm(&mas, current->mm, vma); + vmi_add_vma_to_mm(&vmi, current->mm, vma); /* we flush the region from the icache only when the first executable * mapping of it is made */ @@ -1250,7 +1237,7 @@ share: error_just_free: up_write(&nommu_region_sem); error: - mas_destroy(&mas); + vma_iter_free(&vmi); if (region->vm_file) fput(region->vm_file); kmem_cache_free(vm_region_jar, region); @@ -1278,7 +1265,7 @@ error_getting_region: show_free_areas(0, NULL); return -ENOMEM; -error_maple_preallocate: +error_vma_iter_prealloc: kmem_cache_free(vm_region_jar, region); vm_area_free(vma); pr_warn("Allocation of vma tree for process %d failed\n", current->pid); @@ -1344,20 +1331,18 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) * split a vma into two pieces at address 'addr', a new vma is allocated either * for the first part or the tail. */ -int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, int new_below) +int vmi_split_vma(struct vma_iterator *vmi, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr, int new_below) { struct vm_area_struct *new; struct vm_region *region; unsigned long npages; - MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); /* we're only permitted to split anonymous regions (these should have * only a single usage on the region) */ if (vma->vm_file) return -ENOMEM; - mm = vma->vm_mm; if (mm->map_count >= sysctl_max_map_count) return -ENOMEM; @@ -1369,10 +1354,10 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (!new) goto err_vma_dup; - if (mas_preallocate(&mas, GFP_KERNEL)) { + if (vma_iter_prealloc(vmi)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); - goto err_mas_preallocate; + goto err_vmi_preallocate; } /* most fields are the same, copy all, and then fixup */ @@ -1406,13 +1391,11 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, setup_vma_to_mm(vma, mm); setup_vma_to_mm(new, mm); - mas_set_range(&mas, vma->vm_start, vma->vm_end - 1); - mas_store(&mas, vma); - vma_mas_store(new, &mas); + vma_iter_store(vmi, new); mm->map_count++; return 0; -err_mas_preallocate: +err_vmi_preallocate: vm_area_free(new); err_vma_dup: kmem_cache_free(vm_region_jar, region); @@ -1466,7 +1449,7 @@ static int shrink_vma(struct mm_struct *mm, */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { - MA_STATE(mas, &mm->mm_mt, start, start); + VMA_ITERATOR(vmi, mm, start); struct vm_area_struct *vma; unsigned long end; int ret = 0; @@ -1478,7 +1461,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list end = start + len; /* find the first potentially overlapping VMA */ - vma = mas_find(&mas, end - 1); + vma = vma_find(&vmi, end); if (!vma) { static int limit; if (limit < 5) { @@ -1497,7 +1480,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list return -EINVAL; if (end == vma->vm_end) goto erase_whole_vma; - vma = mas_next(&mas, end - 1); + vma = vma_find(&vmi, end); } while (vma); return -EINVAL; } else { @@ -1511,7 +1494,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list if (end != vma->vm_end && offset_in_page(end)) return -EINVAL; if (start != vma->vm_start && end != vma->vm_end) { - ret = split_vma(mm, vma, start, 1); + ret = vmi_split_vma(&vmi, mm, vma, start, 1); if (ret < 0) return ret; } -- cgit v1.2.3 From 07f1bc5ad7983356ca79c65b22148dc5700a24e5 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:29 -0500 Subject: nommu: pass through vma iterator to shrink_vma() Rename the function to vmi_shrink_vma() indicate it takes the vma iterator. Use the iterator to preallocate and drop the delete function. The maple tree is able to do the modification easier than the linked list and rbtree, so just clear the necessary area in the tree. add_vma_to_mm() is no longer used, so drop this function. vmi_add_vma_to_mm() is now only used once, so inline this function into do_mmap(). Link: https://lkml.kernel.org/r/20230120162650.984577-29-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/nommu.c | 63 +++++++++++++++++--------------------------------------------- 1 file changed, 17 insertions(+), 46 deletions(-) diff --git a/mm/nommu.c b/mm/nommu.c index 7a52a7c37009..9ddeb92600d6 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -560,44 +560,6 @@ static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) } } -/* - * vmi_add_vma_to_mm() - VMA Iterator variant of add_vmi_to_mm(). - * @vmi: The VMA iterator - * @mm: The mm_struct - * @vma: The vma to add - * - */ -static void vmi_add_vma_to_mm(struct vma_iterator *vmi, struct mm_struct *mm, - struct vm_area_struct *vma) -{ - BUG_ON(!vma->vm_region); - - setup_vma_to_mm(vma, mm); - mm->map_count++; - - /* add the VMA to the tree */ - vma_iter_store(vmi, vma); -} - -/* - * add a VMA into a process's mm_struct in the appropriate place in the list - * and tree and add to the address space's page tree also if not an anonymous - * page - * - should be called with mm->mmap_lock held writelocked - */ -static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) -{ - VMA_ITERATOR(vmi, mm, vma->vm_start); - - if (vma_iter_prealloc(&vmi)) { - pr_warn("Allocation of vma tree for process %d failed\n", - current->pid); - return -ENOMEM; - } - vmi_add_vma_to_mm(&vmi, mm, vma); - return 0; -} - static void cleanup_vma_from_mm(struct vm_area_struct *vma) { vma->vm_mm->map_count--; @@ -1221,7 +1183,11 @@ unsigned long do_mmap(struct file *file, current->mm->total_vm += len >> PAGE_SHIFT; share: - vmi_add_vma_to_mm(&vmi, current->mm, vma); + BUG_ON(!vma->vm_region); + setup_vma_to_mm(vma, current->mm); + current->mm->map_count++; + /* add the VMA to the tree */ + vma_iter_store(&vmi, vma); /* we flush the region from the icache only when the first executable * mapping of it is made */ @@ -1406,7 +1372,7 @@ err_vma_dup: * shrink a VMA by removing the specified chunk from either the beginning or * the end */ -static int shrink_vma(struct mm_struct *mm, +static int vmi_shrink_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long from, unsigned long to) { @@ -1414,14 +1380,19 @@ static int shrink_vma(struct mm_struct *mm, /* adjust the VMA's pointers, which may reposition it in the MM's tree * and list */ - if (delete_vma_from_mm(vma)) + if (vma_iter_prealloc(vmi)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); return -ENOMEM; - if (from > vma->vm_start) + } + + if (from > vma->vm_start) { + vma_iter_clear(vmi, from, vma->vm_end); vma->vm_end = from; - else + } else { + vma_iter_clear(vmi, vma->vm_start, to); vma->vm_start = to; - if (add_vma_to_mm(mm, vma)) - return -ENOMEM; + } /* cut the backing region down to size */ region = vma->vm_region; @@ -1498,7 +1469,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list if (ret < 0) return ret; } - return shrink_vma(mm, vma, start, end); + return vmi_shrink_vma(&vmi, vma, start, end); } erase_whole_vma: -- cgit v1.2.3 From 9760ebffbf5507320e0de41f5b80089bdef996a0 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:30 -0500 Subject: mm: switch vma_merge(), split_vma(), and __split_vma to vma iterator Drop the vmi_* functions and transition all users to use the vma iterator directly. Link: https://lkml.kernel.org/r/20230120162650.984577-30-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 14 +++++----- include/linux/mm.h | 18 ++++--------- mm/madvise.c | 6 ++--- mm/mempolicy.c | 6 ++--- mm/mlock.c | 6 ++--- mm/mmap.c | 79 +++++++++++++++--------------------------------------- mm/mprotect.c | 6 ++--- mm/mremap.c | 10 +++---- mm/nommu.c | 8 +++--- 9 files changed, 55 insertions(+), 98 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 4334bd35984d..f3c75c6222de 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -909,7 +909,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) continue; } new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - prev = vmi_vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, + prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), @@ -1452,7 +1452,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma_end = min(end, vma->vm_end); new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; - prev = vmi_vma_merge(&vmi, mm, prev, start, vma_end, new_flags, + prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), ((struct vm_userfaultfd_ctx){ ctx }), @@ -1463,12 +1463,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto next; } if (vma->vm_start < start) { - ret = vmi_split_vma(&vmi, mm, vma, start, 1); + ret = split_vma(&vmi, vma, start, 1); if (ret) break; } if (vma->vm_end > end) { - ret = vmi_split_vma(&vmi, mm, vma, end, 0); + ret = split_vma(&vmi, vma, end, 0); if (ret) break; } @@ -1632,7 +1632,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, uffd_wp_range(mm, vma, start, vma_end - start, false); new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - prev = vmi_vma_merge(&vmi, mm, prev, start, vma_end, new_flags, + prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), NULL_VM_UFFD_CTX, anon_vma_name(vma)); @@ -1641,12 +1641,12 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto next; } if (vma->vm_start < start) { - ret = vmi_split_vma(&vmi, mm, vma, start, 1); + ret = split_vma(&vmi, vma, start, 1); if (ret) break; } if (vma->vm_end > end) { - ret = vmi_split_vma(&vmi, mm, vma, end, 0); + ret = split_vma(&vmi, vma, end, 0); if (ret) break; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 144ddfd65992..f3b49feb5c35 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2839,24 +2839,16 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, { return __vma_adjust(vma, start, end, pgoff, insert, NULL); } -extern struct vm_area_struct *vma_merge(struct mm_struct *, - struct vm_area_struct *prev, unsigned long addr, unsigned long end, - unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *); -extern struct vm_area_struct *vmi_vma_merge(struct vma_iterator *vmi, +extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); -extern int __split_vma(struct mm_struct *, struct vm_area_struct *, - unsigned long addr, int new_below); -extern int vmi__split_vma(struct vma_iterator *vmi, struct mm_struct *, - struct vm_area_struct *, unsigned long addr, int new_below); -extern int split_vma(struct mm_struct *, struct vm_area_struct *, - unsigned long addr, int new_below); -extern int vmi_split_vma(struct vma_iterator *vmi, struct mm_struct *, - struct vm_area_struct *, unsigned long addr, int new_below); +extern int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *, + unsigned long addr, int new_below); +extern int split_vma(struct vma_iterator *vmi, struct vm_area_struct *, + unsigned long addr, int new_below); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, diff --git a/mm/madvise.c b/mm/madvise.c index 4d4471916465..02b317726c9a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -150,7 +150,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, } pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vmi_vma_merge(&vmi, mm, *prev, start, end, new_flags, + *prev = vma_merge(&vmi, mm, *prev, start, end, new_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_name); if (*prev) { @@ -163,7 +163,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, if (start != vma->vm_start) { if (unlikely(mm->map_count >= sysctl_max_map_count)) return -ENOMEM; - error = vmi__split_vma(&vmi, mm, vma, start, 1); + error = __split_vma(&vmi, vma, start, 1); if (error) return error; } @@ -171,7 +171,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, if (end != vma->vm_end) { if (unlikely(mm->map_count >= sysctl_max_map_count)) return -ENOMEM; - error = vmi__split_vma(&vmi, mm, vma, end, 0); + error = __split_vma(&vmi, vma, end, 0); if (error) return error; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ed68bdf980d3..dd5ca942256f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -810,7 +810,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); - prev = vmi_vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags, + prev = vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); @@ -819,12 +819,12 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, goto replace; } if (vma->vm_start != vmstart) { - err = vmi_split_vma(&vmi, vma->vm_mm, vma, vmstart, 1); + err = split_vma(&vmi, vma, vmstart, 1); if (err) goto out; } if (vma->vm_end != vmend) { - err = vmi_split_vma(&vmi, vma->vm_mm, vma, vmend, 0); + err = split_vma(&vmi, vma, vmend, 0); if (err) goto out; } diff --git a/mm/mlock.c b/mm/mlock.c index 0d09b9070071..0336f52e03d7 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -418,7 +418,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, goto out; pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vmi_vma_merge(vmi, mm, *prev, start, end, newflags, + *prev = vma_merge(vmi, mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*prev) { @@ -427,13 +427,13 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, } if (start != vma->vm_start) { - ret = vmi_split_vma(vmi, mm, vma, start, 1); + ret = split_vma(vmi, vma, start, 1); if (ret) goto out; } if (end != vma->vm_end) { - ret = vmi_split_vma(vmi, mm, vma, end, 0); + ret = split_vma(vmi, vma, end, 0); if (ret) goto out; } diff --git a/mm/mmap.c b/mm/mmap.c index 8806bfbaa505..afc65f122f7d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1010,7 +1010,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * parameter) may establish ptes with the wrong permissions of NNNN * instead of the right permissions of XXXX. */ -struct vm_area_struct *vma_merge(struct mm_struct *mm, +struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, @@ -1019,7 +1019,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct anon_vma_name *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; - struct vm_area_struct *mid, *next, *res; + struct vm_area_struct *mid, *next, *res = NULL; int err = -1; bool merge_prev = false; bool merge_next = false; @@ -1085,26 +1085,11 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (err) return NULL; khugepaged_enter_vma(res, vm_flags); - return res; -} -struct vm_area_struct *vmi_vma_merge(struct vma_iterator *vmi, - struct mm_struct *mm, - struct vm_area_struct *prev, unsigned long addr, - unsigned long end, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, - pgoff_t pgoff, struct mempolicy *policy, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) -{ - struct vm_area_struct *tmp; - - tmp = vma_merge(mm, prev, addr, end, vm_flags, anon_vma, file, pgoff, - policy, vm_userfaultfd_ctx, anon_name); - if (tmp) + if (res) vma_iter_set(vmi, end); - return tmp; + return res; } /* @@ -2228,12 +2213,14 @@ static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, * __split_vma() bypasses sysctl_max_map_count checking. We use this where it * has already been checked or doesn't make sense to fail. */ -int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, +int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { struct vm_area_struct *new; int err; - validate_mm_mt(mm); + unsigned long end = vma->vm_end; + + validate_mm_mt(vma->vm_mm); if (vma->vm_ops && vma->vm_ops->may_split) { err = vma->vm_ops->may_split(vma, addr); @@ -2273,8 +2260,10 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); /* Success. */ - if (!err) + if (!err) { + vma_iter_set(vmi, end); return 0; + } /* Avoid vm accounting in close() operation */ new->vm_start = new->vm_end; @@ -2289,46 +2278,21 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, mpol_put(vma_policy(new)); out_free_vma: vm_area_free(new); - validate_mm_mt(mm); + validate_mm_mt(vma->vm_mm); return err; } -int vmi__split_vma(struct vma_iterator *vmi, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long addr, int new_below) -{ - int ret; - unsigned long end = vma->vm_end; - - ret = __split_vma(mm, vma, addr, new_below); - if (!ret) - vma_iter_set(vmi, end); - - return ret; -} /* * Split a vma into two pieces at address 'addr', a new vma is allocated * either for the first part or the tail. */ -int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, +int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { - if (mm->map_count >= sysctl_max_map_count) + if (vma->vm_mm->map_count >= sysctl_max_map_count) return -ENOMEM; - return __split_vma(mm, vma, addr, new_below); -} - -int vmi_split_vma(struct vma_iterator *vmi, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long addr, int new_below) -{ - int ret; - unsigned long end = vma->vm_end; - - ret = split_vma(mm, vma, addr, new_below); - if (!ret) - vma_iter_set(vmi, end); - - return ret; + return __split_vma(vmi, vma, addr, new_below); } static inline int munmap_sidetree(struct vm_area_struct *vma, @@ -2388,7 +2352,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) goto map_count_exceeded; - error = vmi__split_vma(vmi, mm, vma, start, 0); + error = __split_vma(vmi, vma, start, 0); if (error) goto start_split_failed; @@ -2409,7 +2373,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (next->vm_end > end) { struct vm_area_struct *split; - error = vmi__split_vma(vmi, mm, next, end, 1); + error = __split_vma(vmi, next, end, 1); if (error) goto end_split_failed; @@ -2690,9 +2654,10 @@ cannot_expand: * vma again as we may succeed this time. */ if (unlikely(vm_flags != vma->vm_flags && prev)) { - merge = vmi_vma_merge(&vmi, mm, prev, vma->vm_start, - vma->vm_end, vma->vm_flags, NULL, vma->vm_file, - vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); + merge = vma_merge(&vmi, mm, prev, vma->vm_start, + vma->vm_end, vma->vm_flags, NULL, + vma->vm_file, vma->vm_pgoff, NULL, + NULL_VM_UFFD_CTX, NULL); if (merge) { /* * ->mmap() can change vma->vm_file and fput @@ -3249,7 +3214,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ - new_vma = vmi_vma_merge(&vmi, mm, prev, addr, addr + len, vma->vm_flags, + new_vma = vma_merge(&vmi, mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (new_vma) { diff --git a/mm/mprotect.c b/mm/mprotect.c index 39b6335b8813..cce6a0e58fb5 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -642,7 +642,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * First try to merge with previous and/or next vma. */ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *pprev = vmi_vma_merge(vmi, mm, *pprev, start, end, newflags, + *pprev = vma_merge(vmi, mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*pprev) { @@ -654,13 +654,13 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, *pprev = vma; if (start != vma->vm_start) { - error = vmi_split_vma(vmi, mm, vma, start, 1); + error = split_vma(vmi, vma, start, 1); if (error) goto fail; } if (end != vma->vm_end) { - error = vmi_split_vma(vmi, mm, vma, end, 0); + error = split_vma(vmi, vma, end, 0); if (error) goto fail; } diff --git a/mm/mremap.c b/mm/mremap.c index f161516ab3c1..71ba8eddd836 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1043,12 +1043,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, * when a vma would be actually removed due to a merge. */ if (!vma->vm_ops || !vma->vm_ops->close) { - vma = vmi_vma_merge(&vmi, mm, vma, - extension_start, extension_end, - vma->vm_flags, vma->anon_vma, - vma->vm_file, extension_pgoff, - vma_policy(vma), vma->vm_userfaultfd_ctx, - anon_vma_name(vma)); + vma = vma_merge(&vmi, mm, vma, extension_start, + extension_end, vma->vm_flags, vma->anon_vma, + vma->vm_file, extension_pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); } else if (vma_adjust(vma, vma->vm_start, addr + new_len, vma->vm_pgoff, NULL)) { vma = NULL; diff --git a/mm/nommu.c b/mm/nommu.c index 9ddeb92600d6..9a166738909e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1297,18 +1297,20 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) * split a vma into two pieces at address 'addr', a new vma is allocated either * for the first part or the tail. */ -int vmi_split_vma(struct vma_iterator *vmi, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long addr, int new_below) +int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) { struct vm_area_struct *new; struct vm_region *region; unsigned long npages; + struct mm_struct *mm; /* we're only permitted to split anonymous regions (these should have * only a single usage on the region) */ if (vma->vm_file) return -ENOMEM; + mm = vma->vm_mm; if (mm->map_count >= sysctl_max_map_count) return -ENOMEM; @@ -1465,7 +1467,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list if (end != vma->vm_end && offset_in_page(end)) return -EINVAL; if (start != vma->vm_start && end != vma->vm_end) { - ret = vmi_split_vma(&vmi, mm, vma, start, 1); + ret = split_vma(&vmi, vma, start, 1); if (ret < 0) return ret; } -- cgit v1.2.3 From 34403fa579514a6de378f06f79239821c92305bf Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:31 -0500 Subject: mm/damon/vaddr-test.h: stop using vma_mas_store() for maple tree store Prepare for the removal of the vma_mas_store() function by open coding the maple tree store in this test code. Set the range of the maple state and call the store function directly. Link: https://lkml.kernel.org/r/20230120162650.984577-31-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reported-by: kernel test robot Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/vaddr-test.h | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h index bce37c487540..c4b455b5ee30 100644 --- a/mm/damon/vaddr-test.h +++ b/mm/damon/vaddr-test.h @@ -14,19 +14,26 @@ #include -static void __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas, +static int __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas, ssize_t nr_vmas) { - int i; + int i, ret = -ENOMEM; MA_STATE(mas, mt, 0, 0); if (!nr_vmas) - return; + return 0; mas_lock(&mas); - for (i = 0; i < nr_vmas; i++) - vma_mas_store(&vmas[i], &mas); + for (i = 0; i < nr_vmas; i++) { + mas_set_range(&mas, vmas[i].vm_start, vmas[i].vm_end - 1); + if (mas_store_gfp(&mas, &vmas[i], GFP_KERNEL)) + goto failed; + } + + ret = 0; +failed: mas_unlock(&mas); + return ret; } /* @@ -71,7 +78,8 @@ static void damon_test_three_regions_in_vmas(struct kunit *test) }; mt_init_flags(&mm.mm_mt, MM_MT_FLAGS); - __link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas)); + if (__link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas))) + kunit_skip(test, "Failed to create VMA tree"); __damon_va_three_regions(&mm, regions); -- cgit v1.2.3 From fbcc3104b8437cc1babf04421e8bb8181561343e Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:32 -0500 Subject: mmap: convert __vma_adjust() to use vma iterator Use the vma iterator internally for __vma_adjust(). Avoid using the maple tree interface directly for type safety. Link: https://lkml.kernel.org/r/20230120162650.984577-32-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 --- mm/mmap.c | 75 ++++++++++-------------------------------------------- 2 files changed, 13 insertions(+), 65 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f3b49feb5c35..2f62d687e9bd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2856,9 +2856,6 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **, bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); -void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas); -void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas); - static inline int check_data_rlimit(unsigned long rlim, unsigned long new, unsigned long start, diff --git a/mm/mmap.c b/mm/mmap.c index afc65f122f7d..07ba54c34bd0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -432,56 +432,6 @@ static void __vma_link_file(struct vm_area_struct *vma, flush_dcache_mmap_unlock(mapping); } -/* - * vma_mas_store() - Store a VMA in the maple tree. - * @vma: The vm_area_struct - * @mas: The maple state - * - * Efficient way to store a VMA in the maple tree when the @mas has already - * walked to the correct location. - * - * Note: the end address is inclusive in the maple tree. - */ -void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas) -{ - trace_vma_store(mas->tree, vma); - mas_set_range(mas, vma->vm_start, vma->vm_end - 1); - mas_store_prealloc(mas, vma); -} - -/* - * vma_mas_remove() - Remove a VMA from the maple tree. - * @vma: The vm_area_struct - * @mas: The maple state - * - * Efficient way to remove a VMA from the maple tree when the @mas has already - * been established and points to the correct location. - * Note: the end address is inclusive in the maple tree. - */ -void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) -{ - trace_vma_mas_szero(mas->tree, vma->vm_start, vma->vm_end - 1); - mas->index = vma->vm_start; - mas->last = vma->vm_end - 1; - mas_store_prealloc(mas, NULL); -} - -/* - * vma_mas_szero() - Set a given range to zero. Used when modifying a - * vm_area_struct start or end. - * - * @mas: The maple tree ma_state - * @start: The start address to zero - * @end: The end address to zero. - */ -static inline void vma_mas_szero(struct ma_state *mas, unsigned long start, - unsigned long end) -{ - trace_vma_mas_szero(mas->tree, start, end - 1); - mas_set_range(mas, start, end - 1); - mas_store_prealloc(mas, NULL); -} - static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) { VMA_ITERATOR(vmi, mm, 0); @@ -641,7 +591,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, bool vma_changed = false; long adjust_next = 0; int remove_next = 0; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *exporter = NULL, *importer = NULL; if (next && !insert) { @@ -726,7 +676,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } } - if (mas_preallocate(&mas, GFP_KERNEL)) + if (vma_iter_prealloc(&vmi)) return -ENOMEM; vma_adjust_trans_huge(orig_vma, start, end, adjust_next); @@ -772,7 +722,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, if (start != vma->vm_start) { if ((vma->vm_start < start) && (!insert || (insert->vm_end != start))) { - vma_mas_szero(&mas, vma->vm_start, start); + vma_iter_clear(&vmi, vma->vm_start, start); VM_WARN_ON(insert && insert->vm_start > vma->vm_start); } else { vma_changed = true; @@ -782,8 +732,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, if (end != vma->vm_end) { if (vma->vm_end > end) { if (!insert || (insert->vm_start != end)) { - vma_mas_szero(&mas, end, vma->vm_end); - mas_reset(&mas); + vma_iter_clear(&vmi, end, vma->vm_end); + vma_iter_set(&vmi, vma->vm_end); VM_WARN_ON(insert && insert->vm_end < vma->vm_end); } @@ -794,13 +744,13 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } if (vma_changed) - vma_mas_store(vma, &mas); + vma_iter_store(&vmi, vma); vma->vm_pgoff = pgoff; if (adjust_next) { next->vm_start += adjust_next; next->vm_pgoff += adjust_next >> PAGE_SHIFT; - vma_mas_store(next, &mas); + vma_iter_store(&vmi, next); } if (file) { @@ -820,8 +770,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * us to insert it before dropping the locks * (it may either follow vma or precede it). */ - mas_reset(&mas); - vma_mas_store(insert, &mas); + vma_iter_store(&vmi, insert); mm->map_count++; } @@ -867,7 +816,7 @@ again: if (insert && file) uprobe_mmap(insert); - mas_destroy(&mas); + vma_iter_free(&vmi); validate_mm(mm); return 0; @@ -1999,7 +1948,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; /* Overwrite old entry in mtree. */ - vma_mas_store(vma, &mas); + mas_set_range(&mas, vma->vm_start, address - 1); + mas_store_prealloc(&mas, vma); anon_vma_interval_tree_post_update_vma(vma); spin_unlock(&mm->page_table_lock); @@ -2081,7 +2031,8 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) vma->vm_start = address; vma->vm_pgoff -= grow; /* Overwrite old entry in mtree. */ - vma_mas_store(vma, &mas); + mas_set_range(&mas, address, vma->vm_end - 1); + mas_store_prealloc(&mas, vma); anon_vma_interval_tree_post_update_vma(vma); spin_unlock(&mm->page_table_lock); -- cgit v1.2.3 From 9e56044625a1f472edc278105f41a60726991d89 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:33 -0500 Subject: mm: pass through vma iterator to __vma_adjust() Pass the vma iterator through to __vma_adjust() so the state can be updated. Link: https://lkml.kernel.org/r/20230120162650.984577-33-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++++-- mm/mmap.c | 31 +++++++++++++++---------------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 2f62d687e9bd..9c15f401f295 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2831,13 +2831,15 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); -extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start, +extern int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, struct vm_area_struct *expand); static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) { - return __vma_adjust(vma, start, end, pgoff, insert, NULL); + VMA_ITERATOR(vmi, vma->vm_mm, start); + + return __vma_adjust(&vmi, vma, start, end, pgoff, insert, NULL); } extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, diff --git a/mm/mmap.c b/mm/mmap.c index 07ba54c34bd0..330de1ab6a8d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -576,9 +576,9 @@ nomem: * are necessary. The "insert" vma (if any) is to be inserted * before we drop the necessary locks. */ -int __vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, - struct vm_area_struct *expand) +int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff, + struct vm_area_struct *insert, struct vm_area_struct *expand) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next_next = NULL; /* uninit var warning */ @@ -591,7 +591,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, bool vma_changed = false; long adjust_next = 0; int remove_next = 0; - VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *exporter = NULL, *importer = NULL; if (next && !insert) { @@ -676,7 +675,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } } - if (vma_iter_prealloc(&vmi)) + if (vma_iter_prealloc(vmi)) return -ENOMEM; vma_adjust_trans_huge(orig_vma, start, end, adjust_next); @@ -722,7 +721,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, if (start != vma->vm_start) { if ((vma->vm_start < start) && (!insert || (insert->vm_end != start))) { - vma_iter_clear(&vmi, vma->vm_start, start); + vma_iter_clear(vmi, vma->vm_start, start); VM_WARN_ON(insert && insert->vm_start > vma->vm_start); } else { vma_changed = true; @@ -732,8 +731,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, if (end != vma->vm_end) { if (vma->vm_end > end) { if (!insert || (insert->vm_start != end)) { - vma_iter_clear(&vmi, end, vma->vm_end); - vma_iter_set(&vmi, vma->vm_end); + vma_iter_clear(vmi, end, vma->vm_end); + vma_iter_set(vmi, vma->vm_end); VM_WARN_ON(insert && insert->vm_end < vma->vm_end); } @@ -744,13 +743,13 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } if (vma_changed) - vma_iter_store(&vmi, vma); + vma_iter_store(vmi, vma); vma->vm_pgoff = pgoff; if (adjust_next) { next->vm_start += adjust_next; next->vm_pgoff += adjust_next >> PAGE_SHIFT; - vma_iter_store(&vmi, next); + vma_iter_store(vmi, next); } if (file) { @@ -770,7 +769,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * us to insert it before dropping the locks * (it may either follow vma or precede it). */ - vma_iter_store(&vmi, insert); + vma_iter_store(vmi, insert); mm->map_count++; } @@ -816,7 +815,7 @@ again: if (insert && file) uprobe_mmap(insert); - vma_iter_free(&vmi); + vma_iter_free(vmi); validate_mm(mm); return 0; @@ -1010,20 +1009,20 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, if (merge_prev && merge_next && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ - err = __vma_adjust(prev, prev->vm_start, + err = __vma_adjust(vmi, prev, prev->vm_start, next->vm_end, prev->vm_pgoff, NULL, prev); res = prev; } else if (merge_prev) { /* cases 2, 5, 7 */ - err = __vma_adjust(prev, prev->vm_start, + err = __vma_adjust(vmi, prev, prev->vm_start, end, prev->vm_pgoff, NULL, prev); res = prev; } else if (merge_next) { if (prev && addr < prev->vm_end) /* case 4 */ - err = __vma_adjust(prev, prev->vm_start, + err = __vma_adjust(vmi, prev, prev->vm_start, addr, prev->vm_pgoff, NULL, next); else /* cases 3, 8 */ - err = __vma_adjust(mid, addr, next->vm_end, + err = __vma_adjust(vmi, mid, addr, next->vm_end, next->vm_pgoff - pglen, NULL, next); res = next; } -- cgit v1.2.3 From 85ab779e3426dc47cc27418821e6577680b509b9 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:34 -0500 Subject: madvise: use split_vma() instead of __split_vma() The split_vma() wrapper is specifically for this use case, so use it. [Liam.Howlett@oracle.com: fix VMA_ITERATOR start position] Link: https://lkml.kernel.org/r/20230125135809.85262-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230120162650.984577-34-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/madvise.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 02b317726c9a..ca672e37b38c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -142,7 +142,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; int error; pgoff_t pgoff; - VMA_ITERATOR(vmi, mm, 0); + VMA_ITERATOR(vmi, mm, start); if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { *prev = vma; @@ -161,17 +161,13 @@ static int madvise_update_vma(struct vm_area_struct *vma, *prev = vma; if (start != vma->vm_start) { - if (unlikely(mm->map_count >= sysctl_max_map_count)) - return -ENOMEM; - error = __split_vma(&vmi, vma, start, 1); + error = split_vma(&vmi, vma, start, 1); if (error) return error; } if (end != vma->vm_end) { - if (unlikely(mm->map_count >= sysctl_max_map_count)) - return -ENOMEM; - error = __split_vma(&vmi, vma, end, 0); + error = split_vma(&vmi, vma, end, 0); if (error) return error; } -- cgit v1.2.3 From c465be97a4bc0022688d99f77a75b9be91843b31 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:35 -0500 Subject: mm: remove unnecessary write to vma iterator in __vma_adjust() If the vma start address is going to change due to an insert, then it is safe to not write the vma to the tree. The write of the insert vma will alter the tree as necessary. Link: https://lkml.kernel.org/r/20230120162650.984577-35-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 330de1ab6a8d..bd3ccfb4d9e0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -719,10 +719,12 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, } if (start != vma->vm_start) { - if ((vma->vm_start < start) && - (!insert || (insert->vm_end != start))) { - vma_iter_clear(vmi, vma->vm_start, start); - VM_WARN_ON(insert && insert->vm_start > vma->vm_start); + if (vma->vm_start < start) { + if (!insert || (insert->vm_end != start)) { + vma_iter_clear(vmi, vma->vm_start, start); + vma_iter_set(vmi, start); + VM_WARN_ON(insert && insert->vm_start > vma->vm_start); + } } else { vma_changed = true; } -- cgit v1.2.3 From 0fd5a9e2b09ff589370f2c536df77654ed2d341f Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:36 -0500 Subject: mm: pass vma iterator through to __vma_adjust() Pass the iterator through to be used in __vma_adjust(). The state of the iterator needs to be correct for the operation that will occur so make the adjustments. Link: https://lkml.kernel.org/r/20230120162650.984577-36-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index bd3ccfb4d9e0..1bdf66b3b96e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -525,6 +525,10 @@ inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_interval_tree_remove(vma, root); } + /* VMA iterator points to previous, so set to start if necessary */ + if (vma_iter_addr(vmi) != start) + vma_iter_set(vmi, start); + vma->vm_start = start; vma->vm_end = end; vma->vm_pgoff = pgoff; @@ -2164,13 +2168,13 @@ static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, /* * __split_vma() bypasses sysctl_max_map_count checking. We use this where it * has already been checked or doesn't make sense to fail. + * VMA Iterator will point to the end VMA. */ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { struct vm_area_struct *new; int err; - unsigned long end = vma->vm_end; validate_mm_mt(vma->vm_mm); @@ -2206,14 +2210,17 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, new->vm_ops->open(new); if (new_below) - err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + - ((addr - new->vm_start) >> PAGE_SHIFT), new); + err = __vma_adjust(vmi, vma, addr, vma->vm_end, + vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT), + new, NULL); else - err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + err = __vma_adjust(vmi, vma, vma->vm_start, addr, vma->vm_pgoff, + new, NULL); /* Success. */ if (!err) { - vma_iter_set(vmi, end); + if (new_below) + vma_next(vmi); return 0; } @@ -2308,8 +2315,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (error) goto start_split_failed; - vma_iter_set(vmi, start); - vma = vma_find(vmi, end); + vma = vma_iter_load(vmi); } prev = vma_prev(vmi); @@ -2329,7 +2335,6 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (error) goto end_split_failed; - vma_iter_set(vmi, end); split = vma_prev(vmi); error = munmap_sidetree(split, &mas_detach); if (error) @@ -2573,6 +2578,7 @@ cannot_expand: goto unacct_error; } + vma_iter_set(&vmi, addr); vma->vm_start = addr; vma->vm_end = end; vma->vm_flags = vm_flags; -- cgit v1.2.3 From b373037fa9bb374f26bbabc0779fe990d02d33b7 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:37 -0500 Subject: mm: add vma iterator to vma_adjust() arguments Change the vma_adjust() function definition to accept the vma iterator and pass it through to __vma_adjust(). Update fs/exec to use the new vma_adjust() function parameters. Update mm/mremap to use the new vma_adjust() function parameters. Revert the __split_vma() calls back from __vma_adjust() to vma_adjust() and pass through the vma iterator. Link: https://lkml.kernel.org/r/20230120162650.984577-37-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/exec.c | 11 ++++------- include/linux/mm.h | 9 ++++----- mm/mmap.c | 10 +++++----- mm/mremap.c | 4 ++-- 4 files changed, 15 insertions(+), 19 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index b98647eeae9f..76ee62e1d3f1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -699,7 +699,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) /* * cover the whole range: [new_start, old_end) */ - if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL)) + if (vma_adjust(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL)) return -ENOMEM; /* @@ -731,12 +731,9 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) } tlb_finish_mmu(&tlb); - /* - * Shrink the vma to just the new range. Always succeeds. - */ - vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL); - - return 0; + vma_prev(&vmi); + /* Shrink the vma to just the new range */ + return vma_adjust(&vmi, vma, new_start, new_end, vma->vm_pgoff, NULL); } /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 9c15f401f295..2e95287a9f74 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2834,12 +2834,11 @@ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admi extern int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, struct vm_area_struct *expand); -static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) +static inline int vma_adjust(struct vma_iterator *vmi, + struct vm_area_struct *vma, unsigned long start, unsigned long end, + pgoff_t pgoff, struct vm_area_struct *insert) { - VMA_ITERATOR(vmi, vma->vm_mm, start); - - return __vma_adjust(&vmi, vma, start, end, pgoff, insert, NULL); + return __vma_adjust(vmi, vma, start, end, pgoff, insert, NULL); } extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, diff --git a/mm/mmap.c b/mm/mmap.c index 1bdf66b3b96e..f61e45caa32c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2210,12 +2210,12 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, new->vm_ops->open(new); if (new_below) - err = __vma_adjust(vmi, vma, addr, vma->vm_end, - vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT), - new, NULL); + err = vma_adjust(vmi, vma, addr, vma->vm_end, + vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT), + new); else - err = __vma_adjust(vmi, vma, vma->vm_start, addr, vma->vm_pgoff, - new, NULL); + err = vma_adjust(vmi, vma, vma->vm_start, addr, vma->vm_pgoff, + new); /* Success. */ if (!err) { diff --git a/mm/mremap.c b/mm/mremap.c index 71ba8eddd836..2176f0cc7f9a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1047,8 +1047,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, extension_end, vma->vm_flags, vma->anon_vma, vma->vm_file, extension_pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); - } else if (vma_adjust(vma, vma->vm_start, addr + new_len, - vma->vm_pgoff, NULL)) { + } else if (vma_adjust(&vmi, vma, vma->vm_start, + addr + new_len, vma->vm_pgoff, NULL)) { vma = NULL; } if (!vma) { -- cgit v1.2.3 From cc8d1b097de78bd25b7b1ed32018b21cecbf3f6c Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:38 -0500 Subject: mmap: clean up mmap_region() unrolling Move logic of unrolling to the error path as apposed to duplicating it within the function body. This reduces the potential of missing an update to one path when making changes. Link: https://lkml.kernel.org/r/20230120162650.984577-38-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Li Zetao Signed-off-by: Andrew Morton --- mm/mmap.c | 45 ++++++++++++++++++--------------------------- 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index f61e45caa32c..95ea613a4378 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2601,12 +2601,11 @@ cannot_expand: * Expansion is handled above, merging is handled below. * Drivers should not alter the address of the VMA. */ - if (WARN_ON((addr != vma->vm_start))) { - error = -EINVAL; + error = -EINVAL; + if (WARN_ON((addr != vma->vm_start))) goto close_and_free_vma; - } - vma_iter_set(&vmi, addr); + vma_iter_set(&vmi, addr); /* * If vm_flags changed after call_mmap(), we should try merge * vma again as we may succeed this time. @@ -2653,25 +2652,13 @@ cannot_expand: } /* Allow architectures to sanity-check the vm_flags */ - if (!arch_validate_flags(vma->vm_flags)) { - error = -EINVAL; - if (file) - goto close_and_free_vma; - else if (vma->vm_file) - goto unmap_and_free_vma; - else - goto free_vma; - } + error = -EINVAL; + if (!arch_validate_flags(vma->vm_flags)) + goto close_and_free_vma; - if (vma_iter_prealloc(&vmi)) { - error = -ENOMEM; - if (file) - goto close_and_free_vma; - else if (vma->vm_file) - goto unmap_and_free_vma; - else - goto free_vma; - } + error = -ENOMEM; + if (vma_iter_prealloc(&vmi)) + goto close_and_free_vma; if (vma->vm_file) i_mmap_lock_write(vma->vm_file->f_mapping); @@ -2730,14 +2717,18 @@ expanded: return addr; close_and_free_vma: - if (vma->vm_ops && vma->vm_ops->close) + if (file && vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); + + if (file || vma->vm_file) { unmap_and_free_vma: - fput(vma->vm_file); - vma->vm_file = NULL; + fput(vma->vm_file); + vma->vm_file = NULL; - /* Undo any partial mapping done by a device driver. */ - unmap_region(mm, &mm->mm_mt, vma, prev, next, vma->vm_start, vma->vm_end); + /* Undo any partial mapping done by a device driver. */ + unmap_region(mm, &mm->mm_mt, vma, prev, next, vma->vm_start, + vma->vm_end); + } if (file && (vm_flags & VM_SHARED)) mapping_unmap_writable(file->f_mapping); free_vma: -- cgit v1.2.3 From 6b73cff239e52fd43949c40eaabb369298c11aae Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:39 -0500 Subject: mm: change munmap splitting order and move_vma() Splitting can be more efficient when the order is not of concern. Change do_vmi_align_munmap() to reduce walking of the tree during split operations. move_vma() must also be altered to remove the dependency of keeping the original VMA as the active part of the split. Transition to using vma iterator to look up the prev and/or next vma after munmap. [Liam.Howlett@oracle.com: fix vma iterator initialization] Link: https://lkml.kernel.org/r/20230126212011.980350-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230120162650.984577-39-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 18 ++---------------- mm/mremap.c | 28 +++++++++++++++++----------- 2 files changed, 19 insertions(+), 27 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 95ea613a4378..29ffd58d4091 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2329,21 +2329,9 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, for_each_vma_range(*vmi, next, end) { /* Does it split the end? */ if (next->vm_end > end) { - struct vm_area_struct *split; - - error = __split_vma(vmi, next, end, 1); + error = __split_vma(vmi, next, end, 0); if (error) goto end_split_failed; - - split = vma_prev(vmi); - error = munmap_sidetree(split, &mas_detach); - if (error) - goto munmap_sidetree_failed; - - count++; - if (vma == next) - vma = split; - break; } error = munmap_sidetree(next, &mas_detach); if (error) @@ -2356,9 +2344,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, #endif } - if (!next) - next = vma_next(vmi); - + next = vma_next(vmi); if (unlikely(uf)) { /* * If userfaultfd_unmap_prep returns an error the vmas diff --git a/mm/mremap.c b/mm/mremap.c index 2176f0cc7f9a..6c7f49ab7d19 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -580,11 +580,12 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long vm_flags = vma->vm_flags; unsigned long new_pgoff; unsigned long moved_len; - unsigned long excess = 0; + unsigned long account_start = 0; + unsigned long account_end = 0; unsigned long hiwater_vm; - int split = 0; int err = 0; bool need_rmap_locks; + struct vma_iterator vmi; /* * We'd prefer to avoid failure later on in do_munmap: @@ -662,10 +663,10 @@ static unsigned long move_vma(struct vm_area_struct *vma, /* Conceal VM_ACCOUNT so old reservation is not undone */ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) { vma->vm_flags &= ~VM_ACCOUNT; - excess = vma->vm_end - vma->vm_start - old_len; - if (old_addr > vma->vm_start && - old_addr + old_len < vma->vm_end) - split = 1; + if (vma->vm_start < old_addr) + account_start = vma->vm_start; + if (vma->vm_end > old_addr + old_len) + account_end = vma->vm_end; } /* @@ -700,11 +701,12 @@ static unsigned long move_vma(struct vm_area_struct *vma, return new_addr; } - if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) { + vma_iter_init(&vmi, mm, old_addr); + if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) { /* OOM: unable to split vma, just get accounts right */ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) vm_acct_memory(old_len >> PAGE_SHIFT); - excess = 0; + account_start = account_end = 0; } if (vm_flags & VM_LOCKED) { @@ -715,10 +717,14 @@ static unsigned long move_vma(struct vm_area_struct *vma, mm->hiwater_vm = hiwater_vm; /* Restore VM_ACCOUNT if one or two pieces of vma left */ - if (excess) { + if (account_start) { + vma = vma_prev(&vmi); + vma->vm_flags |= VM_ACCOUNT; + } + + if (account_end) { + vma = vma_next(&vmi); vma->vm_flags |= VM_ACCOUNT; - if (split) - find_vma(mm, vma->vm_end)->vm_flags |= VM_ACCOUNT; } return new_addr; -- cgit v1.2.3 From e3d73f848e5f2e9da46646c97fb127dfc6868767 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:40 -0500 Subject: mm/mmap: move anon_vma setting in __vma_adjust() Move the anon_vma setting & warn_no up the function. This is done to clear up the locking later. Link: https://lkml.kernel.org/r/20230120162650.984577-40-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 29ffd58d4091..12545ec9cdeb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -682,6 +682,14 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, if (vma_iter_prealloc(vmi)) return -ENOMEM; + anon_vma = vma->anon_vma; + if (!anon_vma && adjust_next) + anon_vma = next->anon_vma; + + if (anon_vma) + VM_WARN_ON(adjust_next && next->anon_vma && + anon_vma != next->anon_vma); + vma_adjust_trans_huge(orig_vma, start, end, adjust_next); if (file) { mapping = file->f_mapping; @@ -703,12 +711,7 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, } } - anon_vma = vma->anon_vma; - if (!anon_vma && adjust_next) - anon_vma = next->anon_vma; if (anon_vma) { - VM_WARN_ON(adjust_next && next->anon_vma && - anon_vma != next->anon_vma); anon_vma_lock_write(anon_vma); anon_vma_interval_tree_pre_update_vma(vma); if (adjust_next) -- cgit v1.2.3 From 440703e082b9c79c3d4fffcca8c2dffd621e6dc5 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:41 -0500 Subject: mm/mmap: refactor locking out of __vma_adjust() Move the locking into vma_prepare() and vma_complete() for use elsewhere Link: https://lkml.kernel.org/r/20230120162650.984577-41-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/internal.h | 14 ++++ mm/mmap.c | 231 ++++++++++++++++++++++++++++++++++------------------------ 2 files changed, 150 insertions(+), 95 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index ffd65248f266..90bb2078444c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -941,4 +941,18 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi, return 0; } + +/* + * VMA lock generalization + */ +struct vma_prepare { + struct vm_area_struct *vma; + struct vm_area_struct *adj_next; + struct file *file; + struct address_space *mapping; + struct anon_vma *anon_vma; + struct vm_area_struct *insert; + struct vm_area_struct *remove; + struct vm_area_struct *remove2; +}; #endif /* __MM_INTERNAL_H */ diff --git a/mm/mmap.c b/mm/mmap.c index 12545ec9cdeb..1ef284b7e6fb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -573,6 +573,127 @@ nomem: return -ENOMEM; } +/* + * vma_prepare() - Helper function for handling locking VMAs prior to altering + * @vp: The initialized vma_prepare struct + */ +static inline void vma_prepare(struct vma_prepare *vp) +{ + if (vp->file) { + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); + + if (vp->adj_next) + uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, + vp->adj_next->vm_end); + + i_mmap_lock_write(vp->mapping); + if (vp->insert && vp->insert->vm_file) { + /* + * Put into interval tree now, so instantiated pages + * are visible to arm/parisc __flush_dcache_page + * throughout; but we cannot insert into address + * space until vma start or end is updated. + */ + __vma_link_file(vp->insert, + vp->insert->vm_file->f_mapping); + } + } + + if (vp->anon_vma) { + anon_vma_lock_write(vp->anon_vma); + anon_vma_interval_tree_pre_update_vma(vp->vma); + if (vp->adj_next) + anon_vma_interval_tree_pre_update_vma(vp->adj_next); + } + + if (vp->file) { + flush_dcache_mmap_lock(vp->mapping); + vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); + if (vp->adj_next) + vma_interval_tree_remove(vp->adj_next, + &vp->mapping->i_mmap); + } + +} + +/* + * vma_complete- Helper function for handling the unlocking after altering VMAs, + * or for inserting a VMA. + * + * @vp: The vma_prepare struct + * @vmi: The vma iterator + * @mm: The mm_struct + */ +static inline void vma_complete(struct vma_prepare *vp, + struct vma_iterator *vmi, struct mm_struct *mm) +{ + if (vp->file) { + if (vp->adj_next) + vma_interval_tree_insert(vp->adj_next, + &vp->mapping->i_mmap); + vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); + flush_dcache_mmap_unlock(vp->mapping); + } + + if (vp->remove && vp->file) { + __remove_shared_vm_struct(vp->remove, vp->file, vp->mapping); + if (vp->remove2) + __remove_shared_vm_struct(vp->remove2, vp->file, + vp->mapping); + } else if (vp->insert) { + /* + * split_vma has split insert from vma, and needs + * us to insert it before dropping the locks + * (it may either follow vma or precede it). + */ + vma_iter_store(vmi, vp->insert); + mm->map_count++; + } + + if (vp->anon_vma) { + anon_vma_interval_tree_post_update_vma(vp->vma); + if (vp->adj_next) + anon_vma_interval_tree_post_update_vma(vp->adj_next); + anon_vma_unlock_write(vp->anon_vma); + } + + if (vp->file) { + i_mmap_unlock_write(vp->mapping); + uprobe_mmap(vp->vma); + + if (vp->adj_next) + uprobe_mmap(vp->adj_next); + } + + if (vp->remove) { +again: + if (vp->file) { + uprobe_munmap(vp->remove, vp->remove->vm_start, + vp->remove->vm_end); + fput(vp->file); + } + if (vp->remove->anon_vma) + anon_vma_merge(vp->vma, vp->remove); + mm->map_count--; + mpol_put(vma_policy(vp->remove)); + if (!vp->remove2) + WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); + vm_area_free(vp->remove); + + /* + * In mprotect's case 6 (see comments on vma_merge), + * we must remove next_next too. + */ + if (vp->remove2) { + vp->remove = vp->remove2; + vp->remove2 = NULL; + goto again; + } + } + if (vp->insert && vp->file) + uprobe_mmap(vp->insert); +} + /* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. @@ -588,14 +709,13 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct *next_next = NULL; /* uninit var warning */ struct vm_area_struct *next = find_vma(mm, vma->vm_end); struct vm_area_struct *orig_vma = vma; - struct address_space *mapping = NULL; - struct rb_root_cached *root = NULL; struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; bool vma_changed = false; long adjust_next = 0; int remove_next = 0; struct vm_area_struct *exporter = NULL, *importer = NULL; + struct vma_prepare vma_prep; if (next && !insert) { if (end >= next->vm_end) { @@ -691,39 +811,22 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, anon_vma != next->anon_vma); vma_adjust_trans_huge(orig_vma, start, end, adjust_next); - if (file) { - mapping = file->f_mapping; - root = &mapping->i_mmap; - uprobe_munmap(vma, vma->vm_start, vma->vm_end); - - if (adjust_next) - uprobe_munmap(next, next->vm_start, next->vm_end); - - i_mmap_lock_write(mapping); - if (insert && insert->vm_file) { - /* - * Put into interval tree now, so instantiated pages - * are visible to arm/parisc __flush_dcache_page - * throughout; but we cannot insert into address - * space until vma start or end is updated. - */ - __vma_link_file(insert, insert->vm_file->f_mapping); - } - } - if (anon_vma) { - anon_vma_lock_write(anon_vma); - anon_vma_interval_tree_pre_update_vma(vma); - if (adjust_next) - anon_vma_interval_tree_pre_update_vma(next); + memset(&vma_prep, 0, sizeof(vma_prep)); + vma_prep.vma = vma; + vma_prep.anon_vma = anon_vma; + vma_prep.file = file; + if (adjust_next) + vma_prep.adj_next = next; + if (file) + vma_prep.mapping = file->f_mapping; + vma_prep.insert = insert; + if (remove_next) { + vma_prep.remove = next; + vma_prep.remove2 = next_next; } - if (file) { - flush_dcache_mmap_lock(mapping); - vma_interval_tree_remove(vma, root); - if (adjust_next) - vma_interval_tree_remove(next, root); - } + vma_prepare(&vma_prep); if (start != vma->vm_start) { if (vma->vm_start < start) { @@ -761,69 +864,7 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_iter_store(vmi, next); } - if (file) { - if (adjust_next) - vma_interval_tree_insert(next, root); - vma_interval_tree_insert(vma, root); - flush_dcache_mmap_unlock(mapping); - } - - if (remove_next && file) { - __remove_shared_vm_struct(next, file, mapping); - if (remove_next == 2) - __remove_shared_vm_struct(next_next, file, mapping); - } else if (insert) { - /* - * split_vma has split insert from vma, and needs - * us to insert it before dropping the locks - * (it may either follow vma or precede it). - */ - vma_iter_store(vmi, insert); - mm->map_count++; - } - - if (anon_vma) { - anon_vma_interval_tree_post_update_vma(vma); - if (adjust_next) - anon_vma_interval_tree_post_update_vma(next); - anon_vma_unlock_write(anon_vma); - } - - if (file) { - i_mmap_unlock_write(mapping); - uprobe_mmap(vma); - - if (adjust_next) - uprobe_mmap(next); - } - - if (remove_next) { -again: - if (file) { - uprobe_munmap(next, next->vm_start, next->vm_end); - fput(file); - } - if (next->anon_vma) - anon_vma_merge(vma, next); - mm->map_count--; - mpol_put(vma_policy(next)); - if (remove_next != 2) - BUG_ON(vma->vm_end < next->vm_end); - vm_area_free(next); - - /* - * In mprotect's case 6 (see comments on vma_merge), - * we must remove next_next too. - */ - if (remove_next == 2) { - remove_next = 1; - next = next_next; - goto again; - } - } - if (insert && file) - uprobe_mmap(insert); - + vma_complete(&vma_prep, vmi, mm); vma_iter_free(vmi); validate_mm(mm); -- cgit v1.2.3 From 9303d3e1c3f8d8863d561a070489cf44ce5e4103 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:42 -0500 Subject: mm/mmap: use vma_prepare() and vma_complete() in vma_expand() Use the new locking functions for vma_expand(). This reduces code duplication. At the same time change VM_BUG_ON() to VM_WARN_ON() Link: https://lkml.kernel.org/r/20230120162650.984577-42-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 188 ++++++++++++++++++++++++-------------------------------------- 1 file changed, 72 insertions(+), 116 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 1ef284b7e6fb..b5bf70db3777 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -457,122 +457,6 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) return 0; } -/* - * vma_expand - Expand an existing VMA - * - * @mas: The maple state - * @vma: The vma to expand - * @start: The start of the vma - * @end: The exclusive end of the vma - * @pgoff: The page offset of vma - * @next: The current of next vma. - * - * Expand @vma to @start and @end. Can expand off the start and end. Will - * expand over @next if it's different from @vma and @end == @next->vm_end. - * Checking if the @vma can expand and merge with @next needs to be handled by - * the caller. - * - * Returns: 0 on success - */ -inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff, - struct vm_area_struct *next) -{ - struct mm_struct *mm = vma->vm_mm; - struct address_space *mapping = NULL; - struct rb_root_cached *root = NULL; - struct anon_vma *anon_vma = vma->anon_vma; - struct file *file = vma->vm_file; - bool remove_next = false; - - if (next && (vma != next) && (end == next->vm_end)) { - remove_next = true; - if (next->anon_vma && !vma->anon_vma) { - int error; - - anon_vma = next->anon_vma; - vma->anon_vma = anon_vma; - error = anon_vma_clone(vma, next); - if (error) - return error; - } - } - - /* Not merging but overwriting any part of next is not handled. */ - VM_BUG_ON(next && !remove_next && next != vma && end > next->vm_start); - /* Only handles expanding */ - VM_BUG_ON(vma->vm_start < start || vma->vm_end > end); - - if (vma_iter_prealloc(vmi)) - goto nomem; - - vma_adjust_trans_huge(vma, start, end, 0); - - if (file) { - mapping = file->f_mapping; - root = &mapping->i_mmap; - uprobe_munmap(vma, vma->vm_start, vma->vm_end); - i_mmap_lock_write(mapping); - } - - if (anon_vma) { - anon_vma_lock_write(anon_vma); - anon_vma_interval_tree_pre_update_vma(vma); - } - - if (file) { - flush_dcache_mmap_lock(mapping); - vma_interval_tree_remove(vma, root); - } - - /* VMA iterator points to previous, so set to start if necessary */ - if (vma_iter_addr(vmi) != start) - vma_iter_set(vmi, start); - - vma->vm_start = start; - vma->vm_end = end; - vma->vm_pgoff = pgoff; - vma_iter_store(vmi, vma); - - if (file) { - vma_interval_tree_insert(vma, root); - flush_dcache_mmap_unlock(mapping); - } - - /* Expanding over the next vma */ - if (remove_next && file) { - __remove_shared_vm_struct(next, file, mapping); - } - - if (anon_vma) { - anon_vma_interval_tree_post_update_vma(vma); - anon_vma_unlock_write(anon_vma); - } - - if (file) { - i_mmap_unlock_write(mapping); - uprobe_mmap(vma); - } - - if (remove_next) { - if (file) { - uprobe_munmap(next, next->vm_start, next->vm_end); - fput(file); - } - if (next->anon_vma) - anon_vma_merge(vma, next); - mm->map_count--; - mpol_put(vma_policy(next)); - vm_area_free(next); - } - - validate_mm(mm); - return 0; - -nomem: - return -ENOMEM; -} - /* * vma_prepare() - Helper function for handling locking VMAs prior to altering * @vp: The initialized vma_prepare struct @@ -694,6 +578,78 @@ again: uprobe_mmap(vp->insert); } +/* + * vma_expand - Expand an existing VMA + * + * @vmi: The vma iterator + * @vma: The vma to expand + * @start: The start of the vma + * @end: The exclusive end of the vma + * @pgoff: The page offset of vma + * @next: The current of next vma. + * + * Expand @vma to @start and @end. Can expand off the start and end. Will + * expand over @next if it's different from @vma and @end == @next->vm_end. + * Checking if the @vma can expand and merge with @next needs to be handled by + * the caller. + * + * Returns: 0 on success + */ +inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff, + struct vm_area_struct *next) +{ + struct vma_prepare vp; + + memset(&vp, 0, sizeof(vp)); + vp.vma = vma; + vp.anon_vma = vma->anon_vma; + if (next && (vma != next) && (end == next->vm_end)) { + vp.remove = next; + if (next->anon_vma && !vma->anon_vma) { + int error; + + vp.anon_vma = next->anon_vma; + vma->anon_vma = next->anon_vma; + error = anon_vma_clone(vma, next); + if (error) + return error; + } + } + + /* Not merging but overwriting any part of next is not handled. */ + VM_WARN_ON(next && !vp.remove && + next != vma && end > next->vm_start); + /* Only handles expanding */ + VM_WARN_ON(vma->vm_start < start || vma->vm_end > end); + + if (vma_iter_prealloc(vmi)) + goto nomem; + + vma_adjust_trans_huge(vma, start, end, 0); + + vp.file = vma->vm_file; + if (vp.file) + vp.mapping = vp.file->f_mapping; + + /* VMA iterator points to previous, so set to start if necessary */ + if (vma_iter_addr(vmi) != start) + vma_iter_set(vmi, start); + + vma_prepare(&vp); + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + /* Note: mas must be pointing to the expanding VMA */ + vma_iter_store(vmi, vma); + + vma_complete(&vp, vmi, vma->vm_mm); + validate_mm(vma->vm_mm); + return 0; + +nomem: + return -ENOMEM; +} /* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. -- cgit v1.2.3 From 68cefec539206ea691409f0d1e944ed343f23f04 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:43 -0500 Subject: mm/mmap: introduce init_vma_prep() and init_multi_vma_prep() Add init_vma_prep() and init_multi_vma_prep() to set up the struct vma_prepare. This is to abstract the locking when adjusting the VMAs. Also change __vma_adjust() variable remove_next int in favour of a pointer to the VMA to remove. Rename next_next to remove2 since this better reflects its use. Link: https://lkml.kernel.org/r/20230120162650.984577-43-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 108 +++++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 61 insertions(+), 47 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index b5bf70db3777..e259b33fcb52 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -457,6 +457,45 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) return 0; } +/* + * init_multi_vma_prep() - Initializer for struct vma_prepare + * @vp: The vma_prepare struct + * @vma: The vma that will be altered once locked + * @next: The next vma if it is to be adjusted + * @remove: The first vma to be removed + * @remove2: The second vma to be removed + */ +static inline void init_multi_vma_prep(struct vma_prepare *vp, + struct vm_area_struct *vma, struct vm_area_struct *next, + struct vm_area_struct *remove, struct vm_area_struct *remove2) +{ + memset(vp, 0, sizeof(struct vma_prepare)); + vp->vma = vma; + vp->anon_vma = vma->anon_vma; + vp->remove = remove; + vp->remove2 = remove2; + vp->adj_next = next; + if (!vp->anon_vma && next) + vp->anon_vma = next->anon_vma; + + vp->file = vma->vm_file; + if (vp->file) + vp->mapping = vma->vm_file->f_mapping; + +} + +/* + * init_vma_prep() - Initializer wrapper for vma_prepare struct + * @vp: The vma_prepare struct + * @vma: The vma that will be altered once locked + */ +static inline void init_vma_prep(struct vma_prepare *vp, + struct vm_area_struct *vma) +{ + init_multi_vma_prep(vp, vma, NULL, NULL, NULL); +} + + /* * vma_prepare() - Helper function for handling locking VMAs prior to altering * @vp: The initialized vma_prepare struct @@ -566,7 +605,7 @@ again: /* * In mprotect's case 6 (see comments on vma_merge), - * we must remove next_next too. + * we must remove the one after next as well. */ if (vp->remove2) { vp->remove = vp->remove2; @@ -599,17 +638,14 @@ inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *next) { + bool remove_next = false; struct vma_prepare vp; - memset(&vp, 0, sizeof(vp)); - vp.vma = vma; - vp.anon_vma = vma->anon_vma; if (next && (vma != next) && (end == next->vm_end)) { - vp.remove = next; + remove_next = true; if (next->anon_vma && !vma->anon_vma) { int error; - vp.anon_vma = next->anon_vma; vma->anon_vma = next->anon_vma; error = anon_vma_clone(vma, next); if (error) @@ -617,6 +653,7 @@ inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, } } + init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL); /* Not merging but overwriting any part of next is not handled. */ VM_WARN_ON(next && !vp.remove && next != vma && end > next->vm_start); @@ -627,11 +664,6 @@ inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, goto nomem; vma_adjust_trans_huge(vma, start, end, 0); - - vp.file = vma->vm_file; - if (vp.file) - vp.mapping = vp.file->f_mapping; - /* VMA iterator points to previous, so set to start if necessary */ if (vma_iter_addr(vmi) != start) vma_iter_set(vmi, start); @@ -662,14 +694,13 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct *insert, struct vm_area_struct *expand) { struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *next_next = NULL; /* uninit var warning */ + struct vm_area_struct *remove2 = NULL; + struct vm_area_struct *remove = NULL; struct vm_area_struct *next = find_vma(mm, vma->vm_end); struct vm_area_struct *orig_vma = vma; - struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; bool vma_changed = false; long adjust_next = 0; - int remove_next = 0; struct vm_area_struct *exporter = NULL, *importer = NULL; struct vma_prepare vma_prep; @@ -688,25 +719,24 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, */ VM_WARN_ON(end != next->vm_end); /* - * remove_next == 3 means we're - * removing "vma" and that to do so we + * we're removing "vma" and that to do so we * swapped "vma" and "next". */ - remove_next = 3; VM_WARN_ON(file != next->vm_file); swap(vma, next); + remove = next; } else { VM_WARN_ON(expand != vma); /* - * case 1, 6, 7, remove_next == 2 is case 6, - * remove_next == 1 is case 1 or 7. + * case 1, 6, 7, remove next. + * case 6 also removes the one beyond next */ - remove_next = 1 + (end > next->vm_end); - if (remove_next == 2) - next_next = find_vma(mm, next->vm_end); + remove = next; + if (end > next->vm_end) + remove2 = find_vma(mm, next->vm_end); - VM_WARN_ON(remove_next == 2 && - end != next_next->vm_end); + VM_WARN_ON(remove2 != NULL && + end != remove2->vm_end); } exporter = next; @@ -716,8 +746,8 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, * If next doesn't have anon_vma, import from vma after * next, if the vma overlaps with it. */ - if (remove_next == 2 && !next->anon_vma) - exporter = next_next; + if (remove2 != NULL && !next->anon_vma) + exporter = remove2; } else if (end > next->vm_start) { /* @@ -758,30 +788,14 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, if (vma_iter_prealloc(vmi)) return -ENOMEM; - anon_vma = vma->anon_vma; - if (!anon_vma && adjust_next) - anon_vma = next->anon_vma; - - if (anon_vma) - VM_WARN_ON(adjust_next && next->anon_vma && - anon_vma != next->anon_vma); - vma_adjust_trans_huge(orig_vma, start, end, adjust_next); - memset(&vma_prep, 0, sizeof(vma_prep)); - vma_prep.vma = vma; - vma_prep.anon_vma = anon_vma; - vma_prep.file = file; - if (adjust_next) - vma_prep.adj_next = next; - if (file) - vma_prep.mapping = file->f_mapping; - vma_prep.insert = insert; - if (remove_next) { - vma_prep.remove = next; - vma_prep.remove2 = next_next; - } + init_multi_vma_prep(&vma_prep, vma, adjust_next ? next : NULL, remove, + remove2); + VM_WARN_ON(vma_prep.anon_vma && adjust_next && next->anon_vma && + vma_prep.anon_vma != next->anon_vma); + vma_prep.insert = insert; vma_prepare(&vma_prep); if (start != vma->vm_start) { -- cgit v1.2.3 From b2b3b886738fec5e89ca9ebc720eba1a8f615753 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:44 -0500 Subject: mm: don't use __vma_adjust() in __split_vma() Use the abstracted locking and maple tree operations. Since __split_vma() is the only user of the __vma_adjust() function to use the insert argument, drop that argument. Remove the NULL passed through from fs/exec's shift_arg_pages() and mremap() at the same time. Link: https://lkml.kernel.org/r/20230120162650.984577-44-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/exec.c | 4 +- include/linux/mm.h | 7 ++-- mm/mmap.c | 118 +++++++++++++++++++++++++---------------------------- mm/mremap.c | 2 +- 4 files changed, 61 insertions(+), 70 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 76ee62e1d3f1..d52fca2dd30b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -699,7 +699,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) /* * cover the whole range: [new_start, old_end) */ - if (vma_adjust(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL)) + if (vma_adjust(&vmi, vma, new_start, old_end, vma->vm_pgoff)) return -ENOMEM; /* @@ -733,7 +733,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) vma_prev(&vmi); /* Shrink the vma to just the new range */ - return vma_adjust(&vmi, vma, new_start, new_end, vma->vm_pgoff, NULL); + return vma_adjust(&vmi, vma, new_start, new_end, vma->vm_pgoff); } /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 2e95287a9f74..3845de5d2581 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2832,13 +2832,12 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); extern int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, - struct vm_area_struct *expand); + unsigned long end, pgoff_t pgoff, struct vm_area_struct *expand); static inline int vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, - pgoff_t pgoff, struct vm_area_struct *insert) + pgoff_t pgoff) { - return __vma_adjust(vmi, vma, start, end, pgoff, insert, NULL); + return __vma_adjust(vmi, vma, start, end, pgoff, NULL); } extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, diff --git a/mm/mmap.c b/mm/mmap.c index e259b33fcb52..3d08d7d243f0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -691,7 +691,7 @@ nomem: */ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, - struct vm_area_struct *insert, struct vm_area_struct *expand) + struct vm_area_struct *expand) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *remove2 = NULL; @@ -704,7 +704,7 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct *exporter = NULL, *importer = NULL; struct vma_prepare vma_prep; - if (next && !insert) { + if (next) { if (end >= next->vm_end) { /* * vma expands, overlapping all the next, and @@ -795,39 +795,25 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, VM_WARN_ON(vma_prep.anon_vma && adjust_next && next->anon_vma && vma_prep.anon_vma != next->anon_vma); - vma_prep.insert = insert; vma_prepare(&vma_prep); - if (start != vma->vm_start) { - if (vma->vm_start < start) { - if (!insert || (insert->vm_end != start)) { - vma_iter_clear(vmi, vma->vm_start, start); - vma_iter_set(vmi, start); - VM_WARN_ON(insert && insert->vm_start > vma->vm_start); - } - } else { - vma_changed = true; - } - vma->vm_start = start; - } - if (end != vma->vm_end) { - if (vma->vm_end > end) { - if (!insert || (insert->vm_start != end)) { - vma_iter_clear(vmi, end, vma->vm_end); - vma_iter_set(vmi, vma->vm_end); - VM_WARN_ON(insert && - insert->vm_end < vma->vm_end); - } - } else { - vma_changed = true; - } - vma->vm_end = end; - } + if (vma->vm_start < start) + vma_iter_clear(vmi, vma->vm_start, start); + else if (start != vma->vm_start) + vma_changed = true; + + if (vma->vm_end > end) + vma_iter_clear(vmi, end, vma->vm_end); + else if (end != vma->vm_end) + vma_changed = true; + + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; if (vma_changed) vma_iter_store(vmi, vma); - vma->vm_pgoff = pgoff; if (adjust_next) { next->vm_start += adjust_next; next->vm_pgoff += adjust_next >> PAGE_SHIFT; @@ -846,9 +832,9 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, * per-vma resources, so we don't attempt to merge those. */ static inline int is_mergeable_vma(struct vm_area_struct *vma, - struct file *file, unsigned long vm_flags, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) + struct file *file, unsigned long vm_flags, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -1030,20 +1016,19 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ err = __vma_adjust(vmi, prev, prev->vm_start, - next->vm_end, prev->vm_pgoff, NULL, - prev); + next->vm_end, prev->vm_pgoff, prev); res = prev; } else if (merge_prev) { /* cases 2, 5, 7 */ err = __vma_adjust(vmi, prev, prev->vm_start, - end, prev->vm_pgoff, NULL, prev); + end, prev->vm_pgoff, prev); res = prev; } else if (merge_next) { if (prev && addr < prev->vm_end) /* case 4 */ err = __vma_adjust(vmi, prev, prev->vm_start, - addr, prev->vm_pgoff, NULL, next); + addr, prev->vm_pgoff, next); else /* cases 3, 8 */ err = __vma_adjust(vmi, mid, addr, next->vm_end, - next->vm_pgoff - pglen, NULL, next); + next->vm_pgoff - pglen, next); res = next; } @@ -2187,11 +2172,15 @@ static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { + struct vma_prepare vp; struct vm_area_struct *new; int err; validate_mm_mt(vma->vm_mm); + WARN_ON(vma->vm_start >= addr); + WARN_ON(vma->vm_end <= addr); + if (vma->vm_ops && vma->vm_ops->may_split) { err = vma->vm_ops->may_split(vma, addr); if (err) @@ -2202,16 +2191,20 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, if (!new) return -ENOMEM; - if (new_below) + err = -ENOMEM; + if (vma_iter_prealloc(vmi)) + goto out_free_vma; + + if (new_below) { new->vm_end = addr; - else { + } else { new->vm_start = addr; new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); } err = vma_dup_policy(vma, new); if (err) - goto out_free_vma; + goto out_free_vmi; err = anon_vma_clone(new, vma); if (err) @@ -2223,33 +2216,32 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); - if (new_below) - err = vma_adjust(vmi, vma, addr, vma->vm_end, - vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT), - new); - else - err = vma_adjust(vmi, vma, vma->vm_start, addr, vma->vm_pgoff, - new); + vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); + init_vma_prep(&vp, vma); + vp.insert = new; + vma_prepare(&vp); - /* Success. */ - if (!err) { - if (new_below) - vma_next(vmi); - return 0; + if (new_below) { + vma->vm_start = addr; + vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; + } else { + vma->vm_end = addr; } - /* Avoid vm accounting in close() operation */ - new->vm_start = new->vm_end; - new->vm_pgoff = 0; - /* Clean everything up if vma_adjust failed. */ - if (new->vm_ops && new->vm_ops->close) - new->vm_ops->close(new); - if (new->vm_file) - fput(new->vm_file); - unlink_anon_vmas(new); - out_free_mpol: + /* vma_complete stores the new vma */ + vma_complete(&vp, vmi, vma->vm_mm); + + /* Success. */ + if (new_below) + vma_next(vmi); + validate_mm_mt(vma->vm_mm); + return 0; + +out_free_mpol: mpol_put(vma_policy(new)); - out_free_vma: +out_free_vmi: + vma_iter_free(vmi); +out_free_vma: vm_area_free(new); validate_mm_mt(vma->vm_mm); return err; diff --git a/mm/mremap.c b/mm/mremap.c index 6c7f49ab7d19..b98185f48ba5 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1054,7 +1054,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, vma->vm_file, extension_pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); } else if (vma_adjust(&vmi, vma, vma->vm_start, - addr + new_len, vma->vm_pgoff, NULL)) { + addr + new_len, vma->vm_pgoff)) { vma = NULL; } if (!vma) { -- cgit v1.2.3 From 7c9813e886bb52495ff5b97d4b0f1320d36d869b Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:45 -0500 Subject: mm/mremap: convert vma_adjust() to vma_expand() Stop using vma_adjust() in preparation for removing the function. Export vma_expand() to use instead. Link: https://lkml.kernel.org/r/20230120162650.984577-45-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 +++ mm/mmap.c | 6 +++--- mm/mremap.c | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 3845de5d2581..245fb30858c9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2839,6 +2839,9 @@ static inline int vma_adjust(struct vma_iterator *vmi, { return __vma_adjust(vmi, vma, start, end, pgoff, NULL); } +extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff, + struct vm_area_struct *next); extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, diff --git a/mm/mmap.c b/mm/mmap.c index 3d08d7d243f0..9599db011b18 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -634,9 +634,9 @@ again: * * Returns: 0 on success */ -inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff, - struct vm_area_struct *next) +int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff, + struct vm_area_struct *next) { bool remove_next = false; struct vma_prepare vp; diff --git a/mm/mremap.c b/mm/mremap.c index b98185f48ba5..5c9a57909862 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1053,8 +1053,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, extension_end, vma->vm_flags, vma->anon_vma, vma->vm_file, extension_pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); - } else if (vma_adjust(&vmi, vma, vma->vm_start, - addr + new_len, vma->vm_pgoff)) { + } else if (vma_expand(&vmi, vma, vma->vm_start, + addr + new_len, vma->vm_pgoff, NULL)) { vma = NULL; } if (!vma) { -- cgit v1.2.3 From cf51e86dfbe39b7cae3a9de650d035af22dd5fb4 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:46 -0500 Subject: mm/mmap: don't use __vma_adjust() in shift_arg_pages() Introduce shrink_vma() which uses the vma_prepare() and vma_complete() functions to reduce the vma coverage. Convert shift_arg_pages() to use expand_vma() and the new shrink_vma() function. Remove support from __vma_adjust() to reduce a vma size since shift_arg_pages() is the only user that shrinks a VMA in this way. Link: https://lkml.kernel.org/r/20230120162650.984577-46-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/exec.c | 4 ++-- include/linux/mm.h | 10 ++-------- mm/mmap.c | 52 +++++++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 47 insertions(+), 19 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index d52fca2dd30b..c0df813d2b45 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -699,7 +699,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) /* * cover the whole range: [new_start, old_end) */ - if (vma_adjust(&vmi, vma, new_start, old_end, vma->vm_pgoff)) + if (vma_expand(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL)) return -ENOMEM; /* @@ -733,7 +733,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) vma_prev(&vmi); /* Shrink the vma to just the new range */ - return vma_adjust(&vmi, vma, new_start, new_end, vma->vm_pgoff); + return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); } /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 245fb30858c9..dcc34533d2f6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2831,17 +2831,11 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); -extern int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *expand); -static inline int vma_adjust(struct vma_iterator *vmi, - struct vm_area_struct *vma, unsigned long start, unsigned long end, - pgoff_t pgoff) -{ - return __vma_adjust(vmi, vma, start, end, pgoff, NULL); -} extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *next); +extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff); extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, diff --git a/mm/mmap.c b/mm/mmap.c index 9599db011b18..07b52acfd565 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -682,6 +682,44 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, nomem: return -ENOMEM; } + +/* + * vma_shrink() - Reduce an existing VMAs memory area + * @vmi: The vma iterator + * @vma: The VMA to modify + * @start: The new start + * @end: The new end + * + * Returns: 0 on success, -ENOMEM otherwise + */ +int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff) +{ + struct vma_prepare vp; + + WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); + + if (vma_iter_prealloc(vmi)) + return -ENOMEM; + + init_vma_prep(&vp, vma); + vma_adjust_trans_huge(vma, start, end, 0); + vma_prepare(&vp); + + if (vma->vm_start < start) + vma_iter_clear(vmi, vma->vm_start, start); + + if (vma->vm_end > end) + vma_iter_clear(vmi, end, vma->vm_end); + + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + vma_complete(&vp, vmi, vma->vm_mm); + validate_mm(vma->vm_mm); + return 0; +} + /* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. @@ -797,14 +835,7 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_prepare(&vma_prep); - if (vma->vm_start < start) - vma_iter_clear(vmi, vma->vm_start, start); - else if (start != vma->vm_start) - vma_changed = true; - - if (vma->vm_end > end) - vma_iter_clear(vmi, end, vma->vm_end); - else if (end != vma->vm_end) + if (start < vma->vm_start || end > vma->vm_end) vma_changed = true; vma->vm_start = start; @@ -817,7 +848,10 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, if (adjust_next) { next->vm_start += adjust_next; next->vm_pgoff += adjust_next >> PAGE_SHIFT; - vma_iter_store(vmi, next); + if (adjust_next < 0) { + WARN_ON_ONCE(vma_changed); + vma_iter_store(vmi, next); + } } vma_complete(&vma_prep, vmi, mm); -- cgit v1.2.3 From 04241ffe3f0458d54c61cf6c9d58d703efda4dd5 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:47 -0500 Subject: mm/mmap: introduce dup_vma_anon() helper Create a helper for duplicating the anon vma when adjusting the vma. This simplifies the logic of __vma_adjust(). Link: https://lkml.kernel.org/r/20230120162650.984577-47-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 74 ++++++++++++++++++++++++++++++++++----------------------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 07b52acfd565..265c4605dad2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -617,6 +617,29 @@ again: uprobe_mmap(vp->insert); } +/* + * dup_anon_vma() - Helper function to duplicate anon_vma + * @dst: The destination VMA + * @src: The source VMA + * + * Returns: 0 on success. + */ +static inline int dup_anon_vma(struct vm_area_struct *dst, + struct vm_area_struct *src) +{ + /* + * Easily overlooked: when mprotect shifts the boundary, make sure the + * expanding vma has anon_vma set if the shrinking vma had, to cover any + * anon pages imported. + */ + if (src->anon_vma && !dst->anon_vma) { + dst->anon_vma = src->anon_vma; + return anon_vma_clone(dst, src); + } + + return 0; +} + /* * vma_expand - Expand an existing VMA * @@ -642,15 +665,12 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vma_prepare vp; if (next && (vma != next) && (end == next->vm_end)) { - remove_next = true; - if (next->anon_vma && !vma->anon_vma) { - int error; + int ret; - vma->anon_vma = next->anon_vma; - error = anon_vma_clone(vma, next); - if (error) - return error; - } + remove_next = true; + ret = dup_anon_vma(vma, next); + if (ret) + return ret; } init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL); @@ -739,10 +759,11 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, struct file *file = vma->vm_file; bool vma_changed = false; long adjust_next = 0; - struct vm_area_struct *exporter = NULL, *importer = NULL; struct vma_prepare vma_prep; if (next) { + int error = 0; + if (end >= next->vm_end) { /* * vma expands, overlapping all the next, and @@ -777,15 +798,14 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, end != remove2->vm_end); } - exporter = next; - importer = vma; - /* * If next doesn't have anon_vma, import from vma after * next, if the vma overlaps with it. */ - if (remove2 != NULL && !next->anon_vma) - exporter = remove2; + if (remove != NULL && !next->anon_vma) + error = dup_anon_vma(vma, remove2); + else + error = dup_anon_vma(vma, remove); } else if (end > next->vm_start) { /* @@ -793,9 +813,8 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, * mprotect case 5 shifting the boundary up. */ adjust_next = (end - next->vm_start); - exporter = next; - importer = vma; - VM_WARN_ON(expand != importer); + VM_WARN_ON(expand != vma); + error = dup_anon_vma(vma, next); } else if (end < vma->vm_end) { /* * vma shrinks, and !insert tells it's not @@ -803,24 +822,11 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, * mprotect case 4 shifting the boundary down. */ adjust_next = -(vma->vm_end - end); - exporter = vma; - importer = next; - VM_WARN_ON(expand != importer); - } - - /* - * Easily overlooked: when mprotect shifts the boundary, - * make sure the expanding vma has anon_vma set if the - * shrinking vma had, to cover any anon pages imported. - */ - if (exporter && exporter->anon_vma && !importer->anon_vma) { - int error; - - importer->anon_vma = exporter->anon_vma; - error = anon_vma_clone(importer, exporter); - if (error) - return error; + VM_WARN_ON(expand != next); + error = dup_anon_vma(next, vma); } + if (error) + return error; } if (vma_iter_prealloc(vmi)) -- cgit v1.2.3 From 287051b185048c4aabe666439e64232ac59135a6 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:48 -0500 Subject: mm/mmap: convert do_brk_flags() to use vma_prepare() and vma_complete() Use the abstracted vma locking for do_brk_flags() Link: https://lkml.kernel.org/r/20230120162650.984577-48-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 265c4605dad2..604ba8293a95 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2936,6 +2936,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, unsigned long len, unsigned long flags) { struct mm_struct *mm = current->mm; + struct vma_prepare vp; validate_mm_mt(mm); /* @@ -2963,18 +2964,13 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, goto unacct_fail; vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); - if (vma->anon_vma) { - anon_vma_lock_write(vma->anon_vma); - anon_vma_interval_tree_pre_update_vma(vma); - } + init_vma_prep(&vp, vma); + vma_prepare(&vp); vma->vm_end = addr + len; vma->vm_flags |= VM_SOFTDIRTY; vma_iter_store(vmi, vma); - if (vma->anon_vma) { - anon_vma_interval_tree_post_update_vma(vma); - anon_vma_unlock_write(vma->anon_vma); - } + vma_complete(&vp, vmi, mm); khugepaged_enter_vma(vma, flags); goto out; } -- cgit v1.2.3 From 0503ea8f5ba73eb3ab13a81c1eefbaf51405385a Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:49 -0500 Subject: mm/mmap: remove __vma_adjust() Inline the work of __vma_adjust() into vma_merge(). This reduces code size and has the added benefits of the comments for the cases being located with the code. Change the comments referencing vma_adjust() accordingly. [Liam.Howlett@oracle.com: fix vma_merge() offset when expanding the next vma] Link: https://lkml.kernel.org/r/20230130195713.2881766-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230120162650.984577-49-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 2 +- mm/filemap.c | 2 +- mm/mmap.c | 250 +++++++++++++++++++----------------------------- mm/rmap.c | 15 +-- 4 files changed, 107 insertions(+), 162 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1a3904e0179c..59887c69d54c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1351,7 +1351,7 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma) } /* - * Called from mmap_region/vma_adjust with mm->mmap_lock acquired. + * Called from mmap_region/vma_merge with mm->mmap_lock acquired. * * Currently we ignore all errors and always return 0, the callers * can't handle the failure anyway. diff --git a/mm/filemap.c b/mm/filemap.c index c915ded191f0..992554c18f1f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -97,7 +97,7 @@ * ->i_pages lock (__sync_single_inode) * * ->i_mmap_rwsem - * ->anon_vma.lock (vma_adjust) + * ->anon_vma.lock (vma_merge) * * ->anon_vma.lock * ->page_table_lock or pte_lock (anon_vma_prepare and various) diff --git a/mm/mmap.c b/mm/mmap.c index 604ba8293a95..8ce4cee42dce 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -740,133 +740,6 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, return 0; } -/* - * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that - * is already present in an i_mmap tree without adjusting the tree. - * The following helper function should be used when such adjustments - * are necessary. The "insert" vma (if any) is to be inserted - * before we drop the necessary locks. - */ -int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff, - struct vm_area_struct *expand) -{ - struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *remove2 = NULL; - struct vm_area_struct *remove = NULL; - struct vm_area_struct *next = find_vma(mm, vma->vm_end); - struct vm_area_struct *orig_vma = vma; - struct file *file = vma->vm_file; - bool vma_changed = false; - long adjust_next = 0; - struct vma_prepare vma_prep; - - if (next) { - int error = 0; - - if (end >= next->vm_end) { - /* - * vma expands, overlapping all the next, and - * perhaps the one after too (mprotect case 6). - * The only other cases that gets here are - * case 1, case 7 and case 8. - */ - if (next == expand) { - /* - * The only case where we don't expand "vma" - * and we expand "next" instead is case 8. - */ - VM_WARN_ON(end != next->vm_end); - /* - * we're removing "vma" and that to do so we - * swapped "vma" and "next". - */ - VM_WARN_ON(file != next->vm_file); - swap(vma, next); - remove = next; - } else { - VM_WARN_ON(expand != vma); - /* - * case 1, 6, 7, remove next. - * case 6 also removes the one beyond next - */ - remove = next; - if (end > next->vm_end) - remove2 = find_vma(mm, next->vm_end); - - VM_WARN_ON(remove2 != NULL && - end != remove2->vm_end); - } - - /* - * If next doesn't have anon_vma, import from vma after - * next, if the vma overlaps with it. - */ - if (remove != NULL && !next->anon_vma) - error = dup_anon_vma(vma, remove2); - else - error = dup_anon_vma(vma, remove); - - } else if (end > next->vm_start) { - /* - * vma expands, overlapping part of the next: - * mprotect case 5 shifting the boundary up. - */ - adjust_next = (end - next->vm_start); - VM_WARN_ON(expand != vma); - error = dup_anon_vma(vma, next); - } else if (end < vma->vm_end) { - /* - * vma shrinks, and !insert tells it's not - * split_vma inserting another: so it must be - * mprotect case 4 shifting the boundary down. - */ - adjust_next = -(vma->vm_end - end); - VM_WARN_ON(expand != next); - error = dup_anon_vma(next, vma); - } - if (error) - return error; - } - - if (vma_iter_prealloc(vmi)) - return -ENOMEM; - - vma_adjust_trans_huge(orig_vma, start, end, adjust_next); - - init_multi_vma_prep(&vma_prep, vma, adjust_next ? next : NULL, remove, - remove2); - VM_WARN_ON(vma_prep.anon_vma && adjust_next && next->anon_vma && - vma_prep.anon_vma != next->anon_vma); - - vma_prepare(&vma_prep); - - if (start < vma->vm_start || end > vma->vm_end) - vma_changed = true; - - vma->vm_start = start; - vma->vm_end = end; - vma->vm_pgoff = pgoff; - - if (vma_changed) - vma_iter_store(vmi, vma); - - if (adjust_next) { - next->vm_start += adjust_next; - next->vm_pgoff += adjust_next >> PAGE_SHIFT; - if (adjust_next < 0) { - WARN_ON_ONCE(vma_changed); - vma_iter_store(vmi, next); - } - } - - vma_complete(&vma_prep, vmi, mm); - vma_iter_free(vmi); - validate_mm(mm); - - return 0; -} - /* * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. @@ -993,7 +866,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * It is important for case 8 that the vma NNNN overlapping the * region AAAA is never going to extended over XXXX. Instead XXXX must * be extended in region AAAA and NNNN must be removed. This way in - * all cases where vma_merge succeeds, the moment vma_adjust drops the + * all cases where vma_merge succeeds, the moment vma_merge drops the * rmap_locks, the properties of the merged vma will be already * correct for the whole merged range. Some of those properties like * vm_page_prot/vm_flags may be accessed by rmap_walks and they must @@ -1003,6 +876,12 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * or other rmap walkers (if working on addresses beyond the "end" * parameter) may establish ptes with the wrong permissions of NNNN * instead of the right permissions of XXXX. + * + * In the code below: + * PPPP is represented by *prev + * NNNN is represented by *mid (and possibly equal to *next) + * XXXX is represented by *next or not represented at all. + * AAAA is not represented - it will be merged or the function will return NULL */ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, @@ -1013,11 +892,19 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, struct anon_vma_name *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; + pgoff_t vma_pgoff; struct vm_area_struct *mid, *next, *res = NULL; + struct vm_area_struct *vma, *adjust, *remove, *remove2; int err = -1; bool merge_prev = false; bool merge_next = false; + bool vma_expanded = false; + struct vma_prepare vp; + unsigned long vma_end = end; + long adj_next = 0; + unsigned long vma_start = addr; + validate_mm(mm); /* * We later require that vma->vm_flags == vm_flags, * so this tests vma->vm_flags & VM_SPECIAL, too. @@ -1035,13 +922,17 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, VM_WARN_ON(mid && end > mid->vm_end); VM_WARN_ON(addr >= end); - /* Can we merge the predecessor? */ - if (prev && prev->vm_end == addr && - mpol_equal(vma_policy(prev), policy) && - can_vma_merge_after(prev, vm_flags, - anon_vma, file, pgoff, - vm_userfaultfd_ctx, anon_name)) { - merge_prev = true; + if (prev) { + res = prev; + vma = prev; + vma_start = prev->vm_start; + vma_pgoff = prev->vm_pgoff; + /* Can we merge the predecessor? */ + if (prev->vm_end == addr && mpol_equal(vma_policy(prev), policy) + && can_vma_merge_after(prev, vm_flags, anon_vma, file, + pgoff, vm_userfaultfd_ctx, anon_name)) { + merge_prev = true; + } } /* Can we merge the successor? */ if (next && end == next->vm_start && @@ -1051,32 +942,85 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vm_userfaultfd_ctx, anon_name)) { merge_next = true; } + + remove = remove2 = adjust = NULL; /* Can we merge both the predecessor and the successor? */ if (merge_prev && merge_next && - is_mergeable_anon_vma(prev->anon_vma, - next->anon_vma, NULL)) { /* cases 1, 6 */ - err = __vma_adjust(vmi, prev, prev->vm_start, - next->vm_end, prev->vm_pgoff, prev); - res = prev; - } else if (merge_prev) { /* cases 2, 5, 7 */ - err = __vma_adjust(vmi, prev, prev->vm_start, - end, prev->vm_pgoff, prev); - res = prev; + is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { + remove = mid; /* case 1 */ + vma_end = next->vm_end; + err = dup_anon_vma(res, remove); + if (mid != next) { /* case 6 */ + remove2 = next; + if (!remove->anon_vma) + err = dup_anon_vma(res, remove2); + } + } else if (merge_prev) { + err = 0; /* case 2 */ + if (mid && end > mid->vm_start) { + err = dup_anon_vma(res, mid); + if (end == mid->vm_end) { /* case 7 */ + remove = mid; + } else { /* case 5 */ + adjust = mid; + adj_next = (end - mid->vm_start); + } + } } else if (merge_next) { - if (prev && addr < prev->vm_end) /* case 4 */ - err = __vma_adjust(vmi, prev, prev->vm_start, - addr, prev->vm_pgoff, next); - else /* cases 3, 8 */ - err = __vma_adjust(vmi, mid, addr, next->vm_end, - next->vm_pgoff - pglen, next); res = next; + if (prev && addr < prev->vm_end) { /* case 4 */ + vma_end = addr; + adjust = mid; + adj_next = -(vma->vm_end - addr); + err = dup_anon_vma(res, adjust); + } else { + vma = next; /* case 3 */ + vma_start = addr; + vma_end = next->vm_end; + vma_pgoff = mid->vm_pgoff; + err = 0; + if (mid != next) { /* case 8 */ + remove = mid; + err = dup_anon_vma(res, remove); + } + } } - /* - * Cannot merge with predecessor or successor or error in __vma_adjust? - */ + /* Cannot merge or error in anon_vma clone */ if (err) return NULL; + + if (vma_iter_prealloc(vmi)) + return NULL; + + vma_adjust_trans_huge(vma, vma_start, vma_end, adj_next); + init_multi_vma_prep(&vp, vma, adjust, remove, remove2); + VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && + vp.anon_vma != adjust->anon_vma); + + vma_prepare(&vp); + if (vma_start < vma->vm_start || vma_end > vma->vm_end) + vma_expanded = true; + + vma->vm_start = vma_start; + vma->vm_end = vma_end; + vma->vm_pgoff = vma_pgoff; + + if (vma_expanded) + vma_iter_store(vmi, vma); + + if (adj_next) { + adjust->vm_start += adj_next; + adjust->vm_pgoff += adj_next >> PAGE_SHIFT; + if (adj_next < 0) { + WARN_ON(vma_expanded); + vma_iter_store(vmi, next); + } + } + + vma_complete(&vp, vmi, mm); + vma_iter_free(vmi); + validate_mm(mm); khugepaged_enter_vma(res, vm_flags); if (res) diff --git a/mm/rmap.c b/mm/rmap.c index 43760d622040..86fccc2b9fc9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -262,11 +262,12 @@ static inline void unlock_anon_vma_root(struct anon_vma *root) * Attach the anon_vmas from src to dst. * Returns 0 on success, -ENOMEM on failure. * - * anon_vma_clone() is called by __vma_adjust(), __split_vma(), copy_vma() and - * anon_vma_fork(). The first three want an exact copy of src, while the last - * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent - * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call, - * we can identify this case by checking (!dst->anon_vma && src->anon_vma). + * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(), + * copy_vma() and anon_vma_fork(). The first four want an exact copy of src, + * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to + * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before + * call, we can identify this case by checking (!dst->anon_vma && + * src->anon_vma). * * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find * and reuse existing anon_vma which has no vmas and only one child anon_vma. @@ -1253,7 +1254,7 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); if (likely(!folio_test_ksm(folio))) { - /* address might be in next vma when migration races vma_adjust */ + /* address might be in next vma when migration races vma_merge */ if (first) __page_set_anon_rmap(folio, page, vma, address, !!(flags & RMAP_EXCLUSIVE)); @@ -2524,7 +2525,7 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, BUG_ON(!folio_test_locked(folio)); BUG_ON(!anon_vma); - /* address might be in next vma when migration races vma_adjust */ + /* address might be in next vma when migration races vma_merge */ first = atomic_inc_and_test(&folio->_entire_mapcount); VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); -- cgit v1.2.3 From 18b098af2890cdeab07368405409111197f190d2 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:50 -0500 Subject: vma_merge: set vma iterator to correct position. When merging the previous value, set the vma iterator to the previous slot. Don't use the vma iterator to get the next/prev so that it is in the correct position for a write. Link: https://lkml.kernel.org/r/20230120162650.984577-50-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 8ce4cee42dce..b698a96d0511 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -932,6 +932,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, && can_vma_merge_after(prev, vm_flags, anon_vma, file, pgoff, vm_userfaultfd_ctx, anon_name)) { merge_prev = true; + vma_prev(vmi); } } /* Can we merge the successor? */ @@ -1023,9 +1024,6 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, validate_mm(mm); khugepaged_enter_vma(res, vm_flags); - if (res) - vma_iter_set(vmi, end); - return res; } -- cgit v1.2.3 From 06e78b614e3780f9ac32056f2861159fd19d9702 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:46 -0800 Subject: kernel/fork: convert vma assignment to a memcpy Patch series "introduce vm_flags modifier functions", v4. This patchset was originally published as a part of per-VMA locking [1] and was split after suggestion that it's viable on its own and to facilitate the review process. It is now a preprequisite for the next version of per-VMA lock patchset, which reuses vm_flags modifier functions to lock the VMA when vm_flags are being updated. VMA vm_flags modifications are usually done under exclusive mmap_lock protection because this attrubute affects other decisions like VMA merging or splitting and races should be prevented. Introduce vm_flags modifier functions to enforce correct locking. This patch (of 7): Convert vma assignment in vm_area_dup() to a memcpy() to prevent compiler errors when we add a const modifier to vma->vm_flags. Link: https://lkml.kernel.org/r/20230126193752.297968-1-surenb@google.com Link: https://lkml.kernel.org/r/20230126193752.297968-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Mel Gorman Acked-by: Mike Rapoport (IBM) Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Cc: Sebastian Reichel Signed-off-by: Andrew Morton --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index 441dcec60aae..9260f975b8f4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -472,7 +472,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) * orig->shared.rb may be modified concurrently, but the clone * will be reinitialized. */ - *new = data_race(*orig); + data_race(memcpy(new, orig, sizeof(*new))); INIT_LIST_HEAD(&new->anon_vma_chain); dup_anon_vma_name(orig, new); } -- cgit v1.2.3 From bc292ab00f6c7a661a8a605c714e8a148f629ef6 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:47 -0800 Subject: mm: introduce vma->vm_flags wrapper functions vm_flags are among VMA attributes which affect decisions like VMA merging and splitting. Therefore all vm_flags modifications are performed after taking exclusive mmap_lock to prevent vm_flags updates racing with such operations. Introduce modifier functions for vm_flags to be used whenever flags are updated. This way we can better check and control correct locking behavior during these updates. Link: https://lkml.kernel.org/r/20230126193752.297968-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Davidlohr Bueso Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Mike Rapoport (IBM) Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Sebastian Reichel Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 40 ++++++++++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 10 +++++++++- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index dcc34533d2f6..e2df5d122b67 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -627,6 +627,46 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) INIT_LIST_HEAD(&vma->anon_vma_chain); } +/* Use when VMA is not part of the VMA tree and needs no locking */ +static inline void vm_flags_init(struct vm_area_struct *vma, + vm_flags_t flags) +{ + ACCESS_PRIVATE(vma, __vm_flags) = flags; +} + +/* Use when VMA is part of the VMA tree and modifications need coordination */ +static inline void vm_flags_reset(struct vm_area_struct *vma, + vm_flags_t flags) +{ + mmap_assert_write_locked(vma->vm_mm); + vm_flags_init(vma, flags); +} + +static inline void vm_flags_set(struct vm_area_struct *vma, + vm_flags_t flags) +{ + mmap_assert_write_locked(vma->vm_mm); + ACCESS_PRIVATE(vma, __vm_flags) |= flags; +} + +static inline void vm_flags_clear(struct vm_area_struct *vma, + vm_flags_t flags) +{ + mmap_assert_write_locked(vma->vm_mm); + ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; +} + +/* + * Use only when the order of set/clear operations is unimportant, otherwise + * use vm_flags_{set|clear} explicitly. + */ +static inline void vm_flags_mod(struct vm_area_struct *vma, + vm_flags_t set, vm_flags_t clear) +{ + mmap_assert_write_locked(vma->vm_mm); + vm_flags_init(vma, (vma->vm_flags | set) & ~clear); +} + static inline void vma_set_anonymous(struct vm_area_struct *vma) { vma->vm_ops = NULL; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5ca11c6c46e8..10a1e41f4e70 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -491,7 +491,15 @@ struct vm_area_struct { * See vmf_insert_mixed_prot() for discussion. */ pgprot_t vm_page_prot; - unsigned long vm_flags; /* Flags, see mm.h. */ + + /* + * Flags, see mm.h. + * To modify use vm_flags_{init|reset|set|clear|mod} functions. + */ + union { + const vm_flags_t vm_flags; + vm_flags_t __private __vm_flags; + }; /* * For areas with an address space and backing store, -- cgit v1.2.3 From e430a95a04efc557bc4ff9b3035c7c85aee5d63f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:48 -0800 Subject: mm: replace VM_LOCKED_CLEAR_MASK with VM_LOCKED_MASK To simplify the usage of VM_LOCKED_CLEAR_MASK in vm_flags_clear(), replace it with VM_LOCKED_MASK bitmask and convert all users. Link: https://lkml.kernel.org/r/20230126193752.297968-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Mike Rapoport (IBM) Reviewed-by: Davidlohr Bueso Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Sebastian Reichel Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++-- kernel/fork.c | 2 +- mm/hugetlb.c | 4 ++-- mm/mlock.c | 6 +++--- mm/mmap.c | 6 +++--- mm/mremap.c | 2 +- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index e2df5d122b67..663726ca2240 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -421,8 +421,8 @@ extern unsigned int kobjsize(const void *objp); /* This mask defines which mm->def_flags a process can inherit its parent */ #define VM_INIT_DEF_MASK VM_NOHUGEPAGE -/* This mask is used to clear all the VMA flags used by mlock */ -#define VM_LOCKED_CLEAR_MASK (~(VM_LOCKED | VM_LOCKONFAULT)) +/* This mask represents all the VMA flag bits used by mlock */ +#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR diff --git a/kernel/fork.c b/kernel/fork.c index 9260f975b8f4..5e3029ea8e1e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -659,7 +659,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp->anon_vma = NULL; } else if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); + vm_flags_clear(tmp, VM_LOCKED_MASK); file = tmp->vm_file; if (file) { struct address_space *mapping = file->f_mapping; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0f9df0143772..ab35b1cc9927 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6969,8 +6969,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, unsigned long s_end = sbase + PUD_SIZE; /* Allow segments to share if only one is marked locked */ - unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; - unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK; + unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK; + unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK; /* * match the virtual addresses, permission and the alignment of the diff --git a/mm/mlock.c b/mm/mlock.c index 0336f52e03d7..5c4fff93cd6b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -497,7 +497,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, if (vma->vm_start != tmp) return -ENOMEM; - newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= flags; /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ tmp = vma->vm_end; @@ -661,7 +661,7 @@ static int apply_mlockall_flags(int flags) struct vm_area_struct *vma, *prev = NULL; vm_flags_t to_add = 0; - current->mm->def_flags &= VM_LOCKED_CLEAR_MASK; + current->mm->def_flags &= ~VM_LOCKED_MASK; if (flags & MCL_FUTURE) { current->mm->def_flags |= VM_LOCKED; @@ -681,7 +681,7 @@ static int apply_mlockall_flags(int flags) for_each_vma(vmi, vma) { vm_flags_t newflags; - newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= to_add; /* Ignore errors */ diff --git a/mm/mmap.c b/mm/mmap.c index b698a96d0511..03d7c37c5969 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2668,7 +2668,7 @@ expanded: if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + vm_flags_clear(vma, VM_LOCKED_MASK); else mm->locked_vm += (len >> PAGE_SHIFT); } @@ -3338,8 +3338,8 @@ static struct vm_area_struct *__install_special_mapping( vma->vm_start = addr; vma->vm_end = addr + len; - vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + vm_flags_init(vma, (vm_flags | mm->def_flags | + VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_ops = ops; diff --git a/mm/mremap.c b/mm/mremap.c index 5c9a57909862..d70d8063c6e2 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -687,7 +687,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) { /* We always clear VM_LOCKED[ONFAULT] on the old vma */ - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + vm_flags_clear(vma, VM_LOCKED_MASK); /* * anon_vma links of the old vma is no longer needed after its page -- cgit v1.2.3 From 1c71222e5f2393b5ea1a41795c67589eea7e3490 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:49 -0800 Subject: mm: replace vma->vm_flags direct modifications with modifier calls Replace direct modifications to vma->vm_flags with calls to modifier functions to be able to track flag changes and to keep vma locking correctness. [akpm@linux-foundation.org: fix drivers/misc/open-dice.c, per Hyeonggon Yoo] Link: https://lkml.kernel.org/r/20230126193752.297968-5-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Mike Rapoport (IBM) Acked-by: Sebastian Reichel Reviewed-by: Liam R. Howlett Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm/kernel/process.c | 2 +- arch/ia64/mm/init.c | 8 ++++---- arch/loongarch/include/asm/tlb.h | 2 +- arch/powerpc/kvm/book3s_xive_native.c | 2 +- arch/powerpc/mm/book3s64/subpage_prot.c | 2 +- arch/powerpc/platforms/book3s/vas-api.c | 2 +- arch/powerpc/platforms/cell/spufs/file.c | 14 +++++++------- arch/s390/mm/gmap.c | 3 +-- arch/x86/entry/vsyscall/vsyscall_64.c | 2 +- arch/x86/kernel/cpu/sgx/driver.c | 2 +- arch/x86/kernel/cpu/sgx/virt.c | 2 +- arch/x86/mm/pat/memtype.c | 6 +++--- arch/x86/um/mem_32.c | 2 +- drivers/acpi/pfr_telemetry.c | 2 +- drivers/android/binder.c | 3 +-- drivers/char/mspec.c | 2 +- drivers/crypto/hisilicon/qm.c | 2 +- drivers/dax/device.c | 2 +- drivers/dma/idxd/cdev.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 4 ++-- drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 4 ++-- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 4 ++-- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 ++-- drivers/gpu/drm/drm_gem.c | 2 +- drivers/gpu/drm/drm_gem_dma_helper.c | 3 +-- drivers/gpu/drm/drm_gem_shmem_helper.c | 2 +- drivers/gpu/drm/drm_vm.c | 8 ++++---- drivers/gpu/drm/etnaviv/etnaviv_gem.c | 2 +- drivers/gpu/drm/exynos/exynos_drm_gem.c | 4 ++-- drivers/gpu/drm/gma500/framebuffer.c | 2 +- drivers/gpu/drm/i810/i810_dma.c | 2 +- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 4 ++-- drivers/gpu/drm/mediatek/mtk_drm_gem.c | 2 +- drivers/gpu/drm/msm/msm_gem.c | 2 +- drivers/gpu/drm/omapdrm/omap_gem.c | 3 +-- drivers/gpu/drm/rockchip/rockchip_drm_gem.c | 3 +-- drivers/gpu/drm/tegra/gem.c | 5 ++--- drivers/gpu/drm/ttm/ttm_bo_vm.c | 3 +-- drivers/gpu/drm/virtio/virtgpu_vram.c | 2 +- drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c | 2 +- drivers/gpu/drm/xen/xen_drm_front_gem.c | 3 +-- drivers/hsi/clients/cmt_speech.c | 2 +- drivers/hwtracing/intel_th/msu.c | 2 +- drivers/hwtracing/stm/core.c | 2 +- drivers/infiniband/hw/hfi1/file_ops.c | 4 ++-- drivers/infiniband/hw/mlx5/main.c | 4 ++-- drivers/infiniband/hw/qib/qib_file_ops.c | 13 ++++++------- drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 2 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c | 2 +- drivers/media/common/videobuf2/videobuf2-dma-contig.c | 2 +- drivers/media/common/videobuf2/videobuf2-vmalloc.c | 2 +- drivers/media/v4l2-core/videobuf-dma-contig.c | 2 +- drivers/media/v4l2-core/videobuf-dma-sg.c | 4 ++-- drivers/media/v4l2-core/videobuf-vmalloc.c | 2 +- drivers/misc/cxl/context.c | 2 +- drivers/misc/habanalabs/common/memory.c | 2 +- drivers/misc/habanalabs/gaudi/gaudi.c | 4 ++-- drivers/misc/habanalabs/gaudi2/gaudi2.c | 8 ++++---- drivers/misc/habanalabs/goya/goya.c | 4 ++-- drivers/misc/ocxl/context.c | 4 ++-- drivers/misc/ocxl/sysfs.c | 2 +- drivers/misc/open-dice.c | 4 ++-- drivers/misc/sgi-gru/grufile.c | 4 ++-- drivers/misc/uacce/uacce.c | 2 +- drivers/sbus/char/oradax.c | 2 +- drivers/scsi/cxlflash/ocxl_hw.c | 2 +- drivers/scsi/sg.c | 2 +- drivers/staging/media/atomisp/pci/hmm/hmm_bo.c | 2 +- drivers/staging/media/deprecated/meye/meye.c | 4 ++-- drivers/staging/media/deprecated/stkwebcam/stk-webcam.c | 2 +- drivers/target/target_core_user.c | 2 +- drivers/uio/uio.c | 2 +- drivers/usb/core/devio.c | 3 +-- drivers/usb/mon/mon_bin.c | 3 +-- drivers/vdpa/vdpa_user/iova_domain.c | 2 +- drivers/vfio/pci/vfio_pci_core.c | 2 +- drivers/vhost/vdpa.c | 2 +- drivers/video/fbdev/68328fb.c | 2 +- drivers/video/fbdev/core/fb_defio.c | 4 ++-- drivers/xen/gntalloc.c | 2 +- drivers/xen/gntdev.c | 4 ++-- drivers/xen/privcmd-buf.c | 2 +- drivers/xen/privcmd.c | 4 ++-- fs/aio.c | 2 +- fs/cramfs/inode.c | 2 +- fs/erofs/data.c | 2 +- fs/exec.c | 4 ++-- fs/ext4/file.c | 2 +- fs/fuse/dax.c | 2 +- fs/hugetlbfs/inode.c | 4 ++-- fs/orangefs/file.c | 3 +-- fs/proc/task_mmu.c | 2 +- fs/proc/vmcore.c | 3 +-- fs/userfaultfd.c | 2 +- fs/xfs/xfs_file.c | 2 +- include/linux/mm.h | 2 +- kernel/bpf/ringbuf.c | 4 ++-- kernel/bpf/syscall.c | 4 ++-- kernel/events/core.c | 2 +- kernel/kcov.c | 2 +- kernel/relay.c | 2 +- mm/madvise.c | 2 +- mm/memory.c | 6 +++--- mm/mlock.c | 6 +++--- mm/mmap.c | 10 +++++----- mm/mprotect.c | 2 +- mm/mremap.c | 6 +++--- mm/nommu.c | 11 ++++++----- mm/secretmem.c | 2 +- mm/shmem.c | 2 +- mm/vmalloc.c | 2 +- net/ipv4/tcp.c | 4 ++-- security/selinux/selinuxfs.c | 6 +++--- sound/core/oss/pcm_oss.c | 2 +- sound/core/pcm_native.c | 9 +++++---- sound/soc/pxa/mmp-sspa.c | 2 +- sound/usb/usx2y/us122l.c | 4 ++-- sound/usb/usx2y/usX2Yhwdep.c | 2 +- sound/usb/usx2y/usx2yhwdeppcm.c | 2 +- 120 files changed, 188 insertions(+), 199 deletions(-) diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index f811733a8fc5..61c30b9a24ea 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -316,7 +316,7 @@ static int __init gate_vma_init(void) gate_vma.vm_page_prot = PAGE_READONLY_EXEC; gate_vma.vm_start = 0xffff0000; gate_vma.vm_end = 0xffff0000 + PAGE_SIZE; - gate_vma.vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC; + vm_flags_init(&gate_vma, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC); return 0; } arch_initcall(gate_vma_init); diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index fc4e4217e87f..7f5353e28516 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -109,7 +109,7 @@ ia64_init_addr_space (void) vma_set_anonymous(vma); vma->vm_start = current->thread.rbs_bot & PAGE_MASK; vma->vm_end = vma->vm_start + PAGE_SIZE; - vma->vm_flags = VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT; + vm_flags_init(vma, VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); mmap_write_lock(current->mm); if (insert_vm_struct(current->mm, vma)) { @@ -127,8 +127,8 @@ ia64_init_addr_space (void) vma_set_anonymous(vma); vma->vm_end = PAGE_SIZE; vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT); - vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | - VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_init(vma, VM_READ | VM_MAYREAD | VM_IO | + VM_DONTEXPAND | VM_DONTDUMP); mmap_write_lock(current->mm); if (insert_vm_struct(current->mm, vma)) { mmap_write_unlock(current->mm); @@ -272,7 +272,7 @@ static int __init gate_vma_init(void) vma_init(&gate_vma, NULL); gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; - gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; + vm_flags_init(&gate_vma, VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC); gate_vma.vm_page_prot = __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RX); return 0; diff --git a/arch/loongarch/include/asm/tlb.h b/arch/loongarch/include/asm/tlb.h index dd24f5898f65..f5e4deb97402 100644 --- a/arch/loongarch/include/asm/tlb.h +++ b/arch/loongarch/include/asm/tlb.h @@ -149,7 +149,7 @@ static inline void tlb_flush(struct mmu_gather *tlb) struct vm_area_struct vma; vma.vm_mm = tlb->mm; - vma.vm_flags = 0; + vm_flags_init(&vma, 0); if (tlb->fullmm) { flush_tlb_mm(tlb->mm); return; diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 4f566bea5e10..712ab91ced39 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -324,7 +324,7 @@ static int kvmppc_xive_native_mmap(struct kvm_device *dev, return -EINVAL; } - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot); /* diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index d73b3b4176e8..b75a9fb99599 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -156,7 +156,7 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, * VM_NOHUGEPAGE and split them. */ for_each_vma_range(vmi, vma, addr + len) { - vma->vm_flags |= VM_NOHUGEPAGE; + vm_flags_set(vma, VM_NOHUGEPAGE); walk_page_vma(vma, &subpage_walk_ops, NULL); } } diff --git a/arch/powerpc/platforms/book3s/vas-api.c b/arch/powerpc/platforms/book3s/vas-api.c index 9580e8e12165..36c21648d19a 100644 --- a/arch/powerpc/platforms/book3s/vas-api.c +++ b/arch/powerpc/platforms/book3s/vas-api.c @@ -525,7 +525,7 @@ static int coproc_mmap(struct file *fp, struct vm_area_struct *vma) pfn = paste_addr >> PAGE_SHIFT; /* flags, page_prot from cxl_mmap(), except we want cachable */ - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_cached(vma->vm_page_prot); prot = __pgprot(pgprot_val(vma->vm_page_prot) | _PAGE_DIRTY); diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index 62d90a5e23d1..02a8158c469d 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -291,7 +291,7 @@ static int spufs_mem_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot); vma->vm_ops = &spufs_mem_mmap_vmops; @@ -381,7 +381,7 @@ static int spufs_cntl_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_cntl_mmap_vmops; @@ -1043,7 +1043,7 @@ static int spufs_signal1_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_signal1_mmap_vmops; @@ -1179,7 +1179,7 @@ static int spufs_signal2_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_signal2_mmap_vmops; @@ -1302,7 +1302,7 @@ static int spufs_mss_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_mss_mmap_vmops; @@ -1364,7 +1364,7 @@ static int spufs_psmap_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_psmap_mmap_vmops; @@ -1424,7 +1424,7 @@ static int spufs_mfc_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_mfc_mmap_vmops; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 69af6cdf1a2a..ab836597419d 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2522,8 +2522,7 @@ static inline void thp_split_mm(struct mm_struct *mm) VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) { - vma->vm_flags &= ~VM_HUGEPAGE; - vma->vm_flags |= VM_NOHUGEPAGE; + vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE); walk_page_vma(vma, &thp_split_walk_ops, NULL); } mm->def_flags |= VM_NOHUGEPAGE; diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 4af81df133ee..d234ca797e4a 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -391,7 +391,7 @@ void __init map_vsyscall(void) } if (vsyscall_mode == XONLY) - gate_vma.vm_flags = VM_EXEC; + vm_flags_init(&gate_vma, VM_EXEC); BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != (unsigned long)VSYSCALL_ADDR); diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index aa9b8b868867..262f5fb18d74 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -95,7 +95,7 @@ static int sgx_mmap(struct file *file, struct vm_area_struct *vma) return ret; vma->vm_ops = &sgx_vm_ops; - vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO; + vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO); vma->vm_private_data = encl; return 0; diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index 6a77a14eee38..c3e37eaec8ec 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -105,7 +105,7 @@ static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_ops = &sgx_vepc_vm_ops; /* Don't copy VMA in fork() */ - vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY; + vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY); vma->vm_private_data = vepc; return 0; diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index fb4b1b5e0dea..6ca51b1aa5d9 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -1000,7 +1000,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, ret = reserve_pfn_range(paddr, size, prot, 0); if (ret == 0 && vma) - vma->vm_flags |= VM_PAT; + vm_flags_set(vma, VM_PAT); return ret; } @@ -1066,7 +1066,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, } free_pfn_range(paddr, size); if (vma) - vma->vm_flags &= ~VM_PAT; + vm_flags_clear(vma, VM_PAT); } /* @@ -1076,7 +1076,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, */ void untrack_pfn_moved(struct vm_area_struct *vma) { - vma->vm_flags &= ~VM_PAT; + vm_flags_clear(vma, VM_PAT); } pgprot_t pgprot_writecombine(pgprot_t prot) diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c index cafd01f730da..29b2203bc82c 100644 --- a/arch/x86/um/mem_32.c +++ b/arch/x86/um/mem_32.c @@ -16,7 +16,7 @@ static int __init gate_vma_init(void) vma_init(&gate_vma, NULL); gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; - gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; + vm_flags_init(&gate_vma, VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC); gate_vma.vm_page_prot = PAGE_READONLY; return 0; diff --git a/drivers/acpi/pfr_telemetry.c b/drivers/acpi/pfr_telemetry.c index 27fb6cdad75f..843f678ade0c 100644 --- a/drivers/acpi/pfr_telemetry.c +++ b/drivers/acpi/pfr_telemetry.c @@ -310,7 +310,7 @@ pfrt_log_mmap(struct file *file, struct vm_area_struct *vma) return -EROFS; /* changing from read to write with mprotect is not allowed */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); pfrt_log_dev = to_pfrt_log_dev(file); diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 880224ec6abb..cb08982b9666 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -5572,8 +5572,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) proc->pid, vma->vm_start, vma->vm_end, "bad vm_flags", -EPERM); return -EPERM; } - vma->vm_flags |= VM_DONTCOPY | VM_MIXEDMAP; - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_mod(vma, VM_DONTCOPY | VM_MIXEDMAP, VM_MAYWRITE); vma->vm_ops = &binder_vm_ops; vma->vm_private_data = proc; diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c index f8231e2e84be..b35f651837c8 100644 --- a/drivers/char/mspec.c +++ b/drivers/char/mspec.c @@ -206,7 +206,7 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma, refcount_set(&vdata->refcnt, 1); vma->vm_private_data = vdata; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); if (vdata->type == MSPEC_UNCACHED) vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &mspec_vm_ops; diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 007ac7a69ce7..733fe1033910 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -2363,7 +2363,7 @@ static int hisi_qm_uacce_mmap(struct uacce_queue *q, return -EINVAL; } - vma->vm_flags |= VM_IO; + vm_flags_set(vma, VM_IO); return remap_pfn_range(vma, vma->vm_start, phys_base >> PAGE_SHIFT, diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 5494d745ced5..223e4e233d19 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -308,7 +308,7 @@ static int dax_mmap(struct file *filp, struct vm_area_struct *vma) return rc; vma->vm_ops = &dax_vm_ops; - vma->vm_flags |= VM_HUGEPAGE; + vm_flags_set(vma, VM_HUGEPAGE); return 0; } diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index e13e92609943..674bfefca088 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -201,7 +201,7 @@ static int idxd_cdev_mmap(struct file *filp, struct vm_area_struct *vma) if (rc < 0) return rc; - vma->vm_flags |= VM_DONTCOPY; + vm_flags_set(vma, VM_DONTCOPY); pfn = (base + idxd_get_wq_portal_full_offset(wq->id, IDXD_PORTAL_LIMITED)) >> PAGE_SHIFT; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index bb7350ea1d75..a69fd6fdabb4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -257,7 +257,7 @@ static int amdgpu_gem_object_mmap(struct drm_gem_object *obj, struct vm_area_str */ if (is_cow_mapping(vma->vm_flags) && !(vma->vm_flags & VM_ACCESS_FLAGS)) - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); return drm_gem_ttm_mmap(obj, vma); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 6d291aa6386b..d0933dd9af06 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -2879,8 +2879,8 @@ static int kfd_mmio_mmap(struct kfd_dev *dev, struct kfd_process *process, address = dev->adev->rmmio_remap.bus_addr; - vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | - VM_DONTDUMP | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | + VM_DONTDUMP | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c index cd4e61bf0493..cbef2e147da5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c @@ -159,8 +159,8 @@ int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, address = kfd_get_process_doorbells(pdd); if (!address) return -ENOMEM; - vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | - VM_DONTDUMP | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | + VM_DONTDUMP | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 729d26d648af..dd0436bf349a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -1052,8 +1052,8 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) pfn = __pa(page->kernel_address); pfn >>= PAGE_SHIFT; - vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE - | VM_DONTDUMP | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE + | VM_DONTDUMP | VM_PFNMAP); pr_debug("Mapping signal page\n"); pr_debug(" start user address == 0x%08lx\n", vma->vm_start); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 51b1683ac5c1..1fad0ecdfaeb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1978,8 +1978,8 @@ int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process, return -ENOMEM; } - vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND - | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND + | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP); /* Mapping pages to user process */ return remap_pfn_range(vma, vma->vm_start, PFN_DOWN(__pa(qpd->cwsr_kaddr)), diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index b8db675e7fb5..54c76003d2cc 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -1047,7 +1047,7 @@ int drm_gem_mmap_obj(struct drm_gem_object *obj, unsigned long obj_size, goto err_drm_gem_object_put; } - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); } diff --git a/drivers/gpu/drm/drm_gem_dma_helper.c b/drivers/gpu/drm/drm_gem_dma_helper.c index 1e658c448366..fb2c764accc6 100644 --- a/drivers/gpu/drm/drm_gem_dma_helper.c +++ b/drivers/gpu/drm/drm_gem_dma_helper.c @@ -530,8 +530,7 @@ int drm_gem_dma_mmap(struct drm_gem_dma_object *dma_obj, struct vm_area_struct * * the whole buffer. */ vma->vm_pgoff -= drm_vma_node_start(&obj->vma_node); - vma->vm_flags &= ~VM_PFNMAP; - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_mod(vma, VM_DONTEXPAND, VM_PFNMAP); if (dma_obj->map_noncoherent) { vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); diff --git a/drivers/gpu/drm/drm_gem_shmem_helper.c b/drivers/gpu/drm/drm_gem_shmem_helper.c index b602cd72a120..a2c28483e010 100644 --- a/drivers/gpu/drm/drm_gem_shmem_helper.c +++ b/drivers/gpu/drm/drm_gem_shmem_helper.c @@ -633,7 +633,7 @@ int drm_gem_shmem_mmap(struct drm_gem_shmem_object *shmem, struct vm_area_struct if (ret) return ret; - vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); if (shmem->map_wc) vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c index f024dc93939e..87c9fe55dec7 100644 --- a/drivers/gpu/drm/drm_vm.c +++ b/drivers/gpu/drm/drm_vm.c @@ -476,7 +476,7 @@ static int drm_mmap_dma(struct file *filp, struct vm_area_struct *vma) if (!capable(CAP_SYS_ADMIN) && (dma->flags & _DRM_DMA_USE_PCI_RO)) { - vma->vm_flags &= ~(VM_WRITE | VM_MAYWRITE); + vm_flags_clear(vma, VM_WRITE | VM_MAYWRITE); #if defined(__i386__) || defined(__x86_64__) pgprot_val(vma->vm_page_prot) &= ~_PAGE_RW; #else @@ -492,7 +492,7 @@ static int drm_mmap_dma(struct file *filp, struct vm_area_struct *vma) vma->vm_ops = &drm_vm_dma_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); drm_vm_open_locked(dev, vma); return 0; @@ -560,7 +560,7 @@ static int drm_mmap_locked(struct file *filp, struct vm_area_struct *vma) return -EINVAL; if (!capable(CAP_SYS_ADMIN) && (map->flags & _DRM_READ_ONLY)) { - vma->vm_flags &= ~(VM_WRITE | VM_MAYWRITE); + vm_flags_clear(vma, VM_WRITE | VM_MAYWRITE); #if defined(__i386__) || defined(__x86_64__) pgprot_val(vma->vm_page_prot) &= ~_PAGE_RW; #else @@ -628,7 +628,7 @@ static int drm_mmap_locked(struct file *filp, struct vm_area_struct *vma) default: return -EINVAL; /* This should never happen. */ } - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); drm_vm_open_locked(dev, vma); return 0; diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index c5ae5492e1af..b5f73502e3dd 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -130,7 +130,7 @@ static int etnaviv_gem_mmap_obj(struct etnaviv_gem_object *etnaviv_obj, { pgprot_t vm_page_prot; - vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vm_page_prot = vm_get_page_prot(vma->vm_flags); diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c index 3e493f48e0d4..638ca96830e9 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c @@ -274,7 +274,7 @@ static int exynos_drm_gem_mmap_buffer(struct exynos_drm_gem *exynos_gem, unsigned long vm_size; int ret; - vma->vm_flags &= ~VM_PFNMAP; + vm_flags_clear(vma, VM_PFNMAP); vma->vm_pgoff = 0; vm_size = vma->vm_end - vma->vm_start; @@ -368,7 +368,7 @@ static int exynos_drm_gem_mmap(struct drm_gem_object *obj, struct vm_area_struct if (obj->import_attach) return dma_buf_mmap(obj->dma_buf, vma, 0); - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); DRM_DEV_DEBUG_KMS(to_dma_dev(obj->dev), "flags = 0x%x\n", exynos_gem->flags); diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c index 8d5a37b8f110..a9276c8a3e4e 100644 --- a/drivers/gpu/drm/gma500/framebuffer.c +++ b/drivers/gpu/drm/gma500/framebuffer.c @@ -139,7 +139,7 @@ static int psbfb_mmap(struct fb_info *info, struct vm_area_struct *vma) */ vma->vm_ops = &psbfb_vm_ops; vma->vm_private_data = (void *)fb; - vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTDUMP); return 0; } diff --git a/drivers/gpu/drm/i810/i810_dma.c b/drivers/gpu/drm/i810/i810_dma.c index 9fb4dd63342f..01967dd88762 100644 --- a/drivers/gpu/drm/i810/i810_dma.c +++ b/drivers/gpu/drm/i810/i810_dma.c @@ -102,7 +102,7 @@ static int i810_mmap_buffers(struct file *filp, struct vm_area_struct *vma) buf = dev_priv->mmap_buffer; buf_priv = buf->dev_private; - vma->vm_flags |= VM_DONTCOPY; + vm_flags_set(vma, VM_DONTCOPY); buf_priv->currently_mapped = I810_BUF_MAPPED; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index 0ad44f3868de..e95f4c729ca5 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -979,7 +979,7 @@ int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma) i915_gem_object_put(obj); return -EINVAL; } - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } anon = mmap_singleton(to_i915(dev)); @@ -988,7 +988,7 @@ int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma) return PTR_ERR(anon); } - vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO; + vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO); /* * We keep the ref on mmo->obj, not vm_file, but we require diff --git a/drivers/gpu/drm/mediatek/mtk_drm_gem.c b/drivers/gpu/drm/mediatek/mtk_drm_gem.c index 47e96b0289f9..28659514bf20 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_gem.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_gem.c @@ -158,7 +158,7 @@ static int mtk_drm_gem_object_mmap(struct drm_gem_object *obj, * dma_alloc_attrs() allocated a struct page table for mtk_gem, so clear * VM_PFNMAP flag that was set by drm_gem_mmap_obj()/drm_gem_mmap(). */ - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index 1dee0d18abbb..c2fb98a94bc3 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -1012,7 +1012,7 @@ static int msm_gem_object_mmap(struct drm_gem_object *obj, struct vm_area_struct { struct msm_gem_object *msm_obj = to_msm_bo(obj); - vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = msm_gem_pgprot(msm_obj, vm_get_page_prot(vma->vm_flags)); return 0; diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c index cf571796fd26..19fef933904b 100644 --- a/drivers/gpu/drm/omapdrm/omap_gem.c +++ b/drivers/gpu/drm/omapdrm/omap_gem.c @@ -543,8 +543,7 @@ int omap_gem_mmap_obj(struct drm_gem_object *obj, { struct omap_gem_object *omap_obj = to_omap_bo(obj); - vma->vm_flags &= ~VM_PFNMAP; - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_mod(vma, VM_MIXEDMAP, VM_PFNMAP); if (omap_obj->flags & OMAP_BO_WC) { vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c index 6edb7c52cb3d..8ea09d915c3c 100644 --- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c +++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c @@ -251,8 +251,7 @@ static int rockchip_drm_gem_object_mmap(struct drm_gem_object *obj, * We allocated a struct page table for rk_obj, so clear * VM_PFNMAP flag that was set by drm_gem_mmap_obj()/drm_gem_mmap(). */ - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; - vma->vm_flags &= ~VM_PFNMAP; + vm_flags_mod(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP, VM_PFNMAP); vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c index 979e7bc902f6..bce991a2ccc0 100644 --- a/drivers/gpu/drm/tegra/gem.c +++ b/drivers/gpu/drm/tegra/gem.c @@ -574,7 +574,7 @@ int __tegra_gem_mmap(struct drm_gem_object *gem, struct vm_area_struct *vma) * and set the vm_pgoff (used as a fake buffer offset by DRM) * to 0 as we want to map the whole buffer. */ - vma->vm_flags &= ~VM_PFNMAP; + vm_flags_clear(vma, VM_PFNMAP); vma->vm_pgoff = 0; err = dma_mmap_wc(gem->dev->dev, vma, bo->vaddr, bo->iova, @@ -588,8 +588,7 @@ int __tegra_gem_mmap(struct drm_gem_object *gem, struct vm_area_struct *vma) } else { pgprot_t prot = vm_get_page_prot(vma->vm_flags); - vma->vm_flags |= VM_MIXEDMAP; - vma->vm_flags &= ~VM_PFNMAP; + vm_flags_mod(vma, VM_MIXEDMAP, VM_PFNMAP); vma->vm_page_prot = pgprot_writecombine(prot); } diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 5a3e4b891377..c00207582c74 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -468,8 +468,7 @@ int ttm_bo_mmap_obj(struct vm_area_struct *vma, struct ttm_buffer_object *bo) vma->vm_private_data = bo; - vma->vm_flags |= VM_PFNMAP; - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTEXPAND | VM_DONTDUMP); return 0; } EXPORT_SYMBOL(ttm_bo_mmap_obj); diff --git a/drivers/gpu/drm/virtio/virtgpu_vram.c b/drivers/gpu/drm/virtio/virtgpu_vram.c index 6b45b0429fef..25df81c02783 100644 --- a/drivers/gpu/drm/virtio/virtgpu_vram.c +++ b/drivers/gpu/drm/virtio/virtgpu_vram.c @@ -46,7 +46,7 @@ static int virtio_gpu_vram_mmap(struct drm_gem_object *obj, return -EINVAL; vma->vm_pgoff -= drm_vma_node_start(&obj->vma_node); - vma->vm_flags |= VM_MIXEDMAP | VM_DONTEXPAND; + vm_flags_set(vma, VM_MIXEDMAP | VM_DONTEXPAND); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); vma->vm_ops = &virtio_gpu_vram_vm_ops; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c index 265f7c48d856..90097d04b45f 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c @@ -97,7 +97,7 @@ int vmw_mmap(struct file *filp, struct vm_area_struct *vma) /* Use VM_PFNMAP rather than VM_MIXEDMAP if not a COW mapping */ if (!is_cow_mapping(vma->vm_flags)) - vma->vm_flags = (vma->vm_flags & ~VM_MIXEDMAP) | VM_PFNMAP; + vm_flags_mod(vma, VM_PFNMAP, VM_MIXEDMAP); ttm_bo_put(bo); /* release extra ref taken by ttm_bo_mmap_obj() */ diff --git a/drivers/gpu/drm/xen/xen_drm_front_gem.c b/drivers/gpu/drm/xen/xen_drm_front_gem.c index 4c95ebcdcc2d..3ad2b4cfd1f0 100644 --- a/drivers/gpu/drm/xen/xen_drm_front_gem.c +++ b/drivers/gpu/drm/xen/xen_drm_front_gem.c @@ -69,8 +69,7 @@ static int xen_drm_front_gem_object_mmap(struct drm_gem_object *gem_obj, * vm_pgoff (used as a fake buffer offset by DRM) to 0 as we want to map * the whole buffer. */ - vma->vm_flags &= ~VM_PFNMAP; - vma->vm_flags |= VM_MIXEDMAP | VM_DONTEXPAND; + vm_flags_mod(vma, VM_MIXEDMAP | VM_DONTEXPAND, VM_PFNMAP); vma->vm_pgoff = 0; /* diff --git a/drivers/hsi/clients/cmt_speech.c b/drivers/hsi/clients/cmt_speech.c index 8069f795c864..daa8e1bff5d9 100644 --- a/drivers/hsi/clients/cmt_speech.c +++ b/drivers/hsi/clients/cmt_speech.c @@ -1264,7 +1264,7 @@ static int cs_char_mmap(struct file *file, struct vm_area_struct *vma) if (vma_pages(vma) != 1) return -EINVAL; - vma->vm_flags |= VM_IO | VM_DONTDUMP | VM_DONTEXPAND; + vm_flags_set(vma, VM_IO | VM_DONTDUMP | VM_DONTEXPAND); vma->vm_ops = &cs_char_vm_ops; vma->vm_private_data = file->private_data; diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c index 6c8215a47a60..9621efe0e95c 100644 --- a/drivers/hwtracing/intel_th/msu.c +++ b/drivers/hwtracing/intel_th/msu.c @@ -1659,7 +1659,7 @@ out: atomic_dec(&msc->user_count); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTCOPY; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTCOPY); vma->vm_ops = &msc_mmap_ops; return ret; } diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c index 2712e699ba08..534fbefc7f6a 100644 --- a/drivers/hwtracing/stm/core.c +++ b/drivers/hwtracing/stm/core.c @@ -715,7 +715,7 @@ static int stm_char_mmap(struct file *file, struct vm_area_struct *vma) pm_runtime_get_sync(&stm->dev); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &stm_mmap_vmops; vm_iomap_memory(vma, phys, size); diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index f5f9269fdc16..c6e59bc480f9 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -403,7 +403,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) ret = -EPERM; goto done; } - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); addr = vma->vm_start; for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) { memlen = uctxt->egrbufs.buffers[i].len; @@ -528,7 +528,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) goto done; } - vma->vm_flags = flags; + vm_flags_reset(vma, flags); hfi1_cdbg(PROC, "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n", ctxt, subctxt, type, mapio, vmf, memaddr, memlen, diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index c669ef6e47e7..e3c97aa2c46c 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2087,7 +2087,7 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); if (!dev->mdev->clock_info) return -EOPNOTSUPP; @@ -2311,7 +2311,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm if (vma->vm_flags & VM_WRITE) return -EPERM; - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); /* Don't expose to user-space information it shouldn't have */ if (PAGE_SIZE > 4096) diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 3937144b2ae5..80fe92a21f96 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -733,7 +733,7 @@ static int qib_mmap_mem(struct vm_area_struct *vma, struct qib_ctxtdata *rcd, } /* don't allow them to later change with mprotect */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT; @@ -769,7 +769,7 @@ static int mmap_ureg(struct vm_area_struct *vma, struct qib_devdata *dd, phys = dd->physaddr + ureg; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND); ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT, vma->vm_end - vma->vm_start, @@ -810,8 +810,7 @@ static int mmap_piobufs(struct vm_area_struct *vma, * don't allow them to later change to readable with mprotect (for when * not initially mapped readable, as is normally the case) */ - vma->vm_flags &= ~VM_MAYREAD; - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND, VM_MAYREAD); /* We used PAT if wc_cookie == 0 */ if (!dd->wc_cookie) @@ -852,7 +851,7 @@ static int mmap_rcvegrbufs(struct vm_area_struct *vma, goto bail; } /* don't allow them to later change to writable with mprotect */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); start = vma->vm_start; @@ -944,7 +943,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, * Don't allow permission to later change to writable * with mprotect. */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } else goto bail; len = vma->vm_end - vma->vm_start; @@ -955,7 +954,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT; vma->vm_ops = &qib_file_vm_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); ret = 1; bail: diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 6e8c4fbb8083..6289238cc5af 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -672,7 +672,7 @@ int usnic_ib_mmap(struct ib_ucontext *context, usnic_dbg("\n"); us_ibdev = to_usdev(context->device); - vma->vm_flags |= VM_IO; + vm_flags_set(vma, VM_IO); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vfid = vma->vm_pgoff; usnic_dbg("Page Offset %lu PAGE_SHIFT %u VFID %u\n", diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c index 19176583dbde..9f54aa90a35a 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c @@ -408,7 +408,7 @@ int pvrdma_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) } /* Map UAR to kernel space, VM_LOCKED? */ - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); if (io_remap_pfn_range(vma, start, context->uar.pfn, size, vma->vm_page_prot)) diff --git a/drivers/media/common/videobuf2/videobuf2-dma-contig.c b/drivers/media/common/videobuf2/videobuf2-dma-contig.c index 5f1175f8b349..205d3cac425c 100644 --- a/drivers/media/common/videobuf2/videobuf2-dma-contig.c +++ b/drivers/media/common/videobuf2/videobuf2-dma-contig.c @@ -293,7 +293,7 @@ static int vb2_dc_mmap(void *buf_priv, struct vm_area_struct *vma) return ret; } - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_private_data = &buf->handler; vma->vm_ops = &vb2_common_vm_ops; diff --git a/drivers/media/common/videobuf2/videobuf2-vmalloc.c b/drivers/media/common/videobuf2/videobuf2-vmalloc.c index 959b45beb1f3..a6c6d2fcaaa4 100644 --- a/drivers/media/common/videobuf2/videobuf2-vmalloc.c +++ b/drivers/media/common/videobuf2/videobuf2-vmalloc.c @@ -185,7 +185,7 @@ static int vb2_vmalloc_mmap(void *buf_priv, struct vm_area_struct *vma) /* * Make sure that vm_areas for 2 buffers won't be merged together */ - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); /* * Use common vm_area operations to track buffer refcount. diff --git a/drivers/media/v4l2-core/videobuf-dma-contig.c b/drivers/media/v4l2-core/videobuf-dma-contig.c index f2c439359557..4c2ec7a0d804 100644 --- a/drivers/media/v4l2-core/videobuf-dma-contig.c +++ b/drivers/media/v4l2-core/videobuf-dma-contig.c @@ -314,7 +314,7 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q, } vma->vm_ops = &videobuf_vm_ops; - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); vma->vm_private_data = map; dev_dbg(q->dev, "mmap %p: q=%p %08lx-%08lx (%lx) pgoff %08lx buf %d\n", diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index 234e9f647c96..53001532e8e3 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -630,8 +630,8 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q, map->count = 1; map->q = q; vma->vm_ops = &videobuf_vm_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; - vma->vm_flags &= ~VM_IO; /* using shared anonymous pages */ + /* using shared anonymous pages */ + vm_flags_mod(vma, VM_DONTEXPAND | VM_DONTDUMP, VM_IO); vma->vm_private_data = map; dprintk(1, "mmap %p: q=%p %08lx-%08lx pgoff %08lx bufs %d-%d\n", map, q, vma->vm_start, vma->vm_end, vma->vm_pgoff, first, last); diff --git a/drivers/media/v4l2-core/videobuf-vmalloc.c b/drivers/media/v4l2-core/videobuf-vmalloc.c index 9b2443720ab0..85c7090606d6 100644 --- a/drivers/media/v4l2-core/videobuf-vmalloc.c +++ b/drivers/media/v4l2-core/videobuf-vmalloc.c @@ -247,7 +247,7 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q, } vma->vm_ops = &videobuf_vm_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_private_data = map; dprintk(1, "mmap %p: q=%p %08lx-%08lx (%lx) pgoff %08lx buf %d\n", diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index acaa44809c58..76b5ea66dfa1 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c @@ -220,7 +220,7 @@ int cxl_context_iomap(struct cxl_context *ctx, struct vm_area_struct *vma) pr_devel("%s: mmio physical: %llx pe: %i master:%i\n", __func__, ctx->psn_phys, ctx->pe , ctx->master); - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &cxl_mmap_vmops; return 0; diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c index 5e9ae7600d75..6bb44a3ad5e6 100644 --- a/drivers/misc/habanalabs/common/memory.c +++ b/drivers/misc/habanalabs/common/memory.c @@ -2082,7 +2082,7 @@ static int hl_ts_mmap(struct hl_mmap_mem_buf *buf, struct vm_area_struct *vma, v { struct hl_ts_buff *ts_buff = buf->private; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP | VM_DONTCOPY | VM_NORESERVE; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP | VM_DONTCOPY | VM_NORESERVE); return remap_vmalloc_range(vma, ts_buff->user_buff_address, 0); } diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 9f5e208701ba..3b0afdc50ff9 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -4236,8 +4236,8 @@ static int gaudi_mmap(struct hl_device *hdev, struct vm_area_struct *vma, { int rc; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | - VM_DONTCOPY | VM_NORESERVE; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | + VM_DONTCOPY | VM_NORESERVE); rc = dma_mmap_coherent(hdev->dev, vma, cpu_addr, (dma_addr - HOST_PHYS_BASE), size); diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index e793fb2bdcbe..65502ec02bc0 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -5538,8 +5538,8 @@ static int gaudi2_mmap(struct hl_device *hdev, struct vm_area_struct *vma, { int rc; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | - VM_DONTCOPY | VM_NORESERVE; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | + VM_DONTCOPY | VM_NORESERVE); #ifdef _HAS_DMA_MMAP_COHERENT @@ -10116,8 +10116,8 @@ static int gaudi2_block_mmap(struct hl_device *hdev, struct vm_area_struct *vma, address = pci_resource_start(hdev->pdev, SRAM_CFG_BAR_ID) + offset_in_bar; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | - VM_DONTCOPY | VM_NORESERVE; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | + VM_DONTCOPY | VM_NORESERVE); rc = remap_pfn_range(vma, vma->vm_start, address >> PAGE_SHIFT, block_size, vma->vm_page_prot); diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 0f083fcf81a6..2a15a305d01b 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -2880,8 +2880,8 @@ static int goya_mmap(struct hl_device *hdev, struct vm_area_struct *vma, { int rc; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | - VM_DONTCOPY | VM_NORESERVE; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | + VM_DONTCOPY | VM_NORESERVE); rc = dma_mmap_coherent(hdev->dev, vma, cpu_addr, (dma_addr - HOST_PHYS_BASE), size); diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c index 9eb0d93b01c6..7f83116ae11a 100644 --- a/drivers/misc/ocxl/context.c +++ b/drivers/misc/ocxl/context.c @@ -180,7 +180,7 @@ static int check_mmap_afu_irq(struct ocxl_context *ctx, if ((vma->vm_flags & VM_READ) || (vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_WRITE)) return -EINVAL; - vma->vm_flags &= ~(VM_MAYREAD | VM_MAYEXEC); + vm_flags_clear(vma, VM_MAYREAD | VM_MAYEXEC); return 0; } @@ -204,7 +204,7 @@ int ocxl_context_mmap(struct ocxl_context *ctx, struct vm_area_struct *vma) if (rc) return rc; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &ocxl_vmops; return 0; diff --git a/drivers/misc/ocxl/sysfs.c b/drivers/misc/ocxl/sysfs.c index 25c78df8055d..405180d47d9b 100644 --- a/drivers/misc/ocxl/sysfs.c +++ b/drivers/misc/ocxl/sysfs.c @@ -134,7 +134,7 @@ static int global_mmio_mmap(struct file *filp, struct kobject *kobj, (afu->config.global_mmio_size >> PAGE_SHIFT)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &global_mmio_vmops; vma->vm_private_data = afu; diff --git a/drivers/misc/open-dice.c b/drivers/misc/open-dice.c index 9dda47b3fd70..8aea2d070a40 100644 --- a/drivers/misc/open-dice.c +++ b/drivers/misc/open-dice.c @@ -95,12 +95,12 @@ static int open_dice_mmap(struct file *filp, struct vm_area_struct *vma) if (vma->vm_flags & VM_WRITE) return -EPERM; /* Ensure userspace cannot acquire VM_WRITE later. */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } /* Create write-combine mapping so all clients observe a wipe. */ vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); - vma->vm_flags |= VM_DONTCOPY | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTDUMP); return vm_iomap_memory(vma, drvdata->rmem->base, drvdata->rmem->size); } diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c index 7ffcfc0bb587..a3d659c11cc4 100644 --- a/drivers/misc/sgi-gru/grufile.c +++ b/drivers/misc/sgi-gru/grufile.c @@ -101,8 +101,8 @@ static int gru_file_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_end & (GRU_GSEG_PAGESIZE - 1)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_LOCKED | - VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_LOCKED | + VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = PAGE_SHARED; vma->vm_ops = &gru_vm_ops; diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c index 905eff1f840e..b65ab440a19e 100644 --- a/drivers/misc/uacce/uacce.c +++ b/drivers/misc/uacce/uacce.c @@ -229,7 +229,7 @@ static int uacce_fops_mmap(struct file *filep, struct vm_area_struct *vma) if (!qfr) return -ENOMEM; - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_WIPEONFORK; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_WIPEONFORK); vma->vm_ops = &uacce_vm_ops; vma->vm_private_data = q; qfr->type = type; diff --git a/drivers/sbus/char/oradax.c b/drivers/sbus/char/oradax.c index 21b7cb6e7e70..e300cf26bc2a 100644 --- a/drivers/sbus/char/oradax.c +++ b/drivers/sbus/char/oradax.c @@ -389,7 +389,7 @@ static int dax_devmap(struct file *f, struct vm_area_struct *vma) /* completion area is mapped read-only for user */ if (vma->vm_flags & VM_WRITE) return -EPERM; - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); if (remap_pfn_range(vma, vma->vm_start, ctx->ca_buf_ra >> PAGE_SHIFT, len, vma->vm_page_prot)) diff --git a/drivers/scsi/cxlflash/ocxl_hw.c b/drivers/scsi/cxlflash/ocxl_hw.c index 631eda2d467e..6542818e595a 100644 --- a/drivers/scsi/cxlflash/ocxl_hw.c +++ b/drivers/scsi/cxlflash/ocxl_hw.c @@ -1167,7 +1167,7 @@ static int afu_mmap(struct file *file, struct vm_area_struct *vma) (ctx->psn_size >> PAGE_SHIFT)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &ocxlflash_vmops; return 0; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index ff9854f59964..a91049213203 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1288,7 +1288,7 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma) } sfp->mmap_called = 1; - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_private_data = sfp; vma->vm_ops = &sg_mmap_vm_ops; out: diff --git a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c index 5e53eed8ae95..095cd0ba8c21 100644 --- a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c +++ b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c @@ -1072,7 +1072,7 @@ int hmm_bo_mmap(struct vm_area_struct *vma, struct hmm_buffer_object *bo) vma->vm_private_data = bo; vma->vm_ops = &hmm_bo_vm_ops; - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); /* * call hmm_bo_vm_open explicitly. diff --git a/drivers/staging/media/deprecated/meye/meye.c b/drivers/staging/media/deprecated/meye/meye.c index 5d87efd9b95c..746c6ea1c0a7 100644 --- a/drivers/staging/media/deprecated/meye/meye.c +++ b/drivers/staging/media/deprecated/meye/meye.c @@ -1476,8 +1476,8 @@ static int meye_mmap(struct file *file, struct vm_area_struct *vma) } vma->vm_ops = &meye_vm_ops; - vma->vm_flags &= ~VM_IO; /* not I/O memory */ - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + /* not I/O memory */ + vm_flags_mod(vma, VM_DONTEXPAND | VM_DONTDUMP, VM_IO); vma->vm_private_data = (void *) (offset / gbufsize); meye_vm_open(vma); diff --git a/drivers/staging/media/deprecated/stkwebcam/stk-webcam.c b/drivers/staging/media/deprecated/stkwebcam/stk-webcam.c index 787edb3d47c2..a1b7ad350a90 100644 --- a/drivers/staging/media/deprecated/stkwebcam/stk-webcam.c +++ b/drivers/staging/media/deprecated/stkwebcam/stk-webcam.c @@ -779,7 +779,7 @@ static int v4l_stk_mmap(struct file *fp, struct vm_area_struct *vma) ret = remap_vmalloc_range(vma, sbuf->buffer, 0); if (ret) return ret; - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); vma->vm_private_data = sbuf; vma->vm_ops = &stk_v4l_vm_ops; sbuf->v4lbuf.flags |= V4L2_BUF_FLAG_MAPPED; diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 2940559c3086..15ffc8d2ac7b 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -1928,7 +1928,7 @@ static int tcmu_mmap(struct uio_info *info, struct vm_area_struct *vma) { struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &tcmu_vm_ops; vma->vm_private_data = udev; diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index 43afbb7c5ab9..62082d64ece0 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -713,7 +713,7 @@ static const struct vm_operations_struct uio_logical_vm_ops = { static int uio_mmap_logical(struct vm_area_struct *vma) { - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &uio_logical_vm_ops; return 0; } diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index 837f3e57f580..e501a03d6c70 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -279,8 +279,7 @@ static int usbdev_mmap(struct file *file, struct vm_area_struct *vma) } } - vma->vm_flags |= VM_IO; - vma->vm_flags |= (VM_DONTEXPAND | VM_DONTDUMP); + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &usbdev_vm_ops; vma->vm_private_data = usbm; diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c index 094e812e9e69..abb1cd35d8a6 100644 --- a/drivers/usb/mon/mon_bin.c +++ b/drivers/usb/mon/mon_bin.c @@ -1272,8 +1272,7 @@ static int mon_bin_mmap(struct file *filp, struct vm_area_struct *vma) if (vma->vm_flags & VM_WRITE) return -EPERM; - vma->vm_flags &= ~VM_MAYWRITE; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_mod(vma, VM_DONTEXPAND | VM_DONTDUMP, VM_MAYWRITE); vma->vm_private_data = filp->private_data; mon_bin_vma_open(vma); return 0; diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c index e682bc7ee6c9..5e4a77b9bae6 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.c +++ b/drivers/vdpa/vdpa_user/iova_domain.c @@ -512,7 +512,7 @@ static int vduse_domain_mmap(struct file *file, struct vm_area_struct *vma) { struct vduse_iova_domain *domain = file->private_data; - vma->vm_flags |= VM_DONTDUMP | VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTDUMP | VM_DONTEXPAND); vma->vm_private_data = domain; vma->vm_ops = &vduse_domain_mmap_ops; diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 26a541cc64d1..c49f8f2b2865 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1799,7 +1799,7 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma * See remap_pfn_range(), called from vfio_pci_fault() but we can't * change vm_flags within the fault handler. Set them now. */ - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &vfio_pci_mmap_ops; return 0; diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index ec32f785dfde..9c5010ee20da 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -1315,7 +1315,7 @@ static int vhost_vdpa_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_end - vma->vm_start != notify.size) return -ENOTSUPP; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &vhost_vdpa_vm_ops; return 0; } diff --git a/drivers/video/fbdev/68328fb.c b/drivers/video/fbdev/68328fb.c index 7db03ed77c76..41df61b37a18 100644 --- a/drivers/video/fbdev/68328fb.c +++ b/drivers/video/fbdev/68328fb.c @@ -391,7 +391,7 @@ static int mc68x328fb_mmap(struct fb_info *info, struct vm_area_struct *vma) #ifndef MMU /* this is uClinux (no MMU) specific code */ - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_start = videomemory; return 0; diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c index c730253ab85c..dc310c7b5769 100644 --- a/drivers/video/fbdev/core/fb_defio.c +++ b/drivers/video/fbdev/core/fb_defio.c @@ -232,9 +232,9 @@ static const struct address_space_operations fb_deferred_io_aops = { int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma) { vma->vm_ops = &fb_deferred_io_vm_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); if (!(info->flags & FBINFO_VIRTFB)) - vma->vm_flags |= VM_IO; + vm_flags_set(vma, VM_IO); vma->vm_private_data = info; return 0; } diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index a15729beb9d1..26ffb8755ffb 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -525,7 +525,7 @@ static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma) vma->vm_private_data = vm_priv; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &gntalloc_vmops; diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 4d9a3050de6a..61faea1f0663 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -1055,10 +1055,10 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) vma->vm_ops = &gntdev_vmops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP | VM_MIXEDMAP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP | VM_MIXEDMAP); if (use_ptemod) - vma->vm_flags |= VM_DONTCOPY; + vm_flags_set(vma, VM_DONTCOPY); vma->vm_private_data = map; if (map->flags) { diff --git a/drivers/xen/privcmd-buf.c b/drivers/xen/privcmd-buf.c index dd5bbb6e1b6b..2fa10ca5be14 100644 --- a/drivers/xen/privcmd-buf.c +++ b/drivers/xen/privcmd-buf.c @@ -156,7 +156,7 @@ static int privcmd_buf_mmap(struct file *file, struct vm_area_struct *vma) vma_priv->file_priv = file_priv; vma_priv->users = 1; - vma->vm_flags |= VM_IO | VM_DONTEXPAND; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND); vma->vm_ops = &privcmd_buf_vm_ops; vma->vm_private_data = vma_priv; diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 1edf45ee9890..e2f580e30a86 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -934,8 +934,8 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) { /* DONTCOPY is essential for Xen because copy_page_range doesn't know * how to recreate these mappings */ - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTCOPY | - VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTCOPY | + VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &privcmd_vm_ops; vma->vm_private_data = NULL; diff --git a/fs/aio.c b/fs/aio.c index 562916d85cba..5a88caf52be4 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -390,7 +390,7 @@ static const struct vm_operations_struct aio_ring_vm_ops = { static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) { - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); vma->vm_ops = &aio_ring_vm_ops; return 0; } diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 50e4e060db68..45a65c400bd0 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -408,7 +408,7 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) * unpopulated ptes via cramfs_read_folio(). */ int i; - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_set(vma, VM_MIXEDMAP); for (i = 0; i < pages && !ret; i++) { vm_fault_t vmf; unsigned long off = i * PAGE_SIZE; diff --git a/fs/erofs/data.c b/fs/erofs/data.c index f57f921683d7..f32d65987578 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -429,7 +429,7 @@ static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; vma->vm_ops = &erofs_dax_vm_ops; - vma->vm_flags |= VM_HUGEPAGE; + vm_flags_set(vma, VM_HUGEPAGE); return 0; } #else diff --git a/fs/exec.c b/fs/exec.c index c0df813d2b45..d2e2a15e5cfe 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -270,7 +270,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; - vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; + vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); err = insert_vm_struct(mm, vma); @@ -834,7 +834,7 @@ int setup_arg_pages(struct linux_binprm *bprm, } /* mprotect_fixup is overkill to remove the temporary stack flags */ - vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP; + vm_flags_clear(vma, VM_STACK_INCOMPLETE_SETUP); stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ stack_size = vma->vm_end - vma->vm_start; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 7ac0a81bd371..6bdf61a62c79 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -801,7 +801,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; - vma->vm_flags |= VM_HUGEPAGE; + vm_flags_set(vma, VM_HUGEPAGE); } else { vma->vm_ops = &ext4_file_vm_ops; } diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index e23e802a8013..8e74f278a3f6 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -860,7 +860,7 @@ int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &fuse_dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + vm_flags_set(vma, VM_MIXEDMAP | VM_HUGEPAGE); return 0; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 48f1a8ad2243..44ecdcb796cc 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -132,7 +132,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * way when do_mmap unwinds (may be important on powerpc * and ia64). */ - vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; + vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); vma->vm_ops = &hugetlb_vm_ops; ret = seal_check_future_write(info->seals, vma); @@ -811,7 +811,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * as input to create an allocation policy. */ vma_init(&pseudo_vma, mm); - pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); + vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pseudo_vma.vm_file = file; for (index = start; index < end; index++) { diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 167fa43b24f9..a5e1ea8b7119 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -389,8 +389,7 @@ static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) "orangefs_file_mmap: called on %pD\n", file); /* set the sequential readahead hint */ - vma->vm_flags |= VM_SEQ_READ; - vma->vm_flags &= ~VM_RAND_READ; + vm_flags_mod(vma, VM_SEQ_READ, VM_RAND_READ); file_accessed(file); vma->vm_ops = &orangefs_file_vm_ops; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a944e1816364..6a96e1713fd5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1299,7 +1299,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; - vma->vm_flags &= ~VM_SOFTDIRTY; + vm_flags_clear(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); } diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 09a81e4b1273..12af614f33ce 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -582,8 +582,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; - vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_mod(vma, VM_MIXEDMAP, VM_MAYWRITE | VM_MAYEXEC); vma->vm_ops = &vmcore_mmap_ops; len = 0; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f3c75c6222de..44d1ee429eb0 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -113,7 +113,7 @@ static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, { const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; - vma->vm_flags = flags; + vm_flags_reset(vma, flags); /* * For shared mappings, we want to enable writenotify while * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 595a5bcf46b9..b0039a8fea2e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1429,7 +1429,7 @@ xfs_file_mmap( file_accessed(file); vma->vm_ops = &xfs_file_vm_ops; if (IS_DAX(inode)) - vma->vm_flags |= VM_HUGEPAGE; + vm_flags_set(vma, VM_HUGEPAGE); return 0; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 663726ca2240..ce6d9d765aae 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3653,7 +3653,7 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) * VM_MAYWRITE as we still want them to be COW-writable. */ if (vma->vm_flags & VM_SHARED) - vma->vm_flags &= ~(VM_MAYWRITE); + vm_flags_clear(vma, VM_MAYWRITE); } return 0; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 80f4b4d88aaf..8732e0aadf36 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -269,7 +269,7 @@ static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE) return -EPERM; } else { - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, @@ -290,7 +290,7 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma */ return -EPERM; } else { - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 64131f88c553..9f56b442daa9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -882,10 +882,10 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) /* set default open/close callbacks */ vma->vm_ops = &bpf_map_default_vmops; vma->vm_private_data = map; - vma->vm_flags &= ~VM_MAYEXEC; + vm_flags_clear(vma, VM_MAYEXEC); if (!(vma->vm_flags & VM_WRITE)) /* disallow re-mapping with PROT_WRITE */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); err = map->ops->map_mmap(map, vma); if (err) diff --git a/kernel/events/core.c b/kernel/events/core.c index d56328e5080e..55a82f12a42c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6573,7 +6573,7 @@ aux_unlock: * Since pinned accounting is per vm we cannot allow fork() to copy our * vma. */ - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &perf_mmap_vmops; if (event->pmu->event_mapped) diff --git a/kernel/kcov.c b/kernel/kcov.c index e5cd09fd8a05..84c717337df0 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -489,7 +489,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) goto exit; } spin_unlock_irqrestore(&kcov->lock, flags); - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); for (off = 0; off < size; off += PAGE_SIZE) { page = vmalloc_to_page(kcov->area + off); res = vm_insert_page(vma, vma->vm_start + off, page); diff --git a/kernel/relay.c b/kernel/relay.c index ef12532168d9..9aa70ae53d24 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -91,7 +91,7 @@ static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) return -EINVAL; vma->vm_ops = &relay_file_mmap_ops; - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); vma->vm_private_data = buf; return 0; diff --git a/mm/madvise.c b/mm/madvise.c index ca672e37b38c..5a5a687d03c2 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -176,7 +176,7 @@ success: /* * vm_flags is protected by the mmap_lock held in write mode. */ - vma->vm_flags = new_flags; + vm_flags_reset(vma, new_flags); if (!vma->vm_file || vma_is_anon_shmem(vma)) { error = replace_anon_vma_name(vma, anon_name); if (error) diff --git a/mm/memory.c b/mm/memory.c index 029f838587d1..4354b7987f36 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1928,7 +1928,7 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, if (!(vma->vm_flags & VM_MIXEDMAP)) { BUG_ON(mmap_read_trylock(vma->vm_mm)); BUG_ON(vma->vm_flags & VM_PFNMAP); - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_set(vma, VM_MIXEDMAP); } /* Defer page refcount checking till we're about to map that page. */ return insert_pages(vma, addr, pages, num, vma->vm_page_prot); @@ -1986,7 +1986,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, if (!(vma->vm_flags & VM_MIXEDMAP)) { BUG_ON(mmap_read_trylock(vma->vm_mm)); BUG_ON(vma->vm_flags & VM_PFNMAP); - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_set(vma, VM_MIXEDMAP); } return insert_page(vma, addr, page, vma->vm_page_prot); } @@ -2452,7 +2452,7 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, vma->vm_pgoff = pfn; } - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; diff --git a/mm/mlock.c b/mm/mlock.c index 5c4fff93cd6b..ed49459e343e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -380,7 +380,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, */ if (newflags & VM_LOCKED) newflags |= VM_IO; - WRITE_ONCE(vma->vm_flags, newflags); + vm_flags_reset(vma, newflags); lru_add_drain(); walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL); @@ -388,7 +388,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, if (newflags & VM_IO) { newflags &= ~VM_IO; - WRITE_ONCE(vma->vm_flags, newflags); + vm_flags_reset(vma, newflags); } } @@ -457,7 +457,7 @@ success: if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { /* No work to do, and mlocking twice would be wrong */ - vma->vm_flags = newflags; + vm_flags_reset(vma, newflags); } else { mlock_vma_pages_range(vma, start, end, newflags); } diff --git a/mm/mmap.c b/mm/mmap.c index 03d7c37c5969..33c638c7ec04 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2555,7 +2555,7 @@ cannot_expand: vma_iter_set(&vmi, addr); vma->vm_start = addr; vma->vm_end = end; - vma->vm_flags = vm_flags; + vm_flags_init(vma, vm_flags); vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; @@ -2683,7 +2683,7 @@ expanded: * then new mapped in-place (which must be aimed as * a completely new data area). */ - vma->vm_flags |= VM_SOFTDIRTY; + vm_flags_set(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); @@ -2909,7 +2909,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, init_vma_prep(&vp, vma); vma_prepare(&vp); vma->vm_end = addr + len; - vma->vm_flags |= VM_SOFTDIRTY; + vm_flags_set(vma, VM_SOFTDIRTY); vma_iter_store(vmi, vma); vma_complete(&vp, vmi, mm); @@ -2926,7 +2926,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_pgoff = addr >> PAGE_SHIFT; - vma->vm_flags = flags; + vm_flags_init(vma, flags); vma->vm_page_prot = vm_get_page_prot(flags); if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; @@ -2938,7 +2938,7 @@ out: mm->data_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); - vma->vm_flags |= VM_SOFTDIRTY; + vm_flags_set(vma, VM_SOFTDIRTY); validate_mm(mm); return 0; diff --git a/mm/mprotect.c b/mm/mprotect.c index cce6a0e58fb5..1d4843c97c2a 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -670,7 +670,7 @@ success: * vm_flags and vm_page_prot are protected by the mmap_lock * held in write mode. */ - vma->vm_flags = newflags; + vm_flags_reset(vma, newflags); if (vma_wants_manual_pte_write_upgrade(vma)) mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; vma_set_page_prot(vma); diff --git a/mm/mremap.c b/mm/mremap.c index d70d8063c6e2..411a85682b58 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -662,7 +662,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, /* Conceal VM_ACCOUNT so old reservation is not undone */ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) { - vma->vm_flags &= ~VM_ACCOUNT; + vm_flags_clear(vma, VM_ACCOUNT); if (vma->vm_start < old_addr) account_start = vma->vm_start; if (vma->vm_end > old_addr + old_len) @@ -719,12 +719,12 @@ static unsigned long move_vma(struct vm_area_struct *vma, /* Restore VM_ACCOUNT if one or two pieces of vma left */ if (account_start) { vma = vma_prev(&vmi); - vma->vm_flags |= VM_ACCOUNT; + vm_flags_set(vma, VM_ACCOUNT); } if (account_end) { vma = vma_next(&vmi); - vma->vm_flags |= VM_ACCOUNT; + vm_flags_set(vma, VM_ACCOUNT); } return new_addr; diff --git a/mm/nommu.c b/mm/nommu.c index 9a166738909e..57ba243c6a37 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -173,7 +173,7 @@ static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) mmap_write_lock(current->mm); vma = find_vma(current->mm, (unsigned long)ret); if (vma) - vma->vm_flags |= VM_USERMAP; + vm_flags_set(vma, VM_USERMAP); mmap_write_unlock(current->mm); } @@ -950,7 +950,8 @@ static int do_mmap_private(struct vm_area_struct *vma, atomic_long_add(total, &mmap_pages_allocated); - region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; + vm_flags_set(vma, VM_MAPPED_COPY); + region->vm_flags = vma->vm_flags; region->vm_start = (unsigned long) base; region->vm_end = region->vm_start + len; region->vm_top = region->vm_start + (total << PAGE_SHIFT); @@ -1047,7 +1048,7 @@ unsigned long do_mmap(struct file *file, region->vm_flags = vm_flags; region->vm_pgoff = pgoff; - vma->vm_flags = vm_flags; + vm_flags_init(vma, vm_flags); vma->vm_pgoff = pgoff; if (file) { @@ -1111,7 +1112,7 @@ unsigned long do_mmap(struct file *file, vma->vm_end = start + len; if (pregion->vm_flags & VM_MAPPED_COPY) - vma->vm_flags |= VM_MAPPED_COPY; + vm_flags_set(vma, VM_MAPPED_COPY); else { ret = do_mmap_shared_file(vma); if (ret < 0) { @@ -1601,7 +1602,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, if (addr != (pfn << PAGE_SHIFT)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); return 0; } EXPORT_SYMBOL(remap_pfn_range); diff --git a/mm/secretmem.c b/mm/secretmem.c index be3fff86ba00..8453ada8f41d 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -128,7 +128,7 @@ static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) return -EAGAIN; - vma->vm_flags |= VM_LOCKED | VM_DONTDUMP; + vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP); vma->vm_ops = &secretmem_vm_ops; return 0; diff --git a/mm/shmem.c b/mm/shmem.c index 9e1015cbad29..732969afabd1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2304,7 +2304,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) return ret; /* arm64 - allow memory tagging on RAM-based files */ - vma->vm_flags |= VM_MTE_ALLOWED; + vm_flags_set(vma, VM_MTE_ALLOWED); file_accessed(file); /* This is anonymous shared memory if it is unlinked at the time of mmap */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9b71ec3213cb..ff4d7dfdf84a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3596,7 +3596,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, size -= PAGE_SIZE; } while (size > 0); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); return 0; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f713c0422f0f..7db45cdc3e1a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1890,10 +1890,10 @@ int tcp_mmap(struct file *file, struct socket *sock, { if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; - vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); + vm_flags_clear(vma, VM_MAYWRITE | VM_MAYEXEC); /* Instruct vm_insert_page() to not mmap_read_lock(mm) */ - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_set(vma, VM_MIXEDMAP); vma->vm_ops = &tcp_vm_ops; return 0; diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 0a6894cdc54d..18498979a640 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -262,7 +262,7 @@ static int sel_mmap_handle_status(struct file *filp, if (vma->vm_flags & VM_WRITE) return -EPERM; /* disallow mprotect() turns it into writable */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); return remap_pfn_range(vma, vma->vm_start, page_to_pfn(status), @@ -506,13 +506,13 @@ static int sel_mmap_policy(struct file *filp, struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHARED) { /* do not allow mprotect to make mapping writable */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); if (vma->vm_flags & VM_WRITE) return -EACCES; } - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &sel_mmap_policy_ops; return 0; diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c index ac2efeb63a39..728c211142d1 100644 --- a/sound/core/oss/pcm_oss.c +++ b/sound/core/oss/pcm_oss.c @@ -2910,7 +2910,7 @@ static int snd_pcm_oss_mmap(struct file *file, struct vm_area_struct *area) } /* set VM_READ access as well to fix memset() routines that do reads before writes (to improve performance) */ - area->vm_flags |= VM_READ; + vm_flags_set(area, VM_READ); if (substream == NULL) return -ENXIO; runtime = substream->runtime; diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 9c122e757efe..331380c2438b 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -3675,8 +3675,9 @@ static int snd_pcm_mmap_status(struct snd_pcm_substream *substream, struct file return -EINVAL; area->vm_ops = &snd_pcm_vm_ops_status; area->vm_private_data = substream; - area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; - area->vm_flags &= ~(VM_WRITE | VM_MAYWRITE); + vm_flags_mod(area, VM_DONTEXPAND | VM_DONTDUMP, + VM_WRITE | VM_MAYWRITE); + return 0; } @@ -3712,7 +3713,7 @@ static int snd_pcm_mmap_control(struct snd_pcm_substream *substream, struct file return -EINVAL; area->vm_ops = &snd_pcm_vm_ops_control; area->vm_private_data = substream; - area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(area, VM_DONTEXPAND | VM_DONTDUMP); return 0; } @@ -3828,7 +3829,7 @@ static const struct vm_operations_struct snd_pcm_vm_ops_data_fault = { int snd_pcm_lib_default_mmap(struct snd_pcm_substream *substream, struct vm_area_struct *area) { - area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(area, VM_DONTEXPAND | VM_DONTDUMP); if (!substream->ops->page && !snd_dma_buffer_mmap(snd_pcm_get_dma_buf(substream), area)) return 0; diff --git a/sound/soc/pxa/mmp-sspa.c b/sound/soc/pxa/mmp-sspa.c index fb5a4390443f..b3c1744eff91 100644 --- a/sound/soc/pxa/mmp-sspa.c +++ b/sound/soc/pxa/mmp-sspa.c @@ -404,7 +404,7 @@ static int mmp_pcm_mmap(struct snd_soc_component *component, struct snd_pcm_substream *substream, struct vm_area_struct *vma) { - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); return remap_pfn_range(vma, vma->vm_start, substream->dma_buffer.addr >> PAGE_SHIFT, diff --git a/sound/usb/usx2y/us122l.c b/sound/usb/usx2y/us122l.c index e558931cce16..709ccad972e2 100644 --- a/sound/usb/usx2y/us122l.c +++ b/sound/usb/usx2y/us122l.c @@ -224,9 +224,9 @@ static int usb_stream_hwdep_mmap(struct snd_hwdep *hw, } area->vm_ops = &usb_stream_hwdep_vm_ops; - area->vm_flags |= VM_DONTDUMP; + vm_flags_set(area, VM_DONTDUMP); if (!read) - area->vm_flags |= VM_DONTEXPAND; + vm_flags_set(area, VM_DONTEXPAND); area->vm_private_data = us122l; atomic_inc(&us122l->mmap_count); out: diff --git a/sound/usb/usx2y/usX2Yhwdep.c b/sound/usb/usx2y/usX2Yhwdep.c index c29da0341bc5..4937ede0b5d7 100644 --- a/sound/usb/usx2y/usX2Yhwdep.c +++ b/sound/usb/usx2y/usX2Yhwdep.c @@ -61,7 +61,7 @@ static int snd_us428ctls_mmap(struct snd_hwdep *hw, struct file *filp, struct vm } area->vm_ops = &us428ctls_vm_ops; - area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(area, VM_DONTEXPAND | VM_DONTDUMP); area->vm_private_data = hw->private_data; return 0; } diff --git a/sound/usb/usx2y/usx2yhwdeppcm.c b/sound/usb/usx2y/usx2yhwdeppcm.c index 767a227d54da..36f2e31168fb 100644 --- a/sound/usb/usx2y/usx2yhwdeppcm.c +++ b/sound/usb/usx2y/usx2yhwdeppcm.c @@ -706,7 +706,7 @@ static int snd_usx2y_hwdep_pcm_mmap(struct snd_hwdep *hw, struct file *filp, str return -ENODEV; area->vm_ops = &snd_usx2y_hwdep_pcm_vm_ops; - area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(area, VM_DONTEXPAND | VM_DONTDUMP); area->vm_private_data = hw->private_data; return 0; } -- cgit v1.2.3 From ff126c0ece69de1c12d2f6e77ec77df997dd19e6 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:50 -0800 Subject: mm: replace vma->vm_flags indirect modification in ksm_madvise Replace indirect modifications to vma->vm_flags with calls to modifier functions to be able to track flag changes and to keep vma locking correctness. Link: https://lkml.kernel.org/r/20230126193752.297968-6-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Mike Rapoport (IBM) Acked-by: Michael Ellerman [powerpc] Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Sebastian Reichel Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/kvm/book3s_hv_uvmem.c | 6 +++++- arch/s390/mm/gmap.c | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 1d67baa5557a..709ebd578394 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -393,6 +393,7 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm, { unsigned long gfn = memslot->base_gfn; unsigned long end, start = gfn_to_hva(kvm, gfn); + unsigned long vm_flags; int ret = 0; struct vm_area_struct *vma; int merge_flag = (merge) ? MADV_MERGEABLE : MADV_UNMERGEABLE; @@ -409,12 +410,15 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm, ret = H_STATE; break; } + /* Copy vm_flags to avoid partial modifications in ksm_madvise */ + vm_flags = vma->vm_flags; ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, - merge_flag, &vma->vm_flags); + merge_flag, &vm_flags); if (ret) { ret = H_STATE; break; } + vm_flags_reset(vma, vm_flags); start = vma->vm_end; } while (end > vma->vm_end); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index ab836597419d..5a716bdcba05 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2587,14 +2587,18 @@ int gmap_mark_unmergeable(void) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; + unsigned long vm_flags; int ret; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) { + /* Copy vm_flags to avoid partial modifications in ksm_madvise */ + vm_flags = vma->vm_flags; ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, - MADV_UNMERGEABLE, &vma->vm_flags); + MADV_UNMERGEABLE, &vm_flags); if (ret) return ret; + vm_flags_reset(vma, vm_flags); } mm->def_flags &= ~VM_MERGEABLE; return 0; -- cgit v1.2.3 From 68f48381d7fdd1cbb9d88c37a4dfbb98ac78226d Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:51 -0800 Subject: mm: introduce __vm_flags_mod and use it in untrack_pfn There are scenarios when vm_flags can be modified without exclusive mmap_lock, such as: - after VMA was isolated and mmap_lock was downgraded or dropped - in exit_mmap when there are no other mm users and locking is unnecessary Introduce __vm_flags_mod to avoid assertions when the caller takes responsibility for the required locking. Pass a hint to untrack_pfn to conditionally use __vm_flags_mod for flags modification to avoid assertion. Link: https://lkml.kernel.org/r/20230126193752.297968-7-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Acked-by: Mike Rapoport (IBM) Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Sebastian Reichel Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/mm/pat/memtype.c | 10 +++++++--- include/linux/mm.h | 14 ++++++++++++-- include/linux/pgtable.h | 5 +++-- mm/memory.c | 13 +++++++------ mm/memremap.c | 4 ++-- mm/mmap.c | 16 ++++++++++------ 6 files changed, 41 insertions(+), 21 deletions(-) diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 6ca51b1aa5d9..691bf8934b6f 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -1046,7 +1046,7 @@ void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) * can be for the entire vma (in which case pfn, size are zero). */ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size) + unsigned long size, bool mm_wr_locked) { resource_size_t paddr; unsigned long prot; @@ -1065,8 +1065,12 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, size = vma->vm_end - vma->vm_start; } free_pfn_range(paddr, size); - if (vma) - vm_flags_clear(vma, VM_PAT); + if (vma) { + if (mm_wr_locked) + vm_flags_clear(vma, VM_PAT); + else + __vm_flags_mod(vma, 0, VM_PAT); + } } /* diff --git a/include/linux/mm.h b/include/linux/mm.h index ce6d9d765aae..27b34f7730e7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -656,6 +656,16 @@ static inline void vm_flags_clear(struct vm_area_struct *vma, ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; } +/* + * Use only if VMA is not part of the VMA tree or has no other users and + * therefore needs no locking. + */ +static inline void __vm_flags_mod(struct vm_area_struct *vma, + vm_flags_t set, vm_flags_t clear) +{ + vm_flags_init(vma, (vma->vm_flags | set) & ~clear); +} + /* * Use only when the order of set/clear operations is unimportant, otherwise * use vm_flags_{set|clear} explicitly. @@ -664,7 +674,7 @@ static inline void vm_flags_mod(struct vm_area_struct *vma, vm_flags_t set, vm_flags_t clear) { mmap_assert_write_locked(vma->vm_mm); - vm_flags_init(vma, (vma->vm_flags | set) & ~clear); + __vm_flags_mod(vma, set, clear); } static inline void vma_set_anonymous(struct vm_area_struct *vma) @@ -2085,7 +2095,7 @@ static inline void zap_vma_pages(struct vm_area_struct *vma) } void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, struct vm_area_struct *start_vma, unsigned long start, - unsigned long end); + unsigned long end, bool mm_wr_locked); struct mmu_notifier_range; diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5fd45454c073..c63cd44777ec 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1185,7 +1185,8 @@ static inline int track_pfn_copy(struct vm_area_struct *vma) * can be for the entire vma (in which case pfn, size are zero). */ static inline void untrack_pfn(struct vm_area_struct *vma, - unsigned long pfn, unsigned long size) + unsigned long pfn, unsigned long size, + bool mm_wr_locked) { } @@ -1203,7 +1204,7 @@ extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn); extern int track_pfn_copy(struct vm_area_struct *vma); extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size); + unsigned long size, bool mm_wr_locked); extern void untrack_pfn_moved(struct vm_area_struct *vma); #endif diff --git a/mm/memory.c b/mm/memory.c index 4354b7987f36..ce1d84d022d3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1613,7 +1613,7 @@ void unmap_page_range(struct mmu_gather *tlb, static void unmap_single_vma(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, - struct zap_details *details) + struct zap_details *details, bool mm_wr_locked) { unsigned long start = max(vma->vm_start, start_addr); unsigned long end; @@ -1628,7 +1628,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, uprobe_munmap(vma, start, end); if (unlikely(vma->vm_flags & VM_PFNMAP)) - untrack_pfn(vma, 0, 0); + untrack_pfn(vma, 0, 0, mm_wr_locked); if (start != end) { if (unlikely(is_vm_hugetlb_page(vma))) { @@ -1675,7 +1675,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, */ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr) + unsigned long end_addr, bool mm_wr_locked) { struct mmu_notifier_range range; struct zap_details details = { @@ -1689,7 +1689,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); do { - unmap_single_vma(tlb, vma, start_addr, end_addr, &details); + unmap_single_vma(tlb, vma, start_addr, end_addr, &details, + mm_wr_locked); } while ((vma = mas_find(&mas, end_addr - 1)) != NULL); mmu_notifier_invalidate_range_end(&range); } @@ -1723,7 +1724,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, * unmap 'address-end' not 'range.start-range.end' as range * could have been expanded for hugetlb pmd sharing. */ - unmap_single_vma(&tlb, vma, address, end, details); + unmap_single_vma(&tlb, vma, address, end, details, false); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); } @@ -2492,7 +2493,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); if (err) - untrack_pfn(vma, pfn, PAGE_ALIGN(size)); + untrack_pfn(vma, pfn, PAGE_ALIGN(size), true); return err; } EXPORT_SYMBOL(remap_pfn_range); diff --git a/mm/memremap.c b/mm/memremap.c index 08cbf54fe037..2f88f43d4a01 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -129,7 +129,7 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) } mem_hotplug_done(); - untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); + untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true); pgmap_array_delete(range); } @@ -276,7 +276,7 @@ err_add_memory: if (!is_private) kasan_remove_zero_shadow(__va(range->start), range_len(range)); err_kasan: - untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); + untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true); err_pfn_remap: pgmap_array_delete(range); return error; diff --git a/mm/mmap.c b/mm/mmap.c index 33c638c7ec04..20f21f0949dd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -78,7 +78,7 @@ core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next, unsigned long start, - unsigned long end); + unsigned long end, bool mm_wr_locked); static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) { @@ -2133,14 +2133,14 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, bool mm_wr_locked) { struct mmu_gather tlb; lru_add_drain(); tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, mt, vma, start, end); + unmap_vmas(&tlb, mt, vma, start, end, mm_wr_locked); free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); @@ -2388,7 +2388,11 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, mmap_write_downgrade(mm); } - unmap_region(mm, &mt_detach, vma, prev, next, start, end); + /* + * We can free page tables without write-locking mmap_lock because VMAs + * were isolated before we downgraded mmap_lock. + */ + unmap_region(mm, &mt_detach, vma, prev, next, start, end, !downgrade); /* Statistics and freeing VMAs */ mas_set(&mas_detach, start); remove_mt(mm, &mas_detach); @@ -2701,7 +2705,7 @@ unmap_and_free_vma: /* Undo any partial mapping done by a device driver. */ unmap_region(mm, &mm->mm_mt, vma, prev, next, vma->vm_start, - vma->vm_end); + vma->vm_end, true); } if (file && (vm_flags & VM_SHARED)) mapping_unmap_writable(file->f_mapping); @@ -3029,7 +3033,7 @@ void exit_mmap(struct mm_struct *mm) tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ - unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX); + unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX, false); mmap_read_unlock(mm); /* -- cgit v1.2.3 From c2fdc235300a027adc04a41b383bd78ab5da56f4 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:52 -0800 Subject: mm: export dump_mm() mmap_assert_write_locked() is used in vm_flags modifiers. Because mmap_assert_write_locked() uses dump_mm() and vm_flags are sometimes modified from inside a module, it's necessary to export dump_mm() function. Link: https://lkml.kernel.org/r/20230126193752.297968-8-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Acked-by: Mike Rapoport (IBM) Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Sebastian Reichel Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/debug.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/debug.c b/mm/debug.c index 9d3d893dc7f4..96d594e16292 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -215,6 +215,7 @@ void dump_mm(const struct mm_struct *mm) mm->def_flags, &mm->def_flags ); } +EXPORT_SYMBOL(dump_mm); static bool page_init_poisoning __read_mostly = true; -- cgit v1.2.3 From 8f17febb34ceb464e3ff99e9436d0ae3f47b4862 Mon Sep 17 00:00:00 2001 From: Kuan-Ying Lee Date: Sun, 29 Jan 2023 10:14:35 +0800 Subject: kasan: infer allocation size by scanning metadata Make KASAN scan metadata to infer the requested allocation size instead of printing cache->object_size. This patch fixes confusing slab-out-of-bounds reports as reported in: https://bugzilla.kernel.org/show_bug.cgi?id=216457 As an example of the confusing behavior, the report below hints that the allocation size was 192, while the kernel actually called kmalloc(184): ================================================================== BUG: KASAN: slab-out-of-bounds in _find_next_bit+0x143/0x160 lib/find_bit.c:109 Read of size 8 at addr ffff8880175766b8 by task kworker/1:1/26 ... The buggy address belongs to the object at ffff888017576600 which belongs to the cache kmalloc-192 of size 192 The buggy address is located 184 bytes inside of 192-byte region [ffff888017576600, ffff8880175766c0) ... Memory state around the buggy address: ffff888017576580: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff888017576600: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >ffff888017576680: 00 00 00 00 00 00 00 fc fc fc fc fc fc fc fc fc ^ ffff888017576700: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff888017576780: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ================================================================== With this patch, the report shows: ================================================================== ... The buggy address belongs to the object at ffff888017576600 which belongs to the cache kmalloc-192 of size 192 The buggy address is located 0 bytes to the right of allocated 184-byte region [ffff888017576600, ffff8880175766b8) ... ================================================================== Also report slab use-after-free bugs as "slab-use-after-free" and print "freed" instead of "allocated" in the report when describing the accessed memory region. Also improve the metadata-related comment in kasan_find_first_bad_addr and use addr_has_metadata across KASAN code instead of open-coding KASAN_SHADOW_START checks. [akpm@linux-foundation.org: fix printk warning] Link: https://bugzilla.kernel.org/show_bug.cgi?id=216457 Link: https://lkml.kernel.org/r/20230129021437.18812-1-Kuan-Ying.Lee@mediatek.com Signed-off-by: Kuan-Ying Lee Co-developed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Chinwen Chang Cc: Dmitry Vyukov Cc: Matthias Brugger Cc: Qun-Wei Lin Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/generic.c | 4 +--- mm/kasan/kasan.h | 2 ++ mm/kasan/report.c | 41 ++++++++++++++++++++++++++++++----------- mm/kasan/report_generic.c | 32 +++++++++++++++++++++++++++++++- mm/kasan/report_hw_tags.c | 35 ++++++++++++++++++++++++++++++++++- mm/kasan/report_sw_tags.c | 26 ++++++++++++++++++++++++++ mm/kasan/report_tags.c | 2 +- mm/kasan/sw_tags.c | 6 ++---- 8 files changed, 127 insertions(+), 21 deletions(-) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index b076f597a378..a37b5b57bf5c 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -172,10 +172,8 @@ static __always_inline bool check_region_inline(unsigned long addr, if (unlikely(addr + size < addr)) return !kasan_report(addr, size, write, ret_ip); - if (unlikely((void *)addr < - kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { + if (unlikely(!addr_has_metadata((void *)addr))) return !kasan_report(addr, size, write, ret_ip); - } if (likely(!memory_is_poisoned(addr, size))) return true; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 32413f22aa82..308fb70fd40a 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -207,6 +207,7 @@ struct kasan_report_info { void *first_bad_addr; struct kmem_cache *cache; void *object; + size_t alloc_size; /* Filled in by the mode-specific reporting code. */ const char *bug_type; @@ -323,6 +324,7 @@ static inline bool addr_has_metadata(const void *addr) #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ void *kasan_find_first_bad_addr(void *addr, size_t size); +size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache); void kasan_complete_mode_report_info(struct kasan_report_info *info); void kasan_metadata_fetch_row(char *buffer, void *row); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 22598b20c7b7..89078f912827 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -231,33 +231,46 @@ static inline struct page *addr_to_page(const void *addr) return NULL; } -static void describe_object_addr(const void *addr, struct kmem_cache *cache, - void *object) +static void describe_object_addr(const void *addr, struct kasan_report_info *info) { unsigned long access_addr = (unsigned long)addr; - unsigned long object_addr = (unsigned long)object; - const char *rel_type; + unsigned long object_addr = (unsigned long)info->object; + const char *rel_type, *region_state = ""; int rel_bytes; pr_err("The buggy address belongs to the object at %px\n" " which belongs to the cache %s of size %d\n", - object, cache->name, cache->object_size); + info->object, info->cache->name, info->cache->object_size); if (access_addr < object_addr) { rel_type = "to the left"; rel_bytes = object_addr - access_addr; - } else if (access_addr >= object_addr + cache->object_size) { + } else if (access_addr >= object_addr + info->alloc_size) { rel_type = "to the right"; - rel_bytes = access_addr - (object_addr + cache->object_size); + rel_bytes = access_addr - (object_addr + info->alloc_size); } else { rel_type = "inside"; rel_bytes = access_addr - object_addr; } + /* + * Tag-Based modes use the stack ring to infer the bug type, but the + * memory region state description is generated based on the metadata. + * Thus, defining the region state as below can contradict the metadata. + * Fixing this requires further improvements, so only infer the state + * for the Generic mode. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) { + if (strcmp(info->bug_type, "slab-out-of-bounds") == 0) + region_state = "allocated "; + else if (strcmp(info->bug_type, "slab-use-after-free") == 0) + region_state = "freed "; + } + pr_err("The buggy address is located %d bytes %s of\n" - " %d-byte region [%px, %px)\n", - rel_bytes, rel_type, cache->object_size, (void *)object_addr, - (void *)(object_addr + cache->object_size)); + " %s%zu-byte region [%px, %px)\n", + rel_bytes, rel_type, region_state, info->alloc_size, + (void *)object_addr, (void *)(object_addr + info->alloc_size)); } static void describe_object_stacks(struct kasan_report_info *info) @@ -279,7 +292,7 @@ static void describe_object(const void *addr, struct kasan_report_info *info) { if (kasan_stack_collection_enabled()) describe_object_stacks(info); - describe_object_addr(addr, info->cache, info->object); + describe_object_addr(addr, info); } static inline bool kernel_or_module_addr(const void *addr) @@ -436,6 +449,12 @@ static void complete_report_info(struct kasan_report_info *info) if (slab) { info->cache = slab->slab_cache; info->object = nearest_obj(info->cache, slab, addr); + + /* Try to determine allocation size based on the metadata. */ + info->alloc_size = kasan_get_alloc_size(info->object, info->cache); + /* Fallback to the object size if failed. */ + if (!info->alloc_size) + info->alloc_size = info->cache->object_size; } else info->cache = info->object = NULL; diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 043c94b04605..87d39bc0a673 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -43,6 +43,34 @@ void *kasan_find_first_bad_addr(void *addr, size_t size) return p; } +size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache) +{ + size_t size = 0; + u8 *shadow; + + /* + * Skip the addr_has_metadata check, as this function only operates on + * slab memory, which must have metadata. + */ + + /* + * The loop below returns 0 for freed objects, for which KASAN cannot + * calculate the allocation size based on the metadata. + */ + shadow = (u8 *)kasan_mem_to_shadow(object); + while (size < cache->object_size) { + if (*shadow == 0) + size += KASAN_GRANULE_SIZE; + else if (*shadow >= 1 && *shadow <= KASAN_GRANULE_SIZE - 1) + return size + *shadow; + else + return size; + shadow++; + } + + return cache->object_size; +} + static const char *get_shadow_bug_type(struct kasan_report_info *info) { const char *bug_type = "unknown-crash"; @@ -79,9 +107,11 @@ static const char *get_shadow_bug_type(struct kasan_report_info *info) bug_type = "stack-out-of-bounds"; break; case KASAN_PAGE_FREE: + bug_type = "use-after-free"; + break; case KASAN_SLAB_FREE: case KASAN_SLAB_FREETRACK: - bug_type = "use-after-free"; + bug_type = "slab-use-after-free"; break; case KASAN_ALLOCA_LEFT: case KASAN_ALLOCA_RIGHT: diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c index f3d3be614e4b..32e80f78de7d 100644 --- a/mm/kasan/report_hw_tags.c +++ b/mm/kasan/report_hw_tags.c @@ -17,10 +17,43 @@ void *kasan_find_first_bad_addr(void *addr, size_t size) { - /* Return the same value regardless of whether addr_has_metadata(). */ + /* + * Hardware Tag-Based KASAN only calls this function for normal memory + * accesses, and thus addr points precisely to the first bad address + * with an invalid (and present) memory tag. Therefore: + * 1. Return the address as is without walking memory tags. + * 2. Skip the addr_has_metadata check. + */ return kasan_reset_tag(addr); } +size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache) +{ + size_t size = 0; + int i = 0; + u8 memory_tag; + + /* + * Skip the addr_has_metadata check, as this function only operates on + * slab memory, which must have metadata. + */ + + /* + * The loop below returns 0 for freed objects, for which KASAN cannot + * calculate the allocation size based on the metadata. + */ + while (size < cache->object_size) { + memory_tag = hw_get_mem_tag(object + i * KASAN_GRANULE_SIZE); + if (memory_tag != KASAN_TAG_INVALID) + size += KASAN_GRANULE_SIZE; + else + return size; + i++; + } + + return cache->object_size; +} + void kasan_metadata_fetch_row(char *buffer, void *row) { int i; diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c index 7a26397297ed..8b1f5a73ee6d 100644 --- a/mm/kasan/report_sw_tags.c +++ b/mm/kasan/report_sw_tags.c @@ -45,6 +45,32 @@ void *kasan_find_first_bad_addr(void *addr, size_t size) return p; } +size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache) +{ + size_t size = 0; + u8 *shadow; + + /* + * Skip the addr_has_metadata check, as this function only operates on + * slab memory, which must have metadata. + */ + + /* + * The loop below returns 0 for freed objects, for which KASAN cannot + * calculate the allocation size based on the metadata. + */ + shadow = (u8 *)kasan_mem_to_shadow(object); + while (size < cache->object_size) { + if (*shadow != KASAN_TAG_INVALID) + size += KASAN_GRANULE_SIZE; + else + return size; + shadow++; + } + + return cache->object_size; +} + void kasan_metadata_fetch_row(char *buffer, void *row) { memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index ecede06ef374..8b8bfdb3cfdb 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -89,7 +89,7 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) * a use-after-free. */ if (!info->bug_type) - info->bug_type = "use-after-free"; + info->bug_type = "slab-use-after-free"; } else { /* Second alloc of the same object. Give up. */ if (alloc_found) diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index a3afaf2ad1b1..30da65fa02a1 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -106,10 +106,8 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write, return true; untagged_addr = kasan_reset_tag((const void *)addr); - if (unlikely(untagged_addr < - kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { + if (unlikely(!addr_has_metadata(untagged_addr))) return !kasan_report(addr, size, write, ret_ip); - } shadow_first = kasan_mem_to_shadow(untagged_addr); shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1); for (shadow = shadow_first; shadow <= shadow_last; shadow++) { @@ -127,7 +125,7 @@ bool kasan_byte_accessible(const void *addr) void *untagged_addr = kasan_reset_tag(addr); u8 shadow_byte; - if (untagged_addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) + if (!addr_has_metadata(untagged_addr)) return false; shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(untagged_addr)); -- cgit v1.2.3 From 93419139fa14124c1c507d804f2b28866ebee28d Mon Sep 17 00:00:00 2001 From: Tong Tiangen Date: Sun, 29 Jan 2023 04:06:51 +0000 Subject: memory tier: release the new_memtier in find_create_memory_tier() In find_create_memory_tier(), if failed to register device, then we should release new_memtier from the tier list and put device instead of memtier. Link: https://lkml.kernel.org/r/20230129040651.1329208-1-tongtiangen@huawei.com Fixes: 9832fb87834e ("mm/demotion: expose memory tier details via sysfs") Signed-off-by: Tong Tiangen Cc: Aneesh Kumar K.V Cc: Hanjun Guo Cc: Kefeng Wang Cc: Guohanjun Cc: Signed-off-by: Andrew Morton --- mm/memory-tiers.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index c734658c6242..e593e56e530b 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -211,8 +211,8 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty ret = device_register(&new_memtier->dev); if (ret) { - list_del(&memtier->list); - put_device(&memtier->dev); + list_del(&new_memtier->list); + put_device(&new_memtier->dev); return ERR_PTR(ret); } memtier = new_memtier; -- cgit v1.2.3 From 6069b9ec8c0fcd5ee09167020e9e5e673fce93f8 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 29 Jan 2023 14:42:32 +0200 Subject: arm: include asm-generic/memory_model.h from page.h rather than memory.h Patch series "mm, arch: add generic implementation of pfn_valid() for FLATMEM", v2. Every architecture that supports FLATMEM memory model defines its own version of pfn_valid() that essentially compares a pfn to max_mapnr. Use mips/powerpc version implemented as static inline as a generic implementation of pfn_valid() and drop its per-architecture definitions This patch (of 4): Makes it consistent with other architectures and allows for generic definition of pfn_valid() in asm-generic/memory_model.h with clear override in arch/arm/include/asm/page.h Link: https://lkml.kernel.org/r/20230129124235.209895-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20230129124235.209895-2-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: David Hildenbrand Cc: Arnd Bergmann Cc: Brian Cain Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Greg Ungerer Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Huacai Chen Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Vineet Gupta Cc: WANG Xuerui Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/arm/include/asm/memory.h | 2 -- arch/arm/include/asm/page.h | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index d8eef4bd8c71..62e9df024445 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -386,6 +386,4 @@ static inline unsigned long __virt_to_idmap(unsigned long x) #endif -#include - #endif diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h index 5fcc8a600e36..74bb5947b387 100644 --- a/arch/arm/include/asm/page.h +++ b/arch/arm/include/asm/page.h @@ -158,6 +158,7 @@ typedef struct page *pgtable_t; #ifdef CONFIG_HAVE_ARCH_PFN_VALID extern int pfn_valid(unsigned long); +#define pfn_valid pfn_valid #endif #include @@ -167,5 +168,6 @@ extern int pfn_valid(unsigned long); #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC #include +#include #endif -- cgit v1.2.3 From d82f07f06cf85b4fe4560ee7c1a460591db840df Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 29 Jan 2023 14:42:33 +0200 Subject: m68k: use asm-generic/memory_model.h for both MMU and !MMU The MMU variant uses generic definitions of page_to_pfn() and pfn_to_page(), but !MMU defines them in include/asm/page_no.h for no good reason. Include asm-generic/memory_model.h in the common include/asm/page.h and drop redundant definitions. Link: https://lkml.kernel.org/r/20230129124235.209895-3-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: Geert Uytterhoeven Acked-by: Geert Uytterhoeven Reviewed-by: David Hildenbrand Cc: Arnd Bergmann Cc: Brian Cain Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Greg Ungerer Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Huacai Chen Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Vineet Gupta Cc: WANG Xuerui Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/m68k/include/asm/page.h | 6 +----- arch/m68k/include/asm/page_mm.h | 1 - arch/m68k/include/asm/page_no.h | 2 -- 3 files changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/m68k/include/asm/page.h b/arch/m68k/include/asm/page.h index 2f1c54e4725d..a5993ad83ed8 100644 --- a/arch/m68k/include/asm/page.h +++ b/arch/m68k/include/asm/page.h @@ -62,11 +62,7 @@ extern unsigned long _ramend; #include #endif -#ifndef CONFIG_MMU -#define __phys_to_pfn(paddr) ((unsigned long)((paddr) >> PAGE_SHIFT)) -#define __pfn_to_phys(pfn) PFN_PHYS(pfn) -#endif - #include +#include #endif /* _M68K_PAGE_H */ diff --git a/arch/m68k/include/asm/page_mm.h b/arch/m68k/include/asm/page_mm.h index a5b459bcb7d8..3903db2e8da7 100644 --- a/arch/m68k/include/asm/page_mm.h +++ b/arch/m68k/include/asm/page_mm.h @@ -134,7 +134,6 @@ extern int m68k_virt_to_node_shift; }) #define ARCH_PFN_OFFSET (m68k_memory[0].addr >> PAGE_SHIFT) -#include #define virt_addr_valid(kaddr) ((unsigned long)(kaddr) >= PAGE_OFFSET && (unsigned long)(kaddr) < (unsigned long)high_memory) #define pfn_valid(pfn) virt_addr_valid(pfn_to_virt(pfn)) diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h index abd2c3aeb015..83d345f482bd 100644 --- a/arch/m68k/include/asm/page_no.h +++ b/arch/m68k/include/asm/page_no.h @@ -25,8 +25,6 @@ extern unsigned long memory_end; #define virt_to_page(addr) (mem_map + (((unsigned long)(addr)-PAGE_OFFSET) >> PAGE_SHIFT)) #define page_to_virt(page) __va(((((page) - mem_map) << PAGE_SHIFT) + PAGE_OFFSET)) -#define pfn_to_page(pfn) virt_to_page(pfn_to_virt(pfn)) -#define page_to_pfn(page) virt_to_pfn(page_to_virt(page)) #define pfn_valid(pfn) ((pfn) < max_mapnr) #define virt_addr_valid(kaddr) (((unsigned long)(kaddr) >= PAGE_OFFSET) && \ -- cgit v1.2.3 From c2524a6b7de1e8d2767ddc1ee43ee241a580e677 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 29 Jan 2023 14:42:34 +0200 Subject: mips: drop definition of pfn_valid() for DISCONTIGMEM There is a stale definition of pfn_valid() for DISCONTINGMEM memory model guarded !FLATMEM && !SPARSEMEM && NUMA ifdefery. Remove everything but definition of pfn_valid() for FLATMEM. Link: https://lkml.kernel.org/r/20230129124235.209895-4-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: David Hildenbrand Cc: Arnd Bergmann Cc: Brian Cain Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Greg Ungerer Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Huacai Chen Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Vineet Gupta Cc: WANG Xuerui Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/mips/include/asm/page.h | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h index 96bc798c1ec1..9286f11ff6ad 100644 --- a/arch/mips/include/asm/page.h +++ b/arch/mips/include/asm/page.h @@ -235,21 +235,6 @@ static inline int pfn_valid(unsigned long pfn) return pfn >= pfn_offset && pfn < max_mapnr; } -#elif defined(CONFIG_SPARSEMEM) - -/* pfn_valid is defined in linux/mmzone.h */ - -#elif defined(CONFIG_NUMA) - -#define pfn_valid(pfn) \ -({ \ - unsigned long __pfn = (pfn); \ - int __n = pfn_to_nid(__pfn); \ - ((__n >= 0) ? (__pfn < NODE_DATA(__n)->node_start_pfn + \ - NODE_DATA(__n)->node_spanned_pages) \ - : 0); \ -}) - #endif #define virt_to_pfn(kaddr) PFN_DOWN(virt_to_phys((void *)(kaddr))) -- cgit v1.2.3 From e5080a9677854bdd82383713cba168c1b13e46ba Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 29 Jan 2023 14:42:35 +0200 Subject: mm, arch: add generic implementation of pfn_valid() for FLATMEM Every architecture that supports FLATMEM memory model defines its own version of pfn_valid() that essentially compares a pfn to max_mapnr. Use mips/powerpc version implemented as static inline as a generic implementation of pfn_valid() and drop its per-architecture definitions. [rppt@kernel.org: fix the generic pfn_valid()] Link: https://lkml.kernel.org/r/Y9lg7R1Yd931C+y5@kernel.org Link: https://lkml.kernel.org/r/20230129124235.209895-5-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Acked-by: Arnd Bergmann Acked-by: Guo Ren [csky] Acked-by: Huacai Chen [LoongArch] Acked-by: Stafford Horne [OpenRISC] Acked-by: Michael Ellerman [powerpc] Reviewed-by: David Hildenbrand Tested-by: Conor Dooley Cc: Brian Cain Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Greg Ungerer Cc: Helge Deller Cc: Huacai Chen Cc: Matt Turner Cc: Max Filippov Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Thomas Bogendoerfer Cc: Vineet Gupta Cc: WANG Xuerui Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/alpha/include/asm/page.h | 4 ---- arch/arc/include/asm/page.h | 1 - arch/csky/include/asm/page.h | 1 - arch/hexagon/include/asm/page.h | 1 - arch/ia64/include/asm/page.h | 4 ---- arch/loongarch/include/asm/page.h | 13 ------------- arch/m68k/include/asm/page_no.h | 2 -- arch/microblaze/include/asm/page.h | 1 - arch/mips/include/asm/page.h | 13 ------------- arch/nios2/include/asm/page.h | 9 --------- arch/openrisc/include/asm/page.h | 2 -- arch/parisc/include/asm/page.h | 4 ---- arch/powerpc/include/asm/page.h | 9 --------- arch/riscv/include/asm/page.h | 5 ----- arch/sh/include/asm/page.h | 3 --- arch/sparc/include/asm/page_32.h | 1 - arch/um/include/asm/page.h | 1 - arch/x86/include/asm/page_32.h | 4 ---- arch/x86/include/asm/page_64.h | 4 ---- arch/xtensa/include/asm/page.h | 4 ++-- include/asm-generic/memory_model.h | 12 ++++++++++++ include/asm-generic/page.h | 2 -- 22 files changed, 14 insertions(+), 86 deletions(-) diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h index bc5256fba8f0..4db1ebc0ed99 100644 --- a/arch/alpha/include/asm/page.h +++ b/arch/alpha/include/asm/page.h @@ -86,10 +86,6 @@ typedef struct page *pgtable_t; #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define virt_addr_valid(kaddr) pfn_valid((__pa(kaddr) >> PAGE_SHIFT)) -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) < max_mapnr) -#endif /* CONFIG_FLATMEM */ - #include #include diff --git a/arch/arc/include/asm/page.h b/arch/arc/include/asm/page.h index 9a62e1d87967..e43fe27ec54d 100644 --- a/arch/arc/include/asm/page.h +++ b/arch/arc/include/asm/page.h @@ -109,7 +109,6 @@ extern int pfn_valid(unsigned long pfn); #else /* CONFIG_HIGHMEM */ #define ARCH_PFN_OFFSET virt_to_pfn(CONFIG_LINUX_RAM_BASE) -#define pfn_valid(pfn) (((pfn) - ARCH_PFN_OFFSET) < max_mapnr) #endif /* CONFIG_HIGHMEM */ diff --git a/arch/csky/include/asm/page.h b/arch/csky/include/asm/page.h index ed7451478b1b..b23e3006a9e0 100644 --- a/arch/csky/include/asm/page.h +++ b/arch/csky/include/asm/page.h @@ -39,7 +39,6 @@ #define virt_addr_valid(kaddr) ((void *)(kaddr) >= (void *)PAGE_OFFSET && \ (void *)(kaddr) < high_memory) -#define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && ((pfn) - ARCH_PFN_OFFSET) < max_mapnr) extern void *memset(void *dest, int c, size_t l); extern void *memcpy(void *to, const void *from, size_t l); diff --git a/arch/hexagon/include/asm/page.h b/arch/hexagon/include/asm/page.h index d7d4f9fca327..9c03b9965f07 100644 --- a/arch/hexagon/include/asm/page.h +++ b/arch/hexagon/include/asm/page.h @@ -95,7 +95,6 @@ struct page; /* Default vm area behavior is non-executable. */ #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC -#define pfn_valid(pfn) ((pfn) < max_mapnr) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) /* Need to not use a define for linesize; may move this to another file. */ diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h index ba0b365cf2b2..310b09c3342d 100644 --- a/arch/ia64/include/asm/page.h +++ b/arch/ia64/include/asm/page.h @@ -95,10 +95,6 @@ do { \ #include -#ifdef CONFIG_FLATMEM -# define pfn_valid(pfn) ((pfn) < max_mapnr) -#endif - #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) diff --git a/arch/loongarch/include/asm/page.h b/arch/loongarch/include/asm/page.h index 53f284a96182..fb5338b352e6 100644 --- a/arch/loongarch/include/asm/page.h +++ b/arch/loongarch/include/asm/page.h @@ -82,19 +82,6 @@ typedef struct { unsigned long pgprot; } pgprot_t; #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -#ifdef CONFIG_FLATMEM - -static inline int pfn_valid(unsigned long pfn) -{ - /* avoid include hell */ - extern unsigned long max_mapnr; - unsigned long pfn_offset = ARCH_PFN_OFFSET; - - return pfn >= pfn_offset && pfn < max_mapnr; -} - -#endif - #define virt_to_pfn(kaddr) PFN_DOWN(PHYSADDR(kaddr)) #define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h index 83d345f482bd..43ff6b109ebb 100644 --- a/arch/m68k/include/asm/page_no.h +++ b/arch/m68k/include/asm/page_no.h @@ -25,8 +25,6 @@ extern unsigned long memory_end; #define virt_to_page(addr) (mem_map + (((unsigned long)(addr)-PAGE_OFFSET) >> PAGE_SHIFT)) #define page_to_virt(page) __va(((((page) - mem_map) << PAGE_SHIFT) + PAGE_OFFSET)) -#define pfn_valid(pfn) ((pfn) < max_mapnr) - #define virt_addr_valid(kaddr) (((unsigned long)(kaddr) >= PAGE_OFFSET) && \ ((unsigned long)(kaddr) < memory_end)) diff --git a/arch/microblaze/include/asm/page.h b/arch/microblaze/include/asm/page.h index 4b8b2fa78fc5..7b9861bcd458 100644 --- a/arch/microblaze/include/asm/page.h +++ b/arch/microblaze/include/asm/page.h @@ -112,7 +112,6 @@ extern int page_is_ram(unsigned long pfn); # define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) # define ARCH_PFN_OFFSET (memory_start >> PAGE_SHIFT) -# define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && (pfn) < (max_mapnr + ARCH_PFN_OFFSET)) # endif /* __ASSEMBLY__ */ #define virt_addr_valid(vaddr) (pfn_valid(virt_to_pfn(vaddr))) diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h index 9286f11ff6ad..5978a8dfb917 100644 --- a/arch/mips/include/asm/page.h +++ b/arch/mips/include/asm/page.h @@ -224,19 +224,6 @@ extern phys_addr_t __phys_addr_symbol(unsigned long x); #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -#ifdef CONFIG_FLATMEM - -static inline int pfn_valid(unsigned long pfn) -{ - /* avoid include hell */ - extern unsigned long max_mapnr; - unsigned long pfn_offset = ARCH_PFN_OFFSET; - - return pfn >= pfn_offset && pfn < max_mapnr; -} - -#endif - #define virt_to_pfn(kaddr) PFN_DOWN(virt_to_phys((void *)(kaddr))) #define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) diff --git a/arch/nios2/include/asm/page.h b/arch/nios2/include/asm/page.h index 6a989819a7c1..0ae7d9ce369b 100644 --- a/arch/nios2/include/asm/page.h +++ b/arch/nios2/include/asm/page.h @@ -86,15 +86,6 @@ extern struct page *mem_map; # define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -static inline bool pfn_valid(unsigned long pfn) -{ - /* avoid include hell */ - extern unsigned long max_mapnr; - unsigned long pfn_offset = ARCH_PFN_OFFSET; - - return pfn >= pfn_offset && pfn < max_mapnr; -} - # define virt_to_page(vaddr) pfn_to_page(PFN_DOWN(virt_to_phys(vaddr))) # define virt_addr_valid(vaddr) pfn_valid(PFN_DOWN(virt_to_phys(vaddr))) diff --git a/arch/openrisc/include/asm/page.h b/arch/openrisc/include/asm/page.h index aab6e64d6db4..52b0d7e76446 100644 --- a/arch/openrisc/include/asm/page.h +++ b/arch/openrisc/include/asm/page.h @@ -80,8 +80,6 @@ typedef struct page *pgtable_t; #define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) -#define pfn_valid(pfn) ((pfn) < max_mapnr) - #define virt_addr_valid(kaddr) (pfn_valid(virt_to_pfn(kaddr))) #endif /* __ASSEMBLY__ */ diff --git a/arch/parisc/include/asm/page.h b/arch/parisc/include/asm/page.h index 6faaaa3ebe9b..667e703c0e8f 100644 --- a/arch/parisc/include/asm/page.h +++ b/arch/parisc/include/asm/page.h @@ -155,10 +155,6 @@ extern int npmem_ranges; #define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) -#ifndef CONFIG_SPARSEMEM -#define pfn_valid(pfn) ((pfn) < max_mapnr) -#endif - #ifdef CONFIG_HUGETLB_PAGE #define HPAGE_SHIFT PMD_SHIFT /* fixed for transparent huge pages */ #define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index edf1dd1b0ca9..f2b6bf5687d0 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -117,15 +117,6 @@ extern long long virt_phys_offset; #ifdef CONFIG_FLATMEM #define ARCH_PFN_OFFSET ((unsigned long)(MEMORY_START >> PAGE_SHIFT)) -#ifndef __ASSEMBLY__ -extern unsigned long max_mapnr; -static inline bool pfn_valid(unsigned long pfn) -{ - unsigned long min_pfn = ARCH_PFN_OFFSET; - - return pfn >= min_pfn && pfn < max_mapnr; -} -#endif #endif #define virt_to_pfn(kaddr) (__pa(kaddr) >> PAGE_SHIFT) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 9f432c1b5289..7fed7c431928 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -171,11 +171,6 @@ extern phys_addr_t __phys_addr_symbol(unsigned long x); #define sym_to_pfn(x) __phys_to_pfn(__pa_symbol(x)) -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) \ - (((pfn) >= ARCH_PFN_OFFSET) && (((pfn) - ARCH_PFN_OFFSET) < max_mapnr)) -#endif - #endif /* __ASSEMBLY__ */ #define virt_addr_valid(vaddr) ({ \ diff --git a/arch/sh/include/asm/page.h b/arch/sh/include/asm/page.h index eca5daa43b93..09ac6c7faee0 100644 --- a/arch/sh/include/asm/page.h +++ b/arch/sh/include/asm/page.h @@ -169,9 +169,6 @@ typedef struct page *pgtable_t; #define PFN_START (__MEMORY_START >> PAGE_SHIFT) #define ARCH_PFN_OFFSET (PFN_START) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) >= min_low_pfn && (pfn) < max_low_pfn) -#endif #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) #include diff --git a/arch/sparc/include/asm/page_32.h b/arch/sparc/include/asm/page_32.h index fff8861df107..6be6f683f98f 100644 --- a/arch/sparc/include/asm/page_32.h +++ b/arch/sparc/include/asm/page_32.h @@ -130,7 +130,6 @@ extern unsigned long pfn_base; #define ARCH_PFN_OFFSET (pfn_base) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#define pfn_valid(pfn) (((pfn) >= (pfn_base)) && (((pfn)-(pfn_base)) < max_mapnr)) #define virt_addr_valid(kaddr) ((((unsigned long)(kaddr)-PAGE_OFFSET)>>PAGE_SHIFT) < max_mapnr) #include diff --git a/arch/um/include/asm/page.h b/arch/um/include/asm/page.h index cdbd9653aa14..84866127d074 100644 --- a/arch/um/include/asm/page.h +++ b/arch/um/include/asm/page.h @@ -108,7 +108,6 @@ extern unsigned long uml_physmem; #define phys_to_pfn(p) ((p) >> PAGE_SHIFT) #define pfn_to_phys(pfn) PFN_PHYS(pfn) -#define pfn_valid(pfn) ((pfn) < max_mapnr) #define virt_addr_valid(v) pfn_valid(phys_to_pfn(__pa(v))) #include diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h index df42f8aa99e4..580d71aca65a 100644 --- a/arch/x86/include/asm/page_32.h +++ b/arch/x86/include/asm/page_32.h @@ -15,10 +15,6 @@ extern unsigned long __phys_addr(unsigned long); #define __phys_addr_symbol(x) __phys_addr(x) #define __phys_reloc_hide(x) RELOC_HIDE((x), 0) -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) < max_mapnr) -#endif /* CONFIG_FLATMEM */ - #include static inline void clear_page(void *page) diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 198e03e59ca1..cc6b8e087192 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -39,10 +39,6 @@ extern unsigned long __phys_addr_symbol(unsigned long); #define __phys_reloc_hide(x) (x) -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) < max_pfn) -#endif - void clear_page_orig(void *page); void clear_page_rep(void *page); void clear_page_erms(void *page); diff --git a/arch/xtensa/include/asm/page.h b/arch/xtensa/include/asm/page.h index 493eb7083b1a..a77d04972eb9 100644 --- a/arch/xtensa/include/asm/page.h +++ b/arch/xtensa/include/asm/page.h @@ -11,6 +11,8 @@ #ifndef _XTENSA_PAGE_H #define _XTENSA_PAGE_H +#include + #include #include #include @@ -189,8 +191,6 @@ static inline unsigned long ___pa(unsigned long va) #endif #define __va(x) \ ((void *)((unsigned long) (x) - PHYS_OFFSET + PAGE_OFFSET)) -#define pfn_valid(pfn) \ - ((pfn) >= ARCH_PFN_OFFSET && ((pfn) - ARCH_PFN_OFFSET) < max_mapnr) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define page_to_virt(page) __va(page_to_pfn(page) << PAGE_SHIFT) diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h index a2c8ed60233a..6796abe1900e 100644 --- a/include/asm-generic/memory_model.h +++ b/include/asm-generic/memory_model.h @@ -19,6 +19,18 @@ #define __page_to_pfn(page) ((unsigned long)((page) - mem_map) + \ ARCH_PFN_OFFSET) +#ifndef pfn_valid +static inline int pfn_valid(unsigned long pfn) +{ + /* avoid include hell */ + extern unsigned long max_mapnr; + unsigned long pfn_offset = ARCH_PFN_OFFSET; + + return pfn >= pfn_offset && (pfn - pfn_offset) < max_mapnr; +} +#define pfn_valid pfn_valid +#endif + #elif defined(CONFIG_SPARSEMEM_VMEMMAP) /* memmap is virtually contiguous. */ diff --git a/include/asm-generic/page.h b/include/asm-generic/page.h index 6fc47561814c..c0be2edeb484 100644 --- a/include/asm-generic/page.h +++ b/include/asm-generic/page.h @@ -84,8 +84,6 @@ extern unsigned long memory_end; #define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) #endif -#define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && ((pfn) - ARCH_PFN_OFFSET) < max_mapnr) - #define virt_addr_valid(kaddr) (((void *)(kaddr) >= (void *)PAGE_OFFSET) && \ ((void *)(kaddr) < (void *)memory_end)) -- cgit v1.2.3 From 1d693a3e69ba786c6c263d51b8e6d0daf16723ae Mon Sep 17 00:00:00 2001 From: Longlong Xia Date: Tue, 31 Jan 2023 07:10:35 +0000 Subject: mm/swapfile: remove pr_debug in get_swap_pages() It's known that get_swap_pages() may fail to find available space under some extreme case, but pr_debug() provides useless information. Let's remove it. Link: https://lkml.kernel.org/r/20230131071035.1085968-1-xialonglong1@huawei.com Signed-off-by: Longlong Xia Reviewed-by: "Huang, Ying" Cc: Chen Wandun Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/swapfile.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 888aed774fb6..cf0d72020f97 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1098,8 +1098,6 @@ start_over: spin_unlock(&si->lock); if (n_ret || size == SWAPFILE_CLUSTER) goto check_out; - pr_debug("scan_swap_map of si %d failed to find offset\n", - si->type); cond_resched(); spin_lock(&swap_avail_lock); -- cgit v1.2.3 From 7e4a32c0e8adafdda6161635c9046e6c1e8b95b5 Mon Sep 17 00:00:00 2001 From: Hyunmin Lee Date: Wed, 1 Feb 2023 20:51:42 +0900 Subject: mm/vmalloc: replace BUG_ON with a simple if statement As per the coding standards, in the event of an abnormal condition that should not occur under normal circumstances, the kernel should attempt recovery and proceed with execution, rather than halting the machine. Specifically, in the alloc_vmap_area() function, use a simple if() instead of using BUG_ON() halting the machine. Link: https://lkml.kernel.org/r/20230201115142.GA7772@min-iamroot Co-developed-by: Gwan-gyeong Mun Signed-off-by: Gwan-gyeong Mun Co-developed-by: Jeungwoo Yoo Signed-off-by: Jeungwoo Yoo Co-developed-by: Sangyun Kim Signed-off-by: Sangyun Kim Signed-off-by: Hyunmin Lee Reviewed-by: Christophe Leroy Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Uladzislau Rezki (Sony) Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- mm/vmalloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ff4d7dfdf84a..1dd7ca258a76 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1586,9 +1586,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, int purged = 0; int ret; - BUG_ON(!size); - BUG_ON(offset_in_page(size)); - BUG_ON(!is_power_of_2(align)); + if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align))) + return ERR_PTR(-EINVAL); if (unlikely(!vmap_initialized)) return ERR_PTR(-EBUSY); -- cgit v1.2.3 From 601c3c29dbeb049862faa00917f2daf094a71028 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 31 Jan 2023 16:01:16 -0800 Subject: mm: introduce vm_flags_reset_once to replace WRITE_ONCE vm_flags updates Provide vm_flags_reset_once() and replace the vm_flags updates which used WRITE_ONCE() to prevent compiler optimizations. Link: https://lkml.kernel.org/r/20230201000116.1333160-1-surenb@google.com Fixes: 0cce31a0aa0e ("mm: replace vma->vm_flags direct modifications with modifier calls") Signed-off-by: Suren Baghdasaryan Reported-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 +++++++ mm/mlock.c | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 27b34f7730e7..0ed0cb2401f5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -642,6 +642,13 @@ static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_init(vma, flags); } +static inline void vm_flags_reset_once(struct vm_area_struct *vma, + vm_flags_t flags) +{ + mmap_assert_write_locked(vma->vm_mm); + WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); +} + static inline void vm_flags_set(struct vm_area_struct *vma, vm_flags_t flags) { diff --git a/mm/mlock.c b/mm/mlock.c index ed49459e343e..617469fce96d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -380,7 +380,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, */ if (newflags & VM_LOCKED) newflags |= VM_IO; - vm_flags_reset(vma, newflags); + vm_flags_reset_once(vma, newflags); lru_add_drain(); walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL); @@ -388,7 +388,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, if (newflags & VM_IO) { newflags &= ~VM_IO; - vm_flags_reset(vma, newflags); + vm_flags_reset_once(vma, newflags); } } -- cgit v1.2.3 From aa02d3c174abfc506058d3e43f471bc4280563d0 Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Fri, 3 Feb 2023 18:01:32 +0800 Subject: mm/page_alloc: reduce fallbacks to (MIGRATE_PCPTYPES - 1) The commit 1dd214b8f21c ("mm: page_alloc: avoid merging non-fallbackable pageblocks with others") has removed MIGRATE_CMA and MIGRATE_ISOLATE from fallbacks list. so there is no need to add an element at the end of every type. Reduce fallbacks to (MIGRATE_PCPTYPES - 1). Link: https://lkml.kernel.org/r/20230203100132.1627787-1-yajun.deng@linux.dev Signed-off-by: Yajun Deng Acked-by: Vlastimil Babka Cc: Zi Yan Cc: Mel Gorman Cc: David Hildenbrand Cc: Mike Rapoport Cc: Oscar Salvador Cc: Mike Rapoport Signed-off-by: Andrew Morton --- mm/page_alloc.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5ebce58026f1..21d820c42900 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2603,10 +2603,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, * * The other migratetypes do not have fallbacks. */ -static int fallbacks[MIGRATE_TYPES][3] = { - [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, - [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, - [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, +static int fallbacks[MIGRATE_TYPES][MIGRATE_PCPTYPES - 1] = { + [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE }, + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE }, }; #ifdef CONFIG_CMA @@ -2865,11 +2865,8 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, return -1; *can_steal = false; - for (i = 0;; i++) { + for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) { fallback_mt = fallbacks[migratetype][i]; - if (fallback_mt == MIGRATE_TYPES) - break; - if (free_area_empty(area, fallback_mt)) continue; -- cgit v1.2.3 From 3e629597b8477efbcc0ad14ee80558a080eebdc3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Feb 2023 16:25:19 +0000 Subject: filemap: add mapping_read_folio_gfp() This is like read_cache_page_gfp() except it returns the folio instead of the precise page. Link: https://lkml.kernel.org/r/20230206162520.4029022-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Charan Teja Kalla Cc: David Rientjes Cc: Hugh Dickins Cc: Mark Hemment Cc: Michal Hocko Cc: Pavankumar Kondeti Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 2 ++ mm/filemap.c | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 9f1081683771..6a32ac170d3d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -756,6 +756,8 @@ static inline struct page *grab_cache_page(struct address_space *mapping, struct folio *read_cache_folio(struct address_space *, pgoff_t index, filler_t *filler, struct file *file); +struct folio *mapping_read_folio_gfp(struct address_space *, pgoff_t index, + gfp_t flags); struct page *read_cache_page(struct address_space *, pgoff_t index, filler_t *filler, struct file *file); extern struct page * read_cache_page_gfp(struct address_space *mapping, diff --git a/mm/filemap.c b/mm/filemap.c index 992554c18f1f..2ebcf500871d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3585,6 +3585,30 @@ struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index, } EXPORT_SYMBOL(read_cache_folio); +/** + * mapping_read_folio_gfp - Read into page cache, using specified allocation flags. + * @mapping: The address_space for the folio. + * @index: The index that the allocated folio will contain. + * @gfp: The page allocator flags to use if allocating. + * + * This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with + * any new memory allocations done using the specified allocation flags. + * + * The most likely error from this function is EIO, but ENOMEM is + * possible and so is EINTR. If ->read_folio returns another error, + * that will be returned to the caller. + * + * The function expects mapping->invalidate_lock to be already held. + * + * Return: Uptodate folio on success, ERR_PTR() on failure. + */ +struct folio *mapping_read_folio_gfp(struct address_space *mapping, + pgoff_t index, gfp_t gfp) +{ + return do_read_cache_folio(mapping, index, NULL, NULL, gfp); +} +EXPORT_SYMBOL(mapping_read_folio_gfp); + static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp) { -- cgit v1.2.3 From f01b2b3ed8735dacd92f1da548708449525e286a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Feb 2023 16:25:20 +0000 Subject: shmem: add shmem_read_folio() and shmem_read_folio_gfp() These are the folio replacements for shmem_read_mapping_page() and shmem_read_mapping_page_gfp(). [akpm@linux-foundation.org: fix shmem_read_mapping_page_gfp(), per Matthew] Link: https://lkml.kernel.org/r/Y+QdJTuzxeBYejw2@casper.infradead.org Link: https://lkml.kernel.org/r/20230206162520.4029022-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Mark Hemment Cc: Charan Teja Kalla Cc: David Rientjes Cc: Hugh Dickins Cc: Michal Hocko Cc: Pavankumar Kondeti Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 8 ++++++++ mm/shmem.c | 36 ++++++++++++++++++++++++------------ 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index d09d54be4ffd..103d1000a5a2 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -109,6 +109,14 @@ enum sgp_type { int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp); +struct folio *shmem_read_folio_gfp(struct address_space *mapping, + pgoff_t index, gfp_t gfp); + +static inline struct folio *shmem_read_folio(struct address_space *mapping, + pgoff_t index) +{ + return shmem_read_folio_gfp(mapping, index, mapping_gfp_mask(mapping)); +} static inline struct page *shmem_read_mapping_page( struct address_space *mapping, pgoff_t index) diff --git a/mm/shmem.c b/mm/shmem.c index 732969afabd1..be6bdd320d5f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4311,9 +4311,9 @@ int shmem_zero_setup(struct vm_area_struct *vma) } /** - * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. - * @mapping: the page's address_space - * @index: the page index + * shmem_read_folio_gfp - read into page cache, using specified page allocation flags. + * @mapping: the folio's address_space + * @index: the folio index * @gfp: the page allocator flags to use if allocating * * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", @@ -4325,13 +4325,12 @@ int shmem_zero_setup(struct vm_area_struct *vma) * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. */ -struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, - pgoff_t index, gfp_t gfp) +struct folio *shmem_read_folio_gfp(struct address_space *mapping, + pgoff_t index, gfp_t gfp) { #ifdef CONFIG_SHMEM struct inode *inode = mapping->host; struct folio *folio; - struct page *page; int error; BUG_ON(!shmem_mapping(mapping)); @@ -4341,6 +4340,25 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, return ERR_PTR(error); folio_unlock(folio); + return folio; +#else + /* + * The tiny !SHMEM case uses ramfs without swap + */ + return mapping_read_folio_gfp(mapping, index, gfp); +#endif +} +EXPORT_SYMBOL_GPL(shmem_read_folio_gfp); + +struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, + pgoff_t index, gfp_t gfp) +{ + struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp); + struct page *page; + + if (IS_ERR(folio)) + return &folio->page; + page = folio_file_page(folio, index); if (PageHWPoison(page)) { folio_put(folio); @@ -4348,11 +4366,5 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, } return page; -#else - /* - * The tiny !SHMEM case uses ramfs without swap - */ - return read_cache_page_gfp(mapping, index, gfp); -#endif } EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); -- cgit v1.2.3 From 5ff2121a3336a63aa7060cd71534d39dfb1cd2d1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Feb 2023 19:08:50 +0000 Subject: shmem: fix W=1 build warnings with CONFIG_SHMEM=n With W=1 and CONFIG_SHMEM=n, shmem.c functions have no prototypes so the compiler emits warnings. Link: https://lkml.kernel.org/r/20230206190850.4054983-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Mark Hemment Cc: Charan Teja Kalla Cc: David Rientjes Cc: Hugh Dickins Cc: Michal Hocko Cc: Pavankumar Kondeti Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index be6bdd320d5f..577b3838c6b9 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -58,7 +59,6 @@ static struct vfsmount *shm_mnt; #include #include #include -#include #include #include #include -- cgit v1.2.3 From d76f99548cf0474c3bf75f25fab1778e03ade5f2 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 6 Feb 2023 16:40:14 +0800 Subject: mm/vmalloc.c: add used_map into vmap_block to track space of vmap_block Patch series "mm/vmalloc.c: allow vread() to read out vm_map_ram areas", v5. Problem: *** Stephen reported vread() will skip vm_map_ram areas when reading out /proc/kcore with drgn utility. Please see below link to get more details. /proc/kcore reads 0's for vmap_block https://lore.kernel.org/all/87ilk6gos2.fsf@oracle.com/T/#u Root cause: *** The normal vmalloc API uses struct vmap_area to manage the virtual kernel area allocated, and associate a vm_struct to store more information and pass out. However, area reserved through vm_map_ram() interface doesn't allocate vm_struct to associate with. So the current code in vread() will skip the vm_map_ram area through 'if (!va->vm)' conditional checking. Solution: *** To mark the area reserved through vm_map_ram() interface, add field 'flags' into struct vmap_area. Bit 0 indicates this is vm_map_ram area created through vm_map_ram() interface, bit 1 marks out the type of vm_map_ram area which makes use of vmap_block to manage split regions via vb_alloc/free(). And also add bitmap field 'used_map' into struct vmap_block to mark those further subdivided regions being used to differentiate with dirty and free regions in vmap_block. With the help of above vmap_area->flags and vmap_block->used_map, we can recognize and handle vm_map_ram areas successfully. All these are done in patch 1~3. Meanwhile, do some improvement on areas related to vm_map_ram areas in patch 4, 5. And also change area flag from VM_ALLOC to VM_IOREMAP in patch 6, 7 because this will show them as 'ioremap' in /proc/vmallocinfo, and exclude them from /proc/kcore. This patch (of 7): In one vmap_block area, there could be three types of regions: region being used which is allocated through vb_alloc(), dirty region which is freed via vb_free() and free region. Among them, only used region has available data. While there's no way to track those used regions currently. Here, add bitmap field used_map into vmap_block, and set/clear it during allocation or freeing regions of vmap_block area. This is a preparation for later use. Link: https://lkml.kernel.org/r/20230206084020.174506-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20230206084020.174506-2-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Lorenzo Stoakes Reviewed-by: Uladzislau Rezki (Sony) Cc: Dan Carpenter Cc: Stephen Brennan Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- mm/vmalloc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1dd7ca258a76..0f1396d6fbe6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1910,6 +1910,7 @@ struct vmap_block { spinlock_t lock; struct vmap_area *va; unsigned long free, dirty; + DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS); unsigned long dirty_min, dirty_max; /*< dirty range */ struct list_head free_list; struct rcu_head rcu_head; @@ -1986,10 +1987,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) vb->va = va; /* At least something should be left free */ BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); + bitmap_zero(vb->used_map, VMAP_BBMAP_BITS); vb->free = VMAP_BBMAP_BITS - (1UL << order); vb->dirty = 0; vb->dirty_min = VMAP_BBMAP_BITS; vb->dirty_max = 0; + bitmap_set(vb->used_map, 0, (1UL << order)); INIT_LIST_HEAD(&vb->free_list); vb_idx = addr_to_vb_idx(va->va_start); @@ -2099,6 +2102,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) pages_off = VMAP_BBMAP_BITS - vb->free; vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); vb->free -= 1UL << order; + bitmap_set(vb->used_map, pages_off, (1UL << order)); if (vb->free == 0) { spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); @@ -2132,6 +2136,9 @@ static void vb_free(unsigned long addr, unsigned long size) order = get_order(size); offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); + spin_lock(&vb->lock); + bitmap_clear(vb->used_map, offset, (1UL << order)); + spin_unlock(&vb->lock); vunmap_range_noflush(addr, addr + size); -- cgit v1.2.3 From 869176a096068056b338b5cc1b0af93106007f5d Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 6 Feb 2023 16:40:15 +0800 Subject: mm/vmalloc.c: add flags to mark vm_map_ram area Through vmalloc API, a virtual kernel area is reserved for physical address mapping. And vmap_area is used to track them, while vm_struct is allocated to associate with the vmap_area to store more information and passed out. However, area reserved via vm_map_ram() is an exception. It doesn't have vm_struct to associate with vmap_area. And we can't recognize the vmap_area with '->vm == NULL' as a vm_map_ram() area because the normal freeing path will set va->vm = NULL before unmapping, please see function remove_vm_area(). Meanwhile, there are two kinds of handling for vm_map_ram area. One is the whole vmap_area being reserved and mapped at one time through vm_map_area() interface; the other is the whole vmap_area with VMAP_BLOCK_SIZE size being reserved, while mapped into split regions with smaller size via vb_alloc(). To mark the area reserved through vm_map_ram(), add flags field into struct vmap_area. Bit 0 indicates this is vm_map_ram area created through vm_map_ram() interface, while bit 1 marks out the type of vm_map_ram area which makes use of vmap_block to manage split regions via vb_alloc/free(). This is a preparation for later use. Link: https://lkml.kernel.org/r/20230206084020.174506-3-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Lorenzo Stoakes Reviewed-by: Uladzislau Rezki (Sony) Cc: Dan Carpenter Cc: Stephen Brennan Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 1 + mm/vmalloc.c | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 096d48aa3437..69250efa03d1 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -76,6 +76,7 @@ struct vmap_area { unsigned long subtree_max_size; /* in "free" tree */ struct vm_struct *vm; /* in "busy" tree */ }; + unsigned long flags; /* mark type of vm_map_ram area */ }; /* archs that select HAVE_ARCH_HUGE_VMAP should override one or more of these */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0f1396d6fbe6..2d0960190c42 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1578,7 +1578,8 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, - int node, gfp_t gfp_mask) + int node, gfp_t gfp_mask, + unsigned long va_flags) { struct vmap_area *va; unsigned long freed; @@ -1623,6 +1624,7 @@ retry: va->va_start = addr; va->va_end = addr + size; va->vm = NULL; + va->flags = va_flags; spin_lock(&vmap_area_lock); insert_vmap_area(va, &vmap_area_root, &vmap_area_list); @@ -1901,6 +1903,10 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr) #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) +#define VMAP_RAM 0x1 /* indicates vm_map_ram area*/ +#define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/ +#define VMAP_FLAGS_MASK 0x3 + struct vmap_block_queue { spinlock_t lock; struct list_head free; @@ -1976,7 +1982,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, VMALLOC_START, VMALLOC_END, - node, gfp_mask); + node, gfp_mask, + VMAP_RAM|VMAP_BLOCK); if (IS_ERR(va)) { kfree(vb); return ERR_CAST(va); @@ -2285,7 +2292,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) } else { struct vmap_area *va; va = alloc_vmap_area(size, PAGE_SIZE, - VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); + VMALLOC_START, VMALLOC_END, + node, GFP_KERNEL, VMAP_RAM); if (IS_ERR(va)) return NULL; @@ -2483,7 +2491,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, if (!(flags & VM_NO_GUARD)) size += PAGE_SIZE; - va = alloc_vmap_area(size, align, start, end, node, gfp_mask); + va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0); if (IS_ERR(va)) { kfree(area); return NULL; -- cgit v1.2.3 From 06c8994626d1b7d8c26dfd06992d67703a274054 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 6 Feb 2023 16:40:16 +0800 Subject: mm/vmalloc.c: allow vread() to read out vm_map_ram areas Currently, vread can read out vmalloc areas which is associated with a vm_struct. While this doesn't work for areas created by vm_map_ram() interface because it doesn't have an associated vm_struct. Then in vread(), these areas are all skipped. Here, add a new function vmap_ram_vread() to read out vm_map_ram areas. The area created with vmap_ram_vread() interface directly can be handled like the other normal vmap areas with aligned_vread(). While areas which will be further subdivided and managed with vmap_block need carefully read out page-aligned small regions and zero fill holes. Link: https://lkml.kernel.org/r/20230206084020.174506-4-bhe@redhat.com Reported-by: Stephen Brennan Signed-off-by: Baoquan He Reviewed-by: Lorenzo Stoakes Tested-by: Stephen Brennan Cc: Dan Carpenter Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- mm/vmalloc.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 81 insertions(+), 7 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2d0960190c42..7188a47315c2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3463,6 +3463,68 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) return copied; } +static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags) +{ + char *start; + struct vmap_block *vb; + unsigned long offset; + unsigned int rs, re, n; + + /* + * If it's area created by vm_map_ram() interface directly, but + * not further subdividing and delegating management to vmap_block, + * handle it here. + */ + if (!(flags & VMAP_BLOCK)) { + aligned_vread(buf, addr, count); + return; + } + + /* + * Area is split into regions and tracked with vmap_block, read out + * each region and zero fill the hole between regions. + */ + vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr)); + if (!vb) + goto finished; + + spin_lock(&vb->lock); + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { + spin_unlock(&vb->lock); + goto finished; + } + for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) { + if (!count) + break; + start = vmap_block_vaddr(vb->va->va_start, rs); + while (addr < start) { + if (count == 0) + goto unlock; + *buf = '\0'; + buf++; + addr++; + count--; + } + /*it could start reading from the middle of used region*/ + offset = offset_in_page(addr); + n = ((re - rs + 1) << PAGE_SHIFT) - offset; + if (n > count) + n = count; + aligned_vread(buf, start+offset, n); + + buf += n; + addr += n; + count -= n; + } +unlock: + spin_unlock(&vb->lock); + +finished: + /* zero-fill the left dirty or free regions */ + if (count) + memset(buf, 0, count); +} + /** * vread() - read vmalloc area in a safe way. * @buf: buffer for reading data @@ -3493,7 +3555,7 @@ long vread(char *buf, char *addr, unsigned long count) struct vm_struct *vm; char *vaddr, *buf_start = buf; unsigned long buflen = count; - unsigned long n; + unsigned long n, size, flags; addr = kasan_reset_tag(addr); @@ -3514,12 +3576,21 @@ long vread(char *buf, char *addr, unsigned long count) if (!count) break; - if (!va->vm) + vm = va->vm; + flags = va->flags & VMAP_FLAGS_MASK; + /* + * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need + * be set together with VMAP_RAM. + */ + WARN_ON(flags == VMAP_BLOCK); + + if (!vm && !flags) continue; - vm = va->vm; - vaddr = (char *) vm->addr; - if (addr >= vaddr + get_vm_area_size(vm)) + vaddr = (char *) va->va_start; + size = vm ? get_vm_area_size(vm) : va_size(va); + + if (addr >= vaddr + size) continue; while (addr < vaddr) { if (count == 0) @@ -3529,10 +3600,13 @@ long vread(char *buf, char *addr, unsigned long count) addr++; count--; } - n = vaddr + get_vm_area_size(vm) - addr; + n = vaddr + size - addr; if (n > count) n = count; - if (!(vm->flags & VM_IOREMAP)) + + if (flags & VMAP_RAM) + vmap_ram_vread(buf, addr, n, flags); + else if (!(vm->flags & VM_IOREMAP)) aligned_vread(buf, addr, n); else /* IOREMAP area is treated as memory hole */ memset(buf, 0, n); -- cgit v1.2.3 From bba9697b42ead45687352fdd0fd498735bc4361d Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 6 Feb 2023 16:40:17 +0800 Subject: mm/vmalloc: explicitly identify vm_map_ram area when shown in /proc/vmcoreinfo Now, by marking VMAP_RAM in vmap_area->flags for vm_map_ram area, we can clearly differentiate it with other vmalloc areas. So identify vm_map_area area by checking VMAP_RAM of vmap_area->flags when shown in /proc/vmcoreinfo. Meanwhile, the code comment above vm_map_ram area checking in s_show() is not needed any more, remove it here. Link: https://lkml.kernel.org/r/20230206084020.174506-5-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Lorenzo Stoakes Cc: Dan Carpenter Cc: Stephen Brennan Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- mm/vmalloc.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7188a47315c2..87d71c783646 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4152,14 +4152,11 @@ static int s_show(struct seq_file *m, void *p) va = list_entry(p, struct vmap_area, list); - /* - * s_show can encounter race with remove_vm_area, !vm on behalf - * of vmap area is being tear down or vm_map_ram allocation. - */ if (!va->vm) { - seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", - (void *)va->va_start, (void *)va->va_end, - va->va_end - va->va_start); + if (va->flags & VMAP_RAM) + seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start); goto final; } -- cgit v1.2.3 From 30a7a9b17c4b0331ec67aadb4b30ff2a951b4ed5 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 6 Feb 2023 16:40:18 +0800 Subject: mm/vmalloc: skip the uninitilized vmalloc areas For areas allocated via vmalloc_xxx() APIs, it searches for unmapped area to reserve and allocates new pages to map into, please see function __vmalloc_node_range(). During the process, flag VM_UNINITIALIZED is set in vm->flags to indicate that the pages allocation and mapping haven't been done, until clear_vm_uninitialized_flag() is called to clear VM_UNINITIALIZED. For this kind of area, if VM_UNINITIALIZED is still set, let's ignore it in vread() because pages newly allocated and being mapped in that area only contains zero data. reading them out by aligned_vread() is wasting time. Link: https://lkml.kernel.org/r/20230206084020.174506-6-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Lorenzo Stoakes Reviewed-by: Uladzislau Rezki (Sony) Cc: Dan Carpenter Cc: Stephen Brennan Signed-off-by: Andrew Morton --- mm/vmalloc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 87d71c783646..3b57260b6d39 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3587,6 +3587,11 @@ long vread(char *buf, char *addr, unsigned long count) if (!vm && !flags) continue; + if (vm && (vm->flags & VM_UNINITIALIZED)) + continue; + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ + smp_rmb(); + vaddr = (char *) va->va_start; size = vm ? get_vm_area_size(vm) : va_size(va); -- cgit v1.2.3 From 728e5ae07dd6cf72676e67e376671e52b7ddfac3 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 6 Feb 2023 16:40:19 +0800 Subject: powerpc: mm: add VM_IOREMAP flag to the vmalloc area Currently, for vmalloc areas with flag VM_IOREMAP set, except of the specific alignment clamping in __get_vm_area_node(), they will be 1) Shown as ioremap in /proc/vmallocinfo; 2) Ignored by /proc/kcore reading via vread() So for the io mapping in ioremap_phb() of ppc, we should set VM_IOREMAP in flag to make it handled correctly as above. Link: https://lkml.kernel.org/r/20230206084020.174506-7-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Lorenzo Stoakes Reviewed-by: Uladzislau Rezki (Sony) Cc: Dan Carpenter Cc: Stephen Brennan Signed-off-by: Andrew Morton --- arch/powerpc/kernel/pci_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 0c7cfb9fab04..fd42059ae2a5 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -132,7 +132,7 @@ void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size) * address decoding but I'd rather not deal with those outside of the * reserved 64K legacy region. */ - area = __get_vm_area_caller(size, 0, PHB_IO_BASE, PHB_IO_END, + area = __get_vm_area_caller(size, VM_IOREMAP, PHB_IO_BASE, PHB_IO_END, __builtin_return_address(0)); if (!area) return NULL; -- cgit v1.2.3 From 61a1a9906f66bd0eaaf9bade96f22a60f04240b7 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 6 Feb 2023 16:40:20 +0800 Subject: sh: mm: set VM_IOREMAP flag to the vmalloc area Currently, for vmalloc areas with flag VM_IOREMAP set, except of the specific alignment clamping in __get_vm_area_node(), they will be 1) Shown as ioremap in /proc/vmallocinfo; 2) Ignored by /proc/kcore reading via vread() So for the ioremap in __sq_remap() of sh, we should set VM_IOREMAP in flag to make it handled correctly as above. Link: https://lkml.kernel.org/r/20230206084020.174506-8-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Lorenzo Stoakes Reviewed-by: Uladzislau Rezki (Sony) Cc: Dan Carpenter Cc: Stephen Brennan Signed-off-by: Andrew Morton --- arch/sh/kernel/cpu/sh4/sq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/kernel/cpu/sh4/sq.c b/arch/sh/kernel/cpu/sh4/sq.c index a76b94e41e91..27f2e3da5aa2 100644 --- a/arch/sh/kernel/cpu/sh4/sq.c +++ b/arch/sh/kernel/cpu/sh4/sq.c @@ -103,7 +103,7 @@ static int __sq_remap(struct sq_mapping *map, pgprot_t prot) #if defined(CONFIG_MMU) struct vm_struct *vma; - vma = __get_vm_area_caller(map->size, VM_ALLOC, map->sq_addr, + vma = __get_vm_area_caller(map->size, VM_IOREMAP, map->sq_addr, SQ_ADDRMAX, __builtin_return_address(0)); if (!vma) return -ENOMEM; -- cgit v1.2.3 From b2a72dff85fa27fd80349a4f7fae8a6e9bcbbd15 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:22 -0400 Subject: mm/gup: have internal functions get the mmap_read_lock() Patch series "Simplify the external interface for GUP", v2. It is quite a maze of EXPORTED symbols leading up to the three actual worker functions of GUP. Simplify this by reorganizing some of the code so the EXPORTED symbols directly call the correct internal function with validated and consistent arguments. Consolidate all the assertions into one place at the top of the call chains. Remove some dead code. Move more things into the mm/internal.h header This patch (of 13): __get_user_pages_locked() and __gup_longterm_locked() both require the mmap lock to be held. They have a slightly unusual locked parameter that is used to allow these functions to unlock and relock the mmap lock and convey that fact to the caller. Several places wrap these functions with a simple mmap_read_lock() just so they can follow the optimized locked protocol. Consolidate this internally to the functions. Allow internal callers to set locked = 0 to cause the functions to acquire and release the lock on their own. Reorganize __gup_longterm_locked() to use the autolocking in __get_user_pages_locked(). Replace all the places obtaining the mmap_read_lock() just to call __get_user_pages_locked() with the new mechanism. Replace all the internal callers of get_user_pages_unlocked() with direct calls to __gup_longterm_locked() using the new mechanism. A following patch will add assertions ensuring the external interface continues to always pass in locked = 1. Link: https://lkml.kernel.org/r/0-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Link: https://lkml.kernel.org/r/1-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Acked-by: Mike Rapoport (IBM) Reviewed-by: John Hubbard Cc: Alistair Popple Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Howells Cc: Claudio Imbrenda Signed-off-by: Andrew Morton --- mm/gup.c | 113 ++++++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 65 insertions(+), 48 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index c4b793385ed2..5b03d2fd3e1c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1331,8 +1331,17 @@ static bool gup_signal_pending(unsigned int flags) } /* - * Please note that this function, unlike __get_user_pages will not - * return 0 for nr_pages > 0 without FOLL_NOWAIT + * Locking: (*locked == 1) means that the mmap_lock has already been acquired by + * the caller. This function may drop the mmap_lock. If it does so, then it will + * set (*locked = 0). + * + * (*locked == 0) means that the caller expects this function to acquire and + * drop the mmap_lock. Therefore, the value of *locked will still be zero when + * the function returns, even though it may have changed temporarily during + * function execution. + * + * Please note that this function, unlike __get_user_pages(), will not return 0 + * for nr_pages > 0, unless FOLL_NOWAIT is used. */ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, @@ -1343,13 +1352,22 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, unsigned int flags) { long ret, pages_done; - bool lock_dropped; + bool must_unlock = false; if (locked) { /* if VM_FAULT_RETRY can be returned, vmas become invalid */ BUG_ON(vmas); - /* check caller initialized locked */ - BUG_ON(*locked != 1); + } + + /* + * The internal caller expects GUP to manage the lock internally and the + * lock must be released when this returns. + */ + if (locked && !*locked) { + if (mmap_read_lock_killable(mm)) + return -EAGAIN; + must_unlock = true; + *locked = 1; } if (flags & FOLL_PIN) @@ -1368,7 +1386,6 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, flags |= FOLL_GET; pages_done = 0; - lock_dropped = false; for (;;) { ret = __get_user_pages(mm, start, nr_pages, flags, pages, vmas, locked); @@ -1404,7 +1421,9 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, if (likely(pages)) pages += ret; start += ret << PAGE_SHIFT; - lock_dropped = true; + + /* The lock was temporarily dropped, so we must unlock later */ + must_unlock = true; retry: /* @@ -1451,10 +1470,11 @@ retry: pages++; start += PAGE_SIZE; } - if (lock_dropped && *locked) { + if (must_unlock && *locked) { /* - * We must let the caller know we temporarily dropped the lock - * and so the critical section protected by it was lost. + * We either temporarily dropped the lock, or the caller + * requested that we both acquire and drop the lock. Either way, + * we must now unlock, and notify the caller of that state. */ mmap_read_unlock(mm); *locked = 0; @@ -1659,9 +1679,24 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned int foll_flags) { struct vm_area_struct *vma; + bool must_unlock = false; unsigned long vm_flags; long i; + if (!nr_pages) + return 0; + + /* + * The internal caller expects GUP to manage the lock internally and the + * lock must be released when this returns. + */ + if (locked && !*locked) { + if (mmap_read_lock_killable(mm)) + return -EAGAIN; + must_unlock = true; + *locked = 1; + } + /* calculate required read or write permissions. * If FOLL_FORCE is set, we only require the "MAY" flags. */ @@ -1673,12 +1708,12 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, for (i = 0; i < nr_pages; i++) { vma = find_vma(mm, start); if (!vma) - goto finish_or_fault; + break; /* protect what we can, including chardevs */ if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || !(vm_flags & vma->vm_flags)) - goto finish_or_fault; + break; if (pages) { pages[i] = virt_to_page((void *)start); @@ -1690,9 +1725,11 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, start = (start + PAGE_SIZE) & PAGE_MASK; } - return i; + if (must_unlock && *locked) { + mmap_read_unlock(mm); + *locked = 0; + } -finish_or_fault: return i ? : -EFAULT; } #endif /* !CONFIG_MMU */ @@ -1861,17 +1898,13 @@ EXPORT_SYMBOL(fault_in_readable); #ifdef CONFIG_ELF_CORE struct page *get_dump_page(unsigned long addr) { - struct mm_struct *mm = current->mm; struct page *page; - int locked = 1; + int locked = 0; int ret; - if (mmap_read_lock_killable(mm)) - return NULL; - ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked, + ret = __get_user_pages_locked(current->mm, addr, 1, &page, NULL, + &locked, FOLL_FORCE | FOLL_DUMP | FOLL_GET); - if (locked) - mmap_read_unlock(mm); return (ret == 1) ? page : NULL; } #endif /* CONFIG_ELF_CORE */ @@ -2047,13 +2080,9 @@ static long __gup_longterm_locked(struct mm_struct *mm, int *locked, unsigned int gup_flags) { - bool must_unlock = false; unsigned int flags; long rc, nr_pinned_pages; - if (locked && WARN_ON_ONCE(!*locked)) - return -EINVAL; - if (!(gup_flags & FOLL_LONGTERM)) return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, locked, gup_flags); @@ -2070,11 +2099,6 @@ static long __gup_longterm_locked(struct mm_struct *mm, return -EINVAL; flags = memalloc_pin_save(); do { - if (locked && !*locked) { - mmap_read_lock(mm); - must_unlock = true; - *locked = 1; - } nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, locked, gup_flags); @@ -2085,11 +2109,6 @@ static long __gup_longterm_locked(struct mm_struct *mm, rc = check_and_migrate_movable_pages(nr_pinned_pages, pages); } while (rc == -EAGAIN); memalloc_pin_restore(flags); - - if (locked && *locked && must_unlock) { - mmap_read_unlock(mm); - *locked = 0; - } return rc ? rc : nr_pinned_pages; } @@ -2242,16 +2261,10 @@ EXPORT_SYMBOL(get_user_pages); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { - struct mm_struct *mm = current->mm; - int locked = 1; - long ret; + int locked = 0; - mmap_read_lock(mm); - ret = __gup_longterm_locked(mm, start, nr_pages, pages, NULL, &locked, - gup_flags | FOLL_TOUCH); - if (locked) - mmap_read_unlock(mm); - return ret; + return __gup_longterm_locked(current->mm, start, nr_pages, pages, NULL, + &locked, gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages_unlocked); @@ -2904,6 +2917,7 @@ static int internal_get_user_pages_fast(unsigned long start, { unsigned long len, end; unsigned long nr_pinned; + int locked = 0; int ret; if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | @@ -2932,8 +2946,9 @@ static int internal_get_user_pages_fast(unsigned long start, /* Slow path: try to get the remaining pages with get_user_pages */ start += nr_pinned << PAGE_SHIFT; pages += nr_pinned; - ret = get_user_pages_unlocked(start, nr_pages - nr_pinned, pages, - gup_flags); + ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned, + pages, NULL, &locked, + gup_flags | FOLL_TOUCH); if (ret < 0) { /* * The caller has to unpin the pages we already pinned so @@ -3183,11 +3198,13 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE(gup_flags & FOLL_GET)) return -EINVAL; + int locked = 0; if (WARN_ON_ONCE(!pages)) return -EINVAL; - gup_flags |= FOLL_PIN; - return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); + gup_flags |= FOLL_PIN | FOLL_TOUCH; + return __gup_longterm_locked(current->mm, start, nr_pages, pages, NULL, + &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages_unlocked); -- cgit v1.2.3 From 7427c30bea1449a885a1dd9baf991aaad26209ce Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:23 -0400 Subject: mm/gup: remove obsolete FOLL_LONGTERM comment These days FOLL_LONGTERM is not allowed at all on any get_user_pages*() functions, it must be only be used with pin_user_pages*(), plus it now has universal support for all the pin_user_pages*() functions. Link: https://lkml.kernel.org/r/2-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Reviewed-by: John Hubbard Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: David Howells Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 10a1e41f4e70..4396c7bf06d1 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1053,12 +1053,6 @@ typedef unsigned int __bitwise zap_flags_t; * specifically failed. Filesystem pages are still subject to bugs and use of * FOLL_LONGTERM should be avoided on those pages. * - * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call. - * Currently only get_user_pages() and get_user_pages_fast() support this flag - * and calls to get_user_pages_[un]locked are specifically not allowed. This - * is due to an incompatibility with the FS DAX check and - * FAULT_FLAG_ALLOW_RETRY. - * * In the CMA case: long term pins in a CMA region would unnecessarily fragment * that region. And so, CMA attempts to migrate the page before pinning, when * FOLL_LONGTERM is specified. -- cgit v1.2.3 From afa3c33e2684c2eec4f47d83d2859b76f3568be6 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:24 -0400 Subject: mm/gup: don't call __gup_longterm_locked() if FOLL_LONGTERM cannot be set get_user_pages_remote(), get_user_pages_unlocked() and get_user_pages() are never called with FOLL_LONGTERM, so directly call __get_user_pages_locked() The next patch will add an assertion for this. Link: https://lkml.kernel.org/r/3-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Suggested-by: John Hubbard Reviewed-by: John Hubbard Acked-by: Mike Rapoport (IBM) Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: David Howells Signed-off-by: Andrew Morton --- mm/gup.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 5b03d2fd3e1c..9bd4d775716a 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2200,8 +2200,8 @@ long get_user_pages_remote(struct mm_struct *mm, if (!is_valid_gup_flags(gup_flags)) return -EINVAL; - return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, locked, - gup_flags | FOLL_TOUCH | FOLL_REMOTE); + return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, locked, + gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); @@ -2238,8 +2238,8 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, if (!is_valid_gup_flags(gup_flags)) return -EINVAL; - return __gup_longterm_locked(current->mm, start, nr_pages, - pages, vmas, NULL, gup_flags | FOLL_TOUCH); + return __get_user_pages_locked(current->mm, start, nr_pages, pages, + vmas, NULL, gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages); @@ -2263,8 +2263,8 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, { int locked = 0; - return __gup_longterm_locked(current->mm, start, nr_pages, pages, NULL, - &locked, gup_flags | FOLL_TOUCH); + return __get_user_pages_locked(current->mm, start, nr_pages, pages, + NULL, &locked, gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages_unlocked); -- cgit v1.2.3 From 7ce154fe6917e7db94d63bc4d6c73b678ad1c581 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:25 -0400 Subject: mm/gup: move try_grab_page() to mm/internal.h This is part of the internal function of gup.c and is only non-static so that the parts of gup.c in the huge_memory.c and hugetlb.c can call it. Put it in internal.h beside the similarly purposed try_grab_folio() Link: https://lkml.kernel.org/r/4-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Reviewed-by: John Hubbard Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: David Howells Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- mm/internal.h | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 0ed0cb2401f5..afefc166b349 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1268,8 +1268,6 @@ static inline void get_page(struct page *page) folio_get(page_folio(page)); } -int __must_check try_grab_page(struct page *page, unsigned int flags); - static inline __must_check bool try_get_page(struct page *page) { page = compound_head(page); diff --git a/mm/internal.h b/mm/internal.h index 90bb2078444c..3f75763b0fc2 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -856,6 +856,7 @@ int migrate_device_coherent_page(struct page *page); * mm/gup.c */ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); +int __must_check try_grab_page(struct page *page, unsigned int flags); extern bool mirrored_kernelcore; -- cgit v1.2.3 From d64e2dbc33a109a37ad4f5c18945c324345fe873 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:26 -0400 Subject: mm/gup: simplify the external interface functions and consolidate invariants The GUP family of functions have a complex, but fairly well defined, set of invariants for their arguments. Currently these are sprinkled about, sometimes in duplicate through many functions. Internally we don't follow all the invariants that the external interface has to follow, so place these checks directly at the exported interface. This ensures the internal functions never reach a violated invariant. Remove the duplicated invariant checks. The end result is to make these functions fully internal: __get_user_pages_locked() internal_get_user_pages_fast() __gup_longterm_locked() And all the other functions call directly into one of these. Link: https://lkml.kernel.org/r/5-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Suggested-by: John Hubbard Reviewed-by: John Hubbard Acked-by: Mike Rapoport (IBM) Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: David Howells Signed-off-by: Andrew Morton --- mm/gup.c | 153 +++++++++++++++++++++++++++---------------------------- mm/huge_memory.c | 10 ---- 2 files changed, 75 insertions(+), 88 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 9bd4d775716a..029de13c6718 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -215,7 +215,6 @@ int __must_check try_grab_page(struct page *page, unsigned int flags) { struct folio *folio = page_folio(page); - WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN)); if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) return -ENOMEM; @@ -818,7 +817,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, if (vma_is_secretmem(vma)) return NULL; - if (foll_flags & FOLL_PIN) + if (WARN_ON_ONCE(foll_flags & FOLL_PIN)) return NULL; page = follow_page_mask(vma, address, foll_flags, &ctx); @@ -975,9 +974,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) return -EOPNOTSUPP; - if ((gup_flags & FOLL_LONGTERM) && (gup_flags & FOLL_PCI_P2PDMA)) - return -EOPNOTSUPP; - if (vma_is_secretmem(vma)) return -EFAULT; @@ -1354,11 +1350,6 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, long ret, pages_done; bool must_unlock = false; - if (locked) { - /* if VM_FAULT_RETRY can be returned, vmas become invalid */ - BUG_ON(vmas); - } - /* * The internal caller expects GUP to manage the lock internally and the * lock must be released when this returns. @@ -2087,16 +2078,6 @@ static long __gup_longterm_locked(struct mm_struct *mm, return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, locked, gup_flags); - /* - * If we get to this point then FOLL_LONGTERM is set, and FOLL_LONGTERM - * implies FOLL_PIN (although the reverse is not true). Therefore it is - * correct to unconditionally call check_and_migrate_movable_pages() - * which assumes pages have been pinned via FOLL_PIN. - * - * Enforce the above reasoning by asserting that FOLL_PIN is set. - */ - if (WARN_ON(!(gup_flags & FOLL_PIN))) - return -EINVAL; flags = memalloc_pin_save(); do { nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages, @@ -2106,28 +2087,66 @@ static long __gup_longterm_locked(struct mm_struct *mm, rc = nr_pinned_pages; break; } + + /* FOLL_LONGTERM implies FOLL_PIN */ rc = check_and_migrate_movable_pages(nr_pinned_pages, pages); } while (rc == -EAGAIN); memalloc_pin_restore(flags); return rc ? rc : nr_pinned_pages; } -static bool is_valid_gup_flags(unsigned int gup_flags) +/* + * Check that the given flags are valid for the exported gup/pup interface, and + * update them with the required flags that the caller must have set. + */ +static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, + int *locked, unsigned int *gup_flags_p, + unsigned int to_set) { + unsigned int gup_flags = *gup_flags_p; + /* - * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, - * never directly by the caller, so enforce that with an assertion: + * These flags not allowed to be specified externally to the gup + * interfaces: + * - FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only + * - FOLL_REMOTE is internal only and used on follow_page() */ - if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) + if (WARN_ON_ONCE(gup_flags & (FOLL_PIN | FOLL_TRIED | + FOLL_REMOTE | FOLL_FAST_ONLY))) + return false; + + gup_flags |= to_set; + + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) == + (FOLL_PIN | FOLL_GET))) + return false; + + /* LONGTERM can only be specified when pinning */ + if (WARN_ON_ONCE(!(gup_flags & FOLL_PIN) && (gup_flags & FOLL_LONGTERM))) + return false; + + /* Pages input must be given if using GET/PIN */ + if (WARN_ON_ONCE((gup_flags & (FOLL_GET | FOLL_PIN)) && !pages)) return false; + + /* At the external interface locked must be set */ + if (WARN_ON_ONCE(locked && *locked != 1)) + return false; + + /* We want to allow the pgmap to be hot-unplugged at all times */ + if (WARN_ON_ONCE((gup_flags & FOLL_LONGTERM) && + (gup_flags & FOLL_PCI_P2PDMA))) + return false; + /* - * FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying - * that is, FOLL_LONGTERM is a specific case, more restrictive case of - * FOLL_PIN. + * Can't use VMAs with locked, as locked allows GUP to unlock + * which invalidates the vmas array */ - if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + if (WARN_ON_ONCE(vmas && locked)) return false; + *gup_flags_p = gup_flags; return true; } @@ -2197,11 +2216,12 @@ long get_user_pages_remote(struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { - if (!is_valid_gup_flags(gup_flags)) + if (!is_valid_gup_args(pages, vmas, locked, &gup_flags, + FOLL_TOUCH | FOLL_REMOTE)) return -EINVAL; return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, locked, - gup_flags | FOLL_TOUCH | FOLL_REMOTE); + gup_flags); } EXPORT_SYMBOL(get_user_pages_remote); @@ -2235,11 +2255,11 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { - if (!is_valid_gup_flags(gup_flags)) + if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_TOUCH)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, - vmas, NULL, gup_flags | FOLL_TOUCH); + vmas, NULL, gup_flags); } EXPORT_SYMBOL(get_user_pages); @@ -2263,8 +2283,11 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, { int locked = 0; + if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_TOUCH)) + return -EINVAL; + return __get_user_pages_locked(current->mm, start, nr_pages, pages, - NULL, &locked, gup_flags | FOLL_TOUCH); + NULL, &locked, gup_flags); } EXPORT_SYMBOL(get_user_pages_unlocked); @@ -2992,7 +3015,9 @@ int get_user_pages_fast_only(unsigned long start, int nr_pages, * FOLL_FAST_ONLY is required in order to match the API description of * this routine: no fall back to regular ("slow") GUP. */ - gup_flags |= FOLL_GET | FOLL_FAST_ONLY; + if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, + FOLL_GET | FOLL_FAST_ONLY)) + return -EINVAL; nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); @@ -3029,16 +3054,14 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast_only); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { - if (!is_valid_gup_flags(gup_flags)) - return -EINVAL; - /* * The caller may or may not have explicitly set FOLL_GET; either way is * OK. However, internally (within mm/gup.c), gup fast variants must set * FOLL_GET, because gup fast is always a "pin with a +1 page refcount" * request. */ - gup_flags |= FOLL_GET; + if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_GET)) + return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast); @@ -3062,14 +3085,8 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ - if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_PIN)) return -EINVAL; - - if (WARN_ON_ONCE(!pages)) - return -EINVAL; - - gup_flags |= FOLL_PIN; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(pin_user_pages_fast); @@ -3085,20 +3102,14 @@ int pin_user_pages_fast_only(unsigned long start, int nr_pages, { int nr_pinned; - /* - * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API - * rules require returning 0, rather than -errno: - */ - if (WARN_ON_ONCE(gup_flags & FOLL_GET)) - return 0; - - if (WARN_ON_ONCE(!pages)) - return 0; /* * FOLL_FAST_ONLY is required in order to match the API description of * this routine: no fall back to regular ("slow") GUP. */ - gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY); + if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, + FOLL_PIN | FOLL_FAST_ONLY)) + return 0; + nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); /* @@ -3140,16 +3151,11 @@ long pin_user_pages_remote(struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ - if (WARN_ON_ONCE(gup_flags & FOLL_GET)) - return -EINVAL; - - if (WARN_ON_ONCE(!pages)) - return -EINVAL; - + if (!is_valid_gup_args(pages, vmas, locked, &gup_flags, + FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE)) + return 0; return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, locked, - gup_flags | FOLL_PIN | FOLL_TOUCH | - FOLL_REMOTE); + gup_flags); } EXPORT_SYMBOL(pin_user_pages_remote); @@ -3174,14 +3180,8 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ - if (WARN_ON_ONCE(gup_flags & FOLL_GET)) - return -EINVAL; - - if (WARN_ON_ONCE(!pages)) - return -EINVAL; - - gup_flags |= FOLL_PIN; + if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_PIN)) + return 0; return __gup_longterm_locked(current->mm, start, nr_pages, pages, vmas, NULL, gup_flags); } @@ -3195,15 +3195,12 @@ EXPORT_SYMBOL(pin_user_pages); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ - if (WARN_ON_ONCE(gup_flags & FOLL_GET)) - return -EINVAL; int locked = 0; - if (WARN_ON_ONCE(!pages)) - return -EINVAL; + if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, + FOLL_PIN | FOLL_TOUCH)) + return 0; - gup_flags |= FOLL_PIN | FOLL_TOUCH; return __gup_longterm_locked(current->mm, start, nr_pages, pages, NULL, &locked, gup_flags); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1d6977dc6b31..1343a7d88299 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1042,11 +1042,6 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, assert_spin_locked(pmd_lockptr(mm, pmd)); - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ - if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == - (FOLL_PIN | FOLL_GET))) - return NULL; - if (flags & FOLL_WRITE && !pmd_write(*pmd)) return NULL; @@ -1205,11 +1200,6 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, if (flags & FOLL_WRITE && !pud_write(*pud)) return NULL; - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ - if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == - (FOLL_PIN | FOLL_GET))) - return NULL; - if (pud_present(*pud) && pud_devmap(*pud)) /* pass */; else -- cgit v1.2.3 From 961ba47242510ac7af7c66733e471b9d3a6ade1a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:27 -0400 Subject: mm/gup: add an assertion that the mmap lock is locked Since commit 5b78ed24e8ec ("mm/pagemap: add mmap_assert_locked() annotations to find_vma*()") we already have this assertion, it is just buried in find_vma(): __get_user_pages_locked() __get_user_pages() find_extend_vma() find_vma() Also check it at the top of __get_user_pages_locked() as a form of documentation. Link: https://lkml.kernel.org/r/6-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Reviewed-by: John Hubbard Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: David Howells Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/gup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/gup.c b/mm/gup.c index 029de13c6718..bc3f56b7621e 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1360,6 +1360,8 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, must_unlock = true; *locked = 1; } + else + mmap_assert_locked(mm); if (flags & FOLL_PIN) mm_set_has_pinned_flag(&mm->flags); -- cgit v1.2.3 From 6e4382c706f7701c4e4e430ebc9f817eeda64857 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:28 -0400 Subject: mm/gup: remove locked being NULL from faultin_vma_page_range() The only caller of this function always passes in a non-NULL locked, so just remove this obsolete comment. Link: https://lkml.kernel.org/r/7-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Reviewed-by: John Hubbard Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: David Howells Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/gup.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index bc3f56b7621e..3afcd042f426 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1558,12 +1558,7 @@ long populate_vma_page_range(struct vm_area_struct *vma, * code on error (see __get_user_pages()). * * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and - * covered by the VMA. - * - * If @locked is NULL, it may be held for read or write and will be unperturbed. - * - * If @locked is non-NULL, it must held for read only and may be released. If - * it's released, *@locked will be set to 0. + * covered by the VMA. If it's released, *@locked will be set to 0. */ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool write, int *locked) -- cgit v1.2.3 From f04740f54594f85935e29a5c8ff6722f427f3dac Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:29 -0400 Subject: mm/gup: add FOLL_UNLOCKABLE Setting FOLL_UNLOCKABLE allows GUP to lock/unlock the mmap lock on its own. It is a more explicit replacement for locked != NULL. This clears the way for passing in locked = 1, without intending that the lock can be unlocked. Set the flag in all cases where it is used, eg locked is present in the external interface or locked is used internally with locked = 0. Link: https://lkml.kernel.org/r/8-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Acked-by: Mike Rapoport (IBM) Reviewed-by: John Hubbard Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: David Howells Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 1 + mm/gup.c | 36 +++++++++++++++++++++++------------- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4396c7bf06d1..434b3ac8a351 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1104,5 +1104,6 @@ typedef unsigned int __bitwise zap_flags_t; #define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ #define FOLL_PCI_P2PDMA 0x100000 /* allow returning PCI P2PDMA pages */ #define FOLL_INTERRUPTIBLE 0x200000 /* allow interrupts from generic signals */ +#define FOLL_UNLOCKABLE 0x400000 /* allow unlocking the mmap lock (internal only) */ #endif /* _LINUX_MM_TYPES_H */ diff --git a/mm/gup.c b/mm/gup.c index 3afcd042f426..310fc6ab967e 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -896,7 +896,7 @@ static int faultin_page(struct vm_area_struct *vma, fault_flags |= FAULT_FLAG_WRITE; if (*flags & FOLL_REMOTE) fault_flags |= FAULT_FLAG_REMOTE; - if (locked) { + if (*flags & FOLL_UNLOCKABLE) { fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; /* * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set @@ -1382,9 +1382,11 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, for (;;) { ret = __get_user_pages(mm, start, nr_pages, flags, pages, vmas, locked); - if (!locked) + if (!(flags & FOLL_UNLOCKABLE)) { /* VM_FAULT_RETRY couldn't trigger, bypass */ - return ret; + pages_done = ret; + break; + } /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */ if (!*locked) { @@ -1532,6 +1534,9 @@ long populate_vma_page_range(struct vm_area_struct *vma, if (vma_is_accessible(vma)) gup_flags |= FOLL_FORCE; + if (locked) + gup_flags |= FOLL_UNLOCKABLE; + /* * We made sure addr is within a VMA, so the following will * not result in a stack expansion that recurses back here. @@ -1583,7 +1588,7 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, * a poisoned page. * !FOLL_FORCE: Require proper access permissions. */ - gup_flags = FOLL_TOUCH | FOLL_HWPOISON; + gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE; if (write) gup_flags |= FOLL_WRITE; @@ -2107,12 +2112,20 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, * interfaces: * - FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only * - FOLL_REMOTE is internal only and used on follow_page() + * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL */ - if (WARN_ON_ONCE(gup_flags & (FOLL_PIN | FOLL_TRIED | + if (WARN_ON_ONCE(gup_flags & (FOLL_PIN | FOLL_TRIED | FOLL_UNLOCKABLE | FOLL_REMOTE | FOLL_FAST_ONLY))) return false; gup_flags |= to_set; + if (locked) { + /* At the external interface locked must be set */ + if (WARN_ON_ONCE(*locked != 1)) + return false; + + gup_flags |= FOLL_UNLOCKABLE; + } /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) == @@ -2127,10 +2140,6 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, if (WARN_ON_ONCE((gup_flags & (FOLL_GET | FOLL_PIN)) && !pages)) return false; - /* At the external interface locked must be set */ - if (WARN_ON_ONCE(locked && *locked != 1)) - return false; - /* We want to allow the pgmap to be hot-unplugged at all times */ if (WARN_ON_ONCE((gup_flags & FOLL_LONGTERM) && (gup_flags & FOLL_PCI_P2PDMA))) @@ -2140,7 +2149,7 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, * Can't use VMAs with locked, as locked allows GUP to unlock * which invalidates the vmas array */ - if (WARN_ON_ONCE(vmas && locked)) + if (WARN_ON_ONCE(vmas && (gup_flags & FOLL_UNLOCKABLE))) return false; *gup_flags_p = gup_flags; @@ -2280,7 +2289,8 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, { int locked = 0; - if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_TOUCH)) + if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, + FOLL_TOUCH | FOLL_UNLOCKABLE)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, @@ -2968,7 +2978,7 @@ static int internal_get_user_pages_fast(unsigned long start, pages += nr_pinned; ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned, pages, NULL, &locked, - gup_flags | FOLL_TOUCH); + gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE); if (ret < 0) { /* * The caller has to unpin the pages we already pinned so @@ -3195,7 +3205,7 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, int locked = 0; if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, - FOLL_PIN | FOLL_TOUCH)) + FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, pages, NULL, -- cgit v1.2.3 From 9a863a6a51394bff480c959b713874c090a8f5c6 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:30 -0400 Subject: mm/gup: make locked never NULL in the internal GUP functions Now that NULL locked doesn't have a special meaning we can just make it non-NULL in all cases and remove the special tests. get_user_pages() and pin_user_pages() can safely pass in a locked = 1 get_user_pages_remote) and pin_user_pages_remote() can swap in a local variable for locked if NULL is passed. Remove all the NULL checks. Link: https://lkml.kernel.org/r/9-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Acked-by: Mike Rapoport (IBM) Reviewed-by: John Hubbard Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: David Howells Signed-off-by: Andrew Morton --- mm/gup.c | 51 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 310fc6ab967e..4ce88e00e1c6 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -879,9 +879,9 @@ unmap: } /* - * mmap_lock must be held on entry. If @locked != NULL and *@flags - * does not include FOLL_NOWAIT, the mmap_lock may be released. If it - * is, *@locked will be set to 0 and -EBUSY returned. + * mmap_lock must be held on entry. If @flags has FOLL_UNLOCKABLE but not + * FOLL_NOWAIT, the mmap_lock may be released. If it is, *@locked will be set + * to 0 and -EBUSY returned. */ static int faultin_page(struct vm_area_struct *vma, unsigned long address, unsigned int *flags, bool unshare, @@ -930,8 +930,8 @@ static int faultin_page(struct vm_area_struct *vma, * mmap lock in the page fault handler. Sanity check this. */ WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT); - if (locked) - *locked = 0; + *locked = 0; + /* * We should do the same as VM_FAULT_RETRY, but let's not * return -EBUSY since that's not reflecting the reality of @@ -951,7 +951,7 @@ static int faultin_page(struct vm_area_struct *vma, } if (ret & VM_FAULT_RETRY) { - if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) + if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) *locked = 0; return -EBUSY; } @@ -1062,14 +1062,12 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * appropriate) must be called after the page is finished with, and * before put_page is called. * - * If @locked != NULL, *@locked will be set to 0 when mmap_lock is - * released by an up_read(). That can happen if @gup_flags does not - * have FOLL_NOWAIT. + * If FOLL_UNLOCKABLE is set without FOLL_NOWAIT then the mmap_lock may + * be released. If this happens *@locked will be set to 0 on return. * - * A caller using such a combination of @locked and @gup_flags - * must therefore hold the mmap_lock for reading only, and recognize - * when it's been released. Otherwise, it must be held for either - * reading or writing and will not be released. + * A caller using such a combination of @gup_flags must therefore hold the + * mmap_lock for reading only, and recognize when it's been released. Otherwise, + * it must be held for either reading or writing and will not be released. * * In most cases, get_user_pages or get_user_pages_fast should be used * instead of __get_user_pages. __get_user_pages should be used only if @@ -1121,7 +1119,7 @@ static long __get_user_pages(struct mm_struct *mm, i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, gup_flags, locked); - if (locked && *locked == 0) { + if (!*locked) { /* * We've got a VM_FAULT_RETRY * and we've lost mmap_lock. @@ -1354,7 +1352,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, * The internal caller expects GUP to manage the lock internally and the * lock must be released when this returns. */ - if (locked && !*locked) { + if (!*locked) { if (mmap_read_lock_killable(mm)) return -EAGAIN; must_unlock = true; @@ -1502,6 +1500,7 @@ long populate_vma_page_range(struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; + int local_locked = 1; int gup_flags; long ret; @@ -1542,7 +1541,7 @@ long populate_vma_page_range(struct vm_area_struct *vma, * not result in a stack expansion that recurses back here. */ ret = __get_user_pages(mm, start, nr_pages, gup_flags, - NULL, NULL, locked); + NULL, NULL, locked ? locked : &local_locked); lru_add_drain(); return ret; } @@ -1683,7 +1682,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, * The internal caller expects GUP to manage the lock internally and the * lock must be released when this returns. */ - if (locked && !*locked) { + if (!*locked) { if (mmap_read_lock_killable(mm)) return -EAGAIN; must_unlock = true; @@ -2222,11 +2221,14 @@ long get_user_pages_remote(struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { + int local_locked = 1; + if (!is_valid_gup_args(pages, vmas, locked, &gup_flags, FOLL_TOUCH | FOLL_REMOTE)) return -EINVAL; - return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, locked, + return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, + locked ? locked : &local_locked, gup_flags); } EXPORT_SYMBOL(get_user_pages_remote); @@ -2261,11 +2263,13 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { + int locked = 1; + if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_TOUCH)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, - vmas, NULL, gup_flags); + vmas, &locked, gup_flags); } EXPORT_SYMBOL(get_user_pages); @@ -3158,10 +3162,13 @@ long pin_user_pages_remote(struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { + int local_locked = 1; + if (!is_valid_gup_args(pages, vmas, locked, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE)) return 0; - return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, locked, + return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, + locked ? locked : &local_locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages_remote); @@ -3187,10 +3194,12 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { + int locked = 1; + if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_PIN)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, - pages, vmas, NULL, gup_flags); + pages, vmas, &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages); -- cgit v1.2.3 From edad1bb1fbf7e28b49bf76b2aa66bfcaba00f627 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:31 -0400 Subject: mm/gup: remove pin_user_pages_fast_only() Commit ed29c2691188 ("drm/i915: Fix userptr so we do not have to worry about obj->mm.lock, v7.") removed the only caller, remove this dead code too. Link: https://lkml.kernel.org/r/10-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Acked-by: Mike Rapoport (IBM) Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Howells Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- mm/gup.c | 33 --------------------------------- 2 files changed, 35 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index afefc166b349..a0d59645dcf9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2261,8 +2261,6 @@ extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, */ int get_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); -int pin_user_pages_fast_only(unsigned long start, int nr_pages, - unsigned int gup_flags, struct page **pages); static inline bool get_user_page_fast_only(unsigned long addr, unsigned int gup_flags, struct page **pagep) diff --git a/mm/gup.c b/mm/gup.c index 4ce88e00e1c6..91c8ab53f43f 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -3102,39 +3102,6 @@ int pin_user_pages_fast(unsigned long start, int nr_pages, } EXPORT_SYMBOL_GPL(pin_user_pages_fast); -/* - * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior - * is the same, except that this one sets FOLL_PIN instead of FOLL_GET. - * - * The API rules are the same, too: no negative values may be returned. - */ -int pin_user_pages_fast_only(unsigned long start, int nr_pages, - unsigned int gup_flags, struct page **pages) -{ - int nr_pinned; - - /* - * FOLL_FAST_ONLY is required in order to match the API description of - * this routine: no fall back to regular ("slow") GUP. - */ - if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, - FOLL_PIN | FOLL_FAST_ONLY)) - return 0; - - nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, - pages); - /* - * This routine is not allowed to return negative values. However, - * internal_get_user_pages_fast() *can* return -errno. Therefore, - * correct for that here: - */ - if (nr_pinned < 0) - nr_pinned = 0; - - return nr_pinned; -} -EXPORT_SYMBOL_GPL(pin_user_pages_fast_only); - /** * pin_user_pages_remote() - pin pages of a remote process * -- cgit v1.2.3 From 9198a9196ee67814a101c178ed828f8ea9c2965e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:32 -0400 Subject: mm/gup: make get_user_pages_fast_only() return the common return value There are only two callers, both can handle the common return code: - get_user_page_fast_only() checks == 1 - gfn_to_page_many_atomic() already returns -1, and the only caller checks for negative return values Remove the restriction against returning negative values. Link: https://lkml.kernel.org/r/11-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Acked-by: Mike Rapoport (IBM) Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Howells Signed-off-by: Andrew Morton --- mm/gup.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 91c8ab53f43f..91d047096c09 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -3005,8 +3005,6 @@ static int internal_get_user_pages_fast(unsigned long start, * * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to * the regular GUP. - * Note a difference with get_user_pages_fast: this always returns the - * number of pages pinned, 0 if no pages were pinned. * * If the architecture does not support this function, simply return with no * pages pinned. @@ -3018,7 +3016,6 @@ static int internal_get_user_pages_fast(unsigned long start, int get_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { - int nr_pinned; /* * Internally (within mm/gup.c), gup fast variants must set FOLL_GET, * because gup fast is always a "pin with a +1 page refcount" request. @@ -3030,19 +3027,7 @@ int get_user_pages_fast_only(unsigned long start, int nr_pages, FOLL_GET | FOLL_FAST_ONLY)) return -EINVAL; - nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, - pages); - - /* - * As specified in the API description above, this routine is not - * allowed to return negative values. However, the common core - * routine internal_get_user_pages_fast() *can* return -errno. - * Therefore, correct for that here: - */ - if (nr_pinned < 0) - nr_pinned = 0; - - return nr_pinned; + return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast_only); -- cgit v1.2.3 From 63b605128655f2e3968d99e30b293c7e7eaa2fc2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:33 -0400 Subject: mm/gup: move gup_must_unshare() to mm/internal.h This function is only used in gup.c and closely related. It touches FOLL_PIN so it must be moved before the next patch. Link: https://lkml.kernel.org/r/12-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Howells Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- include/linux/mm.h | 65 ------------------------------------------------------ mm/internal.h | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 65 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index a0d59645dcf9..4a0695ef969a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3180,71 +3180,6 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) return 0; } -/* - * Indicates for which pages that are write-protected in the page table, - * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the - * GUP pin will remain consistent with the pages mapped into the page tables - * of the MM. - * - * Temporary unmapping of PageAnonExclusive() pages or clearing of - * PageAnonExclusive() has to protect against concurrent GUP: - * * Ordinary GUP: Using the PT lock - * * GUP-fast and fork(): mm->write_protect_seq - * * GUP-fast and KSM or temporary unmapping (swap, migration): see - * page_try_share_anon_rmap() - * - * Must be called with the (sub)page that's actually referenced via the - * page table entry, which might not necessarily be the head page for a - * PTE-mapped THP. - * - * If the vma is NULL, we're coming from the GUP-fast path and might have - * to fallback to the slow path just to lookup the vma. - */ -static inline bool gup_must_unshare(struct vm_area_struct *vma, - unsigned int flags, struct page *page) -{ - /* - * FOLL_WRITE is implicitly handled correctly as the page table entry - * has to be writable -- and if it references (part of) an anonymous - * folio, that part is required to be marked exclusive. - */ - if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN) - return false; - /* - * Note: PageAnon(page) is stable until the page is actually getting - * freed. - */ - if (!PageAnon(page)) { - /* - * We only care about R/O long-term pining: R/O short-term - * pinning does not have the semantics to observe successive - * changes through the process page tables. - */ - if (!(flags & FOLL_LONGTERM)) - return false; - - /* We really need the vma ... */ - if (!vma) - return true; - - /* - * ... because we only care about writable private ("COW") - * mappings where we have to break COW early. - */ - return is_cow_mapping(vma->vm_flags); - } - - /* Paired with a memory barrier in page_try_share_anon_rmap(). */ - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) - smp_rmb(); - - /* - * Note that PageKsm() pages cannot be exclusive, and consequently, - * cannot get pinned. - */ - return !PageAnonExclusive(page); -} - /* * Indicates whether GUP can follow a PROT_NONE mapped page, or whether * a (NUMA hinting) fault is required. diff --git a/mm/internal.h b/mm/internal.h index 3f75763b0fc2..4f5ca3401b05 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -858,6 +858,71 @@ int migrate_device_coherent_page(struct page *page); struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); int __must_check try_grab_page(struct page *page, unsigned int flags); +/* + * Indicates for which pages that are write-protected in the page table, + * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the + * GUP pin will remain consistent with the pages mapped into the page tables + * of the MM. + * + * Temporary unmapping of PageAnonExclusive() pages or clearing of + * PageAnonExclusive() has to protect against concurrent GUP: + * * Ordinary GUP: Using the PT lock + * * GUP-fast and fork(): mm->write_protect_seq + * * GUP-fast and KSM or temporary unmapping (swap, migration): see + * page_try_share_anon_rmap() + * + * Must be called with the (sub)page that's actually referenced via the + * page table entry, which might not necessarily be the head page for a + * PTE-mapped THP. + * + * If the vma is NULL, we're coming from the GUP-fast path and might have + * to fallback to the slow path just to lookup the vma. + */ +static inline bool gup_must_unshare(struct vm_area_struct *vma, + unsigned int flags, struct page *page) +{ + /* + * FOLL_WRITE is implicitly handled correctly as the page table entry + * has to be writable -- and if it references (part of) an anonymous + * folio, that part is required to be marked exclusive. + */ + if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN) + return false; + /* + * Note: PageAnon(page) is stable until the page is actually getting + * freed. + */ + if (!PageAnon(page)) { + /* + * We only care about R/O long-term pining: R/O short-term + * pinning does not have the semantics to observe successive + * changes through the process page tables. + */ + if (!(flags & FOLL_LONGTERM)) + return false; + + /* We really need the vma ... */ + if (!vma) + return true; + + /* + * ... because we only care about writable private ("COW") + * mappings where we have to break COW early. + */ + return is_cow_mapping(vma->vm_flags); + } + + /* Paired with a memory barrier in page_try_share_anon_rmap(). */ + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + smp_rmb(); + + /* + * Note that PageKsm() pages cannot be exclusive, and consequently, + * cannot get pinned. + */ + return !PageAnonExclusive(page); +} + extern bool mirrored_kernelcore; static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) -- cgit v1.2.3 From 2c2241081f7dec878331fdc3a3f2361e99556bca Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:34 -0400 Subject: mm/gup: move private gup FOLL_ flags to internal.h Move the flags that should not/are not used outside gup.c and related into mm/internal.h to discourage driver abuse. To make this more maintainable going forward compact the two FOLL ranges with new bit numbers from 0 to 11 and 16 to 21, using shifts so it is explicit. Switch to an enum so the whole thing is easier to read. Link: https://lkml.kernel.org/r/13-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Reviewed-by: John Hubbard Acked-by: David Hildenbrand Cc: David Howells Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: Alistair Popple Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 57 +++++++++++++++++++++++++++++------------------- mm/internal.h | 15 +++++++++++++ 2 files changed, 50 insertions(+), 22 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 434b3ac8a351..56753d0f096d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1040,9 +1040,6 @@ typedef unsigned int __bitwise zap_flags_t; * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each * other. Here is what they mean, and how to use them: * - * FOLL_LONGTERM indicates that the page will be held for an indefinite time - * period _often_ under userspace control. This is in contrast to - * iov_iter_get_pages(), whose usages are transient. * * FIXME: For pages which are part of a filesystem, mappings are subject to the * lifetime enforced by the filesystem and we need guarantees that longterm @@ -1086,24 +1083,40 @@ typedef unsigned int __bitwise zap_flags_t; * Please see Documentation/core-api/pin_user_pages.rst for more information. */ -#define FOLL_WRITE 0x01 /* check pte is writable */ -#define FOLL_TOUCH 0x02 /* mark page accessed */ -#define FOLL_GET 0x04 /* do get_page on page */ -#define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ -#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ -#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO - * and return without waiting upon it */ -#define FOLL_NOFAULT 0x80 /* do not fault in pages */ -#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ -#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ -#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ -#define FOLL_ANON 0x8000 /* don't do file mappings */ -#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ -#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ -#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ -#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ -#define FOLL_PCI_P2PDMA 0x100000 /* allow returning PCI P2PDMA pages */ -#define FOLL_INTERRUPTIBLE 0x200000 /* allow interrupts from generic signals */ -#define FOLL_UNLOCKABLE 0x400000 /* allow unlocking the mmap lock (internal only) */ +enum { + /* check pte is writable */ + FOLL_WRITE = 1 << 0, + /* do get_page on page */ + FOLL_GET = 1 << 1, + /* give error on hole if it would be zero */ + FOLL_DUMP = 1 << 2, + /* get_user_pages read/write w/o permission */ + FOLL_FORCE = 1 << 3, + /* + * if a disk transfer is needed, start the IO and return without waiting + * upon it + */ + FOLL_NOWAIT = 1 << 4, + /* do not fault in pages */ + FOLL_NOFAULT = 1 << 5, + /* check page is hwpoisoned */ + FOLL_HWPOISON = 1 << 6, + /* don't do file mappings */ + FOLL_ANON = 1 << 7, + /* + * FOLL_LONGTERM indicates that the page will be held for an indefinite + * time period _often_ under userspace control. This is in contrast to + * iov_iter_get_pages(), whose usages are transient. + */ + FOLL_LONGTERM = 1 << 8, + /* split huge pmd before returning */ + FOLL_SPLIT_PMD = 1 << 9, + /* allow returning PCI P2PDMA pages */ + FOLL_PCI_P2PDMA = 1 << 10, + /* allow interrupts from generic signals */ + FOLL_INTERRUPTIBLE = 1 << 11, + + /* See also internal only FOLL flags in mm/internal.h */ +}; #endif /* _LINUX_MM_TYPES_H */ diff --git a/mm/internal.h b/mm/internal.h index 4f5ca3401b05..dfb37e94e140 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -858,6 +858,21 @@ int migrate_device_coherent_page(struct page *page); struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); int __must_check try_grab_page(struct page *page, unsigned int flags); +enum { + /* mark page accessed */ + FOLL_TOUCH = 1 << 16, + /* a retry, previous pass started an IO */ + FOLL_TRIED = 1 << 17, + /* we are working on non-current tsk/mm */ + FOLL_REMOTE = 1 << 18, + /* pages must be released via unpin_user_page */ + FOLL_PIN = 1 << 19, + /* gup_fast: prevent fall-back to slow gup */ + FOLL_FAST_ONLY = 1 << 20, + /* allow unlocking the mmap lock */ + FOLL_UNLOCKABLE = 1 << 21, +}; + /* * Indicates for which pages that are write-protected in the page table, * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the -- cgit v1.2.3 From e56397e8c40da82c78dccb6f48bfa21e88ccb1e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 7 Feb 2023 19:21:15 +0000 Subject: mm/damon/sysfs: make kobj_type structures constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit ee6d3dd4ed48 ("driver core: make kobj_type constant.") the driver core allows the usage of const struct kobj_type. Take advantage of this to constify the structure definitions to prevent modification at runtime. Link: https://lkml.kernel.org/r/20230207-kobj_type-damon-v1-1-9d4fea6a465b@weissschuh.net Signed-off-by: Thomas Weißschuh Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs-common.c | 2 +- mm/damon/sysfs-common.h | 4 ++-- mm/damon/sysfs-schemes.c | 18 +++++++++--------- mm/damon/sysfs.c | 22 +++++++++++----------- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c index 52bebf242f74..70edf45c2174 100644 --- a/mm/damon/sysfs-common.c +++ b/mm/damon/sysfs-common.c @@ -99,7 +99,7 @@ static struct attribute *damon_sysfs_ul_range_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_ul_range); -struct kobj_type damon_sysfs_ul_range_ktype = { +const struct kobj_type damon_sysfs_ul_range_ktype = { .release = damon_sysfs_ul_range_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_ul_range_groups, diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 604a6cbc3ede..db677eba78fd 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -21,7 +21,7 @@ struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc( unsigned long max); void damon_sysfs_ul_range_release(struct kobject *kobj); -extern struct kobj_type damon_sysfs_ul_range_ktype; +extern const struct kobj_type damon_sysfs_ul_range_ktype; /* * schemes directory @@ -36,7 +36,7 @@ struct damon_sysfs_schemes { struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void); void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes); -extern struct kobj_type damon_sysfs_schemes_ktype; +extern const struct kobj_type damon_sysfs_schemes_ktype; int damon_sysfs_set_schemes(struct damon_ctx *ctx, struct damon_sysfs_schemes *sysfs_schemes); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 86edca66aab1..3cdad5a7f936 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -103,7 +103,7 @@ static struct attribute *damon_sysfs_scheme_region_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_scheme_region); -static struct kobj_type damon_sysfs_scheme_region_ktype = { +static const struct kobj_type damon_sysfs_scheme_region_ktype = { .release = damon_sysfs_scheme_region_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_scheme_region_groups, @@ -153,7 +153,7 @@ static struct attribute *damon_sysfs_scheme_regions_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_scheme_regions); -static struct kobj_type damon_sysfs_scheme_regions_ktype = { +static const struct kobj_type damon_sysfs_scheme_regions_ktype = { .release = damon_sysfs_scheme_regions_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_scheme_regions_groups, @@ -252,7 +252,7 @@ static struct attribute *damon_sysfs_stats_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_stats); -static struct kobj_type damon_sysfs_stats_ktype = { +static const struct kobj_type damon_sysfs_stats_ktype = { .release = damon_sysfs_stats_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_stats_groups, @@ -678,7 +678,7 @@ static struct attribute *damon_sysfs_watermarks_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_watermarks); -static struct kobj_type damon_sysfs_watermarks_ktype = { +static const struct kobj_type damon_sysfs_watermarks_ktype = { .release = damon_sysfs_watermarks_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_watermarks_groups, @@ -789,7 +789,7 @@ static struct attribute *damon_sysfs_weights_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_weights); -static struct kobj_type damon_sysfs_weights_ktype = { +static const struct kobj_type damon_sysfs_weights_ktype = { .release = damon_sysfs_weights_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_weights_groups, @@ -920,7 +920,7 @@ static struct attribute *damon_sysfs_quotas_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_quotas); -static struct kobj_type damon_sysfs_quotas_ktype = { +static const struct kobj_type damon_sysfs_quotas_ktype = { .release = damon_sysfs_quotas_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_quotas_groups, @@ -1019,7 +1019,7 @@ static struct attribute *damon_sysfs_access_pattern_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_access_pattern); -static struct kobj_type damon_sysfs_access_pattern_ktype = { +static const struct kobj_type damon_sysfs_access_pattern_ktype = { .release = damon_sysfs_access_pattern_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_access_pattern_groups, @@ -1279,7 +1279,7 @@ static struct attribute *damon_sysfs_scheme_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_scheme); -static struct kobj_type damon_sysfs_scheme_ktype = { +static const struct kobj_type damon_sysfs_scheme_ktype = { .release = damon_sysfs_scheme_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_scheme_groups, @@ -1396,7 +1396,7 @@ static struct attribute *damon_sysfs_schemes_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_schemes); -struct kobj_type damon_sysfs_schemes_ktype = { +const struct kobj_type damon_sysfs_schemes_ktype = { .release = damon_sysfs_schemes_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_schemes_groups, diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index aeb0beb1da91..33e1d5c9cb54 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -81,7 +81,7 @@ static struct attribute *damon_sysfs_region_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_region); -static struct kobj_type damon_sysfs_region_ktype = { +static const struct kobj_type damon_sysfs_region_ktype = { .release = damon_sysfs_region_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_region_groups, @@ -198,7 +198,7 @@ static struct attribute *damon_sysfs_regions_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_regions); -static struct kobj_type damon_sysfs_regions_ktype = { +static const struct kobj_type damon_sysfs_regions_ktype = { .release = damon_sysfs_regions_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_regions_groups, @@ -277,7 +277,7 @@ static struct attribute *damon_sysfs_target_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_target); -static struct kobj_type damon_sysfs_target_ktype = { +static const struct kobj_type damon_sysfs_target_ktype = { .release = damon_sysfs_target_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_target_groups, @@ -402,7 +402,7 @@ static struct attribute *damon_sysfs_targets_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_targets); -static struct kobj_type damon_sysfs_targets_ktype = { +static const struct kobj_type damon_sysfs_targets_ktype = { .release = damon_sysfs_targets_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_targets_groups, @@ -530,7 +530,7 @@ static struct attribute *damon_sysfs_intervals_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_intervals); -static struct kobj_type damon_sysfs_intervals_ktype = { +static const struct kobj_type damon_sysfs_intervals_ktype = { .release = damon_sysfs_intervals_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_intervals_groups, @@ -612,7 +612,7 @@ static struct attribute *damon_sysfs_attrs_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_attrs); -static struct kobj_type damon_sysfs_attrs_ktype = { +static const struct kobj_type damon_sysfs_attrs_ktype = { .release = damon_sysfs_attrs_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_attrs_groups, @@ -800,7 +800,7 @@ static struct attribute *damon_sysfs_context_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_context); -static struct kobj_type damon_sysfs_context_ktype = { +static const struct kobj_type damon_sysfs_context_ktype = { .release = damon_sysfs_context_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_context_groups, @@ -926,7 +926,7 @@ static struct attribute *damon_sysfs_contexts_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_contexts); -static struct kobj_type damon_sysfs_contexts_ktype = { +static const struct kobj_type damon_sysfs_contexts_ktype = { .release = damon_sysfs_contexts_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_contexts_groups, @@ -1564,7 +1564,7 @@ static struct attribute *damon_sysfs_kdamond_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_kdamond); -static struct kobj_type damon_sysfs_kdamond_ktype = { +static const struct kobj_type damon_sysfs_kdamond_ktype = { .release = damon_sysfs_kdamond_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_kdamond_groups, @@ -1707,7 +1707,7 @@ static struct attribute *damon_sysfs_kdamonds_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_kdamonds); -static struct kobj_type damon_sysfs_kdamonds_ktype = { +static const struct kobj_type damon_sysfs_kdamonds_ktype = { .release = damon_sysfs_kdamonds_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_kdamonds_groups, @@ -1757,7 +1757,7 @@ static struct attribute *damon_sysfs_ui_dir_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_ui_dir); -static struct kobj_type damon_sysfs_ui_dir_ktype = { +static const struct kobj_type damon_sysfs_ui_dir_ktype = { .release = damon_sysfs_ui_dir_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_ui_dir_groups, -- cgit v1.2.3 From 223ec6ab265ead0b319bc2f15d0d1be05078a74b Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 7 Feb 2023 06:27:00 +0000 Subject: mm/memremap.c: fix outdated comment in devm_memremap_pages commit a4574f63edc6 ("mm/memremap_pages: convert to 'struct range'") converted res to range, update the comment correspondingly. Link: https://lkml.kernel.org/r/1675751220-2-1-git-send-email-lizhijian@fujitsu.com Signed-off-by: Li Zhijian Cc: Dan Williams Signed-off-by: Andrew Morton --- mm/memremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memremap.c b/mm/memremap.c index 2f88f43d4a01..bee85560a243 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -385,7 +385,7 @@ EXPORT_SYMBOL_GPL(memremap_pages); * @pgmap: pointer to a struct dev_pagemap * * Notes: - * 1/ At a minimum the res and type members of @pgmap must be initialized + * 1/ At a minimum the range and type members of @pgmap must be initialized * by the caller before passing it to this function * * 2/ The altmap field may optionally be initialized, in which case -- cgit v1.2.3 From f528260b1a7d52140dfeb58857e13fc98ac193ef Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 13 Feb 2023 13:43:24 -0800 Subject: mm/khugepaged: fix invalid page access in release_pte_pages() release_pte_pages() converts from a pfn to a folio by using pfn_folio(). If the pte is not mapped, pfn_folio() will result in undefined behavior which ends up causing a kernel panic[1]. Only call pfn_folio() once we have validated that the pte is both valid and mapped to fix the issue. [1] https://lore.kernel.org/linux-mm/ff300770-afe9-908d-23ed-d23e0796e899@samsung.com/ Link: https://lkml.kernel.org/r/20230213214324.34215-1-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Fixes: 9bdfeea46f49 ("mm/khugepaged: convert release_pte_pages() to use folios") Reported-by: Marek Szyprowski Tested-by: Marek Szyprowski Debugged-by: Alexandre Ghiti Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/khugepaged.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b39ab219d5b7..bd54b957f69a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -511,11 +511,17 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, while (--_pte >= pte) { pte_t pteval = *_pte; + unsigned long pfn; - folio = pfn_folio(pte_pfn(pteval)); - if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) && - !folio_test_large(folio)) - release_pte_folio(folio); + if (pte_none(pteval)) + continue; + pfn = pte_pfn(pteval); + if (is_zero_pfn(pfn)) + continue; + folio = pfn_folio(pfn); + if (folio_test_large(folio)) + continue; + release_pte_folio(folio); } list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) { -- cgit v1.2.3 From 6aa3a920125e9f58891e2b5dc2efd4d0c1ff05a6 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:50 -0600 Subject: mm/hugetlb: convert isolate_hugetlb to folios Patch series "continue hugetlb folio conversion", v3. This series continues the conversion of core hugetlb functions to use folios. This series converts many helper funtions in the hugetlb fault path. This is in preparation for another series to convert the hugetlb fault code paths to operate on folios. This patch (of 8): Convert isolate_hugetlb() to take in a folio and convert its callers to pass a folio. Use page_folio() to convert the callers to use a folio is safe as isolate_hugetlb() operates on a head page. Link: https://lkml.kernel.org/r/20230113223057.173292-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20230113223057.173292-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/gup.c | 2 +- mm/hugetlb.c | 16 ++++++++-------- mm/memory-failure.c | 2 +- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 2 +- mm/migrate.c | 2 +- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a51e6daacac6..6e38a019f654 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -171,7 +171,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -int isolate_hugetlb(struct page *page, struct list_head *list); +int isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); @@ -413,7 +413,7 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, return NULL; } -static inline int isolate_hugetlb(struct page *page, struct list_head *list) +static inline int isolate_hugetlb(struct folio *folio, struct list_head *list) { return -EBUSY; } diff --git a/mm/gup.c b/mm/gup.c index 25e4a3d923d6..b0885f70579c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1930,7 +1930,7 @@ static unsigned long collect_longterm_unpinnable_pages( continue; if (folio_test_hugetlb(folio)) { - isolate_hugetlb(&folio->page, movable_page_list); + isolate_hugetlb(folio, movable_page_list); continue; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ab35b1cc9927..0c1e1ce113c8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2925,7 +2925,7 @@ retry: * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - ret = isolate_hugetlb(&old_folio->page, list); + ret = isolate_hugetlb(old_folio, list); spin_lock_irq(&hugetlb_lock); goto free_new; } else if (!folio_test_hugetlb_freed(old_folio)) { @@ -3000,7 +3000,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (folio_ref_count(folio) && !isolate_hugetlb(&folio->page, list)) + if (folio_ref_count(folio) && !isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); @@ -7250,19 +7250,19 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) * These functions are overwritable if your architecture needs its own * behavior. */ -int isolate_hugetlb(struct page *page, struct list_head *list) +int isolate_hugetlb(struct folio *folio, struct list_head *list) { int ret = 0; spin_lock_irq(&hugetlb_lock); - if (!PageHeadHuge(page) || - !HPageMigratable(page) || - !get_page_unless_zero(page)) { + if (!folio_test_hugetlb(folio) || + !folio_test_hugetlb_migratable(folio) || + !folio_try_get(folio)) { ret = -EBUSY; goto unlock; } - ClearHPageMigratable(page); - list_move_tail(&page->lru, list); + folio_clear_hugetlb_migratable(folio); + list_move_tail(&folio->lru, list); unlock: spin_unlock_irq(&hugetlb_lock); return ret; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b4b30d9b0782..db85c2d37f70 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2508,7 +2508,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) bool isolated = false; if (PageHuge(page)) { - isolated = !isolate_hugetlb(page, pagelist); + isolated = !isolate_hugetlb(page_folio(page), pagelist); } else { bool lru = !__PageMovable(page); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fd40f7e9f176..a1e8c3e9ab08 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1641,7 +1641,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (PageHuge(page)) { pfn = page_to_pfn(head) + compound_nr(head) - 1; - isolate_hugetlb(head, &source); + isolate_hugetlb(folio, &source); continue; } else if (PageTransHuge(page)) pfn = page_to_pfn(head) + thp_nr_pages(page) - 1; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index dd5ca942256f..fc034b070645 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -602,7 +602,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, if (flags & (MPOL_MF_MOVE_ALL) || (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 && !hugetlb_pmd_shared(pte))) { - if (isolate_hugetlb(page, qp->pagelist) && + if (isolate_hugetlb(page_folio(page), qp->pagelist) && (flags & MPOL_MF_STRICT)) /* * Failed to isolate page but allow migrating pages diff --git a/mm/migrate.c b/mm/migrate.c index 206fcdbe67f3..f6464bce7678 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1773,7 +1773,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (PageHuge(page)) { if (PageHead(page)) { - err = isolate_hugetlb(page, pagelist); + err = isolate_hugetlb(page_folio(page), pagelist); if (!err) err = 1; } -- cgit v1.2.3 From 6f6956cf7e6a3034f61780446547e849aa4e216d Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:51 -0600 Subject: mm/hugetlb: convert __update_and_free_page() to folios Change __update_and_free_page() to __update_and_free_hugetlb_folio() by changing its callers to pass in a folio. Link: https://lkml.kernel.org/r/20230113223057.173292-3-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0c1e1ce113c8..d27fcf768548 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1698,10 +1698,10 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio, enqueue_hugetlb_folio(h, folio); } -static void __update_and_free_page(struct hstate *h, struct page *page) +static void __update_and_free_hugetlb_folio(struct hstate *h, + struct folio *folio) { int i; - struct folio *folio = page_folio(page); struct page *subpage; if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) @@ -1714,7 +1714,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; - if (hugetlb_vmemmap_restore(h, page)) { + if (hugetlb_vmemmap_restore(h, &folio->page)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1750,7 +1750,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) destroy_compound_gigantic_folio(folio, huge_page_order(h)); free_gigantic_folio(folio, huge_page_order(h)); } else { - __free_pages(page, huge_page_order(h)); + __free_pages(&folio->page, huge_page_order(h)); } } @@ -1790,7 +1790,7 @@ static void free_hpage_workfn(struct work_struct *work) */ h = size_to_hstate(page_size(page)); - __update_and_free_page(h, page); + __update_and_free_hugetlb_folio(h, page_folio(page)); cond_resched(); } @@ -1807,7 +1807,7 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, bool atomic) { if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) { - __update_and_free_page(h, &folio->page); + __update_and_free_hugetlb_folio(h, folio); return; } -- cgit v1.2.3 From a36f1e9024740c3820427afca4cd375e32a1bb15 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:52 -0600 Subject: mm/hugetlb: convert dequeue_hugetlb_page functions to folios dequeue_huge_page_node_exact() is changed to dequeue_hugetlb_folio_node_ exact() and dequeue_huge_page_nodemask() is changed to dequeue_hugetlb_ folio_nodemask(). Update their callers to pass in a folio. Link: https://lkml.kernel.org/r/20230113223057.173292-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 56 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d27fcf768548..3e648fccf33e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1282,32 +1282,33 @@ static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) folio_set_hugetlb_freed(folio); } -static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) +static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, + int nid) { - struct page *page; + struct folio *folio; bool pin = !!(current->flags & PF_MEMALLOC_PIN); lockdep_assert_held(&hugetlb_lock); - list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { - if (pin && !is_longterm_pinnable_page(page)) + list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) { + if (pin && !folio_is_longterm_pinnable(folio)) continue; - if (PageHWPoison(page)) + if (folio_test_hwpoison(folio)) continue; - list_move(&page->lru, &h->hugepage_activelist); - set_page_refcounted(page); - ClearHPageFreed(page); + list_move(&folio->lru, &h->hugepage_activelist); + folio_ref_unfreeze(folio, 1); + folio_clear_hugetlb_freed(folio); h->free_huge_pages--; h->free_huge_pages_node[nid]--; - return page; + return folio; } return NULL; } -static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, - nodemask_t *nmask) +static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask) { unsigned int cpuset_mems_cookie; struct zonelist *zonelist; @@ -1320,7 +1321,7 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { - struct page *page; + struct folio *folio; if (!cpuset_zone_allowed(zone, gfp_mask)) continue; @@ -1332,9 +1333,9 @@ retry_cpuset: continue; node = zone_to_nid(zone); - page = dequeue_huge_page_node_exact(h, node); - if (page) - return page; + folio = dequeue_hugetlb_folio_node_exact(h, node); + if (folio) + return folio; } if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; @@ -1352,7 +1353,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, unsigned long address, int avoid_reserve, long chg) { - struct page *page = NULL; + struct folio *folio = NULL; struct mempolicy *mpol; gfp_t gfp_mask; nodemask_t *nodemask; @@ -1374,22 +1375,24 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); if (mpol_is_preferred_many(mpol)) { - page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, + nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } - if (!page) - page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + if (!folio) + folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, + nid, nodemask); - if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { - SetHPageRestoreReserve(page); + if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) { + folio_set_hugetlb_restore_reserve(folio); h->resv_huge_pages--; } mpol_cond_put(mpol); - return page; + return &folio->page; err: return NULL; @@ -2475,12 +2478,13 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, { spin_lock_irq(&hugetlb_lock); if (available_huge_pages(h)) { - struct page *page; + struct folio *folio; - page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); - if (page) { + folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, + preferred_nid, nmask); + if (folio) { spin_unlock_irq(&hugetlb_lock); - return page; + return &folio->page; } } spin_unlock_irq(&hugetlb_lock); -- cgit v1.2.3 From 3a740e8bb56ef7ee6b9098b694caabab843be067 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:53 -0600 Subject: mm/hugetlb: convert alloc_surplus_huge_page() to folios Change alloc_surplus_huge_page() to alloc_surplus_hugetlb_folio() and update its callers. Link: https://lkml.kernel.org/r/20230113223057.173292-5-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3e648fccf33e..fa61b4aa68ca 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2378,8 +2378,8 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) /* * Allocates a fresh surplus page from the page allocator. */ -static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nmask) +static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio = NULL; @@ -2416,7 +2416,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, out_unlock: spin_unlock_irq(&hugetlb_lock); - return &folio->page; + return folio; } static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, @@ -2449,7 +2449,7 @@ static struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct page *page = NULL; + struct folio *folio = NULL; struct mempolicy *mpol; gfp_t gfp_mask = htlb_alloc_mask(h); int nid; @@ -2460,16 +2460,16 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, gfp_t gfp = gfp_mask | __GFP_NOWARN; gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = alloc_surplus_huge_page(h, gfp, nid, nodemask); + folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } - if (!page) - page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); + if (!folio) + folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); - return page; + return &folio->page; } /* page migration callback function */ @@ -2518,6 +2518,7 @@ static int gather_surplus_pages(struct hstate *h, long delta) __must_hold(&hugetlb_lock) { LIST_HEAD(surplus_list); + struct folio *folio; struct page *page, *tmp; int ret; long i; @@ -2537,13 +2538,13 @@ static int gather_surplus_pages(struct hstate *h, long delta) retry: spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), + folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), NUMA_NO_NODE, NULL); - if (!page) { + if (!folio) { alloc_ok = false; break; } - list_add(&page->lru, &surplus_list); + list_add(&folio->lru, &surplus_list); cond_resched(); } allocated += i; @@ -3496,7 +3497,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * First take pages out of surplus state. Then make up the * remaining difference by allocating fresh huge pages. * - * We might race with alloc_surplus_huge_page() here and be unable + * We might race with alloc_surplus_hugetlb_folio() here and be unable * to convert a surplus huge page to a normal huge page. That is * not critical, though, it just means the overall size of the * pool might be one hugepage larger than it needs to be, but @@ -3539,7 +3540,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * By placing pages into the surplus state independent of the * overcommit value, we are allowing the surplus pool size to * exceed overcommit. There are few sane options here. Since - * alloc_surplus_huge_page() is checking the global counter, + * alloc_surplus_hugetlb_folio() is checking the global counter, * though, we'll note that we're not allowed to exceed surplus * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. -- cgit v1.2.3 From ff7d853b031302376a0d3640fa1c463d94079637 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:54 -0600 Subject: mm/hugetlb: increase use of folios in alloc_huge_page() Change hugetlb_cgroup_commit_charge{,_rsvd}(), dequeue_huge_page_vma() and alloc_buddy_huge_page_with_mpol() to use folios so alloc_huge_page() is cleaned by operating on folios until its return. Link: https://lkml.kernel.org/r/20230113223057.173292-6-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb_cgroup.h | 8 ++++---- mm/hugetlb.c | 33 ++++++++++++++++----------------- mm/hugetlb_cgroup.c | 8 ++------ 3 files changed, 22 insertions(+), 27 deletions(-) diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index f706626a8063..3d82d91f49ac 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -141,10 +141,10 @@ extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr); extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page); + struct folio *folio); extern void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page); + struct folio *folio); extern void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, struct folio *folio); extern void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, @@ -230,14 +230,14 @@ static inline int hugetlb_cgroup_charge_cgroup_rsvd(int idx, static inline void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { } static inline void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fa61b4aa68ca..5d0d1efbe590 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1348,7 +1348,7 @@ static unsigned long available_huge_pages(struct hstate *h) return h->free_huge_pages - h->resv_huge_pages; } -static struct page *dequeue_huge_page_vma(struct hstate *h, +static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve, long chg) @@ -1392,7 +1392,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, } mpol_cond_put(mpol); - return &folio->page; + return folio; err: return NULL; @@ -2446,7 +2446,7 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, * Use the VMA's mpolicy to allocate a huge page from the buddy. */ static -struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, +struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { struct folio *folio = NULL; @@ -2469,7 +2469,7 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, if (!folio) folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); - return &folio->page; + return folio; } /* page migration callback function */ @@ -3018,7 +3018,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, { struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); - struct page *page; struct folio *folio; long map_chg, map_commit; long gbl_chg; @@ -3082,34 +3081,34 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, * from the global free pool (global change). gbl_chg == 0 indicates * a reservation exists for the allocation. */ - page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); - if (!page) { + folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg); + if (!folio) { spin_unlock_irq(&hugetlb_lock); - page = alloc_buddy_huge_page_with_mpol(h, vma, addr); - if (!page) + folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr); + if (!folio) goto out_uncharge_cgroup; spin_lock_irq(&hugetlb_lock); if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { - SetHPageRestoreReserve(page); + folio_set_hugetlb_restore_reserve(folio); h->resv_huge_pages--; } - list_add(&page->lru, &h->hugepage_activelist); - set_page_refcounted(page); + list_add(&folio->lru, &h->hugepage_activelist); + folio_ref_unfreeze(folio, 1); /* Fall through */ } - folio = page_folio(page); - hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); + + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page. */ if (deferred_reserve) { hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), - h_cg, page); + h_cg, folio); } spin_unlock_irq(&hugetlb_lock); - hugetlb_set_page_subpool(page, spool); + hugetlb_set_folio_subpool(folio, spool); map_commit = vma_commit_reservation(h, vma, addr); if (unlikely(map_chg > map_commit)) { @@ -3130,7 +3129,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); } - return page; + return &folio->page; out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index d9e4425d81ac..dedd2edb076e 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -331,19 +331,15 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { - struct folio *folio = page_folio(page); - __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false); } void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { - struct folio *folio = page_folio(page); - __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true); } -- cgit v1.2.3 From e37d3e838d9078538f920957d1e89682b6764977 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:55 -0600 Subject: mm/hugetlb: convert alloc_migrate_huge_page to folios Change alloc_huge_page_nodemask() to alloc_hugetlb_folio_nodemask() and alloc_migrate_huge_page() to alloc_migrate_hugetlb_folio(). Both functions now return a folio rather than a page. Link: https://lkml.kernel.org/r/20230113223057.173292-7-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 +++--- mm/hugetlb.c | 18 +++++++++--------- mm/migrate.c | 5 ++++- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6e38a019f654..2375c62c61a4 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -719,7 +719,7 @@ struct huge_bootmem_page { int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); -struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, +struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); @@ -1040,8 +1040,8 @@ static inline struct page *alloc_huge_page(struct vm_area_struct *vma, return NULL; } -static inline struct page * -alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, +static inline struct folio * +alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { return NULL; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5d0d1efbe590..57894beb3382 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2419,7 +2419,7 @@ out_unlock: return folio; } -static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, +static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio; @@ -2439,7 +2439,7 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, */ folio_set_hugetlb_temporary(folio); - return &folio->page; + return folio; } /* @@ -2472,8 +2472,8 @@ struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, return folio; } -/* page migration callback function */ -struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, +/* folio migration callback function */ +struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { spin_lock_irq(&hugetlb_lock); @@ -2484,12 +2484,12 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, preferred_nid, nmask); if (folio) { spin_unlock_irq(&hugetlb_lock); - return &folio->page; + return folio; } } spin_unlock_irq(&hugetlb_lock); - return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); + return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); } /* mempolicy aware migration callback */ @@ -2498,16 +2498,16 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, { struct mempolicy *mpol; nodemask_t *nodemask; - struct page *page; + struct folio *folio; gfp_t gfp_mask; int node; gfp_mask = htlb_alloc_mask(h); node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask); + folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); mpol_cond_put(mpol); - return page; + return &folio->page; } /* diff --git a/mm/migrate.c b/mm/migrate.c index f6464bce7678..811e76c6fac1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1663,6 +1663,7 @@ struct page *alloc_migration_target(struct page *page, unsigned long private) struct migration_target_control *mtc; gfp_t gfp_mask; unsigned int order = 0; + struct folio *hugetlb_folio = NULL; struct folio *new_folio = NULL; int nid; int zidx; @@ -1677,7 +1678,9 @@ struct page *alloc_migration_target(struct page *page, unsigned long private) struct hstate *h = folio_hstate(folio); gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); - return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask); + hugetlb_folio = alloc_hugetlb_folio_nodemask(h, nid, + mtc->nmask, gfp_mask); + return &hugetlb_folio->page; } if (folio_test_large(folio)) { -- cgit v1.2.3 From 0ffdc38eb564c1c71a58bbaf874945ba54293ff9 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:56 -0600 Subject: mm/hugetlb: convert restore_reserve_on_error() to folios Use the hugetlb folio flag macros inside restore_reserve_on_error() and update the comments to reflect the use of folios. Link: https://lkml.kernel.org/r/20230113223057.173292-8-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 57894beb3382..3120c3db60c4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2819,22 +2819,23 @@ static long vma_del_reservation(struct hstate *h, void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct page *page) { + struct folio *folio = page_folio(page); long rc = vma_needs_reservation(h, vma, address); - if (HPageRestoreReserve(page)) { + if (folio_test_hugetlb_restore_reserve(folio)) { if (unlikely(rc < 0)) /* * Rare out of memory condition in reserve map - * manipulation. Clear HPageRestoreReserve so that - * global reserve count will not be incremented + * manipulation. Clear hugetlb_restore_reserve so + * that global reserve count will not be incremented * by free_huge_page. This will make it appear - * as though the reservation for this page was + * as though the reservation for this folio was * consumed. This may prevent the task from - * faulting in the page at a later time. This + * faulting in the folio at a later time. This * is better than inconsistent global huge page * accounting of reserve counts. */ - ClearHPageRestoreReserve(page); + folio_clear_hugetlb_restore_reserve(folio); else if (rc) (void)vma_add_reservation(h, vma, address); else @@ -2845,7 +2846,7 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, * This indicates there is an entry in the reserve map * not added by alloc_huge_page. We know it was added * before the alloc_huge_page call, otherwise - * HPageRestoreReserve would be set on the page. + * hugetlb_restore_reserve would be set on the folio. * Remove the entry so that a subsequent allocation * does not consume a reservation. */ @@ -2854,12 +2855,12 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, /* * VERY rare out of memory condition. Since * we can not delete the entry, set - * HPageRestoreReserve so that the reserve - * count will be incremented when the page + * hugetlb_restore_reserve so that the reserve + * count will be incremented when the folio * is freed. This reserve will be consumed * on a subsequent allocation. */ - SetHPageRestoreReserve(page); + folio_set_hugetlb_restore_reserve(folio); } else if (rc < 0) { /* * Rare out of memory condition from @@ -2875,12 +2876,12 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, /* * For private mappings, no entry indicates * a reservation is present. Since we can - * not add an entry, set SetHPageRestoreReserve - * on the page so reserve count will be + * not add an entry, set hugetlb_restore_reserve + * on the folio so reserve count will be * incremented when freed. This reserve will * be consumed on a subsequent allocation. */ - SetHPageRestoreReserve(page); + folio_set_hugetlb_restore_reserve(folio); } else /* * No reservation present, do nothing -- cgit v1.2.3 From bdd7be075acb650cc57d8ee752b5375b966ad07e Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:57 -0600 Subject: mm/hugetlb: convert demote_free_huge_page to folios Change demote_free_huge_page to demote_free_hugetlb_folio() and change demote_pool_huge_page() pass in a folio. Link: https://lkml.kernel.org/r/20230113223057.173292-9-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3120c3db60c4..4ecdbad9a451 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3579,12 +3579,12 @@ out: return 0; } -static int demote_free_huge_page(struct hstate *h, struct page *page) +static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) { - int i, nid = page_to_nid(page); + int i, nid = folio_nid(folio); struct hstate *target_hstate; - struct folio *folio = page_folio(page); struct page *subpage; + struct folio *inner_folio; int rc = 0; target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); @@ -3592,18 +3592,18 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) remove_hugetlb_folio_for_demote(h, folio, false); spin_unlock_irq(&hugetlb_lock); - rc = hugetlb_vmemmap_restore(h, page); + rc = hugetlb_vmemmap_restore(h, &folio->page); if (rc) { - /* Allocation of vmemmmap failed, we can not demote page */ + /* Allocation of vmemmmap failed, we can not demote folio */ spin_lock_irq(&hugetlb_lock); - set_page_refcounted(page); - add_hugetlb_folio(h, page_folio(page), false); + folio_ref_unfreeze(folio, 1); + add_hugetlb_folio(h, folio, false); return rc; } /* * Use destroy_compound_hugetlb_folio_for_demote for all huge page - * sizes as it will not ref count pages. + * sizes as it will not ref count folios. */ destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h)); @@ -3618,15 +3618,15 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) mutex_lock(&target_hstate->resize_lock); for (i = 0; i < pages_per_huge_page(h); i += pages_per_huge_page(target_hstate)) { - subpage = nth_page(page, i); - folio = page_folio(subpage); + subpage = folio_page(folio, i); + inner_folio = page_folio(subpage); if (hstate_is_gigantic(target_hstate)) - prep_compound_gigantic_folio_for_demote(folio, + prep_compound_gigantic_folio_for_demote(inner_folio, target_hstate->order); else prep_compound_page(subpage, target_hstate->order); - set_page_private(subpage, 0); - prep_new_hugetlb_folio(target_hstate, folio, nid); + folio_change_private(inner_folio, NULL); + prep_new_hugetlb_folio(target_hstate, inner_folio, nid); free_huge_page(subpage); } mutex_unlock(&target_hstate->resize_lock); @@ -3648,7 +3648,7 @@ static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) __must_hold(&hugetlb_lock) { int nr_nodes, node; - struct page *page; + struct folio *folio; lockdep_assert_held(&hugetlb_lock); @@ -3659,11 +3659,10 @@ static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) } for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { - list_for_each_entry(page, &h->hugepage_freelists[node], lru) { - if (PageHWPoison(page)) + list_for_each_entry(folio, &h->hugepage_freelists[node], lru) { + if (folio_test_hwpoison(folio)) continue; - - return demote_free_huge_page(h, page); + return demote_free_hugetlb_folio(h, folio); } } -- cgit v1.2.3 From ea4c353df37750d170dc0dcbfa8c47c984779733 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:30 -0800 Subject: mm/hugetlb: convert hugetlb_install_page to folios Patch series "convert hugetlb fault functions to folios", v2. This series converts the hugetlb page faulting functions to operate on folios. These include hugetlb_no_page(), hugetlb_wp(), copy_hugetlb_page_range(), and hugetlb_mcopy_atomic_pte(). This patch (of 8): Change hugetlb_install_page() to hugetlb_install_folio(). This reduces one user of the Huge Page flag macros which take in a page. Link: https://lkml.kernel.org/r/20230125170537.96973-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20230125170537.96973-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Gerald Schaefer Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4ecdbad9a451..b246f2b4d0bd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4946,14 +4946,14 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte) } static void -hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, - struct page *new_page) +hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, + struct folio *new_folio) { - __SetPageUptodate(new_page); - hugepage_add_new_anon_rmap(new_page, vma, addr); - set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); + __folio_mark_uptodate(new_folio); + hugepage_add_new_anon_rmap(&new_folio->page, vma, addr); + set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, &new_folio->page, 1)); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); - SetHPageMigratable(new_page); + folio_set_hugetlb_migratable(new_folio); } int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, @@ -5107,7 +5107,7 @@ again: /* huge_ptep of dst_pte won't change as in child */ goto again; } - hugetlb_install_page(dst_vma, dst_pte, addr, new); + hugetlb_install_folio(dst_vma, dst_pte, addr, page_folio(new)); spin_unlock(src_ptl); spin_unlock(dst_ptl); continue; -- cgit v1.2.3 From 91a2fb956ad993f3cbcfc632611e17e3699fb652 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:31 -0800 Subject: mm/hugetlb: convert hugetlbfs_pagecache_present() to folios Refactor hugetlbfs_pagecache_present() to avoid getting and dropping a refcount on a page. Use RCU and page_cache_next_miss() instead. Link: https://lkml.kernel.org/r/20230125170537.96973-3-sidhartha.kumar@oracle.com Suggested-by: Matthew Wilcox Signed-off-by: Sidhartha Kumar Cc: Gerald Schaefer Cc: John Hubbard Cc: kernel test robot Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b246f2b4d0bd..a0d486ed5411 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5651,17 +5651,15 @@ out_release_old: static bool hugetlbfs_pagecache_present(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { - struct address_space *mapping; - pgoff_t idx; - struct page *page; + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t idx = vma_hugecache_offset(h, vma, address); + bool present; - mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, address); + rcu_read_lock(); + present = page_cache_next_miss(mapping, idx, 1) != idx; + rcu_read_unlock(); - page = find_get_page(mapping, idx); - if (page) - put_page(page); - return page != NULL; + return present; } int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, -- cgit v1.2.3 From ea8e72f4116a995c2aba3fb738ac372c4115375a Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:32 -0800 Subject: mm/hugetlb: convert putback_active_hugepage to take in a folio Convert putback_active_hugepage() to folio_putback_active_hugetlb(), this removes one user of the Huge Page macros which take in a page. The callers in migrate.c are also cleaned up by being able to directly use the src and dst folio variables. Link: https://lkml.kernel.org/r/20230125170537.96973-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Gerald Schaefer Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/hugetlb.c | 8 ++++---- mm/migrate.c | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2375c62c61a4..067906c5778e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -175,7 +175,7 @@ int isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); -void putback_active_hugepage(struct page *page); +void folio_putback_active_hugetlb(struct folio *folio); void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); @@ -429,7 +429,7 @@ static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return 0; } -static inline void putback_active_hugepage(struct page *page) +static inline void folio_putback_active_hugetlb(struct folio *folio) { } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a0d486ed5411..fd1ce61b8f3f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7300,13 +7300,13 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return ret; } -void putback_active_hugepage(struct page *page) +void folio_putback_active_hugetlb(struct folio *folio) { spin_lock_irq(&hugetlb_lock); - SetHPageMigratable(page); - list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); + folio_set_hugetlb_migratable(folio); + list_move_tail(&folio->lru, &(folio_hstate(folio))->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); - put_page(page); + folio_put(folio); } void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason) diff --git a/mm/migrate.c b/mm/migrate.c index 811e76c6fac1..c09872cf41b7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -151,7 +151,7 @@ void putback_movable_pages(struct list_head *l) list_for_each_entry_safe(page, page2, l, lru) { if (unlikely(PageHuge(page))) { - putback_active_hugepage(page); + folio_putback_active_hugetlb(page_folio(page)); continue; } list_del(&page->lru); @@ -1298,7 +1298,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ - putback_active_hugepage(hpage); + folio_putback_active_hugetlb(src); return MIGRATEPAGE_SUCCESS; } @@ -1383,7 +1383,7 @@ out_unlock: folio_unlock(src); out: if (rc == MIGRATEPAGE_SUCCESS) - putback_active_hugepage(hpage); + folio_putback_active_hugetlb(src); else if (rc != -EAGAIN) list_move_tail(&src->lru, ret); @@ -1395,7 +1395,7 @@ out: if (put_new_page) put_new_page(new_hpage, private); else - putback_active_hugepage(new_hpage); + folio_putback_active_hugetlb(dst); return rc; } -- cgit v1.2.3 From d0ce0e47b323a8d7fb5dc3314ce56afa650ade2d Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:33 -0800 Subject: mm/hugetlb: convert hugetlb fault paths to use alloc_hugetlb_folio() Change alloc_huge_page() to alloc_hugetlb_folio() by changing all callers to handle the now folio return type of the function. In this conversion, alloc_huge_page_vma() is also changed to alloc_hugetlb_folio_vma() and hugepage_add_new_anon_rmap() is changed to take in a folio directly. Many additions of '&folio->page' are cleaned up in subsequent patches. hugetlbfs_fallocate() is also refactored to use the RCU + page_cache_next_miss() API. Link: https://lkml.kernel.org/r/20230125170537.96973-5-sidhartha.kumar@oracle.com Suggested-by: Mike Kravetz Reported-by: kernel test robot Signed-off-by: Sidhartha Kumar Cc: Gerald Schaefer Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 40 +++++----- include/linux/hugetlb.h | 8 +- include/linux/rmap.h | 2 +- mm/hugetlb.c | 201 ++++++++++++++++++++++++------------------------ mm/mempolicy.c | 6 +- mm/rmap.c | 6 +- 6 files changed, 133 insertions(+), 130 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 44ecdcb796cc..f89e12106e8a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -819,8 +819,9 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * This is supposed to be the vaddr where the page is being * faulted in, but we have no vaddr here. */ - struct page *page; + struct folio *folio; unsigned long addr; + bool present; cond_resched(); @@ -844,48 +845,49 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ - page = find_get_page(mapping, index); - if (page) { - put_page(page); + rcu_read_lock(); + present = page_cache_next_miss(mapping, index, 1) != index; + rcu_read_unlock(); + if (present) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); hugetlb_drop_vma_policy(&pseudo_vma); continue; } /* - * Allocate page without setting the avoid_reserve argument. + * Allocate folio without setting the avoid_reserve argument. * There certainly are no reserves associated with the * pseudo_vma. However, there could be shared mappings with * reserves for the file at the inode level. If we fallocate - * pages in these areas, we need to consume the reserves + * folios in these areas, we need to consume the reserves * to keep reservation accounting consistent. */ - page = alloc_huge_page(&pseudo_vma, addr, 0); + folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); hugetlb_drop_vma_policy(&pseudo_vma); - if (IS_ERR(page)) { + if (IS_ERR(folio)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); - error = PTR_ERR(page); + error = PTR_ERR(folio); goto out; } - clear_huge_page(page, addr, pages_per_huge_page(h)); - __SetPageUptodate(page); - error = hugetlb_add_to_page_cache(page, mapping, index); + clear_huge_page(&folio->page, addr, pages_per_huge_page(h)); + __folio_mark_uptodate(folio); + error = hugetlb_add_to_page_cache(&folio->page, mapping, index); if (unlikely(error)) { - restore_reserve_on_error(h, &pseudo_vma, addr, page); - put_page(page); + restore_reserve_on_error(h, &pseudo_vma, addr, &folio->page); + folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out; } mutex_unlock(&hugetlb_fault_mutex_table[hash]); - SetHPageMigratable(page); + folio_set_hugetlb_migratable(folio); /* - * unlock_page because locked by hugetlb_add_to_page_cache() - * put_page() due to reference from alloc_huge_page() + * folio_unlock because locked by hugetlb_add_to_page_cache() + * folio_put() due to reference from alloc_hugetlb_folio() */ - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 067906c5778e..6408f85e5754 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -717,11 +717,11 @@ struct huge_bootmem_page { }; int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); -struct page *alloc_huge_page(struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); -struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); @@ -1033,7 +1033,7 @@ static inline int isolate_or_dissolve_huge_page(struct page *page, return -ENOMEM; } -static inline struct page *alloc_huge_page(struct vm_area_struct *vma, +static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { @@ -1047,7 +1047,7 @@ alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, return NULL; } -static inline struct page *alloc_huge_page_vma(struct hstate *h, +static inline struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { diff --git a/include/linux/rmap.h b/include/linux/rmap.h index a6bd1f0a183d..a4570da03e58 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -203,7 +203,7 @@ void page_remove_rmap(struct page *, struct vm_area_struct *, void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); -void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, +void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); static inline void __page_dup_rmap(struct page *page, bool compound) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fd1ce61b8f3f..ea8d4611779b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2493,7 +2493,7 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, } /* mempolicy aware migration callback */ -struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { struct mempolicy *mpol; @@ -2507,7 +2507,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); mpol_cond_put(mpol); - return &folio->page; + return folio; } /* @@ -2798,14 +2798,14 @@ static long vma_del_reservation(struct hstate *h, /* * This routine is called to restore reservation information on error paths. - * It should ONLY be called for pages allocated via alloc_huge_page(), and - * the hugetlb mutex should remain held when calling this routine. + * It should ONLY be called for folios allocated via alloc_hugetlb_folio(), + * and the hugetlb mutex should remain held when calling this routine. * * It handles two specific cases: * 1) A reservation was in place and the page consumed the reservation. * HPageRestoreReserve is set in the page. * 2) No reservation was in place for the page, so HPageRestoreReserve is - * not set. However, alloc_huge_page always updates the reserve map. + * not set. However, alloc_hugetlb_folio always updates the reserve map. * * In case 1, free_huge_page later in the error path will increment the * global reserve count. But, free_huge_page does not have enough context @@ -2814,7 +2814,7 @@ static long vma_del_reservation(struct hstate *h, * reserve count adjustments to be made by free_huge_page. Make sure the * reserve map indicates there is a reservation present. * - * In case 2, simply undo reserve map modifications done by alloc_huge_page. + * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio. */ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct page *page) @@ -2844,8 +2844,8 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, if (!rc) { /* * This indicates there is an entry in the reserve map - * not added by alloc_huge_page. We know it was added - * before the alloc_huge_page call, otherwise + * not added by alloc_hugetlb_folio. We know it was added + * before the alloc_hugetlb_folio call, otherwise * hugetlb_restore_reserve would be set on the folio. * Remove the entry so that a subsequent allocation * does not consume a reservation. @@ -3014,7 +3014,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) return ret; } -struct page *alloc_huge_page(struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { struct hugepage_subpool *spool = subpool_vma(vma); @@ -3023,7 +3023,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, long map_chg, map_commit; long gbl_chg; int ret, idx; - struct hugetlb_cgroup *h_cg; + struct hugetlb_cgroup *h_cg = NULL; bool deferred_reserve; idx = hstate_index(h); @@ -3130,7 +3130,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); } - return &folio->page; + return folio; out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); @@ -4950,7 +4950,7 @@ hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long add struct folio *new_folio) { __folio_mark_uptodate(new_folio); - hugepage_add_new_anon_rmap(&new_folio->page, vma, addr); + hugepage_add_new_anon_rmap(new_folio, vma, addr); set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, &new_folio->page, 1)); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); folio_set_hugetlb_migratable(new_folio); @@ -5080,34 +5080,34 @@ again: } else if (page_try_dup_anon_rmap(ptepage, true, src_vma)) { pte_t src_pte_old = entry; - struct page *new; + struct folio *new_folio; spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ - new = alloc_huge_page(dst_vma, addr, 1); - if (IS_ERR(new)) { + new_folio = alloc_hugetlb_folio(dst_vma, addr, 1); + if (IS_ERR(new_folio)) { put_page(ptepage); - ret = PTR_ERR(new); + ret = PTR_ERR(new_folio); break; } - copy_user_huge_page(new, ptepage, addr, dst_vma, + copy_user_huge_page(&new_folio->page, ptepage, addr, dst_vma, npages); put_page(ptepage); - /* Install the new huge page if src pte stable */ + /* Install the new hugetlb folio if src pte stable */ dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); if (!pte_same(src_pte_old, entry)) { restore_reserve_on_error(h, dst_vma, addr, - new); - put_page(new); + &new_folio->page); + folio_put(new_folio); /* huge_ptep of dst_pte won't change as in child */ goto again; } - hugetlb_install_folio(dst_vma, dst_pte, addr, page_folio(new)); + hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio); spin_unlock(src_ptl); spin_unlock(dst_ptl); continue; @@ -5478,7 +5478,8 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, const bool unshare = flags & FAULT_FLAG_UNSHARE; pte_t pte; struct hstate *h = hstate_vma(vma); - struct page *old_page, *new_page; + struct page *old_page; + struct folio *new_folio; int outside_reserve = 0; vm_fault_t ret = 0; unsigned long haddr = address & huge_page_mask(h); @@ -5539,9 +5540,9 @@ retry_avoidcopy: * be acquired again before returning to the caller, as expected. */ spin_unlock(ptl); - new_page = alloc_huge_page(vma, haddr, outside_reserve); + new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve); - if (IS_ERR(new_page)) { + if (IS_ERR(new_folio)) { /* * If a process owning a MAP_PRIVATE mapping fails to COW, * it is due to references held by a child and an insufficient @@ -5586,7 +5587,7 @@ retry_avoidcopy: return 0; } - ret = vmf_error(PTR_ERR(new_page)); + ret = vmf_error(PTR_ERR(new_folio)); goto out_release_old; } @@ -5599,9 +5600,9 @@ retry_avoidcopy: goto out_release_all; } - copy_user_huge_page(new_page, old_page, address, vma, + copy_user_huge_page(&new_folio->page, old_page, address, vma, pages_per_huge_page(h)); - __SetPageUptodate(new_page); + __folio_mark_uptodate(new_folio); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, haddr + huge_page_size(h)); @@ -5618,12 +5619,12 @@ retry_avoidcopy: huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); page_remove_rmap(old_page, vma, true); - hugepage_add_new_anon_rmap(new_page, vma, haddr); + hugepage_add_new_anon_rmap(new_folio, vma, haddr); set_huge_pte_at(mm, haddr, ptep, - make_huge_pte(vma, new_page, !unshare)); - SetHPageMigratable(new_page); + make_huge_pte(vma, &new_folio->page, !unshare)); + folio_set_hugetlb_migratable(new_folio); /* Make the old page be freed below */ - new_page = old_page; + new_folio = page_folio(old_page); } spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); @@ -5632,9 +5633,9 @@ out_release_all: * No restore in case of successful pagetable update (Break COW or * unshare) */ - if (new_page != old_page) - restore_reserve_on_error(h, vma, haddr, new_page); - put_page(new_page); + if (new_folio != page_folio(old_page)) + restore_reserve_on_error(h, vma, haddr, &new_folio->page); + folio_put(new_folio); out_release_old: put_page(old_page); @@ -5753,11 +5754,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, vm_fault_t ret = VM_FAULT_SIGBUS; int anon_rmap = 0; unsigned long size; - struct page *page; + struct folio *folio; pte_t new_pte; spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); - bool new_page, new_pagecache_page = false; + bool new_folio, new_pagecache_folio = false; u32 hash = hugetlb_fault_mutex_hash(mapping, idx); /* @@ -5776,9 +5777,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * Use page lock to guard against racing truncation * before we get page_table_lock. */ - new_page = false; - page = find_lock_page(mapping, idx); - if (!page) { + new_folio = false; + folio = filemap_lock_folio(mapping, idx); + if (!folio) { size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto out; @@ -5811,8 +5812,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, VM_UFFD_MISSING); } - page = alloc_huge_page(vma, haddr, 0); - if (IS_ERR(page)) { + folio = alloc_hugetlb_folio(vma, haddr, 0); + if (IS_ERR(folio)) { /* * Returning error will result in faulting task being * sent SIGBUS. The hugetlb fault mutex prevents two @@ -5826,17 +5827,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * sure there really is no pte entry. */ if (hugetlb_pte_stable(h, mm, ptep, old_pte)) - ret = vmf_error(PTR_ERR(page)); + ret = vmf_error(PTR_ERR(folio)); else ret = 0; goto out; } - clear_huge_page(page, address, pages_per_huge_page(h)); - __SetPageUptodate(page); - new_page = true; + clear_huge_page(&folio->page, address, pages_per_huge_page(h)); + __folio_mark_uptodate(folio); + new_folio = true; if (vma->vm_flags & VM_MAYSHARE) { - int err = hugetlb_add_to_page_cache(page, mapping, idx); + int err = hugetlb_add_to_page_cache(&folio->page, mapping, idx); if (err) { /* * err can't be -EEXIST which implies someone @@ -5845,13 +5846,13 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * to the page cache. So it's safe to call * restore_reserve_on_error() here. */ - restore_reserve_on_error(h, vma, haddr, page); - put_page(page); + restore_reserve_on_error(h, vma, haddr, &folio->page); + folio_put(folio); goto out; } - new_pagecache_page = true; + new_pagecache_folio = true; } else { - lock_page(page); + folio_lock(folio); if (unlikely(anon_vma_prepare(vma))) { ret = VM_FAULT_OOM; goto backout_unlocked; @@ -5864,7 +5865,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * don't have hwpoisoned swap entry for errored virtual address. * So we need to block hugepage fault by PG_hwpoison bit check. */ - if (unlikely(PageHWPoison(page))) { + if (unlikely(folio_test_hwpoison(folio))) { ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto backout_unlocked; @@ -5872,8 +5873,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, /* Check for page in userfault range. */ if (userfaultfd_minor(vma)) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); /* See comment in userfaultfd_missing() block above */ if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { ret = 0; @@ -5907,10 +5908,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, goto backout; if (anon_rmap) - hugepage_add_new_anon_rmap(page, vma, haddr); + hugepage_add_new_anon_rmap(folio, vma, haddr); else - page_dup_file_rmap(page, true); - new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) + page_dup_file_rmap(&folio->page, true); + new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); /* * If this pte was previously wr-protected, keep it wr-protected even @@ -5923,20 +5924,20 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl); + ret = hugetlb_wp(mm, vma, address, ptep, flags, &folio->page, ptl); } spin_unlock(ptl); /* - * Only set HPageMigratable in newly allocated pages. Existing pages - * found in the pagecache may not have HPageMigratableset if they have + * Only set hugetlb_migratable in newly allocated pages. Existing pages + * found in the pagecache may not have hugetlb_migratable if they have * been isolated for migration. */ - if (new_page) - SetHPageMigratable(page); + if (new_folio) + folio_set_hugetlb_migratable(folio); - unlock_page(page); + folio_unlock(folio); out: hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); @@ -5945,11 +5946,11 @@ out: backout: spin_unlock(ptl); backout_unlocked: - if (new_page && !new_pagecache_page) - restore_reserve_on_error(h, vma, haddr, page); + if (new_folio && !new_pagecache_folio) + restore_reserve_on_error(h, vma, haddr, &folio->page); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out; } @@ -6173,16 +6174,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t _dst_pte; spinlock_t *ptl; int ret = -ENOMEM; - struct page *page; + struct folio *folio; int writable; - bool page_in_pagecache = false; + bool folio_in_pagecache = false; if (is_continue) { ret = -EFAULT; - page = find_lock_page(mapping, idx); - if (!page) + folio = filemap_lock_folio(mapping, idx); + if (!folio) goto out; - page_in_pagecache = true; + folio_in_pagecache = true; } else if (!*pagep) { /* If a page already exists, then it's UFFDIO_COPY for * a non-missing case. Return -EEXIST. @@ -6193,34 +6194,34 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, goto out; } - page = alloc_huge_page(dst_vma, dst_addr, 0); - if (IS_ERR(page)) { + folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + if (IS_ERR(folio)) { ret = -ENOMEM; goto out; } - ret = copy_huge_page_from_user(page, + ret = copy_huge_page_from_user(&folio->page, (const void __user *) src_addr, pages_per_huge_page(h), false); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { ret = -ENOENT; - /* Free the allocated page which may have + /* Free the allocated folio which may have * consumed a reservation. */ - restore_reserve_on_error(h, dst_vma, dst_addr, page); - put_page(page); + restore_reserve_on_error(h, dst_vma, dst_addr, &folio->page); + folio_put(folio); - /* Allocate a temporary page to hold the copied + /* Allocate a temporary folio to hold the copied * contents. */ - page = alloc_huge_page_vma(h, dst_vma, dst_addr); - if (!page) { + folio = alloc_hugetlb_folio_vma(h, dst_vma, dst_addr); + if (!folio) { ret = -ENOMEM; goto out; } - *pagep = page; + *pagep = &folio->page; /* Set the outparam pagep and return to the caller to * copy the contents outside the lock. Don't free the * page. @@ -6236,25 +6237,25 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, goto out; } - page = alloc_huge_page(dst_vma, dst_addr, 0); - if (IS_ERR(page)) { + folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + if (IS_ERR(folio)) { put_page(*pagep); ret = -ENOMEM; *pagep = NULL; goto out; } - copy_user_huge_page(page, *pagep, dst_addr, dst_vma, + copy_user_huge_page(&folio->page, *pagep, dst_addr, dst_vma, pages_per_huge_page(h)); put_page(*pagep); *pagep = NULL; } /* - * The memory barrier inside __SetPageUptodate makes sure that + * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before * the set_pte_at() write. */ - __SetPageUptodate(page); + __folio_mark_uptodate(folio); /* Add shared, newly allocated pages to the page cache. */ if (vm_shared && !is_continue) { @@ -6269,16 +6270,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, * hugetlb_fault_mutex_table that here must be hold by * the caller. */ - ret = hugetlb_add_to_page_cache(page, mapping, idx); + ret = hugetlb_add_to_page_cache(&folio->page, mapping, idx); if (ret) goto out_release_nounlock; - page_in_pagecache = true; + folio_in_pagecache = true; } ptl = huge_pte_lock(h, dst_mm, dst_pte); ret = -EIO; - if (PageHWPoison(page)) + if (folio_test_hwpoison(folio)) goto out_release_unlock; /* @@ -6290,10 +6291,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) goto out_release_unlock; - if (page_in_pagecache) - page_dup_file_rmap(page, true); + if (folio_in_pagecache) + page_dup_file_rmap(&folio->page, true); else - hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); + hugepage_add_new_anon_rmap(folio, dst_vma, dst_addr); /* * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY @@ -6304,7 +6305,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, else writable = dst_vma->vm_flags & VM_WRITE; - _dst_pte = make_huge_pte(dst_vma, page, writable); + _dst_pte = make_huge_pte(dst_vma, &folio->page, writable); /* * Always mark UFFDIO_COPY page dirty; note that this may not be * extremely important for hugetlbfs for now since swapping is not @@ -6326,20 +6327,20 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, spin_unlock(ptl); if (!is_continue) - SetHPageMigratable(page); + folio_set_hugetlb_migratable(folio); if (vm_shared || is_continue) - unlock_page(page); + folio_unlock(folio); ret = 0; out: return ret; out_release_unlock: spin_unlock(ptl); if (vm_shared || is_continue) - unlock_page(page); + folio_unlock(folio); out_release_nounlock: - if (!page_in_pagecache) - restore_reserve_on_error(h, dst_vma, dst_addr, page); - put_page(page); + if (!folio_in_pagecache) + restore_reserve_on_error(h, dst_vma, dst_addr, &folio->page); + folio_put(folio); goto out; } #endif /* CONFIG_USERFAULTFD */ @@ -6871,7 +6872,7 @@ bool hugetlb_reserve_pages(struct inode *inode, /* * pages in this range were added to the reserve * map between region_chg and region_add. This - * indicates a race with alloc_huge_page. Adjust + * indicates a race with alloc_hugetlb_folio. Adjust * the subpool and reserve counts modified above * based on the difference. */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index fc034b070645..7686f40c9750 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1210,9 +1210,11 @@ static struct page *new_page(struct page *page, unsigned long start) break; } - if (folio_test_hugetlb(src)) - return alloc_huge_page_vma(page_hstate(&src->page), + if (folio_test_hugetlb(src)) { + dst = alloc_hugetlb_folio_vma(folio_hstate(src), vma, address); + return &dst->page; + } if (folio_test_large(src)) gfp = GFP_TRANSHUGE; diff --git a/mm/rmap.c b/mm/rmap.c index 86fccc2b9fc9..8287f2cc327d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2534,15 +2534,13 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, !!(flags & RMAP_EXCLUSIVE)); } -void hugepage_add_new_anon_rmap(struct page *page, +void hugepage_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address) { - struct folio *folio = page_folio(page); - BUG_ON(address < vma->vm_start || address >= vma->vm_end); /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); folio_clear_hugetlb_restore_reserve(folio); - __page_set_anon_rmap(folio, page, vma, address, 1); + __page_set_anon_rmap(folio, &folio->page, vma, address, 1); } #endif /* CONFIG_HUGETLB_PAGE */ -- cgit v1.2.3 From d2d7bb44bfbd29200426ba17741550d36e081f91 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:34 -0800 Subject: mm/hugetlb: convert restore_reserve_on_error to take in a folio Every caller of restore_reserve_on_error() is now passing in &folio->page, change the function to take in a folio directly and clean up the call sites. Link: https://lkml.kernel.org/r/20230125170537.96973-6-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: Gerald Schaefer Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 +- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 21 ++++++++++----------- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f89e12106e8a..c736947e73da 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -873,7 +873,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, __folio_mark_uptodate(folio); error = hugetlb_add_to_page_cache(&folio->page, mapping, index); if (unlikely(error)) { - restore_reserve_on_error(h, &pseudo_vma, addr, &folio->page); + restore_reserve_on_error(h, &pseudo_vma, addr, folio); folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6408f85e5754..20ceaaea1697 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -726,7 +726,7 @@ struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *v int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, struct page *page); + unsigned long address, struct folio *folio); /* arch callback */ int __init __alloc_bootmem_huge_page(struct hstate *h, int nid); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ea8d4611779b..1f6270c586c0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2802,9 +2802,9 @@ static long vma_del_reservation(struct hstate *h, * and the hugetlb mutex should remain held when calling this routine. * * It handles two specific cases: - * 1) A reservation was in place and the page consumed the reservation. - * HPageRestoreReserve is set in the page. - * 2) No reservation was in place for the page, so HPageRestoreReserve is + * 1) A reservation was in place and the folio consumed the reservation. + * hugetlb_restore_reserve is set in the folio. + * 2) No reservation was in place for the page, so hugetlb_restore_reserve is * not set. However, alloc_hugetlb_folio always updates the reserve map. * * In case 1, free_huge_page later in the error path will increment the @@ -2817,9 +2817,8 @@ static long vma_del_reservation(struct hstate *h, * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio. */ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, struct page *page) + unsigned long address, struct folio *folio) { - struct folio *folio = page_folio(page); long rc = vma_needs_reservation(h, vma, address); if (folio_test_hugetlb_restore_reserve(folio)) { @@ -5102,7 +5101,7 @@ again: entry = huge_ptep_get(src_pte); if (!pte_same(src_pte_old, entry)) { restore_reserve_on_error(h, dst_vma, addr, - &new_folio->page); + new_folio); folio_put(new_folio); /* huge_ptep of dst_pte won't change as in child */ goto again; @@ -5634,7 +5633,7 @@ out_release_all: * unshare) */ if (new_folio != page_folio(old_page)) - restore_reserve_on_error(h, vma, haddr, &new_folio->page); + restore_reserve_on_error(h, vma, haddr, new_folio); folio_put(new_folio); out_release_old: put_page(old_page); @@ -5846,7 +5845,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * to the page cache. So it's safe to call * restore_reserve_on_error() here. */ - restore_reserve_on_error(h, vma, haddr, &folio->page); + restore_reserve_on_error(h, vma, haddr, folio); folio_put(folio); goto out; } @@ -5947,7 +5946,7 @@ backout: spin_unlock(ptl); backout_unlocked: if (new_folio && !new_pagecache_folio) - restore_reserve_on_error(h, vma, haddr, &folio->page); + restore_reserve_on_error(h, vma, haddr, folio); folio_unlock(folio); folio_put(folio); @@ -6210,7 +6209,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, /* Free the allocated folio which may have * consumed a reservation. */ - restore_reserve_on_error(h, dst_vma, dst_addr, &folio->page); + restore_reserve_on_error(h, dst_vma, dst_addr, folio); folio_put(folio); /* Allocate a temporary folio to hold the copied @@ -6339,7 +6338,7 @@ out_release_unlock: folio_unlock(folio); out_release_nounlock: if (!folio_in_pagecache) - restore_reserve_on_error(h, dst_vma, dst_addr, &folio->page); + restore_reserve_on_error(h, dst_vma, dst_addr, folio); folio_put(folio); goto out; } -- cgit v1.2.3 From 9b91c0e277a3dbb165c2e4301be7a231dc2f76f7 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:35 -0800 Subject: mm/hugetlb: convert hugetlb_add_to_page_cache to take in a folio Every caller of hugetlb_add_to_page_cache() is now passing in &folio->page, change the function to take in a folio directly and clean up the call sites. Link: https://lkml.kernel.org/r/20230125170537.96973-7-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: Gerald Schaefer Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 +- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 9 ++++----- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c736947e73da..cfd09f95551b 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -871,7 +871,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, } clear_huge_page(&folio->page, addr, pages_per_huge_page(h)); __folio_mark_uptodate(folio); - error = hugetlb_add_to_page_cache(&folio->page, mapping, index); + error = hugetlb_add_to_page_cache(folio, mapping, index); if (unlikely(error)) { restore_reserve_on_error(h, &pseudo_vma, addr, folio); folio_put(folio); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 20ceaaea1697..df6dd624ccfe 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -723,7 +723,7 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); -int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, +int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct folio *folio); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1f6270c586c0..de1f73e5e200 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5662,10 +5662,9 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, return present; } -int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, +int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx) { - struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct hstate *h = hstate_inode(inode); int err; @@ -5677,7 +5676,7 @@ int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, __folio_clear_locked(folio); return err; } - ClearHPageRestoreReserve(page); + folio_clear_hugetlb_restore_reserve(folio); /* * mark folio dirty so that it will not be removed from cache/file @@ -5836,7 +5835,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, new_folio = true; if (vma->vm_flags & VM_MAYSHARE) { - int err = hugetlb_add_to_page_cache(&folio->page, mapping, idx); + int err = hugetlb_add_to_page_cache(folio, mapping, idx); if (err) { /* * err can't be -EEXIST which implies someone @@ -6269,7 +6268,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, * hugetlb_fault_mutex_table that here must be hold by * the caller. */ - ret = hugetlb_add_to_page_cache(&folio->page, mapping, idx); + ret = hugetlb_add_to_page_cache(folio, mapping, idx); if (ret) goto out_release_nounlock; folio_in_pagecache = true; -- cgit v1.2.3 From 371607a3c793d7183b0faecc1fb4aa88fadcf202 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:36 -0800 Subject: mm/hugetlb: convert hugetlb_wp() to take in a folio Change the pagecache_page argument of hugetlb_wp to pagecache_folio. Replaces a call to find_lock_page() with filemap_lock_folio(). Link: https://lkml.kernel.org/r/20230125170537.96973-8-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reported-by: gerald.schaefer@linux.ibm.com Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index de1f73e5e200..3a01a9dbf445 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5472,7 +5472,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, unsigned int flags, - struct page *pagecache_page, spinlock_t *ptl) + struct folio *pagecache_folio, spinlock_t *ptl) { const bool unshare = flags & FAULT_FLAG_UNSHARE; pte_t pte; @@ -5529,7 +5529,7 @@ retry_avoidcopy: * of the full address range. */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && - old_page != pagecache_page) + page_folio(old_page) != pagecache_folio) outside_reserve = 1; get_page(old_page); @@ -5922,7 +5922,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_wp(mm, vma, address, ptep, flags, &folio->page, ptl); + ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl); } spin_unlock(ptl); @@ -5985,7 +5985,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, u32 hash; pgoff_t idx; struct page *page = NULL; - struct page *pagecache_page = NULL; + struct folio *pagecache_folio = NULL; struct hstate *h = hstate_vma(vma); struct address_space *mapping; int need_wait_lock = 0; @@ -6067,7 +6067,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, haddr); - pagecache_page = find_lock_page(mapping, idx); + pagecache_folio = filemap_lock_folio(mapping, idx); } ptl = huge_pte_lock(h, mm, ptep); @@ -6087,9 +6087,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, }; spin_unlock(ptl); - if (pagecache_page) { - unlock_page(pagecache_page); - put_page(pagecache_page); + if (pagecache_folio) { + folio_unlock(pagecache_folio); + folio_put(pagecache_folio); } hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); @@ -6098,11 +6098,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* * hugetlb_wp() requires page locks of pte_page(entry) and - * pagecache_page, so here we need take the former one - * when page != pagecache_page or !pagecache_page. + * pagecache_folio, so here we need take the former one + * when page != pagecache_folio or !pagecache_folio. */ page = pte_page(entry); - if (page != pagecache_page) + if (page_folio(page) != pagecache_folio) if (!trylock_page(page)) { need_wait_lock = 1; goto out_ptl; @@ -6113,7 +6113,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!huge_pte_write(entry)) { ret = hugetlb_wp(mm, vma, address, ptep, flags, - pagecache_page, ptl); + pagecache_folio, ptl); goto out_put_page; } else if (likely(flags & FAULT_FLAG_WRITE)) { entry = huge_pte_mkdirty(entry); @@ -6124,15 +6124,15 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, haddr, ptep); out_put_page: - if (page != pagecache_page) + if (page_folio(page) != pagecache_folio) unlock_page(page); put_page(page); out_ptl: spin_unlock(ptl); - if (pagecache_page) { - unlock_page(pagecache_page); - put_page(pagecache_page); + if (pagecache_folio) { + folio_unlock(pagecache_folio); + folio_put(pagecache_folio); } out_mutex: hugetlb_vma_unlock_read(vma); -- cgit v1.2.3 From 192a50220342f82826812d0032a82fe441e924e2 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:37 -0800 Subject: Documentation/mm: update hugetlbfs documentation to mention alloc_hugetlb_folio Link: https://lkml.kernel.org/r/20230125170537.96973-9-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: Gerald Schaefer Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- Documentation/mm/hugetlbfs_reserv.rst | 21 +++++++++++---------- .../translations/zh_CN/mm/hugetlbfs_reserv.rst | 14 +++++++------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/Documentation/mm/hugetlbfs_reserv.rst b/Documentation/mm/hugetlbfs_reserv.rst index f143954e0d05..611728c49bff 100644 --- a/Documentation/mm/hugetlbfs_reserv.rst +++ b/Documentation/mm/hugetlbfs_reserv.rst @@ -181,14 +181,14 @@ Consuming Reservations/Allocating a Huge Page Reservations are consumed when huge pages associated with the reservations are allocated and instantiated in the corresponding mapping. The allocation -is performed within the routine alloc_huge_page():: +is performed within the routine alloc_hugetlb_folio():: - struct page *alloc_huge_page(struct vm_area_struct *vma, + struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) -alloc_huge_page is passed a VMA pointer and a virtual address, so it can +alloc_hugetlb_folio is passed a VMA pointer and a virtual address, so it can consult the reservation map to determine if a reservation exists. In addition, -alloc_huge_page takes the argument avoid_reserve which indicates reserves +alloc_hugetlb_folio takes the argument avoid_reserve which indicates reserves should not be used even if it appears they have been set aside for the specified address. The avoid_reserve argument is most often used in the case of Copy on Write and Page Migration where additional copies of an existing @@ -208,7 +208,8 @@ a reservation for the allocation. After determining whether a reservation exists and can be used for the allocation, the routine dequeue_huge_page_vma() is called. This routine takes two arguments related to reservations: -- avoid_reserve, this is the same value/argument passed to alloc_huge_page() +- avoid_reserve, this is the same value/argument passed to + alloc_hugetlb_folio(). - chg, even though this argument is of type long only the values 0 or 1 are passed to dequeue_huge_page_vma. If the value is 0, it indicates a reservation exists (see the section "Memory Policy and Reservations" for @@ -233,9 +234,9 @@ the scope reservations. Even if a surplus page is allocated, the same reservation based adjustments as above will be made: SetPagePrivate(page) and resv_huge_pages--. -After obtaining a new huge page, (page)->private is set to the value of -the subpool associated with the page if it exists. This will be used for -subpool accounting when the page is freed. +After obtaining a new hugetlb folio, (folio)->_hugetlb_subpool is set to the +value of the subpool associated with the page if it exists. This will be used +for subpool accounting when the folio is freed. The routine vma_commit_reservation() is then called to adjust the reserve map based on the consumption of the reservation. In general, this involves @@ -246,8 +247,8 @@ was no reservation in a shared mapping or this was a private mapping a new entry must be created. It is possible that the reserve map could have been changed between the call -to vma_needs_reservation() at the beginning of alloc_huge_page() and the -call to vma_commit_reservation() after the page was allocated. This would +to vma_needs_reservation() at the beginning of alloc_hugetlb_folio() and the +call to vma_commit_reservation() after the folio was allocated. This would be possible if hugetlb_reserve_pages was called for the same page in a shared mapping. In such cases, the reservation count and subpool free page count will be off by one. This rare condition can be identified by comparing the diff --git a/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst index 752e5696cd47..826a50c47389 100644 --- a/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst +++ b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst @@ -142,14 +142,14 @@ HPAGE_RESV_OWNER标志被设置,以表明该VMA拥有预留。 消耗预留/分配一个巨页 =========================== -当与预留相关的巨页在相应的映射中被分配和实例化时,预留就被消耗了。该分配是在函数alloc_huge_page() +当与预留相关的巨页在相应的映射中被分配和实例化时,预留就被消耗了。该分配是在函数alloc_hugetlb_folio() 中进行的:: - struct page *alloc_huge_page(struct vm_area_struct *vma, + struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) -alloc_huge_page被传递给一个VMA指针和一个虚拟地址,因此它可以查阅预留映射以确定是否存在预留。 -此外,alloc_huge_page需要一个参数avoid_reserve,该参数表示即使看起来已经为指定的地址预留了 +alloc_hugetlb_folio被传递给一个VMA指针和一个虚拟地址,因此它可以查阅预留映射以确定是否存在预留。 +此外,alloc_hugetlb_folio需要一个参数avoid_reserve,该参数表示即使看起来已经为指定的地址预留了 预留,也不应该使用预留。avoid_reserve参数最常被用于写时拷贝和页面迁移的情况下,即现有页面的额 外拷贝被分配。 @@ -162,7 +162,7 @@ vma_needs_reservation()返回的值通常为0或1。如果该地址存在预留 确定预留是否存在并可用于分配后,调用dequeue_huge_page_vma()函数。这个函数需要两个与预留有关 的参数: -- avoid_reserve,这是传递给alloc_huge_page()的同一个值/参数。 +- avoid_reserve,这是传递给alloc_hugetlb_folio()的同一个值/参数。 - chg,尽管这个参数的类型是long,但只有0或1的值被传递给dequeue_huge_page_vma。如果该值为0, 则表明存在预留(关于可能的问题,请参见 “预留和内存策略” 一节)。如果值 为1,则表示不存在预留,如果可能的话,必须从全局空闲池中取出该页。 @@ -179,7 +179,7 @@ free_huge_pages的值被递减。如果有一个与该页相关的预留,将 的剩余巨页和超额分配的问题。即使分配了一个多余的页面,也会进行与上面一样的基于预留的调整: SetPagePrivate(page) 和 resv_huge_pages--. -在获得一个新的巨页后,(page)->private被设置为与该页面相关的子池的值,如果它存在的话。当页 +在获得一个新的巨页后,(folio)->_hugetlb_subpool被设置为与该页面相关的子池的值,如果它存在的话。当页 面被释放时,这将被用于子池的计数。 然后调用函数vma_commit_reservation(),根据预留的消耗情况调整预留映射。一般来说,这涉及 @@ -199,7 +199,7 @@ SetPagePrivate(page)和resv_huge_pages-。 已经存在,所以不做任何改变。然而,如果共享映射中没有预留,或者这是一个私有映射,则必须创建 一个新的条目。 -在alloc_huge_page()开始调用vma_needs_reservation()和页面分配后调用 +在alloc_hugetlb_folio()开始调用vma_needs_reservation()和页面分配后调用 vma_commit_reservation()之间,预留映射有可能被改变。如果hugetlb_reserve_pages在共 享映射中为同一页面被调用,这将是可能的。在这种情况下,预留计数和子池空闲页计数会有一个偏差。 这种罕见的情况可以通过比较vma_needs_reservation和vma_commit_reservation的返回值来 -- cgit v1.2.3 From fa4e3f5ffa5e6e22f751d289c9afa502dda30b8d Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:28 -0800 Subject: mm: add folio_estimated_sharers() Patch series "Convert various mempolicy.c functions to use folios", v4. This patch series converts migrate_page_add() and queue_pages_required() to migrate_folio_add() and queue_page_required(). It also converts the callers of the functions to use folios as well, and introduces a helper function to estimate the number of sharers of a folio. This patch (of 6): folio_estimated_sharers() takes in a folio and returns the precise number of times the first subpage of the folio is mapped. This function aims to provide an estimate for the number of sharers of a folio. This is necessary for folio conversions where we care about the number of processes that share a folio, but don't necessarily want to check every single page within that folio. This is in contrast to folio_mapcount() which calculates the total number of the times a folio and all its subpages are mapped. Link: https://lkml.kernel.org/r/20230130201833.27042-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20230130201833.27042-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Yin Fengwei Acked-by: David Hildenbrand Cc: Jane Chu Signed-off-by: Andrew Morton --- include/linux/mm.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9454b7eb055b..89c118ad4a44 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1916,6 +1916,24 @@ static inline size_t folio_size(struct folio *folio) return PAGE_SIZE << folio_order(folio); } +/** + * folio_estimated_sharers - Estimate the number of sharers of a folio. + * @folio: The folio. + * + * folio_estimated_sharers() aims to serve as a function to efficiently + * estimate the number of processes sharing a folio. This is done by + * looking at the precise mapcount of the first subpage in the folio, and + * assuming the other subpages are the same. This may not be true for large + * folios. If you want exact mapcounts for exact calculations, look at + * page_mapcount() or folio_total_mapcount(). + * + * Return: The estimated number of processes sharing a folio. + */ +static inline int folio_estimated_sharers(struct folio *folio) +{ + return page_mapcount(folio_page(folio, 0)); +} + #ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE static inline int arch_make_page_accessible(struct page *page) { -- cgit v1.2.3 From de1f5055523e9a035b38533f25a56df03d45034a Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:29 -0800 Subject: mm/mempolicy: convert queue_pages_pmd() to queue_folios_pmd() The function now operates on a folio instead of the page associated with a pmd. This change is in preparation for the conversion of queue_pages_required() to queue_folio_required() and migrate_page_add() to migrate_folio_add(). Link: https://lkml.kernel.org/r/20230130201833.27042-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Jane Chu Cc: "Yin, Fengwei" Signed-off-by: Andrew Morton --- mm/mempolicy.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7686f40c9750..fc754dbcbbcd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -442,21 +442,21 @@ static inline bool queue_pages_required(struct page *page, } /* - * queue_pages_pmd() has three possible return values: - * 0 - pages are placed on the right node or queued successfully, or + * queue_folios_pmd() has three possible return values: + * 0 - folios are placed on the right node or queued successfully, or * special page is met, i.e. huge zero page. - * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were + * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were * specified. * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an - * existing page was already on a node that does not follow the + * existing folio was already on a node that does not follow the * policy. */ -static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, +static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, unsigned long end, struct mm_walk *walk) __releases(ptl) { int ret = 0; - struct page *page; + struct folio *folio; struct queue_pages *qp = walk->private; unsigned long flags; @@ -464,19 +464,19 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, ret = -EIO; goto unlock; } - page = pmd_page(*pmd); - if (is_huge_zero_page(page)) { + folio = pfn_folio(pmd_pfn(*pmd)); + if (is_huge_zero_page(&folio->page)) { walk->action = ACTION_CONTINUE; goto unlock; } - if (!queue_pages_required(page, qp)) + if (!queue_pages_required(&folio->page, qp)) goto unlock; flags = qp->flags; - /* go to thp migration */ + /* go to folio migration */ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { if (!vma_migratable(walk->vma) || - migrate_page_add(page, qp->pagelist, flags)) { + migrate_page_add(&folio->page, qp->pagelist, flags)) { ret = 1; goto unlock; } @@ -512,7 +512,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) - return queue_pages_pmd(pmd, ptl, addr, end, walk); + return queue_folios_pmd(pmd, ptl, addr, end, walk); if (pmd_trans_unstable(pmd)) return 0; -- cgit v1.2.3 From 3dae02bbd07f40e37bbfec2d77119628db461eaa Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:30 -0800 Subject: mm/mempolicy: convert queue_pages_pte_range() to queue_folios_pte_range() This function now operates on folios associated with ptes instead of pages. This change is in preparation for the conversion of queue_pages_required() to queue_folio_required() and migrate_page_add() to migrate_folio_add(). Link: https://lkml.kernel.org/r/20230130201833.27042-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Jane Chu Cc: "Yin, Fengwei" Signed-off-by: Andrew Morton --- mm/mempolicy.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index fc754dbcbbcd..b0805bb87655 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -491,19 +491,19 @@ unlock: * Scan through pages checking if pages follow certain conditions, * and move them to the pagelist if they do. * - * queue_pages_pte_range() has three possible return values: - * 0 - pages are placed on the right node or queued successfully, or + * queue_folios_pte_range() has three possible return values: + * 0 - folios are placed on the right node or queued successfully, or * special page is met, i.e. zero page. - * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were + * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were * specified. - * -EIO - only MPOL_MF_STRICT was specified and an existing page was already + * -EIO - only MPOL_MF_STRICT was specified and an existing folio was already * on a node that does not follow the policy. */ -static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, +static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; - struct page *page; + struct folio *folio; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; bool has_unmovable = false; @@ -521,16 +521,16 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, for (; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) continue; - page = vm_normal_page(vma, addr, *pte); - if (!page || is_zone_device_page(page)) + folio = vm_normal_folio(vma, addr, *pte); + if (!folio || folio_is_zone_device(folio)) continue; /* - * vm_normal_page() filters out zero pages, but there might - * still be PageReserved pages to skip, perhaps in a VDSO. + * vm_normal_folio() filters out zero pages, but there might + * still be reserved folios to skip, perhaps in a VDSO. */ - if (PageReserved(page)) + if (folio_test_reserved(folio)) continue; - if (!queue_pages_required(page, qp)) + if (!queue_pages_required(&folio->page, qp)) continue; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { /* MPOL_MF_STRICT must be specified if we get here */ @@ -544,7 +544,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, * temporary off LRU pages in the range. Still * need migrate other LRU pages. */ - if (migrate_page_add(page, qp->pagelist, flags)) + if (migrate_page_add(&folio->page, qp->pagelist, flags)) has_unmovable = true; } else break; @@ -704,7 +704,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, static const struct mm_walk_ops queue_pages_walk_ops = { .hugetlb_entry = queue_pages_hugetlb, - .pmd_entry = queue_pages_pte_range, + .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, }; -- cgit v1.2.3 From 0a2c1e8183163a31fe8c9838f3108aacf9c05c4a Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:31 -0800 Subject: mm/mempolicy: convert queue_pages_hugetlb() to queue_folios_hugetlb() This change is in preparation for the conversion of queue_pages_required() to queue_folio_required() and migrate_page_add() to migrate_folio_add(). Link: https://lkml.kernel.org/r/20230130201833.27042-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Jane Chu Cc: "Yin, Fengwei" Signed-off-by: Andrew Morton --- mm/mempolicy.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b0805bb87655..668392493500 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -558,7 +558,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, return addr != end ? -EIO : 0; } -static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, +static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -566,7 +566,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, #ifdef CONFIG_HUGETLB_PAGE struct queue_pages *qp = walk->private; unsigned long flags = (qp->flags & MPOL_MF_VALID); - struct page *page; + struct folio *folio; spinlock_t *ptl; pte_t entry; @@ -574,13 +574,13 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, entry = huge_ptep_get(pte); if (!pte_present(entry)) goto unlock; - page = pte_page(entry); - if (!queue_pages_required(page, qp)) + folio = pfn_folio(pte_pfn(entry)); + if (!queue_pages_required(&folio->page, qp)) goto unlock; if (flags == MPOL_MF_STRICT) { /* - * STRICT alone means only detecting misplaced page and no + * STRICT alone means only detecting misplaced folio and no * need to further check other vma. */ ret = -EIO; @@ -591,21 +591,28 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, /* * Must be STRICT with MOVE*, otherwise .test_walk() have * stopped walking current vma. - * Detecting misplaced page but allow migrating pages which + * Detecting misplaced folio but allow migrating folios which * have been queued. */ ret = 1; goto unlock; } - /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ + /* + * With MPOL_MF_MOVE, we try to migrate only unshared folios. If it + * is shared it is likely not worth migrating. + * + * To check if the folio is shared, ideally we want to make sure + * every page is mapped to the same process. Doing that is very + * expensive, so check the estimated mapcount of the folio instead. + */ if (flags & (MPOL_MF_MOVE_ALL) || - (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 && + (flags & MPOL_MF_MOVE && folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte))) { - if (isolate_hugetlb(page_folio(page), qp->pagelist) && + if (isolate_hugetlb(folio, qp->pagelist) && (flags & MPOL_MF_STRICT)) /* - * Failed to isolate page but allow migrating pages + * Failed to isolate folio but allow migrating pages * which have been queued. */ ret = 1; @@ -703,7 +710,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, } static const struct mm_walk_ops queue_pages_walk_ops = { - .hugetlb_entry = queue_pages_hugetlb, + .hugetlb_entry = queue_folios_hugetlb, .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, }; -- cgit v1.2.3 From d451b89dcd183da725eda84dfb8a46c0b32a4234 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:32 -0800 Subject: mm/mempolicy: convert queue_pages_required() to queue_folio_required() Replace queue_pages_required() with queue_folio_required(). queue_folio_required() does the same as queue_pages_required(), except takes in a folio instead of a page. Link: https://lkml.kernel.org/r/20230130201833.27042-6-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Jane Chu Cc: "Yin, Fengwei" Signed-off-by: Andrew Morton --- mm/mempolicy.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 668392493500..6a68dbce3b70 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -427,15 +427,15 @@ struct queue_pages { }; /* - * Check if the page's nid is in qp->nmask. + * Check if the folio's nid is in qp->nmask. * * If MPOL_MF_INVERT is set in qp->flags, check if the nid is * in the invert of qp->nmask. */ -static inline bool queue_pages_required(struct page *page, +static inline bool queue_folio_required(struct folio *folio, struct queue_pages *qp) { - int nid = page_to_nid(page); + int nid = folio_nid(folio); unsigned long flags = qp->flags; return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); @@ -469,7 +469,7 @@ static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, walk->action = ACTION_CONTINUE; goto unlock; } - if (!queue_pages_required(&folio->page, qp)) + if (!queue_folio_required(folio, qp)) goto unlock; flags = qp->flags; @@ -530,7 +530,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, */ if (folio_test_reserved(folio)) continue; - if (!queue_pages_required(&folio->page, qp)) + if (!queue_folio_required(folio, qp)) continue; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { /* MPOL_MF_STRICT must be specified if we get here */ @@ -575,7 +575,7 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, if (!pte_present(entry)) goto unlock; folio = pfn_folio(pte_pfn(entry)); - if (!queue_pages_required(&folio->page, qp)) + if (!queue_folio_required(folio, qp)) goto unlock; if (flags == MPOL_MF_STRICT) { -- cgit v1.2.3 From 4a64981dfee9119aa2c1f243b48f34cbbd67779c Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:33 -0800 Subject: mm/mempolicy: convert migrate_page_add() to migrate_folio_add() Replace migrate_page_add() with migrate_folio_add(). migrate_folio_add() does the same a migrate_page_add() but takes in a folio instead of a page. This removes a couple of calls to compound_head(). Link: https://lkml.kernel.org/r/20230130201833.27042-7-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Yin Fengwei Cc: David Hildenbrand Cc: Jane Chu Signed-off-by: Andrew Morton --- mm/mempolicy.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6a68dbce3b70..0919c7a719d4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -414,7 +414,7 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { }, }; -static int migrate_page_add(struct page *page, struct list_head *pagelist, +static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags); struct queue_pages { @@ -476,7 +476,7 @@ static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, /* go to folio migration */ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { if (!vma_migratable(walk->vma) || - migrate_page_add(&folio->page, qp->pagelist, flags)) { + migrate_folio_add(folio, qp->pagelist, flags)) { ret = 1; goto unlock; } @@ -544,7 +544,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, * temporary off LRU pages in the range. Still * need migrate other LRU pages. */ - if (migrate_page_add(&folio->page, qp->pagelist, flags)) + if (migrate_folio_add(folio, qp->pagelist, flags)) has_unmovable = true; } else break; @@ -1021,27 +1021,28 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } #ifdef CONFIG_MIGRATION -/* - * page migration, thp tail pages can be passed. - */ -static int migrate_page_add(struct page *page, struct list_head *pagelist, +static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { - struct page *head = compound_head(page); /* - * Avoid migrating a page that is shared with others. + * We try to migrate only unshared folios. If it is shared it + * is likely not worth migrating. + * + * To check if the folio is shared, ideally we want to make sure + * every page is mapped to the same process. Doing that is very + * expensive, so check the estimated mapcount of the folio instead. */ - if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) { - if (!isolate_lru_page(head)) { - list_add_tail(&head->lru, pagelist); - mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + page_is_file_lru(head), - thp_nr_pages(head)); + if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) { + if (!folio_isolate_lru(folio)) { + list_add_tail(&folio->lru, foliolist); + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + folio_nr_pages(folio)); } else if (flags & MPOL_MF_STRICT) { /* - * Non-movable page may reach here. And, there may be - * temporary off LRU pages or non-LRU movable pages. - * Treat them as unmovable pages since they can't be + * Non-movable folio may reach here. And, there may be + * temporary off LRU folios or non-LRU movable folios. + * Treat them as unmovable folios since they can't be * isolated, so they can't be moved at the moment. It * should return -EIO for this case too. */ @@ -1235,7 +1236,7 @@ static struct page *new_page(struct page *page, unsigned long start) } #else -static int migrate_page_add(struct page *page, struct list_head *pagelist, +static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { return -EIO; -- cgit v1.2.3 From 3c1ea2c729ef8ef07bcb80d01ab2ead45b3406dd Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 13:43:49 -0800 Subject: mm: add folio_get_nontail_page() Patch series "Convert a couple migrate functions to use folios", v2. This patchset introduces folio_movable_ops() and converts 3 functions in mm/migrate.c to use folios. It also introduces folio_get_nontail_page() for folio conversions which may want to distinguish between head and tail pages. This patch (of 4): folio_get_nontail_page() returns the folio associated with a head page. This is necessary for folio conversions where the behavior of that function differs between head pages and tail pages. Link: https://lkml.kernel.org/r/20230130214352.40538-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20230130214352.40538-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 89c118ad4a44..2992a2d55aee 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -892,6 +892,13 @@ static inline bool get_page_unless_zero(struct page *page) return page_ref_add_unless(page, 1, 0); } +static inline struct folio *folio_get_nontail_page(struct page *page) +{ + if (unlikely(!get_page_unless_zero(page))) + return NULL; + return (struct folio *)page; +} + extern int page_is_ram(unsigned long pfn); enum { -- cgit v1.2.3 From da707a6d184a8a6ef0b756c3ba49888fec223793 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 13:43:50 -0800 Subject: mm/migrate: add folio_movable_ops() folio_movable_ops() does the same as page_movable_ops() except uses folios instead of pages. This function will help make folio conversions in migrate.c more readable. Link: https://lkml.kernel.org/r/20230130214352.40538-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/migrate.h | 9 +++++++++ mm/migrate.c | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 3ef77f52a4f0..bdff950a8bb4 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -122,6 +122,15 @@ static inline bool folio_test_movable(struct folio *folio) return PageMovable(&folio->page); } +static inline +const struct movable_operations *folio_movable_ops(struct folio *folio) +{ + VM_BUG_ON(!__folio_test_movable(folio)); + + return (const struct movable_operations *) + ((unsigned long)folio->mapping - PAGE_MAPPING_MOVABLE); +} + static inline const struct movable_operations *page_movable_ops(struct page *page) { diff --git a/mm/migrate.c b/mm/migrate.c index c09872cf41b7..d2b1167329b9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -990,7 +990,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, goto out; } - mops = page_movable_ops(&src->page); + mops = folio_movable_ops(src); rc = mops->migrate_page(&dst->page, &src->page, mode); WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS && !folio_test_isolated(src)); -- cgit v1.2.3 From 19979497c02a365ed9d8276b5f4cc36557a13ced Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 13:43:51 -0800 Subject: mm/migrate: convert isolate_movable_page() to use folios Removes 6 calls to compound_head() and prepares the function to take in a folio instead of page argument. Link: https://lkml.kernel.org/r/20230130214352.40538-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/migrate.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index d2b1167329b9..3cdb76e44ef5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -60,6 +60,7 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) { + struct folio *folio = folio_get_nontail_page(page); const struct movable_operations *mops; /* @@ -71,11 +72,11 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) * the put_page() at the end of this block will take care of * release this page, thus avoiding a nasty leakage. */ - if (unlikely(!get_page_unless_zero(page))) + if (!folio) goto out; - if (unlikely(PageSlab(page))) - goto out_putpage; + if (unlikely(folio_test_slab(folio))) + goto out_putfolio; /* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */ smp_rmb(); /* @@ -83,12 +84,12 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) * we use non-atomic bitops on newly allocated page flags so * unconditionally grabbing the lock ruins page's owner side. */ - if (unlikely(!__PageMovable(page))) - goto out_putpage; + if (unlikely(!__folio_test_movable(folio))) + goto out_putfolio; /* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */ smp_rmb(); - if (unlikely(PageSlab(page))) - goto out_putpage; + if (unlikely(folio_test_slab(folio))) + goto out_putfolio; /* * As movable pages are not isolated from LRU lists, concurrent @@ -101,29 +102,29 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) * lets be sure we have the page lock * before proceeding with the movable page isolation steps. */ - if (unlikely(!trylock_page(page))) - goto out_putpage; + if (unlikely(!folio_trylock(folio))) + goto out_putfolio; - if (!PageMovable(page) || PageIsolated(page)) + if (!folio_test_movable(folio) || folio_test_isolated(folio)) goto out_no_isolated; - mops = page_movable_ops(page); - VM_BUG_ON_PAGE(!mops, page); + mops = folio_movable_ops(folio); + VM_BUG_ON_FOLIO(!mops, folio); - if (!mops->isolate_page(page, mode)) + if (!mops->isolate_page(&folio->page, mode)) goto out_no_isolated; /* Driver shouldn't use PG_isolated bit of page->flags */ - WARN_ON_ONCE(PageIsolated(page)); - SetPageIsolated(page); - unlock_page(page); + WARN_ON_ONCE(folio_test_isolated(folio)); + folio_set_isolated(folio); + folio_unlock(folio); return 0; out_no_isolated: - unlock_page(page); -out_putpage: - put_page(page); + folio_unlock(folio); +out_putfolio: + folio_put(folio); out: return -EBUSY; } -- cgit v1.2.3 From 280d724ac20f9cc463d4ab8e2269f598476b070f Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 13:43:52 -0800 Subject: mm/migrate: convert putback_movable_pages() to use folios Removes 6 calls to compound_head(), and replaces putback_movable_page() with putback_movable_folio() as well. Link: https://lkml.kernel.org/r/20230130214352.40538-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/migrate.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 3cdb76e44ef5..5b40b9040ba6 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -129,12 +129,12 @@ out: return -EBUSY; } -static void putback_movable_page(struct page *page) +static void putback_movable_folio(struct folio *folio) { - const struct movable_operations *mops = page_movable_ops(page); + const struct movable_operations *mops = folio_movable_ops(folio); - mops->putback_page(page); - ClearPageIsolated(page); + mops->putback_page(&folio->page); + folio_clear_isolated(folio); } /* @@ -147,33 +147,33 @@ static void putback_movable_page(struct page *page) */ void putback_movable_pages(struct list_head *l) { - struct page *page; - struct page *page2; + struct folio *folio; + struct folio *folio2; - list_for_each_entry_safe(page, page2, l, lru) { - if (unlikely(PageHuge(page))) { - folio_putback_active_hugetlb(page_folio(page)); + list_for_each_entry_safe(folio, folio2, l, lru) { + if (unlikely(folio_test_hugetlb(folio))) { + folio_putback_active_hugetlb(folio); continue; } - list_del(&page->lru); + list_del(&folio->lru); /* - * We isolated non-lru movable page so here we can use - * __PageMovable because LRU page's mapping cannot have + * We isolated non-lru movable folio so here we can use + * __PageMovable because LRU folio's mapping cannot have * PAGE_MAPPING_MOVABLE. */ - if (unlikely(__PageMovable(page))) { - VM_BUG_ON_PAGE(!PageIsolated(page), page); - lock_page(page); - if (PageMovable(page)) - putback_movable_page(page); + if (unlikely(__folio_test_movable(folio))) { + VM_BUG_ON_FOLIO(!folio_test_isolated(folio), folio); + folio_lock(folio); + if (folio_test_movable(folio)) + putback_movable_folio(folio); else - ClearPageIsolated(page); - unlock_page(page); - put_page(page); + folio_clear_isolated(folio); + folio_unlock(folio); + folio_put(folio); } else { - mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + - page_is_file_lru(page), -thp_nr_pages(page)); - putback_lru_page(page); + node_stat_mod_folio(folio, NR_ISOLATED_ANON + + folio_is_file_lru(folio), -folio_nr_pages(folio)); + folio_putback_lru(folio); } } } -- cgit v1.2.3 From 5445fcbc4cda770cd07f49624704fabcc284d563 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 9 Feb 2023 19:20:07 +0000 Subject: Docs/admin-guide/mm/damon/usage: add DAMON debugfs interface deprecation notice Patch series "mm/damon: deprecate DAMON debugfs interface". DAMON debugfs interface has announced to be deprecated after >v5.15 LTS kernel is released. And v6.1.y has been announced to be an LTS[1]. Though the announcement was there for a while, some people might not have noticed that so far. Also, some users could depend on it and have problems at movng to the alternative (DAMON sysfs interface). For such cases, keep the code and documents with warning messages and contacts to ask helps for the deprecation. [1] https://git.kernel.org/pub/scm/docs/kernel/website.git/commit/?id=332e9121320bc7461b2d3a79665caf153e51732c This patch (of 3): DAMON debugfs interface has announced to be deprecated after >v5.15 LTS kernel is released. And, v6.1.y has announced to be an LTS[1]. Though the announcement was there for a while, some people might not noticed that so far. Also, some users could depend on it and have problems at movng to the alternative (DAMON sysfs interface). For such cases, note DAMON debugfs interface as deprecated, and contacts to ask helps on the document. [1] https://git.kernel.org/pub/scm/docs/kernel/website.git/commit/?id=332e9121320bc7461b2d3a79665caf153e51732c Link: https://lkml.kernel.org/r/20230209192009.7885-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230209192009.7885-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 9237d6a25897..9b823fec974d 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -25,10 +25,12 @@ DAMON provides below interfaces for different users. interface provides only simple :ref:`statistics ` for the monitoring results. For detailed monitoring results, DAMON provides a :ref:`tracepoint `. -- *debugfs interface.* +- *debugfs interface. (DEPRECATED!)* :ref:`This ` is almost identical to :ref:`sysfs interface - `. This will be removed after next LTS kernel is released, - so users should move to the :ref:`sysfs interface `. + `. This is deprecated, so users should move to the + :ref:`sysfs interface `. If you depend on this and cannot + move, please report your usecase to damon@lists.linux.dev and + linux-mm@kvack.org. - *Kernel Space Programming Interface.* :doc:`This ` is for kernel space programmers. Using this, users can utilize every feature of DAMON most flexibly and efficiently by @@ -487,13 +489,17 @@ the files as above. Above is only for an example. .. _debugfs_interface: -debugfs Interface -================= +debugfs Interface (DEPRECATED!) +=============================== .. note:: - DAMON debugfs interface will be removed after next LTS kernel is released, so - users should move to the :ref:`sysfs interface `. + THIS IS DEPRECATED! + + DAMON debugfs interface is deprecated, so users should move to the + :ref:`sysfs interface `. If you depend on this and cannot + move, please report your usecase to damon@lists.linux.dev and + linux-mm@kvack.org. DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``, ``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and -- cgit v1.2.3 From 61e88a2f66580d7488bbf7454423c81886d2e8cd Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 9 Feb 2023 19:20:08 +0000 Subject: mm/damon/Kconfig: add DAMON debugfs interface deprecation notice DAMON debugfs interface has announced to be deprecated after >v5.15 LTS kernel is released. And, v6.1.y has announced to be an LTS[1]. Though the announcement was there for a while, some people might not noticed that so far. Also, some users could depend on it and have problems at movng to the alternative (DAMON sysfs interface). For such cases, note DAMON debugfs interface as deprecated, and contacts to ask helps on the Kconfig. [1] https://git.kernel.org/pub/scm/docs/kernel/website.git/commit/?id=332e9121320bc7461b2d3a79665caf153e51732c Link: https://lkml.kernel.org/r/20230209192009.7885-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- mm/damon/Kconfig | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 7821fcb3f258..436c6b4cb5ec 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -60,7 +60,7 @@ config DAMON_SYSFS the interface for arbitrary data access monitoring. config DAMON_DBGFS - bool "DAMON debugfs interface" + bool "DAMON debugfs interface (DEPRECATED!)" depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS help This builds the debugfs interface for DAMON. The user space admins @@ -68,8 +68,9 @@ config DAMON_DBGFS If unsure, say N. - This will be removed after >5.15.y LTS kernel is released, so users - should move to the sysfs interface (DAMON_SYSFS). + This is deprecated, so users should move to the sysfs interface + (DAMON_SYSFS). If you depend on this and cannot move, please report + your usecase to damon@lists.linux.dev and linux-mm@kvack.org. config DAMON_DBGFS_KUNIT_TEST bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS -- cgit v1.2.3 From 620932cd285208ef3009ac338b1eeed13ccd1753 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 9 Feb 2023 19:20:09 +0000 Subject: mm/damon/dbgfs: print DAMON debugfs interface deprecation message DAMON debugfs interface has announced to be deprecated after >v5.15 LTS kernel is released. And, v6.1.y has announced to be an LTS[1]. Though the announcement was there for a while, some people might not noticed that so far. Also, some users could depend on it and have problems at movng to the alternative (DAMON sysfs interface). For such cases, warn DAMON debugfs interface deprecation with contacts to ask helps when any DAMON debugfs interface file is opened. [1] https://git.kernel.org/pub/scm/docs/kernel/website.git/commit/?id=332e9121320bc7461b2d3a79665caf153e51732c [sj@kernel.org: split DAMON debugfs file open warning message, per Randy] Link: https://lkml.kernel.org/r/20230209192009.7885-4-sj@kernel.org Link: https://lkml.kernel.org/r/20230210044838.63723-4-sj@kernel.org Link: https://lkml.kernel.org/r/20230209192009.7885-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- mm/damon/dbgfs.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index b3f454a5c682..124f0f8c97b7 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -20,6 +20,14 @@ static int dbgfs_nr_ctxs; static struct dentry **dbgfs_dirs; static DEFINE_MUTEX(damon_dbgfs_lock); +static void damon_dbgfs_warn_deprecation(void) +{ + pr_warn_once("DAMON debugfs interface is deprecated, " + "so users should move to DAMON_SYSFS. If you cannot, " + "please report your usecase to damon@lists.linux.dev and " + "linux-mm@kvack.org.\n"); +} + /* * Returns non-empty string on success, negative error code otherwise. */ @@ -711,6 +719,8 @@ out: static int damon_dbgfs_open(struct inode *inode, struct file *file) { + damon_dbgfs_warn_deprecation(); + file->private_data = inode->i_private; return nonseekable_open(inode, file); @@ -1039,15 +1049,24 @@ static ssize_t dbgfs_monitor_on_write(struct file *file, return ret; } +static int damon_dbgfs_static_file_open(struct inode *inode, struct file *file) +{ + damon_dbgfs_warn_deprecation(); + return nonseekable_open(inode, file); +} + static const struct file_operations mk_contexts_fops = { + .open = damon_dbgfs_static_file_open, .write = dbgfs_mk_context_write, }; static const struct file_operations rm_contexts_fops = { + .open = damon_dbgfs_static_file_open, .write = dbgfs_rm_context_write, }; static const struct file_operations monitor_on_fops = { + .open = damon_dbgfs_static_file_open, .read = dbgfs_monitor_on_read, .write = dbgfs_monitor_on_write, }; -- cgit v1.2.3 From 6bdfc60cf0f9771d592a006dcd2cf6e40e1ccd79 Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Fri, 10 Feb 2023 21:33:16 +0100 Subject: mm: fix typo in __vm_enough_memory warning Link: https://lkml.kernel.org/r/20230210203316.5613-1-jwilk@jwilk.net Signed-off-by: Jakub Wilk Acked-by: Mike Rapoport (IBM) Cc: Kefeng Wang Signed-off-by: Andrew Morton --- mm/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/util.c b/mm/util.c index cec9327b27b4..b8ed9dbc7fd5 100644 --- a/mm/util.c +++ b/mm/util.c @@ -967,7 +967,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; error: - pr_warn_ratelimited("%s: pid: %d, comm: %s, no enough memory for the allocation\n", + pr_warn_ratelimited("%s: pid: %d, comm: %s, not enough memory for the allocation\n", __func__, current->pid, current->comm); vm_unacct_memory(pages); -- cgit v1.2.3 From 15ef6a982f40a2b53b057dad24f00c3fb43e7e70 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:15:49 +0100 Subject: lib/stackdepot: put functions in logical order Patch series "lib/stackdepot: fixes and clean-ups", v2. A set of fixes, comments, and clean-ups I came up with while reading the stack depot code. This patch (of 18): Put stack depot functions' declarations and definitions in a more logical order: 1. Functions that save stack traces into stack depot. 2. Functions that fetch and print stack traces. 3. stack_depot_get_extra_bits that operates on stack depot handles and does not interact with the stack depot storage. No functional changes. Link: https://lkml.kernel.org/r/cover.1676063693.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/daca1319b665d826b94c596b992a8d8117846147.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Evgenii Stepanov Cc: Marco Elver Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 15 ++- lib/stackdepot.c | 314 ++++++++++++++++++++++----------------------- 2 files changed, 165 insertions(+), 164 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 9ca7798d7a31..1296a6eeaec0 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -14,17 +14,13 @@ #include typedef u32 depot_stack_handle_t; + /* * Number of bits in the handle that stack depot doesn't use. Users may store * information in them. */ #define STACK_DEPOT_EXTRA_BITS 5 -depot_stack_handle_t __stack_depot_save(unsigned long *entries, - unsigned int nr_entries, - unsigned int extra_bits, - gfp_t gfp_flags, bool can_alloc); - /* * Every user of stack depot has to call stack_depot_init() during its own init * when it's decided that it will be calling stack_depot_save() later. This is @@ -59,17 +55,22 @@ static inline void stack_depot_want_early_init(void) { } static inline int stack_depot_early_init(void) { return 0; } #endif +depot_stack_handle_t __stack_depot_save(unsigned long *entries, + unsigned int nr_entries, + unsigned int extra_bits, + gfp_t gfp_flags, bool can_alloc); + depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags); unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); -unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle); +void stack_depot_print(depot_stack_handle_t stack); int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces); -void stack_depot_print(depot_stack_handle_t stack); +unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle); #endif diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 79e894cf8406..4bfaf3bce619 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -79,84 +79,6 @@ static int next_slab_inited; static size_t depot_offset; static DEFINE_RAW_SPINLOCK(depot_lock); -unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) -{ - union handle_parts parts = { .handle = handle }; - - return parts.extra; -} -EXPORT_SYMBOL(stack_depot_get_extra_bits); - -static bool init_stack_slab(void **prealloc) -{ - if (!*prealloc) - return false; - /* - * This smp_load_acquire() pairs with smp_store_release() to - * |next_slab_inited| below and in depot_alloc_stack(). - */ - if (smp_load_acquire(&next_slab_inited)) - return true; - if (stack_slabs[depot_index] == NULL) { - stack_slabs[depot_index] = *prealloc; - *prealloc = NULL; - } else { - /* If this is the last depot slab, do not touch the next one. */ - if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) { - stack_slabs[depot_index + 1] = *prealloc; - *prealloc = NULL; - } - /* - * This smp_store_release pairs with smp_load_acquire() from - * |next_slab_inited| above and in stack_depot_save(). - */ - smp_store_release(&next_slab_inited, 1); - } - return true; -} - -/* Allocation of a new stack in raw storage */ -static struct stack_record * -depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) -{ - struct stack_record *stack; - size_t required_size = struct_size(stack, entries, size); - - required_size = ALIGN(required_size, 1 << STACK_ALLOC_ALIGN); - - if (unlikely(depot_offset + required_size > STACK_ALLOC_SIZE)) { - if (unlikely(depot_index + 1 >= STACK_ALLOC_MAX_SLABS)) { - WARN_ONCE(1, "Stack depot reached limit capacity"); - return NULL; - } - depot_index++; - depot_offset = 0; - /* - * smp_store_release() here pairs with smp_load_acquire() from - * |next_slab_inited| in stack_depot_save() and - * init_stack_slab(). - */ - if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) - smp_store_release(&next_slab_inited, 0); - } - init_stack_slab(prealloc); - if (stack_slabs[depot_index] == NULL) - return NULL; - - stack = stack_slabs[depot_index] + depot_offset; - - stack->hash = hash; - stack->size = size; - stack->handle.slabindex = depot_index; - stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN; - stack->handle.valid = 1; - stack->handle.extra = 0; - memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); - depot_offset += required_size; - - return stack; -} - /* one hash table bucket entry per 16kB of memory */ #define STACK_HASH_SCALE 14 /* limited between 4k and 1M buckets */ @@ -270,6 +192,76 @@ int stack_depot_init(void) } EXPORT_SYMBOL_GPL(stack_depot_init); +static bool init_stack_slab(void **prealloc) +{ + if (!*prealloc) + return false; + /* + * This smp_load_acquire() pairs with smp_store_release() to + * |next_slab_inited| below and in depot_alloc_stack(). + */ + if (smp_load_acquire(&next_slab_inited)) + return true; + if (stack_slabs[depot_index] == NULL) { + stack_slabs[depot_index] = *prealloc; + *prealloc = NULL; + } else { + /* If this is the last depot slab, do not touch the next one. */ + if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) { + stack_slabs[depot_index + 1] = *prealloc; + *prealloc = NULL; + } + /* + * This smp_store_release pairs with smp_load_acquire() from + * |next_slab_inited| above and in stack_depot_save(). + */ + smp_store_release(&next_slab_inited, 1); + } + return true; +} + +/* Allocation of a new stack in raw storage */ +static struct stack_record * +depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) +{ + struct stack_record *stack; + size_t required_size = struct_size(stack, entries, size); + + required_size = ALIGN(required_size, 1 << STACK_ALLOC_ALIGN); + + if (unlikely(depot_offset + required_size > STACK_ALLOC_SIZE)) { + if (unlikely(depot_index + 1 >= STACK_ALLOC_MAX_SLABS)) { + WARN_ONCE(1, "Stack depot reached limit capacity"); + return NULL; + } + depot_index++; + depot_offset = 0; + /* + * smp_store_release() here pairs with smp_load_acquire() from + * |next_slab_inited| in stack_depot_save() and + * init_stack_slab(). + */ + if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) + smp_store_release(&next_slab_inited, 0); + } + init_stack_slab(prealloc); + if (stack_slabs[depot_index] == NULL) + return NULL; + + stack = stack_slabs[depot_index] + depot_offset; + + stack->hash = hash; + stack->size = size; + stack->handle.slabindex = depot_index; + stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN; + stack->handle.valid = 1; + stack->handle.extra = 0; + memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); + depot_offset += required_size; + + return stack; +} + /* Calculate hash for a stack */ static inline u32 hash_stack(unsigned long *entries, unsigned int size) { @@ -309,85 +301,6 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, return NULL; } -/** - * stack_depot_snprint - print stack entries from a depot into a buffer - * - * @handle: Stack depot handle which was returned from - * stack_depot_save(). - * @buf: Pointer to the print buffer - * - * @size: Size of the print buffer - * - * @spaces: Number of leading spaces to print - * - * Return: Number of bytes printed. - */ -int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, - int spaces) -{ - unsigned long *entries; - unsigned int nr_entries; - - nr_entries = stack_depot_fetch(handle, &entries); - return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries, - spaces) : 0; -} -EXPORT_SYMBOL_GPL(stack_depot_snprint); - -/** - * stack_depot_print - print stack entries from a depot - * - * @stack: Stack depot handle which was returned from - * stack_depot_save(). - * - */ -void stack_depot_print(depot_stack_handle_t stack) -{ - unsigned long *entries; - unsigned int nr_entries; - - nr_entries = stack_depot_fetch(stack, &entries); - if (nr_entries > 0) - stack_trace_print(entries, nr_entries, 0); -} -EXPORT_SYMBOL_GPL(stack_depot_print); - -/** - * stack_depot_fetch - Fetch stack entries from a depot - * - * @handle: Stack depot handle which was returned from - * stack_depot_save(). - * @entries: Pointer to store the entries address - * - * Return: The number of trace entries for this depot. - */ -unsigned int stack_depot_fetch(depot_stack_handle_t handle, - unsigned long **entries) -{ - union handle_parts parts = { .handle = handle }; - void *slab; - size_t offset = parts.offset << STACK_ALLOC_ALIGN; - struct stack_record *stack; - - *entries = NULL; - if (!handle) - return 0; - - if (parts.slabindex > depot_index) { - WARN(1, "slab index %d out of bounds (%d) for stack id %08x\n", - parts.slabindex, depot_index, handle); - return 0; - } - slab = stack_slabs[parts.slabindex]; - if (!slab) - return 0; - stack = slab + offset; - - *entries = stack->entries; - return stack->size; -} -EXPORT_SYMBOL_GPL(stack_depot_fetch); - /** * __stack_depot_save - Save a stack trace from an array * @@ -533,3 +446,90 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, return __stack_depot_save(entries, nr_entries, 0, alloc_flags, true); } EXPORT_SYMBOL_GPL(stack_depot_save); + +/** + * stack_depot_fetch - Fetch stack entries from a depot + * + * @handle: Stack depot handle which was returned from + * stack_depot_save(). + * @entries: Pointer to store the entries address + * + * Return: The number of trace entries for this depot. + */ +unsigned int stack_depot_fetch(depot_stack_handle_t handle, + unsigned long **entries) +{ + union handle_parts parts = { .handle = handle }; + void *slab; + size_t offset = parts.offset << STACK_ALLOC_ALIGN; + struct stack_record *stack; + + *entries = NULL; + if (!handle) + return 0; + + if (parts.slabindex > depot_index) { + WARN(1, "slab index %d out of bounds (%d) for stack id %08x\n", + parts.slabindex, depot_index, handle); + return 0; + } + slab = stack_slabs[parts.slabindex]; + if (!slab) + return 0; + stack = slab + offset; + + *entries = stack->entries; + return stack->size; +} +EXPORT_SYMBOL_GPL(stack_depot_fetch); + +/** + * stack_depot_print - print stack entries from a depot + * + * @stack: Stack depot handle which was returned from + * stack_depot_save(). + * + */ +void stack_depot_print(depot_stack_handle_t stack) +{ + unsigned long *entries; + unsigned int nr_entries; + + nr_entries = stack_depot_fetch(stack, &entries); + if (nr_entries > 0) + stack_trace_print(entries, nr_entries, 0); +} +EXPORT_SYMBOL_GPL(stack_depot_print); + +/** + * stack_depot_snprint - print stack entries from a depot into a buffer + * + * @handle: Stack depot handle which was returned from + * stack_depot_save(). + * @buf: Pointer to the print buffer + * + * @size: Size of the print buffer + * + * @spaces: Number of leading spaces to print + * + * Return: Number of bytes printed. + */ +int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, + int spaces) +{ + unsigned long *entries; + unsigned int nr_entries; + + nr_entries = stack_depot_fetch(handle, &entries); + return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries, + spaces) : 0; +} +EXPORT_SYMBOL_GPL(stack_depot_snprint); + +unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) +{ + union handle_parts parts = { .handle = handle }; + + return parts.extra; +} +EXPORT_SYMBOL(stack_depot_get_extra_bits); -- cgit v1.2.3 From 4a6b5314d6bd9093dcc3c0c8e185af7df9a0fe34 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:15:50 +0100 Subject: lib/stackdepot: use pr_fmt to define message format Use pr_fmt to define the format for printing stack depot messages instead of duplicating the "Stack Depot" prefix in each message. Link: https://lkml.kernel.org/r/3d09db0171a0e92ff3eb0ee74de74558bc9b56c4.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 4bfaf3bce619..83787e46a3ab 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -19,6 +19,8 @@ * Based on code by Dmitry Chernenkov. */ +#define pr_fmt(fmt) "stackdepot: " fmt + #include #include #include @@ -98,7 +100,7 @@ static int __init is_stack_depot_disabled(char *str) ret = kstrtobool(str, &stack_depot_disable); if (!ret && stack_depot_disable) { - pr_info("Stack Depot is disabled\n"); + pr_info("disabled\n"); stack_table = NULL; } return 0; @@ -142,7 +144,7 @@ int __init stack_depot_early_init(void) 1UL << STACK_HASH_ORDER_MAX); if (!stack_table) { - pr_err("Stack Depot hash table allocation failed, disabling\n"); + pr_err("hash table allocation failed, disabling\n"); stack_depot_disable = true; return -ENOMEM; } @@ -177,11 +179,11 @@ int stack_depot_init(void) if (entries > 1UL << STACK_HASH_ORDER_MAX) entries = 1UL << STACK_HASH_ORDER_MAX; - pr_info("Stack Depot allocating hash table of %lu entries with kvcalloc\n", + pr_info("allocating hash table of %lu entries with kvcalloc\n", entries); stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); if (!stack_table) { - pr_err("Stack Depot hash table allocation failed, disabling\n"); + pr_err("hash table allocation failed, disabling\n"); stack_depot_disable = true; ret = -ENOMEM; } -- cgit v1.2.3 From 1c0310add78e7e47e3357c24369b61453a5a72eb Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:15:51 +0100 Subject: lib/stackdepot, mm: rename stack_depot_want_early_init Rename stack_depot_want_early_init to stack_depot_request_early_init. The old name is confusing, as it hints at returning some kind of intention of stack depot. The new name reflects that this function requests an action from stack depot instead. No functional changes. [akpm@linux-foundation.org: update mm/kmemleak.c] Link: https://lkml.kernel.org/r/359f31bf67429a06e630b4395816a967214ef753.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 14 +++++++------- lib/stackdepot.c | 10 +++++----- mm/kmemleak.c | 2 +- mm/page_owner.c | 2 +- mm/slub.c | 4 ++-- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 1296a6eeaec0..c4e3abc16b16 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -31,26 +31,26 @@ typedef u32 depot_stack_handle_t; * enabled as part of mm_init(), for subsystems where it's known at compile time * that stack depot will be used. * - * Another alternative is to call stack_depot_want_early_init(), when the + * Another alternative is to call stack_depot_request_early_init(), when the * decision to use stack depot is taken e.g. when evaluating kernel boot * parameters, which precedes the enablement point in mm_init(). * - * stack_depot_init() and stack_depot_want_early_init() can be called regardless - * of CONFIG_STACKDEPOT and are no-op when disabled. The actual save/fetch/print - * functions should only be called from code that makes sure CONFIG_STACKDEPOT - * is enabled. + * stack_depot_init() and stack_depot_request_early_init() can be called + * regardless of CONFIG_STACKDEPOT and are no-op when disabled. The actual + * save/fetch/print functions should only be called from code that makes sure + * CONFIG_STACKDEPOT is enabled. */ #ifdef CONFIG_STACKDEPOT int stack_depot_init(void); -void __init stack_depot_want_early_init(void); +void __init stack_depot_request_early_init(void); /* This is supposed to be called only from mm_init() */ int __init stack_depot_early_init(void); #else static inline int stack_depot_init(void) { return 0; } -static inline void stack_depot_want_early_init(void) { } +static inline void stack_depot_request_early_init(void) { } static inline int stack_depot_early_init(void) { return 0; } #endif diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 83787e46a3ab..136706efe339 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -71,7 +71,7 @@ struct stack_record { unsigned long entries[]; /* Variable-sized array of entries. */ }; -static bool __stack_depot_want_early_init __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); +static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); static bool __stack_depot_early_init_passed __initdata; static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; @@ -107,12 +107,12 @@ static int __init is_stack_depot_disabled(char *str) } early_param("stack_depot_disable", is_stack_depot_disabled); -void __init stack_depot_want_early_init(void) +void __init stack_depot_request_early_init(void) { - /* Too late to request early init now */ + /* Too late to request early init now. */ WARN_ON(__stack_depot_early_init_passed); - __stack_depot_want_early_init = true; + __stack_depot_early_init_requested = true; } int __init stack_depot_early_init(void) @@ -128,7 +128,7 @@ int __init stack_depot_early_init(void) if (kasan_enabled() && !stack_hash_order) stack_hash_order = STACK_HASH_ORDER_MAX; - if (!__stack_depot_want_early_init || stack_depot_disable) + if (!__stack_depot_early_init_requested || stack_depot_disable) return 0; if (stack_hash_order) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index d9b242cfdb1c..a2d34226e3c8 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -2071,7 +2071,7 @@ static int __init kmemleak_boot_config(char *str) kmemleak_disable(); else if (strcmp(str, "on") == 0) { kmemleak_skip_disable = 1; - stack_depot_want_early_init(); + stack_depot_request_early_init(); } else return -EINVAL; diff --git a/mm/page_owner.c b/mm/page_owner.c index 80dc8f4050fa..220cdeddc295 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -48,7 +48,7 @@ static int __init early_page_owner_param(char *buf) int ret = kstrtobool(buf, &page_owner_enabled); if (page_owner_enabled) - stack_depot_want_early_init(); + stack_depot_request_early_init(); return ret; } diff --git a/mm/slub.c b/mm/slub.c index 67020074ecb4..f8dba33e4d15 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1592,7 +1592,7 @@ static int __init setup_slub_debug(char *str) } else { slab_list_specified = true; if (flags & SLAB_STORE_USER) - stack_depot_want_early_init(); + stack_depot_request_early_init(); } } @@ -1611,7 +1611,7 @@ static int __init setup_slub_debug(char *str) out: slub_debug = global_flags; if (slub_debug & SLAB_STORE_USER) - stack_depot_want_early_init(); + stack_depot_request_early_init(); if (slub_debug != 0 || slub_debug_string) static_branch_enable(&slub_debug_enabled); else -- cgit v1.2.3 From 735df3c3a3493cbedfa86739eec6e2ee37fe95f8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:15:52 +0100 Subject: lib/stackdepot: rename stack_depot_disable Rename stack_depot_disable to stack_depot_disabled to make its name look similar to the names of other stack depot flags. Also put stack_depot_disabled's definition together with the other flags. Also rename is_stack_depot_disabled to disable_stack_depot: this name looks more conventional for a function that processes a boot parameter. No functional changes. Link: https://lkml.kernel.org/r/d78a07d222e689926e5ead229e4a2e3d87dc9aa7.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 136706efe339..202e07c4f02d 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -71,6 +71,7 @@ struct stack_record { unsigned long entries[]; /* Variable-sized array of entries. */ }; +static bool stack_depot_disabled; static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); static bool __stack_depot_early_init_passed __initdata; @@ -91,21 +92,20 @@ static DEFINE_RAW_SPINLOCK(depot_lock); static unsigned int stack_hash_order; static unsigned int stack_hash_mask; -static bool stack_depot_disable; static struct stack_record **stack_table; -static int __init is_stack_depot_disabled(char *str) +static int __init disable_stack_depot(char *str) { int ret; - ret = kstrtobool(str, &stack_depot_disable); - if (!ret && stack_depot_disable) { + ret = kstrtobool(str, &stack_depot_disabled); + if (!ret && stack_depot_disabled) { pr_info("disabled\n"); stack_table = NULL; } return 0; } -early_param("stack_depot_disable", is_stack_depot_disabled); +early_param("stack_depot_disable", disable_stack_depot); void __init stack_depot_request_early_init(void) { @@ -128,7 +128,7 @@ int __init stack_depot_early_init(void) if (kasan_enabled() && !stack_hash_order) stack_hash_order = STACK_HASH_ORDER_MAX; - if (!__stack_depot_early_init_requested || stack_depot_disable) + if (!__stack_depot_early_init_requested || stack_depot_disabled) return 0; if (stack_hash_order) @@ -145,7 +145,7 @@ int __init stack_depot_early_init(void) if (!stack_table) { pr_err("hash table allocation failed, disabling\n"); - stack_depot_disable = true; + stack_depot_disabled = true; return -ENOMEM; } @@ -158,7 +158,7 @@ int stack_depot_init(void) int ret = 0; mutex_lock(&stack_depot_init_mutex); - if (!stack_depot_disable && !stack_table) { + if (!stack_depot_disabled && !stack_table) { unsigned long entries; int scale = STACK_HASH_SCALE; @@ -184,7 +184,7 @@ int stack_depot_init(void) stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); if (!stack_table) { pr_err("hash table allocation failed, disabling\n"); - stack_depot_disable = true; + stack_depot_disabled = true; ret = -ENOMEM; } stack_hash_mask = entries - 1; @@ -353,7 +353,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, */ nr_entries = filter_irq_stacks(entries, nr_entries); - if (unlikely(nr_entries == 0) || stack_depot_disable) + if (unlikely(nr_entries == 0) || stack_depot_disabled) goto fast_exit; hash = hash_stack(entries, nr_entries); -- cgit v1.2.3 From df225c877d898992740d26250727a16db65c370d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:15:53 +0100 Subject: lib/stackdepot: annotate init and early init functions Add comments to stack_depot_early_init and stack_depot_init to explain certain parts of their implementation. Also add a pr_info message to stack_depot_early_init similar to the one in stack_depot_init. Also move the scale variable in stack_depot_init to the scope where it is being used. Link: https://lkml.kernel.org/r/d17fbfbd4d73f38686c5e3d4824a6d62047213a1.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 202e07c4f02d..9fab711e4826 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -115,24 +115,34 @@ void __init stack_depot_request_early_init(void) __stack_depot_early_init_requested = true; } +/* Allocates a hash table via memblock. Can only be used during early boot. */ int __init stack_depot_early_init(void) { unsigned long entries = 0; - /* This is supposed to be called only once, from mm_init() */ + /* This function must be called only once, from mm_init(). */ if (WARN_ON(__stack_depot_early_init_passed)) return 0; - __stack_depot_early_init_passed = true; + /* + * If KASAN is enabled, use the maximum order: KASAN is frequently used + * in fuzzing scenarios, which leads to a large number of different + * stack traces being stored in stack depot. + */ if (kasan_enabled() && !stack_hash_order) stack_hash_order = STACK_HASH_ORDER_MAX; if (!__stack_depot_early_init_requested || stack_depot_disabled) return 0; + /* + * If stack_hash_order is not set, leave entries as 0 to rely on the + * automatic calculations performed by alloc_large_system_hash. + */ if (stack_hash_order) - entries = 1UL << stack_hash_order; + entries = 1UL << stack_hash_order; + pr_info("allocating hash table via alloc_large_system_hash\n"); stack_table = alloc_large_system_hash("stackdepot", sizeof(struct stack_record *), entries, @@ -142,7 +152,6 @@ int __init stack_depot_early_init(void) &stack_hash_mask, 1UL << STACK_HASH_ORDER_MIN, 1UL << STACK_HASH_ORDER_MAX); - if (!stack_table) { pr_err("hash table allocation failed, disabling\n"); stack_depot_disabled = true; @@ -152,6 +161,7 @@ int __init stack_depot_early_init(void) return 0; } +/* Allocates a hash table via kvcalloc. Can be used after boot. */ int stack_depot_init(void) { static DEFINE_MUTEX(stack_depot_init_mutex); @@ -160,11 +170,16 @@ int stack_depot_init(void) mutex_lock(&stack_depot_init_mutex); if (!stack_depot_disabled && !stack_table) { unsigned long entries; - int scale = STACK_HASH_SCALE; + /* + * Similarly to stack_depot_early_init, use stack_hash_order + * if assigned, and rely on automatic scaling otherwise. + */ if (stack_hash_order) { entries = 1UL << stack_hash_order; } else { + int scale = STACK_HASH_SCALE; + entries = nr_free_buffer_pages(); entries = roundup_pow_of_two(entries); @@ -179,7 +194,7 @@ int stack_depot_init(void) if (entries > 1UL << STACK_HASH_ORDER_MAX) entries = 1UL << STACK_HASH_ORDER_MAX; - pr_info("allocating hash table of %lu entries with kvcalloc\n", + pr_info("allocating hash table of %lu entries via kvcalloc\n", entries); stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); if (!stack_table) { -- cgit v1.2.3 From c60324fbf05d9b1dd3231f5373d4bf31cc23db07 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:15:54 +0100 Subject: lib/stackdepot: lower the indentation in stack_depot_init stack_depot_init does most things inside an if check. Move them out and use a goto statement instead. No functional changes. Link: https://lkml.kernel.org/r/8e382f1f0c352e4b2ad47326fec7782af961fe8e.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 70 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 33 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 9fab711e4826..3c713f70b0a3 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -165,46 +165,50 @@ int __init stack_depot_early_init(void) int stack_depot_init(void) { static DEFINE_MUTEX(stack_depot_init_mutex); + unsigned long entries; int ret = 0; mutex_lock(&stack_depot_init_mutex); - if (!stack_depot_disabled && !stack_table) { - unsigned long entries; - /* - * Similarly to stack_depot_early_init, use stack_hash_order - * if assigned, and rely on automatic scaling otherwise. - */ - if (stack_hash_order) { - entries = 1UL << stack_hash_order; - } else { - int scale = STACK_HASH_SCALE; - - entries = nr_free_buffer_pages(); - entries = roundup_pow_of_two(entries); - - if (scale > PAGE_SHIFT) - entries >>= (scale - PAGE_SHIFT); - else - entries <<= (PAGE_SHIFT - scale); - } + if (stack_depot_disabled || stack_table) + goto out_unlock; - if (entries < 1UL << STACK_HASH_ORDER_MIN) - entries = 1UL << STACK_HASH_ORDER_MIN; - if (entries > 1UL << STACK_HASH_ORDER_MAX) - entries = 1UL << STACK_HASH_ORDER_MAX; - - pr_info("allocating hash table of %lu entries via kvcalloc\n", - entries); - stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); - if (!stack_table) { - pr_err("hash table allocation failed, disabling\n"); - stack_depot_disabled = true; - ret = -ENOMEM; - } - stack_hash_mask = entries - 1; + /* + * Similarly to stack_depot_early_init, use stack_hash_order + * if assigned, and rely on automatic scaling otherwise. + */ + if (stack_hash_order) { + entries = 1UL << stack_hash_order; + } else { + int scale = STACK_HASH_SCALE; + + entries = nr_free_buffer_pages(); + entries = roundup_pow_of_two(entries); + + if (scale > PAGE_SHIFT) + entries >>= (scale - PAGE_SHIFT); + else + entries <<= (PAGE_SHIFT - scale); } + + if (entries < 1UL << STACK_HASH_ORDER_MIN) + entries = 1UL << STACK_HASH_ORDER_MIN; + if (entries > 1UL << STACK_HASH_ORDER_MAX) + entries = 1UL << STACK_HASH_ORDER_MAX; + + pr_info("allocating hash table of %lu entries via kvcalloc\n", entries); + stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); + if (!stack_table) { + pr_err("hash table allocation failed, disabling\n"); + stack_depot_disabled = true; + ret = -ENOMEM; + goto out_unlock; + } + stack_hash_mask = entries - 1; + +out_unlock: mutex_unlock(&stack_depot_init_mutex); + return ret; } EXPORT_SYMBOL_GPL(stack_depot_init); -- cgit v1.2.3 From 0d249ac0e07680960929a2d4f7b32be505c8c7a1 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:15:55 +0100 Subject: lib/stackdepot: reorder and annotate global variables Group stack depot global variables by their purpose: 1. Hash table-related variables, 2. Slab-related variables, and add comments. Also clean up comments for hash table-related constants. Link: https://lkml.kernel.org/r/5606a6c70659065a25bee59cd10e57fc60bb4110.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 3c713f70b0a3..de1afe3fb24d 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -75,24 +75,31 @@ static bool stack_depot_disabled; static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); static bool __stack_depot_early_init_passed __initdata; -static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; - -static int depot_index; -static int next_slab_inited; -static size_t depot_offset; -static DEFINE_RAW_SPINLOCK(depot_lock); - -/* one hash table bucket entry per 16kB of memory */ +/* Use one hash table bucket per 16 KB of memory. */ #define STACK_HASH_SCALE 14 -/* limited between 4k and 1M buckets */ +/* Limit the number of buckets between 4K and 1M. */ #define STACK_HASH_ORDER_MIN 12 #define STACK_HASH_ORDER_MAX 20 +/* Initial seed for jhash2. */ #define STACK_HASH_SEED 0x9747b28c +/* Hash table of pointers to stored stack traces. */ +static struct stack_record **stack_table; +/* Fixed order of the number of table buckets. Used when KASAN is enabled. */ static unsigned int stack_hash_order; +/* Hash mask for indexing the table. */ static unsigned int stack_hash_mask; -static struct stack_record **stack_table; +/* Array of memory regions that store stack traces. */ +static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; +/* Currently used slab in stack_slabs. */ +static int depot_index; +/* Offset to the unused space in the currently used slab. */ +static size_t depot_offset; +/* Lock that protects the variables above. */ +static DEFINE_RAW_SPINLOCK(depot_lock); +/* Whether the next slab is initialized. */ +static int next_slab_inited; static int __init disable_stack_depot(char *str) { -- cgit v1.2.3 From 4c2e9a679468a5bef2100504914481c6ddf0d9bd Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:15:56 +0100 Subject: lib/stackdepot: rename hash table constants and variables Give more meaningful names to hash table-related constants and variables: 1. Rename STACK_HASH_SCALE to STACK_HASH_TABLE_SCALE to point out that it is related to scaling the hash table. 2. Rename STACK_HASH_ORDER_MIN/MAX to STACK_BUCKET_NUMBER_ORDER_MIN/MAX to point out that it is related to the number of hash table buckets. 3. Rename stack_hash_order to stack_bucket_number_order for the same reason as #2. No functional changes. Link: https://lkml.kernel.org/r/f166dd6f3cb2378aea78600714393dd568c33ee9.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index de1afe3fb24d..d1ab53197353 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -76,17 +76,17 @@ static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_ST static bool __stack_depot_early_init_passed __initdata; /* Use one hash table bucket per 16 KB of memory. */ -#define STACK_HASH_SCALE 14 +#define STACK_HASH_TABLE_SCALE 14 /* Limit the number of buckets between 4K and 1M. */ -#define STACK_HASH_ORDER_MIN 12 -#define STACK_HASH_ORDER_MAX 20 +#define STACK_BUCKET_NUMBER_ORDER_MIN 12 +#define STACK_BUCKET_NUMBER_ORDER_MAX 20 /* Initial seed for jhash2. */ #define STACK_HASH_SEED 0x9747b28c /* Hash table of pointers to stored stack traces. */ static struct stack_record **stack_table; /* Fixed order of the number of table buckets. Used when KASAN is enabled. */ -static unsigned int stack_hash_order; +static unsigned int stack_bucket_number_order; /* Hash mask for indexing the table. */ static unsigned int stack_hash_mask; @@ -137,28 +137,28 @@ int __init stack_depot_early_init(void) * in fuzzing scenarios, which leads to a large number of different * stack traces being stored in stack depot. */ - if (kasan_enabled() && !stack_hash_order) - stack_hash_order = STACK_HASH_ORDER_MAX; + if (kasan_enabled() && !stack_bucket_number_order) + stack_bucket_number_order = STACK_BUCKET_NUMBER_ORDER_MAX; if (!__stack_depot_early_init_requested || stack_depot_disabled) return 0; /* - * If stack_hash_order is not set, leave entries as 0 to rely on the - * automatic calculations performed by alloc_large_system_hash. + * If stack_bucket_number_order is not set, leave entries as 0 to rely + * on the automatic calculations performed by alloc_large_system_hash. */ - if (stack_hash_order) - entries = 1UL << stack_hash_order; + if (stack_bucket_number_order) + entries = 1UL << stack_bucket_number_order; pr_info("allocating hash table via alloc_large_system_hash\n"); stack_table = alloc_large_system_hash("stackdepot", sizeof(struct stack_record *), entries, - STACK_HASH_SCALE, + STACK_HASH_TABLE_SCALE, HASH_EARLY | HASH_ZERO, NULL, &stack_hash_mask, - 1UL << STACK_HASH_ORDER_MIN, - 1UL << STACK_HASH_ORDER_MAX); + 1UL << STACK_BUCKET_NUMBER_ORDER_MIN, + 1UL << STACK_BUCKET_NUMBER_ORDER_MAX); if (!stack_table) { pr_err("hash table allocation failed, disabling\n"); stack_depot_disabled = true; @@ -181,13 +181,13 @@ int stack_depot_init(void) goto out_unlock; /* - * Similarly to stack_depot_early_init, use stack_hash_order + * Similarly to stack_depot_early_init, use stack_bucket_number_order * if assigned, and rely on automatic scaling otherwise. */ - if (stack_hash_order) { - entries = 1UL << stack_hash_order; + if (stack_bucket_number_order) { + entries = 1UL << stack_bucket_number_order; } else { - int scale = STACK_HASH_SCALE; + int scale = STACK_HASH_TABLE_SCALE; entries = nr_free_buffer_pages(); entries = roundup_pow_of_two(entries); @@ -198,10 +198,10 @@ int stack_depot_init(void) entries <<= (PAGE_SHIFT - scale); } - if (entries < 1UL << STACK_HASH_ORDER_MIN) - entries = 1UL << STACK_HASH_ORDER_MIN; - if (entries > 1UL << STACK_HASH_ORDER_MAX) - entries = 1UL << STACK_HASH_ORDER_MAX; + if (entries < 1UL << STACK_BUCKET_NUMBER_ORDER_MIN) + entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MIN; + if (entries > 1UL << STACK_BUCKET_NUMBER_ORDER_MAX) + entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MAX; pr_info("allocating hash table of %lu entries via kvcalloc\n", entries); stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); -- cgit v1.2.3 From 961c949b012f009c51ce209ded801e34d0a75306 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:15:57 +0100 Subject: lib/stackdepot: rename slab to pool Use "pool" instead of "slab" for naming memory regions stack depot uses to store stack traces. Using "slab" is confusing, as stack depot pools have nothing to do with the slab allocator. Also give better names to pool-related global variables: change "depot_" prefix to "pool_" to point out that these variables are related to stack depot pools. Also rename the slabindex (poolindex) field in handle_parts to pool_index to align its name with the pool_index global variable. No functional changes. Link: https://lkml.kernel.org/r/923c507edb350c3b6ef85860f36be489dfc0ad21.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Vlastimil Babka Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 106 +++++++++++++++++++++++++++---------------------------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index d1ab53197353..522e36cf449f 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -39,7 +39,7 @@ #define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8) #define STACK_ALLOC_NULL_PROTECTION_BITS 1 -#define STACK_ALLOC_ORDER 2 /* 'Slab' size order for stack depot, 4 pages */ +#define STACK_ALLOC_ORDER 2 /* Pool size order for stack depot, 4 pages */ #define STACK_ALLOC_SIZE (1LL << (PAGE_SHIFT + STACK_ALLOC_ORDER)) #define STACK_ALLOC_ALIGN 4 #define STACK_ALLOC_OFFSET_BITS (STACK_ALLOC_ORDER + PAGE_SHIFT - \ @@ -47,16 +47,16 @@ #define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - \ STACK_ALLOC_NULL_PROTECTION_BITS - \ STACK_ALLOC_OFFSET_BITS - STACK_DEPOT_EXTRA_BITS) -#define STACK_ALLOC_SLABS_CAP 8192 -#define STACK_ALLOC_MAX_SLABS \ - (((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \ - (1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_SLABS_CAP) +#define STACK_ALLOC_POOLS_CAP 8192 +#define STACK_ALLOC_MAX_POOLS \ + (((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_POOLS_CAP) ? \ + (1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_POOLS_CAP) /* The compact structure to store the reference to stacks. */ union handle_parts { depot_stack_handle_t handle; struct { - u32 slabindex : STACK_ALLOC_INDEX_BITS; + u32 pool_index : STACK_ALLOC_INDEX_BITS; u32 offset : STACK_ALLOC_OFFSET_BITS; u32 valid : STACK_ALLOC_NULL_PROTECTION_BITS; u32 extra : STACK_DEPOT_EXTRA_BITS; @@ -91,15 +91,15 @@ static unsigned int stack_bucket_number_order; static unsigned int stack_hash_mask; /* Array of memory regions that store stack traces. */ -static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; -/* Currently used slab in stack_slabs. */ -static int depot_index; -/* Offset to the unused space in the currently used slab. */ -static size_t depot_offset; +static void *stack_pools[STACK_ALLOC_MAX_POOLS]; +/* Currently used pool in stack_pools. */ +static int pool_index; +/* Offset to the unused space in the currently used pool. */ +static size_t pool_offset; /* Lock that protects the variables above. */ -static DEFINE_RAW_SPINLOCK(depot_lock); -/* Whether the next slab is initialized. */ -static int next_slab_inited; +static DEFINE_RAW_SPINLOCK(pool_lock); +/* Whether the next pool is initialized. */ +static int next_pool_inited; static int __init disable_stack_depot(char *str) { @@ -220,30 +220,30 @@ out_unlock: } EXPORT_SYMBOL_GPL(stack_depot_init); -static bool init_stack_slab(void **prealloc) +static bool init_stack_pool(void **prealloc) { if (!*prealloc) return false; /* * This smp_load_acquire() pairs with smp_store_release() to - * |next_slab_inited| below and in depot_alloc_stack(). + * |next_pool_inited| below and in depot_alloc_stack(). */ - if (smp_load_acquire(&next_slab_inited)) + if (smp_load_acquire(&next_pool_inited)) return true; - if (stack_slabs[depot_index] == NULL) { - stack_slabs[depot_index] = *prealloc; + if (stack_pools[pool_index] == NULL) { + stack_pools[pool_index] = *prealloc; *prealloc = NULL; } else { - /* If this is the last depot slab, do not touch the next one. */ - if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) { - stack_slabs[depot_index + 1] = *prealloc; + /* If this is the last depot pool, do not touch the next one. */ + if (pool_index + 1 < STACK_ALLOC_MAX_POOLS) { + stack_pools[pool_index + 1] = *prealloc; *prealloc = NULL; } /* * This smp_store_release pairs with smp_load_acquire() from - * |next_slab_inited| above and in stack_depot_save(). + * |next_pool_inited| above and in stack_depot_save(). */ - smp_store_release(&next_slab_inited, 1); + smp_store_release(&next_pool_inited, 1); } return true; } @@ -257,35 +257,35 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) required_size = ALIGN(required_size, 1 << STACK_ALLOC_ALIGN); - if (unlikely(depot_offset + required_size > STACK_ALLOC_SIZE)) { - if (unlikely(depot_index + 1 >= STACK_ALLOC_MAX_SLABS)) { + if (unlikely(pool_offset + required_size > STACK_ALLOC_SIZE)) { + if (unlikely(pool_index + 1 >= STACK_ALLOC_MAX_POOLS)) { WARN_ONCE(1, "Stack depot reached limit capacity"); return NULL; } - depot_index++; - depot_offset = 0; + pool_index++; + pool_offset = 0; /* * smp_store_release() here pairs with smp_load_acquire() from - * |next_slab_inited| in stack_depot_save() and - * init_stack_slab(). + * |next_pool_inited| in stack_depot_save() and + * init_stack_pool(). */ - if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) - smp_store_release(&next_slab_inited, 0); + if (pool_index + 1 < STACK_ALLOC_MAX_POOLS) + smp_store_release(&next_pool_inited, 0); } - init_stack_slab(prealloc); - if (stack_slabs[depot_index] == NULL) + init_stack_pool(prealloc); + if (stack_pools[pool_index] == NULL) return NULL; - stack = stack_slabs[depot_index] + depot_offset; + stack = stack_pools[pool_index] + pool_offset; stack->hash = hash; stack->size = size; - stack->handle.slabindex = depot_index; - stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN; + stack->handle.pool_index = pool_index; + stack->handle.offset = pool_offset >> STACK_ALLOC_ALIGN; stack->handle.valid = 1; stack->handle.extra = 0; memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); - depot_offset += required_size; + pool_offset += required_size; return stack; } @@ -336,10 +336,10 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, * @nr_entries: Size of the storage array * @extra_bits: Flags to store in unused bits of depot_stack_handle_t * @alloc_flags: Allocation gfp flags - * @can_alloc: Allocate stack slabs (increased chance of failure if false) + * @can_alloc: Allocate stack pools (increased chance of failure if false) * * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is - * %true, is allowed to replenish the stack slab pool in case no space is left + * %true, is allowed to replenish the stack pool in case no space is left * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids * any allocations and will fail if no space is left to store the stack trace. * @@ -396,14 +396,14 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, goto exit; /* - * Check if the current or the next stack slab need to be initialized. + * Check if the current or the next stack pool need to be initialized. * If so, allocate the memory - we won't be able to do that under the * lock. * * The smp_load_acquire() here pairs with smp_store_release() to - * |next_slab_inited| in depot_alloc_stack() and init_stack_slab(). + * |next_pool_inited| in depot_alloc_stack() and init_stack_pool(). */ - if (unlikely(can_alloc && !smp_load_acquire(&next_slab_inited))) { + if (unlikely(can_alloc && !smp_load_acquire(&next_pool_inited))) { /* * Zero out zone modifiers, as we don't have specific zone * requirements. Keep the flags related to allocation in atomic @@ -417,7 +417,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, prealloc = page_address(page); } - raw_spin_lock_irqsave(&depot_lock, flags); + raw_spin_lock_irqsave(&pool_lock, flags); found = find_stack(*bucket, entries, nr_entries, hash); if (!found) { @@ -437,10 +437,10 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * We didn't need to store this stack trace, but let's keep * the preallocated memory for the future. */ - WARN_ON(!init_stack_slab(&prealloc)); + WARN_ON(!init_stack_pool(&prealloc)); } - raw_spin_unlock_irqrestore(&depot_lock, flags); + raw_spin_unlock_irqrestore(&pool_lock, flags); exit: if (prealloc) { /* Nobody used this memory, ok to free it. */ @@ -488,7 +488,7 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries) { union handle_parts parts = { .handle = handle }; - void *slab; + void *pool; size_t offset = parts.offset << STACK_ALLOC_ALIGN; struct stack_record *stack; @@ -496,15 +496,15 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, if (!handle) return 0; - if (parts.slabindex > depot_index) { - WARN(1, "slab index %d out of bounds (%d) for stack id %08x\n", - parts.slabindex, depot_index, handle); + if (parts.pool_index > pool_index) { + WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n", + parts.pool_index, pool_index, handle); return 0; } - slab = stack_slabs[parts.slabindex]; - if (!slab) + pool = stack_pools[parts.pool_index]; + if (!pool) return 0; - stack = slab + offset; + stack = pool + offset; *entries = stack->entries; return stack->size; -- cgit v1.2.3 From 424cafee4a9c66435d8b86e7edbc794c116b52a5 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:15:58 +0100 Subject: lib/stackdepot: rename handle and pool constants Change the "STACK_ALLOC_" prefix to "DEPOT_" for the constants that define the number of bits in stack depot handles and the maximum number of pools. The old prefix is unclear and makes wonder about how these constants are related to stack allocations. The new prefix is also shorter. Also simplify the comment for DEPOT_POOL_ORDER. No functional changes. Link: https://lkml.kernel.org/r/84fcceb0acc261a356a0ad4bdfab9ff04bea2445.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 56 +++++++++++++++++++++++++++----------------------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 522e36cf449f..97bba462ee13 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -36,30 +36,28 @@ #include #include -#define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8) - -#define STACK_ALLOC_NULL_PROTECTION_BITS 1 -#define STACK_ALLOC_ORDER 2 /* Pool size order for stack depot, 4 pages */ -#define STACK_ALLOC_SIZE (1LL << (PAGE_SHIFT + STACK_ALLOC_ORDER)) -#define STACK_ALLOC_ALIGN 4 -#define STACK_ALLOC_OFFSET_BITS (STACK_ALLOC_ORDER + PAGE_SHIFT - \ - STACK_ALLOC_ALIGN) -#define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - \ - STACK_ALLOC_NULL_PROTECTION_BITS - \ - STACK_ALLOC_OFFSET_BITS - STACK_DEPOT_EXTRA_BITS) -#define STACK_ALLOC_POOLS_CAP 8192 -#define STACK_ALLOC_MAX_POOLS \ - (((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_POOLS_CAP) ? \ - (1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_POOLS_CAP) +#define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8) + +#define DEPOT_VALID_BITS 1 +#define DEPOT_POOL_ORDER 2 /* Pool size order, 4 pages */ +#define DEPOT_POOL_SIZE (1LL << (PAGE_SHIFT + DEPOT_POOL_ORDER)) +#define DEPOT_STACK_ALIGN 4 +#define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN) +#define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_VALID_BITS - \ + DEPOT_OFFSET_BITS - STACK_DEPOT_EXTRA_BITS) +#define DEPOT_POOLS_CAP 8192 +#define DEPOT_MAX_POOLS \ + (((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \ + (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP) /* The compact structure to store the reference to stacks. */ union handle_parts { depot_stack_handle_t handle; struct { - u32 pool_index : STACK_ALLOC_INDEX_BITS; - u32 offset : STACK_ALLOC_OFFSET_BITS; - u32 valid : STACK_ALLOC_NULL_PROTECTION_BITS; - u32 extra : STACK_DEPOT_EXTRA_BITS; + u32 pool_index : DEPOT_POOL_INDEX_BITS; + u32 offset : DEPOT_OFFSET_BITS; + u32 valid : DEPOT_VALID_BITS; + u32 extra : STACK_DEPOT_EXTRA_BITS; }; }; @@ -91,7 +89,7 @@ static unsigned int stack_bucket_number_order; static unsigned int stack_hash_mask; /* Array of memory regions that store stack traces. */ -static void *stack_pools[STACK_ALLOC_MAX_POOLS]; +static void *stack_pools[DEPOT_MAX_POOLS]; /* Currently used pool in stack_pools. */ static int pool_index; /* Offset to the unused space in the currently used pool. */ @@ -235,7 +233,7 @@ static bool init_stack_pool(void **prealloc) *prealloc = NULL; } else { /* If this is the last depot pool, do not touch the next one. */ - if (pool_index + 1 < STACK_ALLOC_MAX_POOLS) { + if (pool_index + 1 < DEPOT_MAX_POOLS) { stack_pools[pool_index + 1] = *prealloc; *prealloc = NULL; } @@ -255,10 +253,10 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) struct stack_record *stack; size_t required_size = struct_size(stack, entries, size); - required_size = ALIGN(required_size, 1 << STACK_ALLOC_ALIGN); + required_size = ALIGN(required_size, 1 << DEPOT_STACK_ALIGN); - if (unlikely(pool_offset + required_size > STACK_ALLOC_SIZE)) { - if (unlikely(pool_index + 1 >= STACK_ALLOC_MAX_POOLS)) { + if (unlikely(pool_offset + required_size > DEPOT_POOL_SIZE)) { + if (unlikely(pool_index + 1 >= DEPOT_MAX_POOLS)) { WARN_ONCE(1, "Stack depot reached limit capacity"); return NULL; } @@ -269,7 +267,7 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) * |next_pool_inited| in stack_depot_save() and * init_stack_pool(). */ - if (pool_index + 1 < STACK_ALLOC_MAX_POOLS) + if (pool_index + 1 < DEPOT_MAX_POOLS) smp_store_release(&next_pool_inited, 0); } init_stack_pool(prealloc); @@ -281,7 +279,7 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) stack->hash = hash; stack->size = size; stack->handle.pool_index = pool_index; - stack->handle.offset = pool_offset >> STACK_ALLOC_ALIGN; + stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN; stack->handle.valid = 1; stack->handle.extra = 0; memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); @@ -412,7 +410,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, alloc_flags &= ~GFP_ZONEMASK; alloc_flags &= (GFP_ATOMIC | GFP_KERNEL); alloc_flags |= __GFP_NOWARN; - page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER); + page = alloc_pages(alloc_flags, DEPOT_POOL_ORDER); if (page) prealloc = page_address(page); } @@ -444,7 +442,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, exit: if (prealloc) { /* Nobody used this memory, ok to free it. */ - free_pages((unsigned long)prealloc, STACK_ALLOC_ORDER); + free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER); } if (found) retval.handle = found->handle.handle; @@ -489,7 +487,7 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, { union handle_parts parts = { .handle = handle }; void *pool; - size_t offset = parts.offset << STACK_ALLOC_ALIGN; + size_t offset = parts.offset << DEPOT_STACK_ALIGN; struct stack_record *stack; *entries = NULL; -- cgit v1.2.3 From cb788e84a4cfe47941cded45b5ca81a917fbb1a6 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:15:59 +0100 Subject: lib/stackdepot: rename init_stack_pool Rename init_stack_pool to depot_init_pool to align the name with depot_alloc_stack. No functional changes. Link: https://lkml.kernel.org/r/23106a3e291d8df0aba33c0e2fe86dc596286479.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 97bba462ee13..7f5f08bb6c3a 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -218,7 +218,7 @@ out_unlock: } EXPORT_SYMBOL_GPL(stack_depot_init); -static bool init_stack_pool(void **prealloc) +static bool depot_init_pool(void **prealloc) { if (!*prealloc) return false; @@ -265,12 +265,12 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) /* * smp_store_release() here pairs with smp_load_acquire() from * |next_pool_inited| in stack_depot_save() and - * init_stack_pool(). + * depot_init_pool(). */ if (pool_index + 1 < DEPOT_MAX_POOLS) smp_store_release(&next_pool_inited, 0); } - init_stack_pool(prealloc); + depot_init_pool(prealloc); if (stack_pools[pool_index] == NULL) return NULL; @@ -399,7 +399,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * lock. * * The smp_load_acquire() here pairs with smp_store_release() to - * |next_pool_inited| in depot_alloc_stack() and init_stack_pool(). + * |next_pool_inited| in depot_alloc_stack() and depot_init_pool(). */ if (unlikely(can_alloc && !smp_load_acquire(&next_pool_inited))) { /* @@ -435,7 +435,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * We didn't need to store this stack trace, but let's keep * the preallocated memory for the future. */ - WARN_ON(!init_stack_pool(&prealloc)); + WARN_ON(!depot_init_pool(&prealloc)); } raw_spin_unlock_irqrestore(&pool_lock, flags); -- cgit v1.2.3 From 514d5c557b8b590a80f0569af5ae5f4d455ecef2 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:16:00 +0100 Subject: lib/stacktrace: drop impossible WARN_ON for depot_init_pool depot_init_pool has two call sites: 1. In depot_alloc_stack with a potentially NULL prealloc. 2. In __stack_depot_save with a non-NULL prealloc. At the same time depot_init_pool can only return false when prealloc is NULL. As the second call site makes sure that prealloc is not NULL, the WARN_ON there can never trigger. Thus, drop the WARN_ON and also move the prealloc check from depot_init_pool to its first call site. Also change the return type of depot_init_pool to void as it now always returns true. Link: https://lkml.kernel.org/r/ce149f9bdcbc80a92549b54da67eafb27f846b7b.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 7f5f08bb6c3a..d4d988276b91 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -218,16 +218,14 @@ out_unlock: } EXPORT_SYMBOL_GPL(stack_depot_init); -static bool depot_init_pool(void **prealloc) +static void depot_init_pool(void **prealloc) { - if (!*prealloc) - return false; /* * This smp_load_acquire() pairs with smp_store_release() to * |next_pool_inited| below and in depot_alloc_stack(). */ if (smp_load_acquire(&next_pool_inited)) - return true; + return; if (stack_pools[pool_index] == NULL) { stack_pools[pool_index] = *prealloc; *prealloc = NULL; @@ -243,7 +241,6 @@ static bool depot_init_pool(void **prealloc) */ smp_store_release(&next_pool_inited, 1); } - return true; } /* Allocation of a new stack in raw storage */ @@ -270,7 +267,8 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) if (pool_index + 1 < DEPOT_MAX_POOLS) smp_store_release(&next_pool_inited, 0); } - depot_init_pool(prealloc); + if (*prealloc) + depot_init_pool(prealloc); if (stack_pools[pool_index] == NULL) return NULL; @@ -435,7 +433,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * We didn't need to store this stack trace, but let's keep * the preallocated memory for the future. */ - WARN_ON(!depot_init_pool(&prealloc)); + depot_init_pool(&prealloc); } raw_spin_unlock_irqrestore(&pool_lock, flags); -- cgit v1.2.3 From cd0fc64e76844758b78d0fd376ae3ca4fd802049 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:16:01 +0100 Subject: lib/stackdepot: annotate depot_init_pool and depot_alloc_stack Clean up the exisiting comments and add new ones to depot_init_pool and depot_alloc_stack. As a part of the clean-up, remove mentions of which variable is accessed by smp_store_release and smp_load_acquire: it is clear as is from the code. Link: https://lkml.kernel.org/r/f80b02951364e6b40deda965b4003de0cd1a532d.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index d4d988276b91..c4bc198c3d93 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -218,32 +218,39 @@ out_unlock: } EXPORT_SYMBOL_GPL(stack_depot_init); +/* Uses preallocated memory to initialize a new stack depot pool. */ static void depot_init_pool(void **prealloc) { /* - * This smp_load_acquire() pairs with smp_store_release() to - * |next_pool_inited| below and in depot_alloc_stack(). + * smp_load_acquire() here pairs with smp_store_release() below and + * in depot_alloc_stack(). */ if (smp_load_acquire(&next_pool_inited)) return; + + /* Check if the current pool is not yet allocated. */ if (stack_pools[pool_index] == NULL) { + /* Use the preallocated memory for the current pool. */ stack_pools[pool_index] = *prealloc; *prealloc = NULL; } else { - /* If this is the last depot pool, do not touch the next one. */ + /* + * Otherwise, use the preallocated memory for the next pool + * as long as we do not exceed the maximum number of pools. + */ if (pool_index + 1 < DEPOT_MAX_POOLS) { stack_pools[pool_index + 1] = *prealloc; *prealloc = NULL; } /* - * This smp_store_release pairs with smp_load_acquire() from - * |next_pool_inited| above and in stack_depot_save(). + * This smp_store_release pairs with smp_load_acquire() above + * and in stack_depot_save(). */ smp_store_release(&next_pool_inited, 1); } } -/* Allocation of a new stack in raw storage */ +/* Allocates a new stack in a stack depot pool. */ static struct stack_record * depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) { @@ -252,28 +259,35 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) required_size = ALIGN(required_size, 1 << DEPOT_STACK_ALIGN); + /* Check if there is not enough space in the current pool. */ if (unlikely(pool_offset + required_size > DEPOT_POOL_SIZE)) { + /* Bail out if we reached the pool limit. */ if (unlikely(pool_index + 1 >= DEPOT_MAX_POOLS)) { WARN_ONCE(1, "Stack depot reached limit capacity"); return NULL; } + + /* Move on to the next pool. */ pool_index++; pool_offset = 0; /* - * smp_store_release() here pairs with smp_load_acquire() from - * |next_pool_inited| in stack_depot_save() and - * depot_init_pool(). + * smp_store_release() here pairs with smp_load_acquire() in + * stack_depot_save() and depot_init_pool(). */ if (pool_index + 1 < DEPOT_MAX_POOLS) smp_store_release(&next_pool_inited, 0); } + + /* Assign the preallocated memory to a pool if required. */ if (*prealloc) depot_init_pool(prealloc); + + /* Check if we have a pool to save the stack trace. */ if (stack_pools[pool_index] == NULL) return NULL; + /* Save the stack trace. */ stack = stack_pools[pool_index] + pool_offset; - stack->hash = hash; stack->size = size; stack->handle.pool_index = pool_index; -- cgit v1.2.3 From d11a5621f3252120dfc7cef7600a90bd8e605caf Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:16:02 +0100 Subject: lib/stackdepot: rename next_pool_inited to next_pool_required Stack depot uses next_pool_inited to mark that either the next pool is initialized or the limit on the number of pools is reached. However, the flag name only reflects the former part of its purpose, which is confusing. Rename next_pool_inited to next_pool_required and invert its value. Also annotate usages of next_pool_required with comments. Link: https://lkml.kernel.org/r/484fd2695dff7a9bdc437a32f8a6ee228535aa02.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index c4bc198c3d93..4df162a84bfe 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -96,8 +96,14 @@ static int pool_index; static size_t pool_offset; /* Lock that protects the variables above. */ static DEFINE_RAW_SPINLOCK(pool_lock); -/* Whether the next pool is initialized. */ -static int next_pool_inited; +/* + * Stack depot tries to keep an extra pool allocated even before it runs out + * of space in the currently used pool. + * This flag marks that this next extra pool needs to be allocated and + * initialized. It has the value 0 when either the next pool is not yet + * initialized or the limit on the number of pools is reached. + */ +static int next_pool_required = 1; static int __init disable_stack_depot(char *str) { @@ -222,10 +228,12 @@ EXPORT_SYMBOL_GPL(stack_depot_init); static void depot_init_pool(void **prealloc) { /* + * If the next pool is already initialized or the maximum number of + * pools is reached, do not use the preallocated memory. * smp_load_acquire() here pairs with smp_store_release() below and * in depot_alloc_stack(). */ - if (smp_load_acquire(&next_pool_inited)) + if (!smp_load_acquire(&next_pool_required)) return; /* Check if the current pool is not yet allocated. */ @@ -243,10 +251,13 @@ static void depot_init_pool(void **prealloc) *prealloc = NULL; } /* + * At this point, either the next pool is initialized or the + * maximum number of pools is reached. In either case, take + * note that initializing another pool is not required. * This smp_store_release pairs with smp_load_acquire() above * and in stack_depot_save(). */ - smp_store_release(&next_pool_inited, 1); + smp_store_release(&next_pool_required, 0); } } @@ -271,11 +282,13 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) pool_index++; pool_offset = 0; /* + * If the maximum number of pools is not reached, take note + * that the next pool needs to initialized. * smp_store_release() here pairs with smp_load_acquire() in * stack_depot_save() and depot_init_pool(). */ if (pool_index + 1 < DEPOT_MAX_POOLS) - smp_store_release(&next_pool_inited, 0); + smp_store_release(&next_pool_required, 1); } /* Assign the preallocated memory to a pool if required. */ @@ -406,14 +419,13 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, goto exit; /* - * Check if the current or the next stack pool need to be initialized. - * If so, allocate the memory - we won't be able to do that under the - * lock. + * Check if another stack pool needs to be initialized. If so, allocate + * the memory now - we won't be able to do that under the lock. * * The smp_load_acquire() here pairs with smp_store_release() to * |next_pool_inited| in depot_alloc_stack() and depot_init_pool(). */ - if (unlikely(can_alloc && !smp_load_acquire(&next_pool_inited))) { + if (unlikely(can_alloc && smp_load_acquire(&next_pool_required))) { /* * Zero out zone modifiers, as we don't have specific zone * requirements. Keep the flags related to allocation in atomic -- cgit v1.2.3 From 36aa1e6779c3c6f8e0d4552544214f5cffe3c287 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:16:03 +0100 Subject: lib/stacktrace, kasan, kmsan: rework extra_bits interface The current implementation of the extra_bits interface is confusing: passing extra_bits to __stack_depot_save makes it seem that the extra bits are somehow stored in stack depot. In reality, they are only embedded into a stack depot handle and are not used within stack depot. Drop the extra_bits argument from __stack_depot_save and instead provide a new stack_depot_set_extra_bits function (similar to the exsiting stack_depot_get_extra_bits) that saves extra bits into a stack depot handle. Update the callers of __stack_depot_save to use the new interace. This change also fixes a minor issue in the old code: __stack_depot_save does not return NULL if saving stack trace fails and extra_bits is used. Link: https://lkml.kernel.org/r/317123b5c05e2f82854fc55d8b285e0869d3cb77.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 4 +++- lib/stackdepot.c | 42 +++++++++++++++++++++++++++++++++--------- mm/kasan/common.c | 2 +- mm/kmsan/core.c | 10 +++++++--- 4 files changed, 44 insertions(+), 14 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index c4e3abc16b16..267f4b2634ee 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -57,7 +57,6 @@ static inline int stack_depot_early_init(void) { return 0; } depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, - unsigned int extra_bits, gfp_t gfp_flags, bool can_alloc); depot_stack_handle_t stack_depot_save(unsigned long *entries, @@ -71,6 +70,9 @@ void stack_depot_print(depot_stack_handle_t stack); int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces); +depot_stack_handle_t __must_check stack_depot_set_extra_bits( + depot_stack_handle_t handle, unsigned int extra_bits); + unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle); #endif diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 4df162a84bfe..8c6e4e9cb535 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -357,7 +357,6 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, * * @entries: Pointer to storage array * @nr_entries: Size of the storage array - * @extra_bits: Flags to store in unused bits of depot_stack_handle_t * @alloc_flags: Allocation gfp flags * @can_alloc: Allocate stack pools (increased chance of failure if false) * @@ -369,10 +368,6 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, * If the stack trace in @entries is from an interrupt, only the portion up to * interrupt entry is saved. * - * Additional opaque flags can be passed in @extra_bits, stored in the unused - * bits of the stack handle, and retrieved using stack_depot_get_extra_bits() - * without calling stack_depot_fetch(). - * * Context: Any context, but setting @can_alloc to %false is required if * alloc_pages() cannot be used from the current context. Currently * this is the case from contexts where neither %GFP_ATOMIC nor @@ -382,7 +377,6 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, */ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, - unsigned int extra_bits, gfp_t alloc_flags, bool can_alloc) { struct stack_record *found = NULL, **bucket; @@ -471,8 +465,6 @@ exit: if (found) retval.handle = found->handle.handle; fast_exit: - retval.extra = extra_bits; - return retval.handle; } EXPORT_SYMBOL_GPL(__stack_depot_save); @@ -493,7 +485,7 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t alloc_flags) { - return __stack_depot_save(entries, nr_entries, 0, alloc_flags, true); + return __stack_depot_save(entries, nr_entries, alloc_flags, true); } EXPORT_SYMBOL_GPL(stack_depot_save); @@ -576,6 +568,38 @@ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, } EXPORT_SYMBOL_GPL(stack_depot_snprint); +/** + * stack_depot_set_extra_bits - Set extra bits in a stack depot handle + * + * @handle: Stack depot handle returned from stack_depot_save() + * @extra_bits: Value to set the extra bits + * + * Return: Stack depot handle with extra bits set + * + * Stack depot handles have a few unused bits, which can be used for storing + * user-specific information. These bits are transparent to the stack depot. + */ +depot_stack_handle_t __must_check stack_depot_set_extra_bits( + depot_stack_handle_t handle, unsigned int extra_bits) +{ + union handle_parts parts = { .handle = handle }; + + /* Don't set extra bits on empty handles. */ + if (!handle) + return 0; + + parts.extra = extra_bits; + return parts.handle; +} +EXPORT_SYMBOL(stack_depot_set_extra_bits); + +/** + * stack_depot_get_extra_bits - Retrieve extra bits from a stack depot handle + * + * @handle: Stack depot handle with extra bits saved + * + * Return: Extra bits retrieved from the stack depot handle + */ unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) { union handle_parts parts = { .handle = handle }; diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 1e7336ae3786..b376a5d055e5 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -43,7 +43,7 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc) unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); - return __stack_depot_save(entries, nr_entries, 0, flags, can_alloc); + return __stack_depot_save(entries, nr_entries, flags, can_alloc); } void kasan_set_track(struct kasan_track *track, gfp_t flags) diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index 112dce135c7f..f710257d6867 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -69,13 +69,15 @@ depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, { unsigned long entries[KMSAN_STACK_DEPTH]; unsigned int nr_entries; + depot_stack_handle_t handle; nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0); /* Don't sleep (see might_sleep_if() in __alloc_pages_nodemask()). */ flags &= ~__GFP_DIRECT_RECLAIM; - return __stack_depot_save(entries, nr_entries, extra, flags, true); + handle = __stack_depot_save(entries, nr_entries, flags, true); + return stack_depot_set_extra_bits(handle, extra); } /* Copy the metadata following the memmove() behavior. */ @@ -215,6 +217,7 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id) u32 extra_bits; int depth; bool uaf; + depot_stack_handle_t handle; if (!id) return id; @@ -250,8 +253,9 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id) * positives when __stack_depot_save() passes it to instrumented code. */ kmsan_internal_unpoison_memory(entries, sizeof(entries), false); - return __stack_depot_save(entries, ARRAY_SIZE(entries), extra_bits, - GFP_ATOMIC, true); + handle = __stack_depot_save(entries, ARRAY_SIZE(entries), GFP_ATOMIC, + true); + return stack_depot_set_extra_bits(handle, extra_bits); } void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b, -- cgit v1.2.3 From beb3c23c69a91a10f247e93ffef1fcd0209d93e4 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:16:04 +0100 Subject: lib/stackdepot: annotate racy pool_index accesses Accesses to pool_index are protected by pool_lock everywhere except in a sanity check in stack_depot_fetch. The read access there can race with the write access in depot_alloc_stack. Use WRITE/READ_ONCE() to annotate the racy accesses. As the sanity check is only used to print a warning in case of a violation of the stack depot interface usage, it does not make a lot of sense to use proper synchronization. [andreyknvl@google.com: s/pool_index/pool_index_cached/ in stack_depot_fetch()] Link: https://lkml.kernel.org/r/95cf53f0da2c112aa2cc54456cbcd6975c3ff343.1676129911.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/359ac9c13cd0869c56740fb2029f505e41593830.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 8c6e4e9cb535..b625090890e1 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -278,8 +278,12 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) return NULL; } - /* Move on to the next pool. */ - pool_index++; + /* + * Move on to the next pool. + * WRITE_ONCE pairs with potential concurrent read in + * stack_depot_fetch(). + */ + WRITE_ONCE(pool_index, pool_index + 1); pool_offset = 0; /* * If the maximum number of pools is not reached, take note @@ -502,6 +506,11 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries) { union handle_parts parts = { .handle = handle }; + /* + * READ_ONCE pairs with potential concurrent write in + * depot_alloc_stack. + */ + int pool_index_cached = READ_ONCE(pool_index); void *pool; size_t offset = parts.offset << DEPOT_STACK_ALIGN; struct stack_record *stack; @@ -510,9 +519,9 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, if (!handle) return 0; - if (parts.pool_index > pool_index) { + if (parts.pool_index > pool_index_cached) { WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n", - parts.pool_index, pool_index, handle); + parts.pool_index, pool_index_cached, handle); return 0; } pool = stack_pools[parts.pool_index]; -- cgit v1.2.3 From b232b9995a6dbaafe19d07d81acc039bc84bd569 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:16:05 +0100 Subject: lib/stackdepot: various comments clean-ups Clean up comments in include/linux/stackdepot.h and lib/stackdepot.c: 1. Rework the initialization comment in stackdepot.h. 2. Rework the header comment in stackdepot.c. 3. Various clean-ups for other comments. Also adjust whitespaces for find_stack and depot_alloc_stack call sites. No functional changes. Link: https://lkml.kernel.org/r/5836231b7954355e2311fc9b5870f697ea8e1f7d.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 36 +++++++------- lib/stackdepot.c | 120 ++++++++++++++++++++++----------------------- 2 files changed, 78 insertions(+), 78 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 267f4b2634ee..afdf8ee7b597 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -1,11 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * A generic stack depot implementation + * Stack depot - a stack trace storage that avoids duplication. * * Author: Alexander Potapenko * Copyright (C) 2016 Google, Inc. * - * Based on code by Dmitry Chernenkov. + * Based on the code by Dmitry Chernenkov. */ #ifndef _LINUX_STACKDEPOT_H @@ -17,35 +17,37 @@ typedef u32 depot_stack_handle_t; /* * Number of bits in the handle that stack depot doesn't use. Users may store - * information in them. + * information in them via stack_depot_set/get_extra_bits. */ #define STACK_DEPOT_EXTRA_BITS 5 /* - * Every user of stack depot has to call stack_depot_init() during its own init - * when it's decided that it will be calling stack_depot_save() later. This is - * recommended for e.g. modules initialized later in the boot process, when - * slab_is_available() is true. + * Using stack depot requires its initialization, which can be done in 3 ways: * - * The alternative is to select STACKDEPOT_ALWAYS_INIT to have stack depot - * enabled as part of mm_init(), for subsystems where it's known at compile time - * that stack depot will be used. + * 1. Selecting CONFIG_STACKDEPOT_ALWAYS_INIT. This option is suitable in + * scenarios where it's known at compile time that stack depot will be used. + * Enabling this config makes the kernel initialize stack depot in mm_init(). * - * Another alternative is to call stack_depot_request_early_init(), when the - * decision to use stack depot is taken e.g. when evaluating kernel boot - * parameters, which precedes the enablement point in mm_init(). + * 2. Calling stack_depot_request_early_init() during early boot, before + * stack_depot_early_init() in mm_init() completes. For example, this can + * be done when evaluating kernel boot parameters. + * + * 3. Calling stack_depot_init(). Possible after boot is complete. This option + * is recommended for modules initialized later in the boot process, after + * mm_init() completes. * * stack_depot_init() and stack_depot_request_early_init() can be called - * regardless of CONFIG_STACKDEPOT and are no-op when disabled. The actual - * save/fetch/print functions should only be called from code that makes sure - * CONFIG_STACKDEPOT is enabled. + * regardless of whether CONFIG_STACKDEPOT is enabled and are no-op when this + * config is disabled. The save/fetch/print stack depot functions can only be + * called from the code that makes sure CONFIG_STACKDEPOT is enabled _and_ + * initializes stack depot via one of the ways listed above. */ #ifdef CONFIG_STACKDEPOT int stack_depot_init(void); void __init stack_depot_request_early_init(void); -/* This is supposed to be called only from mm_init() */ +/* Must be only called from mm_init(). */ int __init stack_depot_early_init(void); #else static inline int stack_depot_init(void) { return 0; } diff --git a/lib/stackdepot.c b/lib/stackdepot.c index b625090890e1..aaa5d2f1abc2 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -1,22 +1,26 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Generic stack depot for storing stack traces. + * Stack depot - a stack trace storage that avoids duplication. * - * Some debugging tools need to save stack traces of certain events which can - * be later presented to the user. For example, KASAN needs to safe alloc and - * free stacks for each object, but storing two stack traces per object - * requires too much memory (e.g. SLUB_DEBUG needs 256 bytes per object for - * that). + * Stack depot is intended to be used by subsystems that need to store and + * later retrieve many potentially duplicated stack traces without wasting + * memory. * - * Instead, stack depot maintains a hashtable of unique stacktraces. Since alloc - * and free stacks repeat a lot, we save about 100x space. - * Stacks are never removed from depot, so we store them contiguously one after - * another in a contiguous memory allocation. + * For example, KASAN needs to save allocation and free stack traces for each + * object. Storing two stack traces per object requires a lot of memory (e.g. + * SLUB_DEBUG needs 256 bytes per object for that). Since allocation and free + * stack traces often repeat, using stack depot allows to save about 100x space. + * + * Internally, stack depot maintains a hash table of unique stacktraces. The + * stack traces themselves are stored contiguously one after another in a set + * of separate page allocations. + * + * Stack traces are never removed from stack depot. * * Author: Alexander Potapenko * Copyright (C) 2016 Google, Inc. * - * Based on code by Dmitry Chernenkov. + * Based on the code by Dmitry Chernenkov. */ #define pr_fmt(fmt) "stackdepot: " fmt @@ -50,7 +54,7 @@ (((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \ (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP) -/* The compact structure to store the reference to stacks. */ +/* Compact structure that stores a reference to a stack. */ union handle_parts { depot_stack_handle_t handle; struct { @@ -62,11 +66,11 @@ union handle_parts { }; struct stack_record { - struct stack_record *next; /* Link in the hashtable */ - u32 hash; /* Hash in the hastable */ - u32 size; /* Number of frames in the stack */ + struct stack_record *next; /* Link in the hash table */ + u32 hash; /* Hash in the hash table */ + u32 size; /* Number of stored frames */ union handle_parts handle; - unsigned long entries[]; /* Variable-sized array of entries. */ + unsigned long entries[]; /* Variable-sized array of frames */ }; static bool stack_depot_disabled; @@ -317,7 +321,7 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) return stack; } -/* Calculate hash for a stack */ +/* Calculates the hash for a stack. */ static inline u32 hash_stack(unsigned long *entries, unsigned int size) { return jhash2((u32 *)entries, @@ -325,9 +329,9 @@ static inline u32 hash_stack(unsigned long *entries, unsigned int size) STACK_HASH_SEED); } -/* Use our own, non-instrumented version of memcmp(). - * - * We actually don't care about the order, just the equality. +/* + * Non-instrumented version of memcmp(). + * Does not check the lexicographical order, only the equality. */ static inline int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2, @@ -340,7 +344,7 @@ int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2, return 0; } -/* Find a stack that is equal to the one stored in entries in the hash */ +/* Finds a stack in a bucket of the hash table. */ static inline struct stack_record *find_stack(struct stack_record *bucket, unsigned long *entries, int size, u32 hash) @@ -357,27 +361,27 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, } /** - * __stack_depot_save - Save a stack trace from an array + * __stack_depot_save - Save a stack trace to stack depot * - * @entries: Pointer to storage array - * @nr_entries: Size of the storage array - * @alloc_flags: Allocation gfp flags + * @entries: Pointer to the stack trace + * @nr_entries: Number of frames in the stack + * @alloc_flags: Allocation GFP flags * @can_alloc: Allocate stack pools (increased chance of failure if false) * * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is - * %true, is allowed to replenish the stack pool in case no space is left + * %true, stack depot can replenish the stack pools in case no space is left * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids - * any allocations and will fail if no space is left to store the stack trace. + * any allocations and fails if no space is left to store the stack trace. * - * If the stack trace in @entries is from an interrupt, only the portion up to - * interrupt entry is saved. + * If the provided stack trace comes from the interrupt context, only the part + * up to the interrupt entry is saved. * * Context: Any context, but setting @can_alloc to %false is required if * alloc_pages() cannot be used from the current context. Currently - * this is the case from contexts where neither %GFP_ATOMIC nor + * this is the case for contexts where neither %GFP_ATOMIC nor * %GFP_NOWAIT can be used (NMI, raw_spin_lock). * - * Return: The handle of the stack struct stored in depot, 0 on failure. + * Return: Handle of the stack struct stored in depot, 0 on failure */ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, @@ -392,11 +396,11 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, /* * If this stack trace is from an interrupt, including anything before - * interrupt entry usually leads to unbounded stackdepot growth. + * interrupt entry usually leads to unbounded stack depot growth. * - * Because use of filter_irq_stacks() is a requirement to ensure - * stackdepot can efficiently deduplicate interrupt stacks, always - * filter_irq_stacks() to simplify all callers' use of stackdepot. + * Since use of filter_irq_stacks() is a requirement to ensure stack + * depot can efficiently deduplicate interrupt stacks, always + * filter_irq_stacks() to simplify all callers' use of stack depot. */ nr_entries = filter_irq_stacks(entries, nr_entries); @@ -411,8 +415,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * The smp_load_acquire() here pairs with smp_store_release() to * |bucket| below. */ - found = find_stack(smp_load_acquire(bucket), entries, - nr_entries, hash); + found = find_stack(smp_load_acquire(bucket), entries, nr_entries, hash); if (found) goto exit; @@ -441,7 +444,8 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, found = find_stack(*bucket, entries, nr_entries, hash); if (!found) { - struct stack_record *new = depot_alloc_stack(entries, nr_entries, hash, &prealloc); + struct stack_record *new = + depot_alloc_stack(entries, nr_entries, hash, &prealloc); if (new) { new->next = *bucket; @@ -454,8 +458,8 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, } } else if (prealloc) { /* - * We didn't need to store this stack trace, but let's keep - * the preallocated memory for the future. + * Stack depot already contains this stack trace, but let's + * keep the preallocated memory for the future. */ depot_init_pool(&prealloc); } @@ -463,7 +467,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, raw_spin_unlock_irqrestore(&pool_lock, flags); exit: if (prealloc) { - /* Nobody used this memory, ok to free it. */ + /* Stack depot didn't use this memory, free it. */ free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER); } if (found) @@ -474,16 +478,16 @@ fast_exit: EXPORT_SYMBOL_GPL(__stack_depot_save); /** - * stack_depot_save - Save a stack trace from an array + * stack_depot_save - Save a stack trace to stack depot * - * @entries: Pointer to storage array - * @nr_entries: Size of the storage array - * @alloc_flags: Allocation gfp flags + * @entries: Pointer to the stack trace + * @nr_entries: Number of frames in the stack + * @alloc_flags: Allocation GFP flags * * Context: Contexts where allocations via alloc_pages() are allowed. * See __stack_depot_save() for more details. * - * Return: The handle of the stack struct stored in depot, 0 on failure. + * Return: Handle of the stack trace stored in depot, 0 on failure */ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, @@ -494,13 +498,12 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, EXPORT_SYMBOL_GPL(stack_depot_save); /** - * stack_depot_fetch - Fetch stack entries from a depot + * stack_depot_fetch - Fetch a stack trace from stack depot * - * @handle: Stack depot handle which was returned from - * stack_depot_save(). - * @entries: Pointer to store the entries address + * @handle: Stack depot handle returned from stack_depot_save() + * @entries: Pointer to store the address of the stack trace * - * Return: The number of trace entries for this depot. + * Return: Number of frames for the fetched stack */ unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries) @@ -535,11 +538,9 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, EXPORT_SYMBOL_GPL(stack_depot_fetch); /** - * stack_depot_print - print stack entries from a depot - * - * @stack: Stack depot handle which was returned from - * stack_depot_save(). + * stack_depot_print - Print a stack trace from stack depot * + * @stack: Stack depot handle returned from stack_depot_save() */ void stack_depot_print(depot_stack_handle_t stack) { @@ -553,17 +554,14 @@ void stack_depot_print(depot_stack_handle_t stack) EXPORT_SYMBOL_GPL(stack_depot_print); /** - * stack_depot_snprint - print stack entries from a depot into a buffer + * stack_depot_snprint - Print a stack trace from stack depot into a buffer * - * @handle: Stack depot handle which was returned from - * stack_depot_save(). + * @handle: Stack depot handle returned from stack_depot_save() * @buf: Pointer to the print buffer - * * @size: Size of the print buffer - * * @spaces: Number of leading spaces to print * - * Return: Number of bytes printed. + * Return: Number of bytes printed */ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces) -- cgit v1.2.3 From 0621d160f1003a8aedd3628133568ecffdd724f7 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:16:06 +0100 Subject: lib/stackdepot: move documentation comments to stackdepot.h Move all interface- and usage-related documentation comments to include/linux/stackdepot.h. It makes sense to have them in the header where they are available to the interface users. [akpm@linux-foundation.org: grammar fix, per Alexander] Link: https://lkml.kernel.org/r/fbfee41495b306dd8881f9b1c1b80999c885e82f.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 87 ++++++++++++++++++++++++++++++++++++++++++++++ lib/stackdepot.c | 87 ---------------------------------------------- 2 files changed, 87 insertions(+), 87 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index afdf8ee7b597..e58306783d8e 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -2,6 +2,17 @@ /* * Stack depot - a stack trace storage that avoids duplication. * + * Stack depot is intended to be used by subsystems that need to store and + * later retrieve many potentially duplicated stack traces without wasting + * memory. + * + * For example, KASAN needs to save allocation and free stack traces for each + * object. Storing two stack traces per object requires a lot of memory (e.g. + * SLUB_DEBUG needs 256 bytes per object for that). Since allocation and free + * stack traces often repeat, using stack depot allows to save about 100x space. + * + * Stack traces are never removed from the stack depot. + * * Author: Alexander Potapenko * Copyright (C) 2016 Google, Inc. * @@ -57,24 +68,100 @@ static inline void stack_depot_request_early_init(void) { } static inline int stack_depot_early_init(void) { return 0; } #endif +/** + * __stack_depot_save - Save a stack trace to stack depot + * + * @entries: Pointer to the stack trace + * @nr_entries: Number of frames in the stack + * @alloc_flags: Allocation GFP flags + * @can_alloc: Allocate stack pools (increased chance of failure if false) + * + * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is + * %true, stack depot can replenish the stack pools in case no space is left + * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids + * any allocations and fails if no space is left to store the stack trace. + * + * If the provided stack trace comes from the interrupt context, only the part + * up to the interrupt entry is saved. + * + * Context: Any context, but setting @can_alloc to %false is required if + * alloc_pages() cannot be used from the current context. Currently + * this is the case for contexts where neither %GFP_ATOMIC nor + * %GFP_NOWAIT can be used (NMI, raw_spin_lock). + * + * Return: Handle of the stack struct stored in depot, 0 on failure + */ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags, bool can_alloc); +/** + * stack_depot_save - Save a stack trace to stack depot + * + * @entries: Pointer to the stack trace + * @nr_entries: Number of frames in the stack + * @alloc_flags: Allocation GFP flags + * + * Context: Contexts where allocations via alloc_pages() are allowed. + * See __stack_depot_save() for more details. + * + * Return: Handle of the stack trace stored in depot, 0 on failure + */ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags); +/** + * stack_depot_fetch - Fetch a stack trace from stack depot + * + * @handle: Stack depot handle returned from stack_depot_save() + * @entries: Pointer to store the address of the stack trace + * + * Return: Number of frames for the fetched stack + */ unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); +/** + * stack_depot_print - Print a stack trace from stack depot + * + * @stack: Stack depot handle returned from stack_depot_save() + */ void stack_depot_print(depot_stack_handle_t stack); +/** + * stack_depot_snprint - Print a stack trace from stack depot into a buffer + * + * @handle: Stack depot handle returned from stack_depot_save() + * @buf: Pointer to the print buffer + * @size: Size of the print buffer + * @spaces: Number of leading spaces to print + * + * Return: Number of bytes printed + */ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces); +/** + * stack_depot_set_extra_bits - Set extra bits in a stack depot handle + * + * @handle: Stack depot handle returned from stack_depot_save() + * @extra_bits: Value to set the extra bits + * + * Return: Stack depot handle with extra bits set + * + * Stack depot handles have a few unused bits, which can be used for storing + * user-specific information. These bits are transparent to the stack depot. + */ depot_stack_handle_t __must_check stack_depot_set_extra_bits( depot_stack_handle_t handle, unsigned int extra_bits); +/** + * stack_depot_get_extra_bits - Retrieve extra bits from a stack depot handle + * + * @handle: Stack depot handle with extra bits saved + * + * Return: Extra bits retrieved from the stack depot handle + */ unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle); #endif diff --git a/lib/stackdepot.c b/lib/stackdepot.c index aaa5d2f1abc2..036da8e295d1 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -2,21 +2,10 @@ /* * Stack depot - a stack trace storage that avoids duplication. * - * Stack depot is intended to be used by subsystems that need to store and - * later retrieve many potentially duplicated stack traces without wasting - * memory. - * - * For example, KASAN needs to save allocation and free stack traces for each - * object. Storing two stack traces per object requires a lot of memory (e.g. - * SLUB_DEBUG needs 256 bytes per object for that). Since allocation and free - * stack traces often repeat, using stack depot allows to save about 100x space. - * * Internally, stack depot maintains a hash table of unique stacktraces. The * stack traces themselves are stored contiguously one after another in a set * of separate page allocations. * - * Stack traces are never removed from stack depot. - * * Author: Alexander Potapenko * Copyright (C) 2016 Google, Inc. * @@ -360,29 +349,6 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, return NULL; } -/** - * __stack_depot_save - Save a stack trace to stack depot - * - * @entries: Pointer to the stack trace - * @nr_entries: Number of frames in the stack - * @alloc_flags: Allocation GFP flags - * @can_alloc: Allocate stack pools (increased chance of failure if false) - * - * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is - * %true, stack depot can replenish the stack pools in case no space is left - * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids - * any allocations and fails if no space is left to store the stack trace. - * - * If the provided stack trace comes from the interrupt context, only the part - * up to the interrupt entry is saved. - * - * Context: Any context, but setting @can_alloc to %false is required if - * alloc_pages() cannot be used from the current context. Currently - * this is the case for contexts where neither %GFP_ATOMIC nor - * %GFP_NOWAIT can be used (NMI, raw_spin_lock). - * - * Return: Handle of the stack struct stored in depot, 0 on failure - */ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t alloc_flags, bool can_alloc) @@ -477,18 +443,6 @@ fast_exit: } EXPORT_SYMBOL_GPL(__stack_depot_save); -/** - * stack_depot_save - Save a stack trace to stack depot - * - * @entries: Pointer to the stack trace - * @nr_entries: Number of frames in the stack - * @alloc_flags: Allocation GFP flags - * - * Context: Contexts where allocations via alloc_pages() are allowed. - * See __stack_depot_save() for more details. - * - * Return: Handle of the stack trace stored in depot, 0 on failure - */ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t alloc_flags) @@ -497,14 +451,6 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, } EXPORT_SYMBOL_GPL(stack_depot_save); -/** - * stack_depot_fetch - Fetch a stack trace from stack depot - * - * @handle: Stack depot handle returned from stack_depot_save() - * @entries: Pointer to store the address of the stack trace - * - * Return: Number of frames for the fetched stack - */ unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries) { @@ -537,11 +483,6 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, } EXPORT_SYMBOL_GPL(stack_depot_fetch); -/** - * stack_depot_print - Print a stack trace from stack depot - * - * @stack: Stack depot handle returned from stack_depot_save() - */ void stack_depot_print(depot_stack_handle_t stack) { unsigned long *entries; @@ -553,16 +494,6 @@ void stack_depot_print(depot_stack_handle_t stack) } EXPORT_SYMBOL_GPL(stack_depot_print); -/** - * stack_depot_snprint - Print a stack trace from stack depot into a buffer - * - * @handle: Stack depot handle returned from stack_depot_save() - * @buf: Pointer to the print buffer - * @size: Size of the print buffer - * @spaces: Number of leading spaces to print - * - * Return: Number of bytes printed - */ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces) { @@ -575,17 +506,6 @@ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, } EXPORT_SYMBOL_GPL(stack_depot_snprint); -/** - * stack_depot_set_extra_bits - Set extra bits in a stack depot handle - * - * @handle: Stack depot handle returned from stack_depot_save() - * @extra_bits: Value to set the extra bits - * - * Return: Stack depot handle with extra bits set - * - * Stack depot handles have a few unused bits, which can be used for storing - * user-specific information. These bits are transparent to the stack depot. - */ depot_stack_handle_t __must_check stack_depot_set_extra_bits( depot_stack_handle_t handle, unsigned int extra_bits) { @@ -600,13 +520,6 @@ depot_stack_handle_t __must_check stack_depot_set_extra_bits( } EXPORT_SYMBOL(stack_depot_set_extra_bits); -/** - * stack_depot_get_extra_bits - Retrieve extra bits from a stack depot handle - * - * @handle: Stack depot handle with extra bits saved - * - * Return: Extra bits retrieved from the stack depot handle - */ unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) { union handle_parts parts = { .handle = handle }; -- cgit v1.2.3 From 5b855937096aea7f81e73ad6d40d433c9dd49577 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 13 Feb 2023 20:34:36 +0800 Subject: migrate_pages: organize stats with struct migrate_pages_stats Patch series "migrate_pages(): batch TLB flushing", v5. Now, migrate_pages() migrates folios one by one, like the fake code as follows, for each folio unmap flush TLB copy restore map If multiple folios are passed to migrate_pages(), there are opportunities to batch the TLB flushing and copying. That is, we can change the code to something as follows, for each folio unmap for each folio flush TLB for each folio copy for each folio restore map The total number of TLB flushing IPI can be reduced considerably. And we may use some hardware accelerator such as DSA to accelerate the folio copying. So in this patch, we refactor the migrate_pages() implementation and implement the TLB flushing batching. Base on this, hardware accelerated folio copying can be implemented. If too many folios are passed to migrate_pages(), in the naive batched implementation, we may unmap too many folios at the same time. The possibility for a task to wait for the migrated folios to be mapped again increases. So the latency may be hurt. To deal with this issue, the max number of folios be unmapped in batch is restricted to no more than HPAGE_PMD_NR in the unit of page. That is, the influence is at the same level of THP migration. We use the following test to measure the performance impact of the patchset, On a 2-socket Intel server, - Run pmbench memory accessing benchmark - Run `migratepages` to migrate pages of pmbench between node 0 and node 1 back and forth. With the patch, the TLB flushing IPI reduces 99.1% during the test and the number of pages migrated successfully per second increases 291.7%. Xin Hao helped to test the patchset on an ARM64 server with 128 cores, 2 NUMA nodes. Test results show that the page migration performance increases up to 78%. This patch (of 9): Define struct migrate_pages_stats to organize the various statistics in migrate_pages(). This makes it easier to collect and consume the statistics in multiple functions. This will be needed in the following patches in the series. Link: https://lkml.kernel.org/r/20230213123444.155149-1-ying.huang@intel.com Link: https://lkml.kernel.org/r/20230213123444.155149-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Alistair Popple Reviewed-by: Zi Yan Reviewed-by: Baolin Wang Reviewed-by: Xin Hao Cc: Yang Shi Cc: Oscar Salvador Cc: Matthew Wilcox Cc: Bharata B Rao Cc: Minchan Kim Cc: Mike Kravetz Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- mm/migrate.c | 60 ++++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 5b40b9040ba6..1a9cfcf857d2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1414,6 +1414,16 @@ static inline int try_split_folio(struct folio *folio, struct list_head *split_f return rc; } +struct migrate_pages_stats { + int nr_succeeded; /* Normal and large folios migrated successfully, in + units of base pages */ + int nr_failed_pages; /* Normal and large folios failed to be migrated, in + units of base pages. Untried folios aren't counted */ + int nr_thp_succeeded; /* THP migrated successfully */ + int nr_thp_failed; /* THP failed to be migrated */ + int nr_thp_split; /* THP split before migrating */ +}; + /* * migrate_pages - migrate the folios specified in a list, to the free folios * supplied as the target for the page migration @@ -1448,13 +1458,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, int large_retry = 1; int thp_retry = 1; int nr_failed = 0; - int nr_failed_pages = 0; int nr_retry_pages = 0; - int nr_succeeded = 0; - int nr_thp_succeeded = 0; int nr_large_failed = 0; - int nr_thp_failed = 0; - int nr_thp_split = 0; int pass = 0; bool is_large = false; bool is_thp = false; @@ -1464,9 +1469,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, LIST_HEAD(split_folios); bool nosplit = (reason == MR_NUMA_MISPLACED); bool no_split_folio_counting = false; + struct migrate_pages_stats stats; trace_mm_migrate_pages_start(mode, reason); + memset(&stats, 0, sizeof(stats)); split_folio_migration: for (pass = 0; pass < 10 && (retry || large_retry); pass++) { retry = 0; @@ -1520,9 +1527,9 @@ split_folio_migration: /* Large folio migration is unsupported */ if (is_large) { nr_large_failed++; - nr_thp_failed += is_thp; + stats.nr_thp_failed += is_thp; if (!try_split_folio(folio, &split_folios)) { - nr_thp_split += is_thp; + stats.nr_thp_split += is_thp; break; } /* Hugetlb migration is unsupported */ @@ -1530,7 +1537,7 @@ split_folio_migration: nr_failed++; } - nr_failed_pages += nr_pages; + stats.nr_failed_pages += nr_pages; list_move_tail(&folio->lru, &ret_folios); break; case -ENOMEM: @@ -1540,13 +1547,13 @@ split_folio_migration: */ if (is_large) { nr_large_failed++; - nr_thp_failed += is_thp; + stats.nr_thp_failed += is_thp; /* Large folio NUMA faulting doesn't split to retry. */ if (!nosplit) { int ret = try_split_folio(folio, &split_folios); if (!ret) { - nr_thp_split += is_thp; + stats.nr_thp_split += is_thp; break; } else if (reason == MR_LONGTERM_PIN && ret == -EAGAIN) { @@ -1564,7 +1571,7 @@ split_folio_migration: nr_failed++; } - nr_failed_pages += nr_pages + nr_retry_pages; + stats.nr_failed_pages += nr_pages + nr_retry_pages; /* * There might be some split folios of fail-to-migrate large * folios left in split_folios list. Move them back to migration @@ -1574,7 +1581,7 @@ split_folio_migration: list_splice_init(&split_folios, from); /* nr_failed isn't updated for not used */ nr_large_failed += large_retry; - nr_thp_failed += thp_retry; + stats.nr_thp_failed += thp_retry; goto out; case -EAGAIN: if (is_large) { @@ -1586,8 +1593,8 @@ split_folio_migration: nr_retry_pages += nr_pages; break; case MIGRATEPAGE_SUCCESS: - nr_succeeded += nr_pages; - nr_thp_succeeded += is_thp; + stats.nr_succeeded += nr_pages; + stats.nr_thp_succeeded += is_thp; break; default: /* @@ -1598,20 +1605,20 @@ split_folio_migration: */ if (is_large) { nr_large_failed++; - nr_thp_failed += is_thp; + stats.nr_thp_failed += is_thp; } else if (!no_split_folio_counting) { nr_failed++; } - nr_failed_pages += nr_pages; + stats.nr_failed_pages += nr_pages; break; } } } nr_failed += retry; nr_large_failed += large_retry; - nr_thp_failed += thp_retry; - nr_failed_pages += nr_retry_pages; + stats.nr_thp_failed += thp_retry; + stats.nr_failed_pages += nr_retry_pages; /* * Try to migrate split folios of fail-to-migrate large folios, no * nr_failed counting in this round, since all split folios of a @@ -1644,16 +1651,17 @@ out: if (list_empty(from)) rc = 0; - count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); - count_vm_events(PGMIGRATE_FAIL, nr_failed_pages); - count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded); - count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed); - count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split); - trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded, - nr_thp_failed, nr_thp_split, mode, reason); + count_vm_events(PGMIGRATE_SUCCESS, stats.nr_succeeded); + count_vm_events(PGMIGRATE_FAIL, stats.nr_failed_pages); + count_vm_events(THP_MIGRATION_SUCCESS, stats.nr_thp_succeeded); + count_vm_events(THP_MIGRATION_FAIL, stats.nr_thp_failed); + count_vm_events(THP_MIGRATION_SPLIT, stats.nr_thp_split); + trace_mm_migrate_pages(stats.nr_succeeded, stats.nr_failed_pages, + stats.nr_thp_succeeded, stats.nr_thp_failed, + stats.nr_thp_split, mode, reason); if (ret_succeeded) - *ret_succeeded = nr_succeeded; + *ret_succeeded = stats.nr_succeeded; return rc; } -- cgit v1.2.3 From e5bfff8b10e496378da4b7863479dd6fb907d4ea Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 13 Feb 2023 20:34:37 +0800 Subject: migrate_pages: separate hugetlb folios migration This is a preparation patch to batch the folio unmapping and moving for the non-hugetlb folios. Based on that we can batch the TLB shootdown during the folio migration and make it possible to use some hardware accelerator for the folio copying. In this patch the hugetlb folios and non-hugetlb folios migration is separated in migrate_pages() to make it easy to change the non-hugetlb folios migration implementation. Link: https://lkml.kernel.org/r/20230213123444.155149-3-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Baolin Wang Reviewed-by: Xin Hao Cc: Zi Yan Cc: Yang Shi Cc: Oscar Salvador Cc: Matthew Wilcox Cc: Bharata B Rao Cc: Alistair Popple Cc: Minchan Kim Cc: Mike Kravetz Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- mm/migrate.c | 141 +++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 119 insertions(+), 22 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 1a9cfcf857d2..586a32bdaa71 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1414,6 +1414,8 @@ static inline int try_split_folio(struct folio *folio, struct list_head *split_f return rc; } +#define NR_MAX_MIGRATE_PAGES_RETRY 10 + struct migrate_pages_stats { int nr_succeeded; /* Normal and large folios migrated successfully, in units of base pages */ @@ -1424,6 +1426,95 @@ struct migrate_pages_stats { int nr_thp_split; /* THP split before migrating */ }; +/* + * Returns the number of hugetlb folios that were not migrated, or an error code + * after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no hugetlb folios are movable + * any more because the list has become empty or no retryable hugetlb folios + * exist any more. It is caller's responsibility to call putback_movable_pages() + * only if ret != 0. + */ +static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, + free_page_t put_new_page, unsigned long private, + enum migrate_mode mode, int reason, + struct migrate_pages_stats *stats, + struct list_head *ret_folios) +{ + int retry = 1; + int nr_failed = 0; + int nr_retry_pages = 0; + int pass = 0; + struct folio *folio, *folio2; + int rc, nr_pages; + + for (pass = 0; pass < NR_MAX_MIGRATE_PAGES_RETRY && retry; pass++) { + retry = 0; + nr_retry_pages = 0; + + list_for_each_entry_safe(folio, folio2, from, lru) { + if (!folio_test_hugetlb(folio)) + continue; + + nr_pages = folio_nr_pages(folio); + + cond_resched(); + + rc = unmap_and_move_huge_page(get_new_page, + put_new_page, private, + &folio->page, pass > 2, mode, + reason, ret_folios); + /* + * The rules are: + * Success: hugetlb folio will be put back + * -EAGAIN: stay on the from list + * -ENOMEM: stay on the from list + * -ENOSYS: stay on the from list + * Other errno: put on ret_folios list + */ + switch(rc) { + case -ENOSYS: + /* Hugetlb migration is unsupported */ + nr_failed++; + stats->nr_failed_pages += nr_pages; + list_move_tail(&folio->lru, ret_folios); + break; + case -ENOMEM: + /* + * When memory is low, don't bother to try to migrate + * other folios, just exit. + */ + stats->nr_failed_pages += nr_pages + nr_retry_pages; + return -ENOMEM; + case -EAGAIN: + retry++; + nr_retry_pages += nr_pages; + break; + case MIGRATEPAGE_SUCCESS: + stats->nr_succeeded += nr_pages; + break; + default: + /* + * Permanent failure (-EBUSY, etc.): + * unlike -EAGAIN case, the failed folio is + * removed from migration folio list and not + * retried in the next outer loop. + */ + nr_failed++; + stats->nr_failed_pages += nr_pages; + break; + } + } + } + /* + * nr_failed is number of hugetlb folios failed to be migrated. After + * NR_MAX_MIGRATE_PAGES_RETRY attempts, give up and count retried hugetlb + * folios as failed. + */ + nr_failed += retry; + stats->nr_failed_pages += nr_retry_pages; + + return nr_failed; +} + /* * migrate_pages - migrate the folios specified in a list, to the free folios * supplied as the target for the page migration @@ -1440,10 +1531,10 @@ struct migrate_pages_stats { * @ret_succeeded: Set to the number of folios migrated successfully if * the caller passes a non-NULL pointer. * - * The function returns after 10 attempts or if no folios are movable any more - * because the list has become empty or no retryable folios exist any more. - * It is caller's responsibility to call putback_movable_pages() to return folios - * to the LRU or free list only if ret != 0. + * The function returns after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no folios + * are movable any more because the list has become empty or no retryable folios + * exist any more. It is caller's responsibility to call putback_movable_pages() + * only if ret != 0. * * Returns the number of {normal folio, large folio, hugetlb} that were not * migrated, or an error code. The number of large folio splits will be @@ -1457,7 +1548,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, int retry = 1; int large_retry = 1; int thp_retry = 1; - int nr_failed = 0; + int nr_failed; int nr_retry_pages = 0; int nr_large_failed = 0; int pass = 0; @@ -1474,38 +1565,45 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, trace_mm_migrate_pages_start(mode, reason); memset(&stats, 0, sizeof(stats)); + rc = migrate_hugetlbs(from, get_new_page, put_new_page, private, mode, reason, + &stats, &ret_folios); + if (rc < 0) + goto out; + nr_failed = rc; + split_folio_migration: - for (pass = 0; pass < 10 && (retry || large_retry); pass++) { + for (pass = 0; + pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry); + pass++) { retry = 0; large_retry = 0; thp_retry = 0; nr_retry_pages = 0; list_for_each_entry_safe(folio, folio2, from, lru) { + /* Retried hugetlb folios will be kept in list */ + if (folio_test_hugetlb(folio)) { + list_move_tail(&folio->lru, &ret_folios); + continue; + } + /* * Large folio statistics is based on the source large * folio. Capture required information that might get * lost during migration. */ - is_large = folio_test_large(folio) && !folio_test_hugetlb(folio); + is_large = folio_test_large(folio); is_thp = is_large && folio_test_pmd_mappable(folio); nr_pages = folio_nr_pages(folio); + cond_resched(); - if (folio_test_hugetlb(folio)) - rc = unmap_and_move_huge_page(get_new_page, - put_new_page, private, - &folio->page, pass > 2, mode, - reason, - &ret_folios); - else - rc = unmap_and_move(get_new_page, put_new_page, - private, folio, pass > 2, mode, - reason, &ret_folios); + rc = unmap_and_move(get_new_page, put_new_page, + private, folio, pass > 2, mode, + reason, &ret_folios); /* * The rules are: - * Success: non hugetlb folio will be freed, hugetlb - * folio will be put back + * Success: folio will be freed * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list * -ENOSYS: stay on the from list @@ -1532,7 +1630,6 @@ split_folio_migration: stats.nr_thp_split += is_thp; break; } - /* Hugetlb migration is unsupported */ } else if (!no_split_folio_counting) { nr_failed++; } @@ -1626,8 +1723,8 @@ split_folio_migration: */ if (!list_empty(&split_folios)) { /* - * Move non-migrated folios (after 10 retries) to ret_folios - * to avoid migrating them again. + * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY + * retries) to ret_folios to avoid migrating them again. */ list_splice_init(from, &ret_folios); list_splice_init(&split_folios, from); -- cgit v1.2.3 From 42012e0436d44aeb2e68f11a28ddd0ad3f38b61f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 13 Feb 2023 20:34:38 +0800 Subject: migrate_pages: restrict number of pages to migrate in batch This is a preparation patch to batch the folio unmapping and moving for non-hugetlb folios. If we had batched the folio unmapping, all folios to be migrated would be unmapped before copying the contents and flags of the folios. If the folios that were passed to migrate_pages() were too many in unit of pages, the execution of the processes would be stopped for too long time, thus too long latency. For example, migrate_pages() syscall will call migrate_pages() with all folios of a process. To avoid this possible issue, in this patch, we restrict the number of pages to be migrated to be no more than HPAGE_PMD_NR. That is, the influence is at the same level of THP migration. Link: https://lkml.kernel.org/r/20230213123444.155149-4-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Baolin Wang Cc: Zi Yan Cc: Yang Shi Cc: Oscar Salvador Cc: Matthew Wilcox Cc: Bharata B Rao Cc: Alistair Popple Cc: Xin Hao Cc: Minchan Kim Cc: Mike Kravetz Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- mm/migrate.c | 174 ++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 106 insertions(+), 68 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 586a32bdaa71..d436f35fa145 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1414,6 +1414,11 @@ static inline int try_split_folio(struct folio *folio, struct list_head *split_f return rc; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define NR_MAX_BATCHED_MIGRATION HPAGE_PMD_NR +#else +#define NR_MAX_BATCHED_MIGRATION 512 +#endif #define NR_MAX_MIGRATE_PAGES_RETRY 10 struct migrate_pages_stats { @@ -1515,40 +1520,15 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, return nr_failed; } -/* - * migrate_pages - migrate the folios specified in a list, to the free folios - * supplied as the target for the page migration - * - * @from: The list of folios to be migrated. - * @get_new_page: The function used to allocate free folios to be used - * as the target of the folio migration. - * @put_new_page: The function used to free target folios if migration - * fails, or NULL if no special handling is necessary. - * @private: Private data to be passed on to get_new_page() - * @mode: The migration mode that specifies the constraints for - * folio migration, if any. - * @reason: The reason for folio migration. - * @ret_succeeded: Set to the number of folios migrated successfully if - * the caller passes a non-NULL pointer. - * - * The function returns after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no folios - * are movable any more because the list has become empty or no retryable folios - * exist any more. It is caller's responsibility to call putback_movable_pages() - * only if ret != 0. - * - * Returns the number of {normal folio, large folio, hugetlb} that were not - * migrated, or an error code. The number of large folio splits will be - * considered as the number of non-migrated large folio, no matter how many - * split folios of the large folio are migrated successfully. - */ -int migrate_pages(struct list_head *from, new_page_t get_new_page, +static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, - enum migrate_mode mode, int reason, unsigned int *ret_succeeded) + enum migrate_mode mode, int reason, struct list_head *ret_folios, + struct migrate_pages_stats *stats) { int retry = 1; int large_retry = 1; int thp_retry = 1; - int nr_failed; + int nr_failed = 0; int nr_retry_pages = 0; int nr_large_failed = 0; int pass = 0; @@ -1556,20 +1536,9 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, bool is_thp = false; struct folio *folio, *folio2; int rc, nr_pages; - LIST_HEAD(ret_folios); LIST_HEAD(split_folios); bool nosplit = (reason == MR_NUMA_MISPLACED); bool no_split_folio_counting = false; - struct migrate_pages_stats stats; - - trace_mm_migrate_pages_start(mode, reason); - - memset(&stats, 0, sizeof(stats)); - rc = migrate_hugetlbs(from, get_new_page, put_new_page, private, mode, reason, - &stats, &ret_folios); - if (rc < 0) - goto out; - nr_failed = rc; split_folio_migration: for (pass = 0; @@ -1581,12 +1550,6 @@ split_folio_migration: nr_retry_pages = 0; list_for_each_entry_safe(folio, folio2, from, lru) { - /* Retried hugetlb folios will be kept in list */ - if (folio_test_hugetlb(folio)) { - list_move_tail(&folio->lru, &ret_folios); - continue; - } - /* * Large folio statistics is based on the source large * folio. Capture required information that might get @@ -1600,15 +1563,14 @@ split_folio_migration: rc = unmap_and_move(get_new_page, put_new_page, private, folio, pass > 2, mode, - reason, &ret_folios); + reason, ret_folios); /* * The rules are: * Success: folio will be freed * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list * -ENOSYS: stay on the from list - * Other errno: put on ret_folios list then splice to - * from list + * Other errno: put on ret_folios list */ switch(rc) { /* @@ -1625,17 +1587,17 @@ split_folio_migration: /* Large folio migration is unsupported */ if (is_large) { nr_large_failed++; - stats.nr_thp_failed += is_thp; + stats->nr_thp_failed += is_thp; if (!try_split_folio(folio, &split_folios)) { - stats.nr_thp_split += is_thp; + stats->nr_thp_split += is_thp; break; } } else if (!no_split_folio_counting) { nr_failed++; } - stats.nr_failed_pages += nr_pages; - list_move_tail(&folio->lru, &ret_folios); + stats->nr_failed_pages += nr_pages; + list_move_tail(&folio->lru, ret_folios); break; case -ENOMEM: /* @@ -1644,13 +1606,13 @@ split_folio_migration: */ if (is_large) { nr_large_failed++; - stats.nr_thp_failed += is_thp; + stats->nr_thp_failed += is_thp; /* Large folio NUMA faulting doesn't split to retry. */ if (!nosplit) { int ret = try_split_folio(folio, &split_folios); if (!ret) { - stats.nr_thp_split += is_thp; + stats->nr_thp_split += is_thp; break; } else if (reason == MR_LONGTERM_PIN && ret == -EAGAIN) { @@ -1668,17 +1630,17 @@ split_folio_migration: nr_failed++; } - stats.nr_failed_pages += nr_pages + nr_retry_pages; + stats->nr_failed_pages += nr_pages + nr_retry_pages; /* * There might be some split folios of fail-to-migrate large - * folios left in split_folios list. Move them back to migration + * folios left in split_folios list. Move them to ret_folios * list so that they could be put back to the right list by * the caller otherwise the folio refcnt will be leaked. */ - list_splice_init(&split_folios, from); + list_splice_init(&split_folios, ret_folios); /* nr_failed isn't updated for not used */ nr_large_failed += large_retry; - stats.nr_thp_failed += thp_retry; + stats->nr_thp_failed += thp_retry; goto out; case -EAGAIN: if (is_large) { @@ -1690,8 +1652,8 @@ split_folio_migration: nr_retry_pages += nr_pages; break; case MIGRATEPAGE_SUCCESS: - stats.nr_succeeded += nr_pages; - stats.nr_thp_succeeded += is_thp; + stats->nr_succeeded += nr_pages; + stats->nr_thp_succeeded += is_thp; break; default: /* @@ -1702,20 +1664,20 @@ split_folio_migration: */ if (is_large) { nr_large_failed++; - stats.nr_thp_failed += is_thp; + stats->nr_thp_failed += is_thp; } else if (!no_split_folio_counting) { nr_failed++; } - stats.nr_failed_pages += nr_pages; + stats->nr_failed_pages += nr_pages; break; } } } nr_failed += retry; nr_large_failed += large_retry; - stats.nr_thp_failed += thp_retry; - stats.nr_failed_pages += nr_retry_pages; + stats->nr_thp_failed += thp_retry; + stats->nr_failed_pages += nr_retry_pages; /* * Try to migrate split folios of fail-to-migrate large folios, no * nr_failed counting in this round, since all split folios of a @@ -1726,7 +1688,7 @@ split_folio_migration: * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY * retries) to ret_folios to avoid migrating them again. */ - list_splice_init(from, &ret_folios); + list_splice_init(from, ret_folios); list_splice_init(&split_folios, from); no_split_folio_counting = true; retry = 1; @@ -1734,6 +1696,82 @@ split_folio_migration: } rc = nr_failed + nr_large_failed; +out: + return rc; +} + +/* + * migrate_pages - migrate the folios specified in a list, to the free folios + * supplied as the target for the page migration + * + * @from: The list of folios to be migrated. + * @get_new_page: The function used to allocate free folios to be used + * as the target of the folio migration. + * @put_new_page: The function used to free target folios if migration + * fails, or NULL if no special handling is necessary. + * @private: Private data to be passed on to get_new_page() + * @mode: The migration mode that specifies the constraints for + * folio migration, if any. + * @reason: The reason for folio migration. + * @ret_succeeded: Set to the number of folios migrated successfully if + * the caller passes a non-NULL pointer. + * + * The function returns after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no folios + * are movable any more because the list has become empty or no retryable folios + * exist any more. It is caller's responsibility to call putback_movable_pages() + * only if ret != 0. + * + * Returns the number of {normal folio, large folio, hugetlb} that were not + * migrated, or an error code. The number of large folio splits will be + * considered as the number of non-migrated large folio, no matter how many + * split folios of the large folio are migrated successfully. + */ +int migrate_pages(struct list_head *from, new_page_t get_new_page, + free_page_t put_new_page, unsigned long private, + enum migrate_mode mode, int reason, unsigned int *ret_succeeded) +{ + int rc, rc_gather; + int nr_pages; + struct folio *folio, *folio2; + LIST_HEAD(folios); + LIST_HEAD(ret_folios); + struct migrate_pages_stats stats; + + trace_mm_migrate_pages_start(mode, reason); + + memset(&stats, 0, sizeof(stats)); + + rc_gather = migrate_hugetlbs(from, get_new_page, put_new_page, private, + mode, reason, &stats, &ret_folios); + if (rc_gather < 0) + goto out; +again: + nr_pages = 0; + list_for_each_entry_safe(folio, folio2, from, lru) { + /* Retried hugetlb folios will be kept in list */ + if (folio_test_hugetlb(folio)) { + list_move_tail(&folio->lru, &ret_folios); + continue; + } + + nr_pages += folio_nr_pages(folio); + if (nr_pages > NR_MAX_BATCHED_MIGRATION) + break; + } + if (nr_pages > NR_MAX_BATCHED_MIGRATION) + list_cut_before(&folios, from, &folio->lru); + else + list_splice_init(from, &folios); + rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private, + mode, reason, &ret_folios, &stats); + list_splice_tail_init(&folios, &ret_folios); + if (rc < 0) { + rc_gather = rc; + goto out; + } + rc_gather += rc; + if (!list_empty(from)) + goto again; out: /* * Put the permanent failure folio back to migration list, they @@ -1746,7 +1784,7 @@ out: * are migrated successfully. */ if (list_empty(from)) - rc = 0; + rc_gather = 0; count_vm_events(PGMIGRATE_SUCCESS, stats.nr_succeeded); count_vm_events(PGMIGRATE_FAIL, stats.nr_failed_pages); @@ -1760,7 +1798,7 @@ out: if (ret_succeeded) *ret_succeeded = stats.nr_succeeded; - return rc; + return rc_gather; } struct page *alloc_migration_target(struct page *page, unsigned long private) -- cgit v1.2.3 From 64c8902ed4418317cd416c566f896bd4a92b2efc Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 13 Feb 2023 20:34:39 +0800 Subject: migrate_pages: split unmap_and_move() to _unmap() and _move() This is a preparation patch to batch the folio unmapping and moving. In this patch, unmap_and_move() is split to migrate_folio_unmap() and migrate_folio_move(). So, we can batch _unmap() and _move() in different loops later. To pass some information between unmap and move, the original unused dst->mapping and dst->private are used. Link: https://lkml.kernel.org/r/20230213123444.155149-5-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Baolin Wang Reviewed-by: Xin Hao Cc: Zi Yan Cc: Yang Shi Cc: Oscar Salvador Cc: Matthew Wilcox Cc: Bharata B Rao Cc: Alistair Popple Cc: Minchan Kim Cc: Mike Kravetz Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- include/linux/migrate.h | 1 + mm/migrate.c | 169 ++++++++++++++++++++++++++++++++++++------------ 2 files changed, 129 insertions(+), 41 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index bdff950a8bb4..c88b96b48be7 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -18,6 +18,7 @@ struct migration_target_control; * - zero on page migration success; */ #define MIGRATEPAGE_SUCCESS 0 +#define MIGRATEPAGE_UNMAP 1 /** * struct movable_operations - Driver page migration diff --git a/mm/migrate.c b/mm/migrate.c index d436f35fa145..5fd18a7cce62 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1027,11 +1027,53 @@ out: return rc; } -static int __unmap_and_move(struct folio *src, struct folio *dst, +/* + * To record some information during migration, we use some unused + * fields (mapping and private) of struct folio of the newly allocated + * destination folio. This is safe because nobody is using them + * except us. + */ +static void __migrate_folio_record(struct folio *dst, + unsigned long page_was_mapped, + struct anon_vma *anon_vma) +{ + dst->mapping = (void *)anon_vma; + dst->private = (void *)page_was_mapped; +} + +static void __migrate_folio_extract(struct folio *dst, + int *page_was_mappedp, + struct anon_vma **anon_vmap) +{ + *anon_vmap = (void *)dst->mapping; + *page_was_mappedp = (unsigned long)dst->private; + dst->mapping = NULL; + dst->private = NULL; +} + +/* Cleanup src folio upon migration success */ +static void migrate_folio_done(struct folio *src, + enum migrate_reason reason) +{ + /* + * Compaction can migrate also non-LRU pages which are + * not accounted to NR_ISOLATED_*. They can be recognized + * as __PageMovable + */ + if (likely(!__folio_test_movable(src))) + mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON + + folio_is_file_lru(src), -folio_nr_pages(src)); + + if (reason != MR_MEMORY_FAILURE) + /* We release the page in page_handle_poison. */ + folio_put(src); +} + +static int __migrate_folio_unmap(struct folio *src, struct folio *dst, int force, enum migrate_mode mode) { int rc = -EAGAIN; - bool page_was_mapped = false; + int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; bool is_lru = !__PageMovable(&src->page); @@ -1107,8 +1149,8 @@ static int __unmap_and_move(struct folio *src, struct folio *dst, goto out_unlock; if (unlikely(!is_lru)) { - rc = move_to_new_folio(dst, src, mode); - goto out_unlock_both; + __migrate_folio_record(dst, page_was_mapped, anon_vma); + return MIGRATEPAGE_UNMAP; } /* @@ -1133,11 +1175,42 @@ static int __unmap_and_move(struct folio *src, struct folio *dst, VM_BUG_ON_FOLIO(folio_test_anon(src) && !folio_test_ksm(src) && !anon_vma, src); try_to_migrate(src, 0); - page_was_mapped = true; + page_was_mapped = 1; } - if (!folio_mapped(src)) - rc = move_to_new_folio(dst, src, mode); + if (!folio_mapped(src)) { + __migrate_folio_record(dst, page_was_mapped, anon_vma); + return MIGRATEPAGE_UNMAP; + } + + if (page_was_mapped) + remove_migration_ptes(src, src, false); + +out_unlock_both: + folio_unlock(dst); +out_unlock: + /* Drop an anon_vma reference if we took one */ + if (anon_vma) + put_anon_vma(anon_vma); + folio_unlock(src); +out: + + return rc; +} + +static int __migrate_folio_move(struct folio *src, struct folio *dst, + enum migrate_mode mode) +{ + int rc; + int page_was_mapped = 0; + struct anon_vma *anon_vma = NULL; + bool is_lru = !__PageMovable(&src->page); + + __migrate_folio_extract(dst, &page_was_mapped, &anon_vma); + + rc = move_to_new_folio(dst, src, mode); + if (unlikely(!is_lru)) + goto out_unlock_both; /* * When successful, push dst to LRU immediately: so that if it @@ -1160,12 +1233,10 @@ static int __unmap_and_move(struct folio *src, struct folio *dst, out_unlock_both: folio_unlock(dst); -out_unlock: /* Drop an anon_vma reference if we took one */ if (anon_vma) put_anon_vma(anon_vma); folio_unlock(src); -out: /* * If migration is successful, decrease refcount of dst, * which will not free the page because new page owner increased @@ -1177,19 +1248,15 @@ out: return rc; } -/* - * Obtain the lock on folio, remove all ptes and migrate the folio - * to the newly allocated folio in dst. - */ -static int unmap_and_move(new_page_t get_new_page, - free_page_t put_new_page, - unsigned long private, struct folio *src, - int force, enum migrate_mode mode, - enum migrate_reason reason, - struct list_head *ret) +/* Obtain the lock on page, remove all ptes. */ +static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page, + unsigned long private, struct folio *src, + struct folio **dstp, int force, + enum migrate_mode mode, enum migrate_reason reason, + struct list_head *ret) { struct folio *dst; - int rc = MIGRATEPAGE_SUCCESS; + int rc = MIGRATEPAGE_UNMAP; struct page *newpage = NULL; if (!thp_migration_supported() && folio_test_transhuge(src)) @@ -1200,20 +1267,49 @@ static int unmap_and_move(new_page_t get_new_page, folio_clear_active(src); folio_clear_unevictable(src); /* free_pages_prepare() will clear PG_isolated. */ - goto out; + list_del(&src->lru); + migrate_folio_done(src, reason); + return MIGRATEPAGE_SUCCESS; } newpage = get_new_page(&src->page, private); if (!newpage) return -ENOMEM; dst = page_folio(newpage); + *dstp = dst; dst->private = NULL; - rc = __unmap_and_move(src, dst, force, mode); + rc = __migrate_folio_unmap(src, dst, force, mode); + if (rc == MIGRATEPAGE_UNMAP) + return rc; + + /* + * A folio that has not been unmapped will be restored to + * right list unless we want to retry. + */ + if (rc != -EAGAIN) + list_move_tail(&src->lru, ret); + + if (put_new_page) + put_new_page(&dst->page, private); + else + folio_put(dst); + + return rc; +} + +/* Migrate the folio to the newly allocated folio in dst. */ +static int migrate_folio_move(free_page_t put_new_page, unsigned long private, + struct folio *src, struct folio *dst, + enum migrate_mode mode, enum migrate_reason reason, + struct list_head *ret) +{ + int rc; + + rc = __migrate_folio_move(src, dst, mode); if (rc == MIGRATEPAGE_SUCCESS) set_page_owner_migrate_reason(&dst->page, reason); -out: if (rc != -EAGAIN) { /* * A folio that has been migrated has all references @@ -1229,20 +1325,7 @@ out: * we want to retry. */ if (rc == MIGRATEPAGE_SUCCESS) { - /* - * Compaction can migrate also non-LRU folios which are - * not accounted to NR_ISOLATED_*. They can be recognized - * as __folio_test_movable - */ - if (likely(!__folio_test_movable(src))) - mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON + - folio_is_file_lru(src), -folio_nr_pages(src)); - - if (reason != MR_MEMORY_FAILURE) - /* - * We release the folio in page_handle_poison. - */ - folio_put(src); + migrate_folio_done(src, reason); } else { if (rc != -EAGAIN) list_add_tail(&src->lru, ret); @@ -1534,7 +1617,7 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, int pass = 0; bool is_large = false; bool is_thp = false; - struct folio *folio, *folio2; + struct folio *folio, *folio2, *dst = NULL; int rc, nr_pages; LIST_HEAD(split_folios); bool nosplit = (reason == MR_NUMA_MISPLACED); @@ -1561,9 +1644,13 @@ split_folio_migration: cond_resched(); - rc = unmap_and_move(get_new_page, put_new_page, - private, folio, pass > 2, mode, - reason, ret_folios); + rc = migrate_folio_unmap(get_new_page, put_new_page, private, + folio, &dst, pass > 2, mode, + reason, ret_folios); + if (rc == MIGRATEPAGE_UNMAP) + rc = migrate_folio_move(put_new_page, private, + folio, dst, mode, + reason, ret_folios); /* * The rules are: * Success: folio will be freed -- cgit v1.2.3 From 5dfab109d5193e6c224d96cabf90e9cc2c039884 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 13 Feb 2023 20:34:40 +0800 Subject: migrate_pages: batch _unmap and _move In this patch the _unmap and _move stage of the folio migration is batched. That for, previously, it is, for each folio _unmap() _move() Now, it is, for each folio _unmap() for each folio _move() Based on this, we can batch the TLB flushing and use some hardware accelerator to copy folios between batched _unmap and batched _move stages. Link: https://lkml.kernel.org/r/20230213123444.155149-6-ying.huang@intel.com Signed-off-by: "Huang, Ying" Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Zi Yan Cc: Yang Shi Cc: Baolin Wang Cc: Oscar Salvador Cc: Matthew Wilcox Cc: Bharata B Rao Cc: Alistair Popple Cc: Xin Hao Cc: Minchan Kim Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/migrate.c | 214 ++++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 189 insertions(+), 25 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 5fd18a7cce62..ee3e21f1061c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1051,6 +1051,33 @@ static void __migrate_folio_extract(struct folio *dst, dst->private = NULL; } +/* Restore the source folio to the original state upon failure */ +static void migrate_folio_undo_src(struct folio *src, + int page_was_mapped, + struct anon_vma *anon_vma, + struct list_head *ret) +{ + if (page_was_mapped) + remove_migration_ptes(src, src, false); + /* Drop an anon_vma reference if we took one */ + if (anon_vma) + put_anon_vma(anon_vma); + folio_unlock(src); + list_move_tail(&src->lru, ret); +} + +/* Restore the destination folio to the original state upon failure */ +static void migrate_folio_undo_dst(struct folio *dst, + free_page_t put_new_page, + unsigned long private) +{ + folio_unlock(dst); + if (put_new_page) + put_new_page(&dst->page, private); + else + folio_put(dst); +} + /* Cleanup src folio upon migration success */ static void migrate_folio_done(struct folio *src, enum migrate_reason reason) @@ -1069,8 +1096,8 @@ static void migrate_folio_done(struct folio *src, folio_put(src); } -static int __migrate_folio_unmap(struct folio *src, struct folio *dst, - int force, enum migrate_mode mode) +static int __migrate_folio_unmap(struct folio *src, struct folio *dst, int force, + bool avoid_force_lock, enum migrate_mode mode) { int rc = -EAGAIN; int page_was_mapped = 0; @@ -1097,6 +1124,17 @@ static int __migrate_folio_unmap(struct folio *src, struct folio *dst, if (current->flags & PF_MEMALLOC) goto out; + /* + * We have locked some folios and are going to wait to lock + * this folio. To avoid a potential deadlock, let's bail + * out and not do that. The locked folios will be moved and + * unlocked, then we can wait to lock this folio. + */ + if (avoid_force_lock) { + rc = -EDEADLOCK; + goto out; + } + folio_lock(src); } @@ -1205,10 +1243,20 @@ static int __migrate_folio_move(struct folio *src, struct folio *dst, int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; bool is_lru = !__PageMovable(&src->page); + struct list_head *prev; __migrate_folio_extract(dst, &page_was_mapped, &anon_vma); + prev = dst->lru.prev; + list_del(&dst->lru); rc = move_to_new_folio(dst, src, mode); + + if (rc == -EAGAIN) { + list_add(&dst->lru, prev); + __migrate_folio_record(dst, page_was_mapped, anon_vma); + return rc; + } + if (unlikely(!is_lru)) goto out_unlock_both; @@ -1251,7 +1299,7 @@ out_unlock_both: /* Obtain the lock on page, remove all ptes. */ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct folio *src, - struct folio **dstp, int force, + struct folio **dstp, int force, bool avoid_force_lock, enum migrate_mode mode, enum migrate_reason reason, struct list_head *ret) { @@ -1279,7 +1327,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page *dstp = dst; dst->private = NULL; - rc = __migrate_folio_unmap(src, dst, force, mode); + rc = __migrate_folio_unmap(src, dst, force, avoid_force_lock, mode); if (rc == MIGRATEPAGE_UNMAP) return rc; @@ -1287,7 +1335,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page * A folio that has not been unmapped will be restored to * right list unless we want to retry. */ - if (rc != -EAGAIN) + if (rc != -EAGAIN && rc != -EDEADLOCK) list_move_tail(&src->lru, ret); if (put_new_page) @@ -1326,9 +1374,8 @@ static int migrate_folio_move(free_page_t put_new_page, unsigned long private, */ if (rc == MIGRATEPAGE_SUCCESS) { migrate_folio_done(src, reason); - } else { - if (rc != -EAGAIN) - list_add_tail(&src->lru, ret); + } else if (rc != -EAGAIN) { + list_add_tail(&src->lru, ret); if (put_new_page) put_new_page(&dst->page, private); @@ -1603,12 +1650,16 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, return nr_failed; } +/* + * migrate_pages_batch() first unmaps folios in the from list as many as + * possible, then move the unmapped folios. + */ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, enum migrate_mode mode, int reason, struct list_head *ret_folios, struct migrate_pages_stats *stats) { - int retry = 1; + int retry; int large_retry = 1; int thp_retry = 1; int nr_failed = 0; @@ -1617,13 +1668,19 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, int pass = 0; bool is_large = false; bool is_thp = false; - struct folio *folio, *folio2, *dst = NULL; - int rc, nr_pages; + struct folio *folio, *folio2, *dst = NULL, *dst2; + int rc, rc_saved, nr_pages; LIST_HEAD(split_folios); + LIST_HEAD(unmap_folios); + LIST_HEAD(dst_folios); bool nosplit = (reason == MR_NUMA_MISPLACED); bool no_split_folio_counting = false; + bool avoid_force_lock; -split_folio_migration: +retry: + rc_saved = 0; + avoid_force_lock = false; + retry = 1; for (pass = 0; pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry); pass++) { @@ -1645,16 +1702,15 @@ split_folio_migration: cond_resched(); rc = migrate_folio_unmap(get_new_page, put_new_page, private, - folio, &dst, pass > 2, mode, - reason, ret_folios); - if (rc == MIGRATEPAGE_UNMAP) - rc = migrate_folio_move(put_new_page, private, - folio, dst, mode, - reason, ret_folios); + folio, &dst, pass > 2, avoid_force_lock, + mode, reason, ret_folios); /* * The rules are: * Success: folio will be freed + * Unmap: folio will be put on unmap_folios list, + * dst folio put on dst_folios list * -EAGAIN: stay on the from list + * -EDEADLOCK: stay on the from list * -ENOMEM: stay on the from list * -ENOSYS: stay on the from list * Other errno: put on ret_folios list @@ -1689,7 +1745,7 @@ split_folio_migration: case -ENOMEM: /* * When memory is low, don't bother to try to migrate - * other folios, just exit. + * other folios, move unmapped folios, then exit. */ if (is_large) { nr_large_failed++; @@ -1728,7 +1784,19 @@ split_folio_migration: /* nr_failed isn't updated for not used */ nr_large_failed += large_retry; stats->nr_thp_failed += thp_retry; - goto out; + rc_saved = rc; + if (list_empty(&unmap_folios)) + goto out; + else + goto move; + case -EDEADLOCK: + /* + * The folio cannot be locked for potential deadlock. + * Go move (and unlock) all locked folios. Then we can + * try again. + */ + rc_saved = rc; + goto move; case -EAGAIN: if (is_large) { large_retry++; @@ -1742,6 +1810,15 @@ split_folio_migration: stats->nr_succeeded += nr_pages; stats->nr_thp_succeeded += is_thp; break; + case MIGRATEPAGE_UNMAP: + /* + * We have locked some folios, don't force lock + * to avoid deadlock. + */ + avoid_force_lock = true; + list_move_tail(&folio->lru, &unmap_folios); + list_add_tail(&dst->lru, &dst_folios); + break; default: /* * Permanent failure (-EBUSY, etc.): @@ -1765,12 +1842,95 @@ split_folio_migration: nr_large_failed += large_retry; stats->nr_thp_failed += thp_retry; stats->nr_failed_pages += nr_retry_pages; +move: + retry = 1; + for (pass = 0; + pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry); + pass++) { + retry = 0; + large_retry = 0; + thp_retry = 0; + nr_retry_pages = 0; + + dst = list_first_entry(&dst_folios, struct folio, lru); + dst2 = list_next_entry(dst, lru); + list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) { + is_large = folio_test_large(folio); + is_thp = is_large && folio_test_pmd_mappable(folio); + nr_pages = folio_nr_pages(folio); + + cond_resched(); + + rc = migrate_folio_move(put_new_page, private, + folio, dst, mode, + reason, ret_folios); + /* + * The rules are: + * Success: folio will be freed + * -EAGAIN: stay on the unmap_folios list + * Other errno: put on ret_folios list + */ + switch(rc) { + case -EAGAIN: + if (is_large) { + large_retry++; + thp_retry += is_thp; + } else if (!no_split_folio_counting) { + retry++; + } + nr_retry_pages += nr_pages; + break; + case MIGRATEPAGE_SUCCESS: + stats->nr_succeeded += nr_pages; + stats->nr_thp_succeeded += is_thp; + break; + default: + if (is_large) { + nr_large_failed++; + stats->nr_thp_failed += is_thp; + } else if (!no_split_folio_counting) { + nr_failed++; + } + + stats->nr_failed_pages += nr_pages; + break; + } + dst = dst2; + dst2 = list_next_entry(dst, lru); + } + } + nr_failed += retry; + nr_large_failed += large_retry; + stats->nr_thp_failed += thp_retry; + stats->nr_failed_pages += nr_retry_pages; + + if (rc_saved) + rc = rc_saved; + else + rc = nr_failed + nr_large_failed; +out: + /* Cleanup remaining folios */ + dst = list_first_entry(&dst_folios, struct folio, lru); + dst2 = list_next_entry(dst, lru); + list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) { + int page_was_mapped = 0; + struct anon_vma *anon_vma = NULL; + + __migrate_folio_extract(dst, &page_was_mapped, &anon_vma); + migrate_folio_undo_src(folio, page_was_mapped, anon_vma, + ret_folios); + list_del(&dst->lru); + migrate_folio_undo_dst(dst, put_new_page, private); + dst = dst2; + dst2 = list_next_entry(dst, lru); + } + /* * Try to migrate split folios of fail-to-migrate large folios, no * nr_failed counting in this round, since all split folios of a * large folio is counted as 1 failure in the first round. */ - if (!list_empty(&split_folios)) { + if (rc >= 0 && !list_empty(&split_folios)) { /* * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY * retries) to ret_folios to avoid migrating them again. @@ -1778,12 +1938,16 @@ split_folio_migration: list_splice_init(from, ret_folios); list_splice_init(&split_folios, from); no_split_folio_counting = true; - retry = 1; - goto split_folio_migration; + goto retry; } - rc = nr_failed + nr_large_failed; -out: + /* + * We have unlocked all locked folios, so we can force lock now, let's + * try again. + */ + if (rc == -EDEADLOCK) + goto retry; + return rc; } -- cgit v1.2.3 From 80562ba0d8378e89fe5836c28ea56c2aab3014e8 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 13 Feb 2023 20:34:41 +0800 Subject: migrate_pages: move migrate_folio_unmap() Just move the position of the functions. There's no any functionality change. This is to make it easier to review the next patch via putting code near its position in the next patch. Link: https://lkml.kernel.org/r/20230213123444.155149-7-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Zi Yan Cc: Yang Shi Cc: Baolin Wang Cc: Oscar Salvador Cc: Matthew Wilcox Cc: Bharata B Rao Cc: Alistair Popple Cc: Xin Hao Cc: Minchan Kim Cc: Mike Kravetz Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- mm/migrate.c | 100 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index ee3e21f1061c..0c7488ebe248 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1236,6 +1236,56 @@ out: return rc; } +/* Obtain the lock on page, remove all ptes. */ +static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page, + unsigned long private, struct folio *src, + struct folio **dstp, int force, bool avoid_force_lock, + enum migrate_mode mode, enum migrate_reason reason, + struct list_head *ret) +{ + struct folio *dst; + int rc = MIGRATEPAGE_UNMAP; + struct page *newpage = NULL; + + if (!thp_migration_supported() && folio_test_transhuge(src)) + return -ENOSYS; + + if (folio_ref_count(src) == 1) { + /* Folio was freed from under us. So we are done. */ + folio_clear_active(src); + folio_clear_unevictable(src); + /* free_pages_prepare() will clear PG_isolated. */ + list_del(&src->lru); + migrate_folio_done(src, reason); + return MIGRATEPAGE_SUCCESS; + } + + newpage = get_new_page(&src->page, private); + if (!newpage) + return -ENOMEM; + dst = page_folio(newpage); + *dstp = dst; + + dst->private = NULL; + rc = __migrate_folio_unmap(src, dst, force, avoid_force_lock, mode); + if (rc == MIGRATEPAGE_UNMAP) + return rc; + + /* + * A folio that has not been unmapped will be restored to + * right list unless we want to retry. + */ + if (rc != -EAGAIN && rc != -EDEADLOCK) + list_move_tail(&src->lru, ret); + + if (put_new_page) + put_new_page(&dst->page, private); + else + folio_put(dst); + + return rc; +} + static int __migrate_folio_move(struct folio *src, struct folio *dst, enum migrate_mode mode) { @@ -1296,56 +1346,6 @@ out_unlock_both: return rc; } -/* Obtain the lock on page, remove all ptes. */ -static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page, - unsigned long private, struct folio *src, - struct folio **dstp, int force, bool avoid_force_lock, - enum migrate_mode mode, enum migrate_reason reason, - struct list_head *ret) -{ - struct folio *dst; - int rc = MIGRATEPAGE_UNMAP; - struct page *newpage = NULL; - - if (!thp_migration_supported() && folio_test_transhuge(src)) - return -ENOSYS; - - if (folio_ref_count(src) == 1) { - /* Folio was freed from under us. So we are done. */ - folio_clear_active(src); - folio_clear_unevictable(src); - /* free_pages_prepare() will clear PG_isolated. */ - list_del(&src->lru); - migrate_folio_done(src, reason); - return MIGRATEPAGE_SUCCESS; - } - - newpage = get_new_page(&src->page, private); - if (!newpage) - return -ENOMEM; - dst = page_folio(newpage); - *dstp = dst; - - dst->private = NULL; - rc = __migrate_folio_unmap(src, dst, force, avoid_force_lock, mode); - if (rc == MIGRATEPAGE_UNMAP) - return rc; - - /* - * A folio that has not been unmapped will be restored to - * right list unless we want to retry. - */ - if (rc != -EAGAIN && rc != -EDEADLOCK) - list_move_tail(&src->lru, ret); - - if (put_new_page) - put_new_page(&dst->page, private); - else - folio_put(dst); - - return rc; -} - /* Migrate the folio to the newly allocated folio in dst. */ static int migrate_folio_move(free_page_t put_new_page, unsigned long private, struct folio *src, struct folio *dst, -- cgit v1.2.3 From ebe75e4751063dce6f61b579b43de86dcf7b7462 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 13 Feb 2023 20:34:42 +0800 Subject: migrate_pages: share more code between _unmap and _move This is a code cleanup patch to reduce the duplicated code between the _unmap and _move stages of migrate_pages(). No functionality change is expected. Link: https://lkml.kernel.org/r/20230213123444.155149-8-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Zi Yan Cc: Yang Shi Cc: Baolin Wang Cc: Oscar Salvador Cc: Matthew Wilcox Cc: Bharata B Rao Cc: Alistair Popple Cc: Xin Hao Cc: Minchan Kim Cc: Mike Kravetz Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- mm/migrate.c | 207 ++++++++++++++++++++++++----------------------------------- 1 file changed, 85 insertions(+), 122 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 0c7488ebe248..00713ccb6643 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1055,6 +1055,7 @@ static void __migrate_folio_extract(struct folio *dst, static void migrate_folio_undo_src(struct folio *src, int page_was_mapped, struct anon_vma *anon_vma, + bool locked, struct list_head *ret) { if (page_was_mapped) @@ -1062,16 +1063,20 @@ static void migrate_folio_undo_src(struct folio *src, /* Drop an anon_vma reference if we took one */ if (anon_vma) put_anon_vma(anon_vma); - folio_unlock(src); - list_move_tail(&src->lru, ret); + if (locked) + folio_unlock(src); + if (ret) + list_move_tail(&src->lru, ret); } /* Restore the destination folio to the original state upon failure */ static void migrate_folio_undo_dst(struct folio *dst, + bool locked, free_page_t put_new_page, unsigned long private) { - folio_unlock(dst); + if (locked) + folio_unlock(dst); if (put_new_page) put_new_page(&dst->page, private); else @@ -1096,13 +1101,42 @@ static void migrate_folio_done(struct folio *src, folio_put(src); } -static int __migrate_folio_unmap(struct folio *src, struct folio *dst, int force, - bool avoid_force_lock, enum migrate_mode mode) +/* Obtain the lock on page, remove all ptes. */ +static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page, + unsigned long private, struct folio *src, + struct folio **dstp, int force, bool avoid_force_lock, + enum migrate_mode mode, enum migrate_reason reason, + struct list_head *ret) { + struct folio *dst; int rc = -EAGAIN; + struct page *newpage = NULL; int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; bool is_lru = !__PageMovable(&src->page); + bool locked = false; + bool dst_locked = false; + + if (!thp_migration_supported() && folio_test_transhuge(src)) + return -ENOSYS; + + if (folio_ref_count(src) == 1) { + /* Folio was freed from under us. So we are done. */ + folio_clear_active(src); + folio_clear_unevictable(src); + /* free_pages_prepare() will clear PG_isolated. */ + list_del(&src->lru); + migrate_folio_done(src, reason); + return MIGRATEPAGE_SUCCESS; + } + + newpage = get_new_page(&src->page, private); + if (!newpage) + return -ENOMEM; + dst = page_folio(newpage); + *dstp = dst; + + dst->private = NULL; if (!folio_trylock(src)) { if (!force || mode == MIGRATE_ASYNC) @@ -1137,6 +1171,7 @@ static int __migrate_folio_unmap(struct folio *src, struct folio *dst, int force folio_lock(src); } + locked = true; if (folio_test_writeback(src)) { /* @@ -1151,10 +1186,10 @@ static int __migrate_folio_unmap(struct folio *src, struct folio *dst, int force break; default: rc = -EBUSY; - goto out_unlock; + goto out; } if (!force) - goto out_unlock; + goto out; folio_wait_writeback(src); } @@ -1184,7 +1219,8 @@ static int __migrate_folio_unmap(struct folio *src, struct folio *dst, int force * This is much like races on refcount of oldpage: just don't BUG(). */ if (unlikely(!folio_trylock(dst))) - goto out_unlock; + goto out; + dst_locked = true; if (unlikely(!is_lru)) { __migrate_folio_record(dst, page_was_mapped, anon_vma); @@ -1206,7 +1242,7 @@ static int __migrate_folio_unmap(struct folio *src, struct folio *dst, int force if (!src->mapping) { if (folio_test_private(src)) { try_to_free_buffers(src); - goto out_unlock_both; + goto out; } } else if (folio_mapped(src)) { /* Establish migration ptes */ @@ -1221,73 +1257,25 @@ static int __migrate_folio_unmap(struct folio *src, struct folio *dst, int force return MIGRATEPAGE_UNMAP; } - if (page_was_mapped) - remove_migration_ptes(src, src, false); - -out_unlock_both: - folio_unlock(dst); -out_unlock: - /* Drop an anon_vma reference if we took one */ - if (anon_vma) - put_anon_vma(anon_vma); - folio_unlock(src); out: - - return rc; -} - -/* Obtain the lock on page, remove all ptes. */ -static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page, - unsigned long private, struct folio *src, - struct folio **dstp, int force, bool avoid_force_lock, - enum migrate_mode mode, enum migrate_reason reason, - struct list_head *ret) -{ - struct folio *dst; - int rc = MIGRATEPAGE_UNMAP; - struct page *newpage = NULL; - - if (!thp_migration_supported() && folio_test_transhuge(src)) - return -ENOSYS; - - if (folio_ref_count(src) == 1) { - /* Folio was freed from under us. So we are done. */ - folio_clear_active(src); - folio_clear_unevictable(src); - /* free_pages_prepare() will clear PG_isolated. */ - list_del(&src->lru); - migrate_folio_done(src, reason); - return MIGRATEPAGE_SUCCESS; - } - - newpage = get_new_page(&src->page, private); - if (!newpage) - return -ENOMEM; - dst = page_folio(newpage); - *dstp = dst; - - dst->private = NULL; - rc = __migrate_folio_unmap(src, dst, force, avoid_force_lock, mode); - if (rc == MIGRATEPAGE_UNMAP) - return rc; - /* * A folio that has not been unmapped will be restored to * right list unless we want to retry. */ - if (rc != -EAGAIN && rc != -EDEADLOCK) - list_move_tail(&src->lru, ret); + if (rc == -EAGAIN || rc == -EDEADLOCK) + ret = NULL; - if (put_new_page) - put_new_page(&dst->page, private); - else - folio_put(dst); + migrate_folio_undo_src(src, page_was_mapped, anon_vma, locked, ret); + migrate_folio_undo_dst(dst, dst_locked, put_new_page, private); return rc; } -static int __migrate_folio_move(struct folio *src, struct folio *dst, - enum migrate_mode mode) +/* Migrate the folio to the newly allocated folio in dst. */ +static int migrate_folio_move(free_page_t put_new_page, unsigned long private, + struct folio *src, struct folio *dst, + enum migrate_mode mode, enum migrate_reason reason, + struct list_head *ret) { int rc; int page_was_mapped = 0; @@ -1300,12 +1288,8 @@ static int __migrate_folio_move(struct folio *src, struct folio *dst, list_del(&dst->lru); rc = move_to_new_folio(dst, src, mode); - - if (rc == -EAGAIN) { - list_add(&dst->lru, prev); - __migrate_folio_record(dst, page_was_mapped, anon_vma); - return rc; - } + if (rc) + goto out; if (unlikely(!is_lru)) goto out_unlock_both; @@ -1319,70 +1303,49 @@ static int __migrate_folio_move(struct folio *src, struct folio *dst, * unsuccessful, and other cases when a page has been temporarily * isolated from the unevictable LRU: but this case is the easiest. */ - if (rc == MIGRATEPAGE_SUCCESS) { - folio_add_lru(dst); - if (page_was_mapped) - lru_add_drain(); - } + folio_add_lru(dst); + if (page_was_mapped) + lru_add_drain(); if (page_was_mapped) - remove_migration_ptes(src, - rc == MIGRATEPAGE_SUCCESS ? dst : src, false); + remove_migration_ptes(src, dst, false); out_unlock_both: folio_unlock(dst); - /* Drop an anon_vma reference if we took one */ - if (anon_vma) - put_anon_vma(anon_vma); - folio_unlock(src); + set_page_owner_migrate_reason(&dst->page, reason); /* * If migration is successful, decrease refcount of dst, * which will not free the page because new page owner increased * refcounter. */ - if (rc == MIGRATEPAGE_SUCCESS) - folio_put(dst); - - return rc; -} - -/* Migrate the folio to the newly allocated folio in dst. */ -static int migrate_folio_move(free_page_t put_new_page, unsigned long private, - struct folio *src, struct folio *dst, - enum migrate_mode mode, enum migrate_reason reason, - struct list_head *ret) -{ - int rc; - - rc = __migrate_folio_move(src, dst, mode); - if (rc == MIGRATEPAGE_SUCCESS) - set_page_owner_migrate_reason(&dst->page, reason); - - if (rc != -EAGAIN) { - /* - * A folio that has been migrated has all references - * removed and will be freed. A folio that has not been - * migrated will have kept its references and be restored. - */ - list_del(&src->lru); - } + folio_put(dst); /* - * If migration is successful, releases reference grabbed during - * isolation. Otherwise, restore the folio to right list unless - * we want to retry. + * A folio that has been migrated has all references removed + * and will be freed. */ - if (rc == MIGRATEPAGE_SUCCESS) { - migrate_folio_done(src, reason); - } else if (rc != -EAGAIN) { - list_add_tail(&src->lru, ret); + list_del(&src->lru); + /* Drop an anon_vma reference if we took one */ + if (anon_vma) + put_anon_vma(anon_vma); + folio_unlock(src); + migrate_folio_done(src, reason); - if (put_new_page) - put_new_page(&dst->page, private); - else - folio_put(dst); + return rc; +out: + /* + * A folio that has not been migrated will be restored to + * right list unless we want to retry. + */ + if (rc == -EAGAIN) { + list_add(&dst->lru, prev); + __migrate_folio_record(dst, page_was_mapped, anon_vma); + return rc; } + migrate_folio_undo_src(src, page_was_mapped, anon_vma, true, ret); + migrate_folio_undo_dst(dst, true, put_new_page, private); + return rc; } @@ -1918,9 +1881,9 @@ out: __migrate_folio_extract(dst, &page_was_mapped, &anon_vma); migrate_folio_undo_src(folio, page_was_mapped, anon_vma, - ret_folios); + true, ret_folios); list_del(&dst->lru); - migrate_folio_undo_dst(dst, put_new_page, private); + migrate_folio_undo_dst(dst, true, put_new_page, private); dst = dst2; dst2 = list_next_entry(dst, lru); } -- cgit v1.2.3 From 7e12beb8ca2ac98b2ec42e0ea4b76cdc93b58654 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 13 Feb 2023 20:34:43 +0800 Subject: migrate_pages: batch flushing TLB The TLB flushing will cost quite some CPU cycles during the folio migration in some situations. For example, when migrate a folio of a process with multiple active threads that run on multiple CPUs. After batching the _unmap and _move in migrate_pages(), the TLB flushing can be batched easily with the existing TLB flush batching mechanism. This patch implements that. We use the following test case to test the patch. On a 2-socket Intel server, - Run pmbench memory accessing benchmark - Run `migratepages` to migrate pages of pmbench between node 0 and node 1 back and forth. With the patch, the TLB flushing IPI reduces 99.1% during the test and the number of pages migrated successfully per second increases 291.7%. Haoxin helped to test the patchset on an ARM64 server with 128 cores, 2 NUMA nodes. Test results show that the page migration performance increases up to 78%. NOTE: TLB flushing is batched only for normal folios, not for THP folios. Because the overhead of TLB flushing for THP folios is much lower than that for normal folios (about 1/512 on x86 platform). Link: https://lkml.kernel.org/r/20230213123444.155149-9-ying.huang@intel.com Signed-off-by: "Huang, Ying" Tested-by: Xin Hao Reviewed-by: Zi Yan Reviewed-by: Xin Hao Cc: Yang Shi Cc: Baolin Wang Cc: Oscar Salvador Cc: Matthew Wilcox Cc: Bharata B Rao Cc: Alistair Popple Cc: Minchan Kim Cc: Mike Kravetz Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- mm/migrate.c | 5 ++++- mm/rmap.c | 20 +++++++++++++++++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 00713ccb6643..2fa420e4f68c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1248,7 +1248,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page /* Establish migration ptes */ VM_BUG_ON_FOLIO(folio_test_anon(src) && !folio_test_ksm(src) && !anon_vma, src); - try_to_migrate(src, 0); + try_to_migrate(src, TTU_BATCH_FLUSH); page_was_mapped = 1; } @@ -1806,6 +1806,9 @@ retry: stats->nr_thp_failed += thp_retry; stats->nr_failed_pages += nr_retry_pages; move: + /* Flush TLBs for all unmapped folios */ + try_to_unmap_flush(); + retry = 1; for (pass = 0; pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry); diff --git a/mm/rmap.c b/mm/rmap.c index 8287f2cc327d..15ae24585fc4 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1952,7 +1952,21 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, } else { flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); /* Nuke the page table entry. */ - pteval = ptep_clear_flush(vma, address, pvmw.pte); + if (should_defer_flush(mm, flags)) { + /* + * We clear the PTE but do not flush so potentially + * a remote CPU could still be writing to the folio. + * If the entry was previously clean then the + * architecture must guarantee that a clear->dirty + * transition on a cached TLB entry is written through + * and traps if the PTE is unmapped. + */ + pteval = ptep_get_and_clear(mm, address, pvmw.pte); + + set_tlb_ubc_flush_pending(mm, pte_dirty(pteval)); + } else { + pteval = ptep_clear_flush(vma, address, pvmw.pte); + } } /* Set the dirty flag on the folio now the pte is gone. */ @@ -2124,10 +2138,10 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags) /* * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and - * TTU_SPLIT_HUGE_PMD and TTU_SYNC flags. + * TTU_SPLIT_HUGE_PMD, TTU_SYNC, and TTU_BATCH_FLUSH flags. */ if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | - TTU_SYNC))) + TTU_SYNC | TTU_BATCH_FLUSH))) return; if (folio_is_zone_device(folio) && -- cgit v1.2.3 From 6f7d760e86fa84862d749e36ebd29abf31f4f883 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 13 Feb 2023 20:34:44 +0800 Subject: migrate_pages: move THP/hugetlb migration support check to simplify code This is a code cleanup patch, no functionality change is expected. After the change, the line number reduces especially in the long migrate_pages_batch(). Link: https://lkml.kernel.org/r/20230213123444.155149-10-ying.huang@intel.com Signed-off-by: "Huang, Ying" Suggested-by: Alistair Popple Reviewed-by: Zi Yan Cc: Yang Shi Cc: Baolin Wang Cc: Oscar Salvador Cc: Matthew Wilcox Cc: Bharata B Rao Cc: Xin Hao Cc: Minchan Kim Cc: Mike Kravetz Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- mm/migrate.c | 83 ++++++++++++++++++++++++++---------------------------------- 1 file changed, 36 insertions(+), 47 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 2fa420e4f68c..ef68a1aff35c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1117,9 +1117,6 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page bool locked = false; bool dst_locked = false; - if (!thp_migration_supported() && folio_test_transhuge(src)) - return -ENOSYS; - if (folio_ref_count(src) == 1) { /* Folio was freed from under us. So we are done. */ folio_clear_active(src); @@ -1380,16 +1377,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; - /* - * Migratability of hugepages depends on architectures and their size. - * This check is necessary because some callers of hugepage migration - * like soft offline and memory hotremove don't walk through page - * tables or check whether the hugepage is pmd-based or not before - * kicking migration. - */ - if (!hugepage_migration_supported(page_hstate(hpage))) - return -ENOSYS; - if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ folio_putback_active_hugetlb(src); @@ -1556,6 +1543,20 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, cond_resched(); + /* + * Migratability of hugepages depends on architectures and + * their size. This check is necessary because some callers + * of hugepage migration like soft offline and memory + * hotremove don't walk through page tables or check whether + * the hugepage is pmd-based or not before kicking migration. + */ + if (!hugepage_migration_supported(folio_hstate(folio))) { + nr_failed++; + stats->nr_failed_pages += nr_pages; + list_move_tail(&folio->lru, ret_folios); + continue; + } + rc = unmap_and_move_huge_page(get_new_page, put_new_page, private, &folio->page, pass > 2, mode, @@ -1565,16 +1566,9 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, * Success: hugetlb folio will be put back * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list - * -ENOSYS: stay on the from list * Other errno: put on ret_folios list */ switch(rc) { - case -ENOSYS: - /* Hugetlb migration is unsupported */ - nr_failed++; - stats->nr_failed_pages += nr_pages; - list_move_tail(&folio->lru, ret_folios); - break; case -ENOMEM: /* * When memory is low, don't bother to try to migrate @@ -1664,6 +1658,28 @@ retry: cond_resched(); + /* + * Large folio migration might be unsupported or + * the allocation might be failed so we should retry + * on the same folio with the large folio split + * to normal folios. + * + * Split folios are put in split_folios, and + * we will migrate them after the rest of the + * list is processed. + */ + if (!thp_migration_supported() && is_thp) { + nr_large_failed++; + stats->nr_thp_failed++; + if (!try_split_folio(folio, &split_folios)) { + stats->nr_thp_split++; + continue; + } + stats->nr_failed_pages += nr_pages; + list_move_tail(&folio->lru, ret_folios); + continue; + } + rc = migrate_folio_unmap(get_new_page, put_new_page, private, folio, &dst, pass > 2, avoid_force_lock, mode, reason, ret_folios); @@ -1675,36 +1691,9 @@ retry: * -EAGAIN: stay on the from list * -EDEADLOCK: stay on the from list * -ENOMEM: stay on the from list - * -ENOSYS: stay on the from list * Other errno: put on ret_folios list */ switch(rc) { - /* - * Large folio migration might be unsupported or - * the allocation could've failed so we should retry - * on the same folio with the large folio split - * to normal folios. - * - * Split folios are put in split_folios, and - * we will migrate them after the rest of the - * list is processed. - */ - case -ENOSYS: - /* Large folio migration is unsupported */ - if (is_large) { - nr_large_failed++; - stats->nr_thp_failed += is_thp; - if (!try_split_folio(folio, &split_folios)) { - stats->nr_thp_split += is_thp; - break; - } - } else if (!no_split_folio_counting) { - nr_failed++; - } - - stats->nr_failed_pages += nr_pages; - list_move_tail(&folio->lru, ret_folios); - break; case -ENOMEM: /* * When memory is low, don't bother to try to migrate -- cgit v1.2.3 From 9f550d78b40da21b4da515db4c37d8d7b12aa1a6 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 13 Feb 2023 00:53:22 -0700 Subject: mm: multi-gen LRU: avoid futile retries Recall that the per-node memcg LRU has two generations and they alternate when the last memcg (of a given node) is moved from one to the other. Each generation is also sharded into multiple bins to improve scalability. A reclaimer starts with a random bin (in the old generation) and, if it fails, it will retry, i.e., to try the rest of the bins. If a reclaimer fails with the last memcg, it should move this memcg to the young generation first, which causes the generations to alternate, and then retry. Otherwise, the retries will be futile because all other bins are empty. Link: https://lkml.kernel.org/r/20230213075322.1416966-1-yuzhao@google.com Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists") Signed-off-by: Yu Zhao Reported-by: T.J. Mercier Signed-off-by: Andrew Morton --- mm/vmscan.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index d4b9fd1ae0ed..34535bbd4fe9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5356,18 +5356,20 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) { + int op; int gen; int bin; int first_bin; struct lruvec *lruvec; struct lru_gen_folio *lrugen; + struct mem_cgroup *memcg; const struct hlist_nulls_node *pos; - int op = 0; - struct mem_cgroup *memcg = NULL; unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); bin = first_bin = get_random_u32_below(MEMCG_NR_BINS); restart: + op = 0; + memcg = NULL; gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); rcu_read_lock(); @@ -5391,14 +5393,22 @@ restart: op = shrink_one(lruvec, sc); - if (sc->nr_reclaimed >= nr_to_reclaim) - goto success; - rcu_read_lock(); + + if (sc->nr_reclaimed >= nr_to_reclaim) + break; } rcu_read_unlock(); + if (op) + lru_gen_rotate_memcg(lruvec, op); + + mem_cgroup_put(memcg); + + if (sc->nr_reclaimed >= nr_to_reclaim) + return; + /* restart if raced with lru_gen_rotate_memcg() */ if (gen != get_nulls_value(pos)) goto restart; @@ -5407,11 +5417,6 @@ restart: bin = get_memcg_bin(bin + 1); if (bin != first_bin) goto restart; -success: - if (op) - lru_gen_rotate_memcg(lruvec, op); - - mem_cgroup_put(memcg); } static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -- cgit v1.2.3 From 1bc67ca65b31bcb669c4eaca79b3c8d205bb212a Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Sun, 12 Feb 2023 19:10:27 +0800 Subject: mm: page_alloc: call panic() when memoryless node allocation fails In free_area_init(), we will continue to run after allocation of memoryless node pgdat fails. However, in the subsequent process (such as when initializing zonelist), the case that NODE_DATA(nid) is NULL is not handled, which will cause panic. Instead of this, it's better to call panic() directly when the memory allocation fails during system boot. Link: https://lkml.kernel.org/r/20230212111027.95520-1-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Signed-off-by: Andrew Morton --- mm/page_alloc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 21d820c42900..4b6bcec41c8f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8405,11 +8405,9 @@ void __init free_area_init(unsigned long *max_zone_pfn) /* Allocator not initialized yet */ pgdat = arch_alloc_nodedata(nid); - if (!pgdat) { - pr_err("Cannot allocate %zuB for node %d.\n", - sizeof(*pgdat), nid); - continue; - } + if (!pgdat) + panic("Cannot allocate %zuB for node %d.\n", + sizeof(*pgdat), nid); arch_refresh_nodedata(nid, pgdat); free_area_init_memoryless_node(nid); -- cgit v1.2.3 From 44081c77e8a4aac9c5a010ed0d9ccdcf684041e1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 14 Feb 2023 11:30:24 +0100 Subject: maple_tree: reduce stack usage with gcc-9 and earlier gcc-10 changed the way inlining works to be less aggressive, but older versions run into an oversized stack frame warning whenever CONFIG_KASAN_STACK is enabled, as that forces variables from inlined callees to be non-overlapping: lib/maple_tree.c: In function 'mas_wr_bnode': lib/maple_tree.c:4320:1: error: the frame size of 1424 bytes is larger than 1024 bytes [-Werror=frame-larger-than=] Change the annotations on mas_store_b_node() and mas_commit_b_node() to explicitly forbid inlining in this configuration, which is the same behavior that newer versions already have. Link: https://lkml.kernel.org/r/20230214103030.1051950-1-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: David Hildenbrand Reviewed-by: Liam R. Howlett Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Vincenzo Frascino Cc: Vernon Yang Signed-off-by: Andrew Morton --- lib/maple_tree.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 5e9703189259..646297cae5d1 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -146,6 +146,13 @@ struct maple_subtree_state { struct maple_big_node *bn; }; +#ifdef CONFIG_KASAN_STACK +/* Prevent mas_wr_bnode() from exceeding the stack frame limit */ +#define noinline_for_kasan noinline_for_stack +#else +#define noinline_for_kasan inline +#endif + /* Functions */ static inline struct maple_node *mt_alloc_one(gfp_t gfp) { @@ -2107,7 +2114,7 @@ static inline void mas_bulk_rebalance(struct ma_state *mas, unsigned char end, * * Return: The actual end of the data stored in @b_node */ -static inline void mas_store_b_node(struct ma_wr_state *wr_mas, +static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node, unsigned char offset_end) { unsigned char slot; @@ -3579,7 +3586,7 @@ static inline bool mas_reuse_node(struct ma_wr_state *wr_mas, * @b_node: The maple big node * @end: The end of the data. */ -static inline int mas_commit_b_node(struct ma_wr_state *wr_mas, +static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node, unsigned char end) { struct maple_node *node; -- cgit v1.2.3 From 2ef8ed7ddd2e6e69da7802be51af8ad71326a74f Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Tue, 14 Feb 2023 15:35:49 +0000 Subject: mm: percpu: fix incorrect size in pcpu_obj_full_size() The extra space which is used to store the obj_cgroup membership is only valid when kmemcg is enabled. The kmemcg can be disabled via the kernel parameter "cgroup.memory=nokmem" at boot time. This helper is also used in non-memcg code, for example the tracepoint, so we should fix it. It was found by code review when I was implementing bpf memory usage[1]. No real issue happens in production environment. [1]. https://lwn.net/Articles/921991/ Link: https://lkml.kernel.org/r/20230214153549.12291-1-laoar.shao@gmail.com Signed-off-by: Yafang Shao Reviewed-by: Roman Gushchin Acked-by: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Vasily Averin Signed-off-by: Andrew Morton --- mm/percpu-internal.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 70b1ea23f4d2..f9847c131998 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -4,6 +4,7 @@ #include #include +#include /* * pcpu_block_md is the metadata block struct. @@ -118,14 +119,15 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk) * @size: size of area to allocate in bytes * * For each accounted object there is an extra space which is used to store - * obj_cgroup membership. Charge it too. + * obj_cgroup membership if kmemcg is not disabled. Charge it too. */ static inline size_t pcpu_obj_full_size(size_t size) { size_t extra_size = 0; #ifdef CONFIG_MEMCG_KMEM - extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *); + if (!mem_cgroup_kmem_disabled()) + extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *); #endif return size * num_possible_cpus() + extra_size; -- cgit v1.2.3 From 9325ddf90ec3a801c09da374b74532d4589a7346 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Tue, 14 Feb 2023 16:07:28 +0200 Subject: m68k/nommu: add missing definition of ARCH_PFN_OFFSET Patch series "fixups for generic implementation of pfn_valid()". Guenter reported boot failures on m68k-nommu and sh caused by the switch to the generic implementation of pfn_valid(): https://lore.kernel.org/all/20230212173513.GA4052259@roeck-us.net https://lore.kernel.org/all/20230212161320.GA3784076@roeck-us.net These are small fixups that address the issues. This patch (of 2): On m68k/nommu RAM does not necessarily start at 0x0 and when it does not pfn_valid() uses a wrong offset into the memory map which causes silent boot failures. Define ARCH_PFN_OFFSET to make pfn_valid() use the correct offset. Link: https://lkml.kernel.org/r/20230214140729.1649961-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20230214140729.1649961-2-rppt@kernel.org Fixes: d82f07f06cf8 ("m68k: use asm-generic/memory_model.h for both MMU and !MMU") Reported-by: Guenter Roeck Signed-off-by: Mike Rapoport (IBM) Acked-by: Greg Ungerer Reviewed-by: David Hildenbrand Cc: Arnd Bergmann Cc: Geert Uytterhoeven Cc: Rich Felker Cc: Yoshinori Sato Cc: John Paul Adrian Glaubitz Signed-off-by: Andrew Morton --- arch/m68k/include/asm/page_no.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h index 43ff6b109ebb..060e4c0e7605 100644 --- a/arch/m68k/include/asm/page_no.h +++ b/arch/m68k/include/asm/page_no.h @@ -28,6 +28,8 @@ extern unsigned long memory_end; #define virt_addr_valid(kaddr) (((unsigned long)(kaddr) >= PAGE_OFFSET) && \ ((unsigned long)(kaddr) < memory_end)) +#define ARCH_PFN_OFFSET PHYS_PFN(PAGE_OFFSET_RAW) + #endif /* __ASSEMBLY__ */ #endif /* _M68K_PAGE_NO_H */ -- cgit v1.2.3 From b4fb12e6c74791ac4c5c98b845628c576366b889 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Tue, 14 Feb 2023 16:07:29 +0200 Subject: sh: initialize max_mapnr sh never initializes max_mapnr which is used by the generic implementation of pfn_valid(). Initialize max_mapnr with set_max_mapnr() in sh::paging_init(). Link: https://lkml.kernel.org/r/20230214140729.1649961-3-rppt@kernel.org Fixes: e5080a967785 ("mm, arch: add generic implementation of pfn_valid() for FLATMEM") Reported-by: Guenter Roeck Signed-off-by: Mike Rapoport (IBM) Acked-by: John Paul Adrian Glaubitz Reviewed-by: David Hildenbrand Cc: Arnd Bergmann Cc: Geert Uytterhoeven Cc: Greg Ungerer Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/sh/mm/init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 506784702430..bf1b54055316 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -301,6 +301,7 @@ void __init paging_init(void) */ max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; min_low_pfn = __MEMORY_START >> PAGE_SHIFT; + set_max_mapnr(max_low_pfn - min_low_pfn); nodes_clear(node_online_map); -- cgit v1.2.3 From f7a449f779608efe1941a0e0c4bd7b5f57000be7 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 13 Feb 2023 11:29:22 -0800 Subject: mm: memcontrol: rename memcg_kmem_enabled() Currently there are two kmem-related helper functions with a confusing semantics: memcg_kmem_enabled() and mem_cgroup_kmem_disabled(). The problem is that an obvious expectation memcg_kmem_enabled() == !mem_cgroup_kmem_disabled(), can be false. mem_cgroup_kmem_disabled() is similar to mem_cgroup_disabled(): it returns true only if CONFIG_MEMCG_KMEM is not set or the kmem accounting is disabled using a boot time kernel option "cgroup.memory=nokmem". It never changes the value dynamically. memcg_kmem_enabled() is different: it always returns false until the first non-root memory cgroup will get online (assuming the kernel memory accounting is enabled). It's goal is to improve the performance on systems without the cgroupfs mounted/memory controller enabled or on the systems with only the root memory cgroup. To make things more obvious and avoid potential bugs, let's rename memcg_kmem_enabled() to memcg_kmem_online(). Link: https://lkml.kernel.org/r/20230213192922.1146370-1-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Muchun Song Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Shakeel Butt Cc: Dennis Zhou Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 14 +++++++------- mm/memcontrol.c | 8 ++++---- mm/page_alloc.c | 8 ++++---- mm/percpu.c | 2 +- mm/slab.h | 10 +++++----- mm/vmscan.c | 2 +- 6 files changed, 22 insertions(+), 22 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 35478695cabf..5567319027d1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1776,24 +1776,24 @@ struct obj_cgroup *get_obj_cgroup_from_page(struct page *page); int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size); void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size); -extern struct static_key_false memcg_kmem_enabled_key; +extern struct static_key_false memcg_kmem_online_key; -static inline bool memcg_kmem_enabled(void) +static inline bool memcg_kmem_online(void) { - return static_branch_likely(&memcg_kmem_enabled_key); + return static_branch_likely(&memcg_kmem_online_key); } static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { - if (memcg_kmem_enabled()) + if (memcg_kmem_online()) return __memcg_kmem_charge_page(page, gfp, order); return 0; } static inline void memcg_kmem_uncharge_page(struct page *page, int order) { - if (memcg_kmem_enabled()) + if (memcg_kmem_online()) __memcg_kmem_uncharge_page(page, order); } @@ -1814,7 +1814,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, { struct mem_cgroup *memcg; - if (!memcg_kmem_enabled()) + if (!memcg_kmem_online()) return; rcu_read_lock(); @@ -1854,7 +1854,7 @@ static inline struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) return NULL; } -static inline bool memcg_kmem_enabled(void) +static inline bool memcg_kmem_online(void) { return false; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 17335459d8dc..3e3cdb9bed95 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -345,8 +345,8 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg, * conditional to this static branch, we'll have to allow modules that does * kmem_cache_alloc and the such to see this symbol as well */ -DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); -EXPORT_SYMBOL(memcg_kmem_enabled_key); +DEFINE_STATIC_KEY_FALSE(memcg_kmem_online_key); +EXPORT_SYMBOL(memcg_kmem_online_key); #endif /** @@ -3034,7 +3034,7 @@ struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) { struct obj_cgroup *objcg; - if (!memcg_kmem_enabled()) + if (!memcg_kmem_online()) return NULL; if (PageMemcgKmem(page)) { @@ -3746,7 +3746,7 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) objcg->memcg = memcg; rcu_assign_pointer(memcg->objcg, objcg); - static_branch_enable(&memcg_kmem_enabled_key); + static_branch_enable(&memcg_kmem_online_key); memcg->kmemcg_id = memcg->id.id; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4b6bcec41c8f..4c9ab8b93b1a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1410,7 +1410,7 @@ static __always_inline bool free_pages_prepare(struct page *page, * Do not let hwpoison pages hit pcplists/buddy * Untie memcg state and reset page's owner */ - if (memcg_kmem_enabled() && PageMemcgKmem(page)) + if (memcg_kmem_online() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); reset_page_owner(page, order); page_table_check_free(page, order); @@ -1441,7 +1441,7 @@ static __always_inline bool free_pages_prepare(struct page *page, } if (PageMappingFlags(page)) page->mapping = NULL; - if (memcg_kmem_enabled() && PageMemcgKmem(page)) + if (memcg_kmem_online() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); if (check_free && free_page_is_bad(page)) bad++; @@ -5432,7 +5432,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, goto out; /* Bulk allocator does not support memcg accounting. */ - if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT)) + if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT)) goto failed; /* Use the single page allocator for one page. */ @@ -5604,7 +5604,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, page = __alloc_pages_slowpath(alloc_gfp, order, &ac); out: - if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT) && page && + if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page && unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) { __free_pages(page, order); page = NULL; diff --git a/mm/percpu.c b/mm/percpu.c index acd78da0493b..28e07ede46f6 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1625,7 +1625,7 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, { struct obj_cgroup *objcg; - if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT)) + if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT)) return true; objcg = get_obj_cgroup_from_current(); diff --git a/mm/slab.h b/mm/slab.h index 63fb4c00d529..43966aa5fadf 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -494,7 +494,7 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, { struct obj_cgroup *objcg; - if (!memcg_kmem_enabled()) + if (!memcg_kmem_online()) return true; if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)) @@ -535,7 +535,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, unsigned long off; size_t i; - if (!memcg_kmem_enabled() || !objcg) + if (!memcg_kmem_online() || !objcg) return; for (i = 0; i < size; i++) { @@ -567,7 +567,7 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, struct obj_cgroup **objcgs; int i; - if (!memcg_kmem_enabled()) + if (!memcg_kmem_online()) return; objcgs = slab_objcgs(slab); @@ -649,7 +649,7 @@ static inline struct kmem_cache *virt_to_cache(const void *obj) static __always_inline void account_slab(struct slab *slab, int order, struct kmem_cache *s, gfp_t gfp) { - if (memcg_kmem_enabled() && (s->flags & SLAB_ACCOUNT)) + if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) memcg_alloc_slab_cgroups(slab, s, gfp, true); mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), @@ -659,7 +659,7 @@ static __always_inline void account_slab(struct slab *slab, int order, static __always_inline void unaccount_slab(struct slab *slab, int order, struct kmem_cache *s) { - if (memcg_kmem_enabled()) + if (memcg_kmem_online()) memcg_free_slab_cgroups(slab); mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), diff --git a/mm/vmscan.c b/mm/vmscan.c index 34535bbd4fe9..098c79129c42 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -915,7 +915,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, } /* Call non-slab shrinkers even though kmem is disabled */ - if (!memcg_kmem_enabled() && + if (!memcg_kmem_online() && !(shrinker->flags & SHRINKER_NONSLAB)) continue; -- cgit v1.2.3 From 9701c9ff8311fed118fd09d962a90e254e761d97 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 15 Feb 2023 14:00:56 +0100 Subject: kasan: mark addr_has_metadata __always_inline Patch series "objtool warning fixes", v2. These are three of the easier fixes for objtool warnings around kasan/kmsan/kcsan. I dropped one patch since Peter had come up with a better fix, and adjusted the changelog text based on feedback. This patch (of 3): When the compiler decides not to inline this function, objtool complains about incorrect UACCESS state: mm/kasan/generic.o: warning: objtool: __asan_load2+0x11: call to addr_has_metadata() with UACCESS enabled Link: https://lore.kernel.org/all/20230208164011.2287122-1-arnd@kernel.org/ Link: https://lkml.kernel.org/r/20230215130058.3836177-2-arnd@kernel.org Signed-off-by: Arnd Bergmann Acked-by: Peter Zijlstra (Intel) Reviewed-by: Marco Elver Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Josh Poimboeuf Cc: Kuan-Ying Lee Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 308fb70fd40a..8fae87ab99cc 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -297,7 +297,7 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr) << KASAN_SHADOW_SCALE_SHIFT); } -static inline bool addr_has_metadata(const void *addr) +static __always_inline bool addr_has_metadata(const void *addr) { return (kasan_reset_tag(addr) >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); @@ -316,7 +316,7 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write, #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ -static inline bool addr_has_metadata(const void *addr) +static __always_inline bool addr_has_metadata(const void *addr) { return (is_vmalloc_addr(addr) || virt_addr_valid(addr)); } -- cgit v1.2.3 From e75a698859a3f62825af06987da6b3e6466e6e56 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 15 Feb 2023 14:00:57 +0100 Subject: kmsan: disable ftrace in kmsan core code objtool warns about some suspicous code inside of kmsan: vmlinux.o: warning: objtool: __msan_metadata_ptr_for_load_n+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_metadata_ptr_for_store_n+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_metadata_ptr_for_load_1+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_metadata_ptr_for_store_1+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_metadata_ptr_for_load_2+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_metadata_ptr_for_store_2+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_metadata_ptr_for_load_4+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_metadata_ptr_for_store_4+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_metadata_ptr_for_load_8+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_metadata_ptr_for_store_8+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_instrument_asm_store+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_chain_origin+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_poison_alloca+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_warning+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_get_context_state+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: kmsan_copy_to_user+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: kmsan_unpoison_memory+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: kmsan_unpoison_entry_regs+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: kmsan_report+0x4: call to __fentry__() with UACCESS enabled The Makefile contained a line to turn off ftrace for the entire directory, but this does not work. Replace it with individual lines, matching the approach in kasan. Link: https://lkml.kernel.org/r/20230215130058.3836177-3-arnd@kernel.org Signed-off-by: Arnd Bergmann Fixes: f80be4571b19 ("kmsan: add KMSAN runtime core") Acked-by: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Josh Poimboeuf Cc: Kuan-Ying Lee Cc: Marco Elver Cc: Peter Zijlstra (Intel) Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kmsan/Makefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/kmsan/Makefile b/mm/kmsan/Makefile index 98eab2856626..91cfdde642d1 100644 --- a/mm/kmsan/Makefile +++ b/mm/kmsan/Makefile @@ -14,7 +14,13 @@ CC_FLAGS_KMSAN_RUNTIME := -fno-stack-protector CC_FLAGS_KMSAN_RUNTIME += $(call cc-option,-fno-conserve-stack) CC_FLAGS_KMSAN_RUNTIME += -DDISABLE_BRANCH_PROFILING -CFLAGS_REMOVE.o = $(CC_FLAGS_FTRACE) +# Disable ftrace to avoid recursion. +CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_hooks.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_init.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_instrumentation.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_shadow.o = $(CC_FLAGS_FTRACE) CFLAGS_core.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_hooks.o := $(CC_FLAGS_KMSAN_RUNTIME) -- cgit v1.2.3 From d5d469247264e56960705dc5ae7e1d014861fe40 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 15 Feb 2023 14:00:58 +0100 Subject: objtool: add UACCESS exceptions for __tsan_volatile_read/write A lot of the tsan helpers are already excempt from the UACCESS warnings, but some more functions were added that need the same thing: kernel/kcsan/core.o: warning: objtool: __tsan_volatile_read16+0x0: call to __tsan_unaligned_read16() with UACCESS enabled kernel/kcsan/core.o: warning: objtool: __tsan_volatile_write16+0x0: call to __tsan_unaligned_write16() with UACCESS enabled vmlinux.o: warning: objtool: __tsan_unaligned_volatile_read16+0x4: call to __tsan_unaligned_read16() with UACCESS enabled vmlinux.o: warning: objtool: __tsan_unaligned_volatile_write16+0x4: call to __tsan_unaligned_write16() with UACCESS enabled As Marco points out, these functions don't even call each other explicitly but instead gcc (but not clang) notices the functions being identical and turns one symbol into a direct branch to the other. Link: https://lkml.kernel.org/r/20230215130058.3836177-4-arnd@kernel.org Fixes: 75d75b7a4d54 ("kcsan: Support distinguishing volatile accesses") Signed-off-by: Arnd Bergmann Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Josh Poimboeuf Cc: Kuan-Ying Lee Cc: Peter Zijlstra (Intel) Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- tools/objtool/check.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 4b7c8b33069e..b1a5f658673f 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1186,6 +1186,8 @@ static const char *uaccess_safe_builtin[] = { "__tsan_atomic64_compare_exchange_val", "__tsan_atomic_thread_fence", "__tsan_atomic_signal_fence", + "__tsan_unaligned_read16", + "__tsan_unaligned_write16", /* KCOV */ "write_comp_data", "check_kcov_mode", -- cgit v1.2.3 From be2d57563822b7e00b2b16d9354637c4b6d6d5cc Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 15 Feb 2023 18:39:34 +0800 Subject: mm: change to return bool for folio_isolate_lru() Patch series "Change the return value for page isolation functions", v3. Now the page isolation functions did not return a boolean to indicate success or not, instead it will return a negative error when failed to isolate a page. So below code used in most places seem a boolean success/failure thing, which can confuse people whether the isolation is successful. if (folio_isolate_lru(folio)) continue; Moreover the page isolation functions only return 0 or -EBUSY, and most users did not care about the negative error except for few users, thus we can convert all page isolation functions to return a boolean value, which can remove the confusion to make code more clear. No functional changes intended in this patch series. This patch (of 4): Now the folio_isolate_lru() did not return a boolean value to indicate isolation success or not, however below code checking the return value can make people think that it was a boolean success/failure thing, which makes people easy to make mistakes (see the fix patch[1]). if (folio_isolate_lru(folio)) continue; Thus it's better to check the negative error value expilictly returned by folio_isolate_lru(), which makes code more clear per Linus's suggestion[2]. Moreover Matthew suggested we can convert the isolation functions to return a boolean[3], since most users did not care about the negative error value, and can also remove the confusing of checking return value. So this patch converts the folio_isolate_lru() to return a boolean value, which means return 'true' to indicate the folio isolation is successful, and 'false' means a failure to isolation. Meanwhile changing all users' logic of checking the isolation state. No functional changes intended. [1] https://lore.kernel.org/all/20230131063206.28820-1-Kuan-Ying.Lee@mediatek.com/T/#u [2] https://lore.kernel.org/all/CAHk-=wiBrY+O-4=2mrbVyxR+hOqfdJ=Do6xoucfJ9_5az01L4Q@mail.gmail.com/ [3] https://lore.kernel.org/all/Y+sTFqwMNAjDvxw3@casper.infradead.org/ Link: https://lkml.kernel.org/r/cover.1676424378.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/8a4e3679ed4196168efadf7ea36c038f2f7d5aa9.1676424378.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: SeongJae Park Acked-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Linus Torvalds Cc: Johannes Weiner Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 2 +- mm/folio-compat.c | 8 +++++++- mm/gup.c | 2 +- mm/internal.h | 2 +- mm/khugepaged.c | 2 +- mm/madvise.c | 4 ++-- mm/mempolicy.c | 2 +- mm/vmscan.c | 10 +++++----- 8 files changed, 19 insertions(+), 13 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index b4df9b9bcc0a..607bb69e526c 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -246,7 +246,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) folio_clear_referenced(folio); folio_test_clear_young(folio); - if (folio_isolate_lru(folio)) { + if (!folio_isolate_lru(folio)) { folio_put(folio); continue; } diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 18c48b557926..540373cf904e 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -115,9 +115,15 @@ EXPORT_SYMBOL(grab_cache_page_write_begin); int isolate_lru_page(struct page *page) { + bool ret; + if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page")) return -EBUSY; - return folio_isolate_lru((struct folio *)page); + ret = folio_isolate_lru((struct folio *)page); + if (ret) + return 0; + + return -EBUSY; } void putback_lru_page(struct page *page) diff --git a/mm/gup.c b/mm/gup.c index b0885f70579c..eab18ba045db 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1939,7 +1939,7 @@ static unsigned long collect_longterm_unpinnable_pages( drain_allow = false; } - if (folio_isolate_lru(folio)) + if (!folio_isolate_lru(folio)) continue; list_add_tail(&folio->lru, movable_page_list); diff --git a/mm/internal.h b/mm/internal.h index dfb37e94e140..8645e8496537 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -188,7 +188,7 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, * in mm/vmscan.c: */ int isolate_lru_page(struct page *page); -int folio_isolate_lru(struct folio *folio); +bool folio_isolate_lru(struct folio *folio); void putback_lru_page(struct page *page); void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index bd54b957f69a..15eebab0fbb5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1950,7 +1950,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, goto out_unlock; } - if (folio_isolate_lru(folio)) { + if (!folio_isolate_lru(folio)) { result = SCAN_DEL_PAGE_LRU; goto out_unlock; } diff --git a/mm/madvise.c b/mm/madvise.c index 5a5a687d03c2..c2202f51e9dd 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -406,7 +406,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, folio_clear_referenced(folio); folio_test_clear_young(folio); if (pageout) { - if (!folio_isolate_lru(folio)) { + if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) folio_putback_lru(folio); else @@ -500,7 +500,7 @@ regular_folio: folio_clear_referenced(folio); folio_test_clear_young(folio); if (pageout) { - if (!folio_isolate_lru(folio)) { + if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) folio_putback_lru(folio); else diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0919c7a719d4..2751bc3310fd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1033,7 +1033,7 @@ static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, * expensive, so check the estimated mapcount of the folio instead. */ if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) { - if (!folio_isolate_lru(folio)) { + if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, foliolist); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), diff --git a/mm/vmscan.c b/mm/vmscan.c index 098c79129c42..9c1c5e8b24b8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2337,12 +2337,12 @@ move: * (2) The lru_lock must not be held. * (3) Interrupts must be enabled. * - * Return: 0 if the folio was removed from an LRU list. - * -EBUSY if the folio was not on an LRU list. + * Return: true if the folio was removed from an LRU list. + * false if the folio was not on an LRU list. */ -int folio_isolate_lru(struct folio *folio) +bool folio_isolate_lru(struct folio *folio) { - int ret = -EBUSY; + bool ret = false; VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio); @@ -2353,7 +2353,7 @@ int folio_isolate_lru(struct folio *folio) lruvec = folio_lruvec_lock_irq(folio); lruvec_del_folio(lruvec, folio); unlock_page_lruvec_irq(lruvec); - ret = 0; + ret = true; } return ret; -- cgit v1.2.3 From f7f9c00dfafffd7a5a1a5685e2d874c64913e2ed Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 15 Feb 2023 18:39:35 +0800 Subject: mm: change to return bool for isolate_lru_page() The isolate_lru_page() can only return 0 or -EBUSY, and most users did not care about the negative error of isolate_lru_page(), except one user in add_page_for_migration(). So we can convert the isolate_lru_page() to return a boolean value, which can help to make the code more clear when checking the return value of isolate_lru_page(). Also convert all users' logic of checking the isolation state. No functional changes intended. Link: https://lkml.kernel.org/r/3074c1ab628d9dbf139b33f248a8bc253a3f95f0.1676424378.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Linus Torvalds Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/folio-compat.c | 12 +++--------- mm/internal.h | 2 +- mm/khugepaged.c | 2 +- mm/memcontrol.c | 4 ++-- mm/memory-failure.c | 4 ++-- mm/memory_hotplug.c | 8 +++++--- mm/migrate.c | 9 ++++++--- mm/migrate_device.c | 2 +- 8 files changed, 21 insertions(+), 22 deletions(-) diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 540373cf904e..cabcd1de9ecb 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -113,17 +113,11 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, } EXPORT_SYMBOL(grab_cache_page_write_begin); -int isolate_lru_page(struct page *page) +bool isolate_lru_page(struct page *page) { - bool ret; - if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page")) - return -EBUSY; - ret = folio_isolate_lru((struct folio *)page); - if (ret) - return 0; - - return -EBUSY; + return false; + return folio_isolate_lru((struct folio *)page); } void putback_lru_page(struct page *page) diff --git a/mm/internal.h b/mm/internal.h index 8645e8496537..fc01fd092ea5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -187,7 +187,7 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, /* * in mm/vmscan.c: */ -int isolate_lru_page(struct page *page); +bool isolate_lru_page(struct page *page); bool folio_isolate_lru(struct folio *folio); void putback_lru_page(struct page *page); void folio_putback_lru(struct folio *folio); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 15eebab0fbb5..987281ead49e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -636,7 +636,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * Isolate the page to avoid collapsing an hugepage * currently in use by the VM. */ - if (isolate_lru_page(page)) { + if (!isolate_lru_page(page)) { unlock_page(page); result = SCAN_DEL_PAGE_LRU; goto out; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3e3cdb9bed95..25f2465d5a37 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6176,7 +6176,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); if (target_type == MC_TARGET_PAGE) { page = target.page; - if (!isolate_lru_page(page)) { + if (isolate_lru_page(page)) { if (!mem_cgroup_move_account(page, true, mc.from, mc.to)) { mc.precharge -= HPAGE_PMD_NR; @@ -6226,7 +6226,7 @@ retry: */ if (PageTransCompound(page)) goto put; - if (!device && isolate_lru_page(page)) + if (!device && !isolate_lru_page(page)) goto put; if (!mem_cgroup_move_account(page, false, mc.from, mc.to)) { diff --git a/mm/memory-failure.c b/mm/memory-failure.c index db85c2d37f70..e504362fdb23 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -846,7 +846,7 @@ static const char * const action_page_types[] = { */ static int delete_from_lru_cache(struct page *p) { - if (!isolate_lru_page(p)) { + if (isolate_lru_page(p)) { /* * Clear sensible page flags, so that the buddy system won't * complain when the page is unpoison-and-freed. @@ -2513,7 +2513,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) bool lru = !__PageMovable(page); if (lru) - isolated = !isolate_lru_page(page); + isolated = isolate_lru_page(page); else isolated = !isolate_movable_page(page, ISOLATE_UNEVICTABLE); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a1e8c3e9ab08..5fc2dcf4e3ab 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1632,6 +1632,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) for (pfn = start_pfn; pfn < end_pfn; pfn++) { struct folio *folio; + bool isolated; if (!pfn_valid(pfn)) continue; @@ -1667,9 +1668,10 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * We can skip free pages. And we can deal with pages on * LRU and non-lru movable pages. */ - if (PageLRU(page)) - ret = isolate_lru_page(page); - else + if (PageLRU(page)) { + isolated = isolate_lru_page(page); + ret = isolated ? 0 : -EBUSY; + } else ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); if (!ret) { /* Success */ list_add_tail(&page->lru, &source); diff --git a/mm/migrate.c b/mm/migrate.c index ef68a1aff35c..53010a142e7f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2132,11 +2132,14 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, } } else { struct page *head; + bool isolated; head = compound_head(page); - err = isolate_lru_page(head); - if (err) + isolated = isolate_lru_page(head); + if (!isolated) { + err = -EBUSY; goto out_putpage; + } err = 1; list_add_tail(&head->lru, pagelist); @@ -2541,7 +2544,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) return 0; } - if (isolate_lru_page(page)) + if (!isolate_lru_page(page)) return 0; mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page), diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 6c3740318a98..d30c9de60b0d 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -388,7 +388,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, allow_drain = false; } - if (isolate_lru_page(page)) { + if (!isolate_lru_page(page)) { src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; restore++; continue; -- cgit v1.2.3 From 9747b9e92418b61c2281561e0651803f1fad0159 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 15 Feb 2023 18:39:36 +0800 Subject: mm: hugetlb: change to return bool for isolate_hugetlb() Now the isolate_hugetlb() only returns 0 or -EBUSY, and most users did not care about the negative value, thus we can convert the isolate_hugetlb() to return a boolean value to make code more clear when checking the hugetlb isolation state. Moreover converts 2 users which will consider the negative value returned by isolate_hugetlb(). No functional changes intended. [akpm@linux-foundation.org: shorten locked section, per SeongJae Park] Link: https://lkml.kernel.org/r/12a287c5bebc13df304387087bbecc6421510849.1676424378.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Kravetz Acked-by: Linus Torvalds Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 +++--- mm/hugetlb.c | 13 ++++++++----- mm/memory-failure.c | 2 +- mm/mempolicy.c | 2 +- mm/migrate.c | 7 +++---- 5 files changed, 16 insertions(+), 14 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index df6dd624ccfe..5f5e4177b2e0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -171,7 +171,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -int isolate_hugetlb(struct folio *folio, struct list_head *list); +bool isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); @@ -413,9 +413,9 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, return NULL; } -static inline int isolate_hugetlb(struct folio *folio, struct list_head *list) +static inline bool isolate_hugetlb(struct folio *folio, struct list_head *list) { - return -EBUSY; + return false; } static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3a01a9dbf445..07abcb6eb203 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2925,12 +2925,15 @@ retry: */ goto free_new; } else if (folio_ref_count(old_folio)) { + bool isolated; + /* * Someone has grabbed the folio, try to isolate it here. * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - ret = isolate_hugetlb(old_folio, list); + isolated = isolate_hugetlb(old_folio, list); + ret = isolated ? 0 : -EBUSY; spin_lock_irq(&hugetlb_lock); goto free_new; } else if (!folio_test_hugetlb_freed(old_folio)) { @@ -3005,7 +3008,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (folio_ref_count(folio) && !isolate_hugetlb(folio, list)) + if (folio_ref_count(folio) && isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); @@ -7251,15 +7254,15 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) * These functions are overwritable if your architecture needs its own * behavior. */ -int isolate_hugetlb(struct folio *folio, struct list_head *list) +bool isolate_hugetlb(struct folio *folio, struct list_head *list) { - int ret = 0; + bool ret = true; spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(folio) || !folio_test_hugetlb_migratable(folio) || !folio_try_get(folio)) { - ret = -EBUSY; + ret = false; goto unlock; } folio_clear_hugetlb_migratable(folio); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e504362fdb23..8604753bc644 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2508,7 +2508,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) bool isolated = false; if (PageHuge(page)) { - isolated = !isolate_hugetlb(page_folio(page), pagelist); + isolated = isolate_hugetlb(page_folio(page), pagelist); } else { bool lru = !__PageMovable(page); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2751bc3310fd..a256a241fd1d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -609,7 +609,7 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, if (flags & (MPOL_MF_MOVE_ALL) || (flags & MPOL_MF_MOVE && folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte))) { - if (isolate_hugetlb(folio, qp->pagelist) && + if (!isolate_hugetlb(folio, qp->pagelist) && (flags & MPOL_MF_STRICT)) /* * Failed to isolate folio but allow migrating pages diff --git a/mm/migrate.c b/mm/migrate.c index 53010a142e7f..2db546a0618c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2095,6 +2095,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, struct vm_area_struct *vma; struct page *page; int err; + bool isolated; mmap_read_lock(mm); err = -EFAULT; @@ -2126,13 +2127,11 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (PageHuge(page)) { if (PageHead(page)) { - err = isolate_hugetlb(page_folio(page), pagelist); - if (!err) - err = 1; + isolated = isolate_hugetlb(page_folio(page), pagelist); + err = isolated ? 1 : -EBUSY; } } else { struct page *head; - bool isolated; head = compound_head(page); isolated = isolate_lru_page(head); -- cgit v1.2.3 From cd7755800eb54e8522f5e51f4e71e6494c1f1572 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 15 Feb 2023 18:39:37 +0800 Subject: mm: change to return bool for isolate_movable_page() Now the isolate_movable_page() can only return 0 or -EBUSY, and no users will care about the negative return value, thus we can convert the isolate_movable_page() to return a boolean value to make the code more clear when checking the movable page isolation state. No functional changes intended. [akpm@linux-foundation.org: remove unneeded comment, per Matthew] Link: https://lkml.kernel.org/r/cb877f73f4fff8d309611082ec740a7065b1ade0.1676424378.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Linus Torvalds Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/migrate.h | 6 +++--- mm/compaction.c | 2 +- mm/memory-failure.c | 4 ++-- mm/memory_hotplug.c | 10 +++++----- mm/migrate.c | 6 +++--- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index c88b96b48be7..6b252f519c86 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -71,7 +71,7 @@ extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, int reason, unsigned int *ret_succeeded); extern struct page *alloc_migration_target(struct page *page, unsigned long private); -extern int isolate_movable_page(struct page *page, isolate_mode_t mode); +extern bool isolate_movable_page(struct page *page, isolate_mode_t mode); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src); @@ -92,8 +92,8 @@ static inline int migrate_pages(struct list_head *l, new_page_t new, static inline struct page *alloc_migration_target(struct page *page, unsigned long private) { return NULL; } -static inline int isolate_movable_page(struct page *page, isolate_mode_t mode) - { return -EBUSY; } +static inline bool isolate_movable_page(struct page *page, isolate_mode_t mode) + { return false; } static inline int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src) diff --git a/mm/compaction.c b/mm/compaction.c index d73578af44cc..ad7409f70519 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -976,7 +976,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, locked = NULL; } - if (!isolate_movable_page(page, mode)) + if (isolate_movable_page(page, mode)) goto isolate_success; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 8604753bc644..a1ede7bdce95 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2515,8 +2515,8 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) if (lru) isolated = isolate_lru_page(page); else - isolated = !isolate_movable_page(page, - ISOLATE_UNEVICTABLE); + isolated = isolate_movable_page(page, + ISOLATE_UNEVICTABLE); if (isolated) { list_add(&page->lru, pagelist); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5fc2dcf4e3ab..5f73fd894b89 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1668,18 +1668,18 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * We can skip free pages. And we can deal with pages on * LRU and non-lru movable pages. */ - if (PageLRU(page)) { + if (PageLRU(page)) isolated = isolate_lru_page(page); - ret = isolated ? 0 : -EBUSY; - } else - ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); - if (!ret) { /* Success */ + else + isolated = isolate_movable_page(page, ISOLATE_UNEVICTABLE); + if (isolated) { list_add_tail(&page->lru, &source); if (!__PageMovable(page)) inc_node_page_state(page, NR_ISOLATED_ANON + page_is_file_lru(page)); } else { + ret = -EBUSY; if (__ratelimit(&migrate_rs)) { pr_warn("failed to isolate pfn %lx\n", pfn); dump_page(page, "isolation failed"); diff --git a/mm/migrate.c b/mm/migrate.c index 2db546a0618c..9a101c7bb8ff 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -58,7 +58,7 @@ #include "internal.h" -int isolate_movable_page(struct page *page, isolate_mode_t mode) +bool isolate_movable_page(struct page *page, isolate_mode_t mode) { struct folio *folio = folio_get_nontail_page(page); const struct movable_operations *mops; @@ -119,14 +119,14 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) folio_set_isolated(folio); folio_unlock(folio); - return 0; + return true; out_no_isolated: folio_unlock(folio); out_putfolio: folio_put(folio); out: - return -EBUSY; + return false; } static void putback_movable_folio(struct folio *folio) -- cgit v1.2.3 From 7a079ba20090ab50d2f4203ceccd1e0f4becd1a6 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 15 Feb 2023 15:58:00 -0500 Subject: mm/uffd: fix comment in handling pte markers The comment is obsolete after f369b07c8614 ("mm/uffd: reset write protection when unregister with wp-mode", 2022-08-20). Remove it. Link: https://lkml.kernel.org/r/20230215205800.223549-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Axel Rasmussen Signed-off-by: Andrew Morton --- mm/memory.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 7a04a1130ec1..f456f3b5049c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3627,9 +3627,7 @@ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf) { /* * Just in case there're leftover special ptes even after the region - * got unregistered - we can simply clear them. We can also do that - * proactively when e.g. when we do UFFDIO_UNREGISTER upon some uffd-wp - * ranges, but it should be more efficient to be done lazily here. + * got unregistered - we can simply clear them. */ if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma))) return pte_marker_clear(vmf); -- cgit v1.2.3 From 32cf666eab720b597650d9dea6ff07e99cd36b3d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 16 Feb 2023 17:07:03 +0000 Subject: mm/memory_hotplug: cleanup return value handing in do_migrate_range() Return value mechanism of do_migrate_range() is not very simple, while no caller of the function checks the return value. Make the function return nothing to be more simple, and cleanup related unnecessary code. Link: https://lkml.kernel.org/r/20230216170703.64574-1-sj@kernel.org Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Reviewed-by: Baolin Wang Signed-off-by: SeongJae Park Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5f73fd894b89..db3b270254f1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1620,12 +1620,10 @@ found: return 0; } -static int -do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) +static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; struct page *page, *head; - int ret = 0; LIST_HEAD(source); static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -1679,7 +1677,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) page_is_file_lru(page)); } else { - ret = -EBUSY; if (__ratelimit(&migrate_rs)) { pr_warn("failed to isolate pfn %lx\n", pfn); dump_page(page, "isolation failed"); @@ -1693,6 +1690,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) .nmask = &nmask, .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, }; + int ret; /* * We have checked that migration range is on a single zone so @@ -1721,8 +1719,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) putback_movable_pages(&source); } } - - return ret; } static int __init cmdline_parse_movable_node(char *p) -- cgit v1.2.3 From f9366f4c2a29d14f5992b195e268240c2deb116e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 16 Feb 2023 14:44:24 -0800 Subject: include/linux/migrate.h: remove unneeded externs As suggested by Matthew. Suggested-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/migrate.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 6b252f519c86..6241a1596a75 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -62,16 +62,16 @@ extern const char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION -extern void putback_movable_pages(struct list_head *l); +void putback_movable_pages(struct list_head *l); int migrate_folio_extra(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode, int extra_count); int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode); -extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, - unsigned long private, enum migrate_mode mode, int reason, - unsigned int *ret_succeeded); -extern struct page *alloc_migration_target(struct page *page, unsigned long private); -extern bool isolate_movable_page(struct page *page, isolate_mode_t mode); +int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, + unsigned long private, enum migrate_mode mode, int reason, + unsigned int *ret_succeeded); +struct page *alloc_migration_target(struct page *page, unsigned long private); +bool isolate_movable_page(struct page *page, isolate_mode_t mode); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src); @@ -142,8 +142,8 @@ const struct movable_operations *page_movable_ops(struct page *page) } #ifdef CONFIG_NUMA_BALANCING -extern int migrate_misplaced_page(struct page *page, - struct vm_area_struct *vma, int node); +int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, + int node); #else static inline int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int node) -- cgit v1.2.3