From d92725256b4f22d084b813b37ddc394da79aacab Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 30 May 2022 14:34:50 -0400 Subject: mm: avoid unnecessary page fault retires on shared memory types I observed that for each of the shared file-backed page faults, we're very likely to retry one more time for the 1st write fault upon no page. It's because we'll need to release the mmap lock for dirty rate limit purpose with balance_dirty_pages_ratelimited() (in fault_dirty_shared_page()). Then after that throttling we return VM_FAULT_RETRY. We did that probably because VM_FAULT_RETRY is the only way we can return to the fault handler at that time telling it we've released the mmap lock. However that's not ideal because it's very likely the fault does not need to be retried at all since the pgtable was well installed before the throttling, so the next continuous fault (including taking mmap read lock, walk the pgtable, etc.) could be in most cases unnecessary. It's not only slowing down page faults for shared file-backed, but also add more mmap lock contention which is in most cases not needed at all. To observe this, one could try to write to some shmem page and look at "pgfault" value in /proc/vmstat, then we should expect 2 counts for each shmem write simply because we retried, and vm event "pgfault" will capture that. To make it more efficient, add a new VM_FAULT_COMPLETED return code just to show that we've completed the whole fault and released the lock. It's also a hint that we should very possibly not need another fault immediately on this page because we've just completed it. This patch provides a ~12% perf boost on my aarch64 test VM with a simple program sequentially dirtying 400MB shmem file being mmap()ed and these are the time it needs: Before: 650.980 ms (+-1.94%) After: 569.396 ms (+-1.38%) I believe it could help more than that. We need some special care on GUP and the s390 pgfault handler (for gmap code before returning from pgfault), the rest changes in the page fault handlers should be relatively straightforward. Another thing to mention is that mm_account_fault() does take this new fault as a generic fault to be accounted, unlike VM_FAULT_RETRY. I explicitly didn't touch hmm_vma_fault() and break_ksm() because they do not handle VM_FAULT_RETRY even with existing code, so I'm literally keeping them as-is. Link: https://lkml.kernel.org/r/20220530183450.42886-1-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: Geert Uytterhoeven Acked-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Acked-by: Vineet Gupta Acked-by: Guo Ren Acked-by: Max Filippov Acked-by: Christian Borntraeger Acked-by: Michael Ellerman (powerpc) Acked-by: Catalin Marinas Reviewed-by: Alistair Popple Reviewed-by: Ingo Molnar Acked-by: Russell King (Oracle) [arm part] Acked-by: Heiko Carstens Cc: Vasily Gorbik Cc: Stafford Horne Cc: David S. Miller Cc: Johannes Berg Cc: Brian Cain Cc: Richard Henderson Cc: Richard Weinberger Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Janosch Frank Cc: Albert Ou Cc: Anton Ivanov Cc: Dave Hansen Cc: Borislav Petkov Cc: Sven Schnelle Cc: Andrea Arcangeli Cc: James Bottomley Cc: Al Viro Cc: Alexander Gordeev Cc: Jonas Bonn Cc: Will Deacon Cc: Vlastimil Babka Cc: Michal Simek Cc: Matt Turner Cc: Paul Mackerras Cc: David Hildenbrand Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Stefan Kristiansson Cc: Paul Walmsley Cc: Ivan Kokshaysky Cc: Chris Zankel Cc: Hugh Dickins Cc: Dinh Nguyen Cc: Rich Felker Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Thomas Bogendoerfer Cc: Helge Deller Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/arm/mm/fault.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/arm') diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index a062e07516dd..46cccd6bf705 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -322,6 +322,10 @@ retry: return 0; } + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return 0; + if (!(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; -- cgit v1.2.3 From ca26f936f51b8c9219ede32b1a1f76c4924897aa Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:57 +0530 Subject: arm/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This enables ARCH_HAS_VM_GET_PAGE_PROT on the platform and exports standard vm_get_page_prot() implementation via DECLARE_VM_GET_PAGE_PROT, which looks up a private and static protection_map[] array. Subsequently all __SXXX and __PXXX macros can be dropped which are no longer needed. Link: https://lkml.kernel.org/r/20220711070600.2378316-24-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Russell King Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/arm/Kconfig | 1 + arch/arm/include/asm/pgtable.h | 17 ----------------- arch/arm/lib/uaccess_with_memcpy.c | 2 +- arch/arm/mm/mmu.c | 20 ++++++++++++++++++++ 4 files changed, 22 insertions(+), 18 deletions(-) (limited to 'arch/arm') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 7630ba9cb6cc..e153b6d4fc5b 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -24,6 +24,7 @@ config ARM select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB || !MMU select ARCH_HAS_TEARDOWN_DMA_OPS if MMU select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAVE_NMI_SAFE_CMPXCHG if CPU_V7 || CPU_V7M || CPU_V6K select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index cd1f84bb40ae..78a532068fec 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -137,23 +137,6 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, * 2) If we could do execute protection, then read is implied * 3) write implies read permissions */ -#define __P000 __PAGE_NONE -#define __P001 __PAGE_READONLY -#define __P010 __PAGE_COPY -#define __P011 __PAGE_COPY -#define __P100 __PAGE_READONLY_EXEC -#define __P101 __PAGE_READONLY_EXEC -#define __P110 __PAGE_COPY_EXEC -#define __P111 __PAGE_COPY_EXEC - -#define __S000 __PAGE_NONE -#define __S001 __PAGE_READONLY -#define __S010 __PAGE_SHARED -#define __S011 __PAGE_SHARED -#define __S100 __PAGE_READONLY_EXEC -#define __S101 __PAGE_READONLY_EXEC -#define __S110 __PAGE_SHARED_EXEC -#define __S111 __PAGE_SHARED_EXEC #ifndef __ASSEMBLY__ /* diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c index c30b689bec2e..14eecaaf295f 100644 --- a/arch/arm/lib/uaccess_with_memcpy.c +++ b/arch/arm/lib/uaccess_with_memcpy.c @@ -237,7 +237,7 @@ static int __init test_size_treshold(void) if (!dst_page) goto no_dst; kernel_ptr = page_address(src_page); - user_ptr = vmap(&dst_page, 1, VM_IOREMAP, __pgprot(__P010)); + user_ptr = vmap(&dst_page, 1, VM_IOREMAP, __pgprot(__PAGE_COPY)); if (!user_ptr) goto no_vmap; diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 5e2be37a198e..2722abddd725 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -405,6 +405,26 @@ void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot) local_flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE); } +static pgprot_t protection_map[16] __ro_after_init = { + [VM_NONE] = __PAGE_NONE, + [VM_READ] = __PAGE_READONLY, + [VM_WRITE] = __PAGE_COPY, + [VM_WRITE | VM_READ] = __PAGE_COPY, + [VM_EXEC] = __PAGE_READONLY_EXEC, + [VM_EXEC | VM_READ] = __PAGE_READONLY_EXEC, + [VM_EXEC | VM_WRITE] = __PAGE_COPY_EXEC, + [VM_EXEC | VM_WRITE | VM_READ] = __PAGE_COPY_EXEC, + [VM_SHARED] = __PAGE_NONE, + [VM_SHARED | VM_READ] = __PAGE_READONLY, + [VM_SHARED | VM_WRITE] = __PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = __PAGE_SHARED, + [VM_SHARED | VM_EXEC] = __PAGE_READONLY_EXEC, + [VM_SHARED | VM_EXEC | VM_READ] = __PAGE_READONLY_EXEC, + [VM_SHARED | VM_EXEC | VM_WRITE] = __PAGE_SHARED_EXEC, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = __PAGE_SHARED_EXEC +}; +DECLARE_VM_GET_PAGE_PROT + /* * Adjust the PMD section entries according to the CPU in use. */ -- cgit v1.2.3 From 3d923c5f1e21ad491acd4c0d62bf2481ce94016c Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:36:00 +0530 Subject: mm/mmap: drop ARCH_HAS_VM_GET_PAGE_PROT Now all the platforms enable ARCH_HAS_GET_PAGE_PROT. They define and export own vm_get_page_prot() whether custom or standard DECLARE_VM_GET_PAGE_PROT. Hence there is no need for default generic fallback for vm_get_page_prot(). Just drop this fallback and also ARCH_HAS_GET_PAGE_PROT mechanism. Link: https://lkml.kernel.org/r/20220711070600.2378316-27-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Geert Uytterhoeven Reviewed-by: Christoph Hellwig Reviewed-by: Christophe Leroy Acked-by: Geert Uytterhoeven Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/arm/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/arm') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index e153b6d4fc5b..7630ba9cb6cc 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -24,7 +24,6 @@ config ARM select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB || !MMU select ARCH_HAS_TEARDOWN_DMA_OPS if MMU select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAVE_NMI_SAFE_CMPXCHG if CPU_V7 || CPU_V7M || CPU_V6K select ARCH_HAS_GCOV_PROFILE_ALL -- cgit v1.2.3 From 391145380f4b432403d8bdaf53ad7109104cb0df Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 5 Jul 2022 18:47:08 +0300 Subject: ARM: head.S: rename PMD_ORDER to PMD_ENTRY_ORDER PMD_ORDER denotes order of magnitude for a PMD entry, i.e PMD entry size is 2 ^ PMD_ORDER. Rename PMD_ORDER to PMD_ENTRY_ORDER to allow a generic definition of PMD_ORDER as order of a PMD allocation: (PMD_SHIFT - PAGE_SHIFT). Link: https://lkml.kernel.org/r/20220705154708.181258-16-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Russell King (Oracle) Signed-off-by: Andrew Morton --- arch/arm/kernel/head.S | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'arch/arm') diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S index 500612d3da2e..29e2900178a1 100644 --- a/arch/arm/kernel/head.S +++ b/arch/arm/kernel/head.S @@ -38,10 +38,10 @@ #ifdef CONFIG_ARM_LPAE /* LPAE requires an additional page for the PGD */ #define PG_DIR_SIZE 0x5000 -#define PMD_ORDER 3 +#define PMD_ENTRY_ORDER 3 /* PMD entry size is 2^PMD_ENTRY_ORDER */ #else #define PG_DIR_SIZE 0x4000 -#define PMD_ORDER 2 +#define PMD_ENTRY_ORDER 2 #endif .globl swapper_pg_dir @@ -240,7 +240,7 @@ __create_page_tables: mov r6, r6, lsr #SECTION_SHIFT 1: orr r3, r7, r5, lsl #SECTION_SHIFT @ flags + kernel base - str r3, [r4, r5, lsl #PMD_ORDER] @ identity mapping + str r3, [r4, r5, lsl #PMD_ENTRY_ORDER] @ identity mapping cmp r5, r6 addlo r5, r5, #1 @ next section blo 1b @@ -250,7 +250,7 @@ __create_page_tables: * set two variables to indicate the physical start and end of the * kernel. */ - add r0, r4, #KERNEL_OFFSET >> (SECTION_SHIFT - PMD_ORDER) + add r0, r4, #KERNEL_OFFSET >> (SECTION_SHIFT - PMD_ENTRY_ORDER) ldr r6, =(_end - 1) adr_l r5, kernel_sec_start @ _pa(kernel_sec_start) #if defined CONFIG_CPU_ENDIAN_BE8 || defined CONFIG_CPU_ENDIAN_BE32 @@ -259,8 +259,8 @@ __create_page_tables: str r8, [r5] @ Save physical start of kernel (LE) #endif orr r3, r8, r7 @ Add the MMU flags - add r6, r4, r6, lsr #(SECTION_SHIFT - PMD_ORDER) -1: str r3, [r0], #1 << PMD_ORDER + add r6, r4, r6, lsr #(SECTION_SHIFT - PMD_ENTRY_ORDER) +1: str r3, [r0], #1 << PMD_ENTRY_ORDER add r3, r3, #1 << SECTION_SHIFT cmp r0, r6 bls 1b @@ -280,14 +280,14 @@ __create_page_tables: mov r3, pc mov r3, r3, lsr #SECTION_SHIFT orr r3, r7, r3, lsl #SECTION_SHIFT - add r0, r4, #(XIP_START & 0xff000000) >> (SECTION_SHIFT - PMD_ORDER) - str r3, [r0, #((XIP_START & 0x00f00000) >> SECTION_SHIFT) << PMD_ORDER]! + add r0, r4, #(XIP_START & 0xff000000) >> (SECTION_SHIFT - PMD_ENTRY_ORDER) + str r3, [r0, #((XIP_START & 0x00f00000) >> SECTION_SHIFT) << PMD_ENTRY_ORDER]! ldr r6, =(_edata_loc - 1) - add r0, r0, #1 << PMD_ORDER - add r6, r4, r6, lsr #(SECTION_SHIFT - PMD_ORDER) + add r0, r0, #1 << PMD_ENTRY_ORDER + add r6, r4, r6, lsr #(SECTION_SHIFT - PMD_ENTRY_ORDER) 1: cmp r0, r6 add r3, r3, #1 << SECTION_SHIFT - strls r3, [r0], #1 << PMD_ORDER + strls r3, [r0], #1 << PMD_ENTRY_ORDER bls 1b #endif @@ -297,10 +297,10 @@ __create_page_tables: */ mov r0, r2, lsr #SECTION_SHIFT cmp r2, #0 - ldrne r3, =FDT_FIXED_BASE >> (SECTION_SHIFT - PMD_ORDER) + ldrne r3, =FDT_FIXED_BASE >> (SECTION_SHIFT - PMD_ENTRY_ORDER) addne r3, r3, r4 orrne r6, r7, r0, lsl #SECTION_SHIFT - strne r6, [r3], #1 << PMD_ORDER + strne r6, [r3], #1 << PMD_ENTRY_ORDER addne r6, r6, #1 << SECTION_SHIFT strne r6, [r3] @@ -319,7 +319,7 @@ __create_page_tables: addruart r7, r3, r0 mov r3, r3, lsr #SECTION_SHIFT - mov r3, r3, lsl #PMD_ORDER + mov r3, r3, lsl #PMD_ENTRY_ORDER add r0, r4, r3 mov r3, r7, lsr #SECTION_SHIFT @@ -349,7 +349,7 @@ __create_page_tables: * If we're using the NetWinder or CATS, we also need to map * in the 16550-type serial port for the debug messages */ - add r0, r4, #0xff000000 >> (SECTION_SHIFT - PMD_ORDER) + add r0, r4, #0xff000000 >> (SECTION_SHIFT - PMD_ENTRY_ORDER) orr r3, r7, #0x7c000000 str r3, [r0] #endif @@ -359,10 +359,10 @@ __create_page_tables: * Similar reasons here - for debug. This is * only for Acorn RiscPC architectures. */ - add r0, r4, #0x02000000 >> (SECTION_SHIFT - PMD_ORDER) + add r0, r4, #0x02000000 >> (SECTION_SHIFT - PMD_ENTRY_ORDER) orr r3, r7, #0x02000000 str r3, [r0] - add r0, r4, #0xd8000000 >> (SECTION_SHIFT - PMD_ORDER) + add r0, r4, #0xd8000000 >> (SECTION_SHIFT - PMD_ENTRY_ORDER) str r3, [r0] #endif #endif -- cgit v1.2.3