From 8211dad6279817a8966ff6b74c2c588dd4166f45 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:38:35 -0700 Subject: s390: add pte_free_defer() for pgtables sharing page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add s390-specific pte_free_defer(), to free table page via call_rcu(). pte_free_defer() will be called inside khugepaged's retract_page_tables() loop, where allocating extra memory cannot be relied upon. This precedes the generic version to avoid build breakage from incompatible pgtable_t. This version is more complicated than others: because s390 fits two 2K page tables into one 4K page (so page->rcu_head must be shared between both halves), and already uses page->lru (which page->rcu_head overlays) to list any free halves; with clever management by page->_refcount bits. Build upon the existing management, adjusted to follow a new rule: that a page is never on the free list if pte_free_defer() was used on either half (marked by PageActive). And for simplicity, delay calling RCU until both halves are freed. Not adding back unallocated fragments to the list in pte_free_defer() can result in wasting some amount of memory for pagetables, depending on how long the allocated fragment will stay in use. In practice, this effect is expected to be insignificant, and not justify a far more complex approach, which might allow to add the fragments back later in __tlb_remove_table(), where we might not have a stable mm any more. [hughd@google.com: Claudio finds warning on mm_has_pgste() more useful than on mm_alloc_pgste()] Link: https://lkml.kernel.org/r/3bc095ba-a180-ce3b-82b1-2bfc64612f3@google.com Link: https://lkml.kernel.org/r/94eccf5f-264c-8abe-4567-e77f4b4e14a@google.com Signed-off-by: Hugh Dickins Reviewed-by: Gerald Schaefer Tested-by: Alexander Gordeev Acked-by: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/s390/include/asm/pgalloc.h | 4 +++ arch/s390/mm/pgalloc.c | 80 ++++++++++++++++++++++++++++++++++------- 2 files changed, 72 insertions(+), 12 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 17eb618f1348..89a9d5ef94f8 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -143,6 +143,10 @@ static inline void pmd_populate(struct mm_struct *mm, #define pte_free_kernel(mm, pte) page_table_free(mm, (unsigned long *) pte) #define pte_free(mm, pte) page_table_free(mm, (unsigned long *) pte) +/* arch use pte_free_defer() implementation in arch/s390/mm/pgalloc.c */ +#define pte_free_defer pte_free_defer +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); + void vmem_map_init(void); void *vmem_crst_alloc(unsigned long val); pte_t *vmem_pte_alloc(void); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 66ab68db9842..d7374add7820 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -229,6 +229,15 @@ void page_table_free_pgste(struct page *page) * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable * while the PP bits are never used, nor such a page is added to or removed * from mm_context_t::pgtable_list. + * + * pte_free_defer() overrides those rules: it takes the page off pgtable_list, + * and prevents both 2K fragments from being reused. pte_free_defer() has to + * guarantee that its pgtable cannot be reused before the RCU grace period + * has elapsed (which page_table_free_rcu() does not actually guarantee). + * But for simplicity, because page->rcu_head overlays page->lru, and because + * the RCU callback might not be called before the mm_context_t has been freed, + * pte_free_defer() in this implementation prevents both fragments from being + * reused, and delays making the call to RCU until both fragments are freed. */ unsigned long *page_table_alloc(struct mm_struct *mm) { @@ -261,7 +270,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) table += PTRS_PER_PTE; atomic_xor_bits(&page->_refcount, 0x01U << (bit + 24)); - list_del(&page->lru); + list_del_init(&page->lru); } } spin_unlock_bh(&mm->context.lock); @@ -281,6 +290,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) table = (unsigned long *) page_to_virt(page); if (mm_alloc_pgste(mm)) { /* Return 4K page table with PGSTEs */ + INIT_LIST_HEAD(&page->lru); atomic_xor_bits(&page->_refcount, 0x03U << 24); memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); @@ -300,7 +310,9 @@ static void page_table_release_check(struct page *page, void *table, { char msg[128]; - if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask) + if (!IS_ENABLED(CONFIG_DEBUG_VM)) + return; + if (!mask && list_empty(&page->lru)) return; snprintf(msg, sizeof(msg), "Invalid pgtable %p release half 0x%02x mask 0x%02x", @@ -308,6 +320,15 @@ static void page_table_release_check(struct page *page, void *table, dump_page(page, msg); } +static void pte_free_now(struct rcu_head *head) +{ + struct page *page; + + page = container_of(head, struct page, rcu_head); + pgtable_pte_page_dtor(page); + __free_page(page); +} + void page_table_free(struct mm_struct *mm, unsigned long *table) { unsigned int mask, bit, half; @@ -325,10 +346,17 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) */ mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if (mask & 0x03U) + if ((mask & 0x03U) && !PageActive(page)) { + /* + * Other half is allocated, and neither half has had + * its free deferred: add page to head of list, to make + * this freed half available for immediate reuse. + */ list_add(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); + } else { + /* If page is on list, now remove it. */ + list_del_init(&page->lru); + } spin_unlock_bh(&mm->context.lock); mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24)); mask >>= 24; @@ -342,8 +370,10 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) } page_table_release_check(page, table, half, mask); - pgtable_pte_page_dtor(page); - __free_page(page); + if (TestClearPageActive(page)) + call_rcu(&page->rcu_head, pte_free_now); + else + pte_free_now(&page->rcu_head); } void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, @@ -370,10 +400,18 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, */ mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if (mask & 0x03U) + if ((mask & 0x03U) && !PageActive(page)) { + /* + * Other half is allocated, and neither half has had + * its free deferred: add page to end of list, to make + * this freed half available for reuse once its pending + * bit has been cleared by __tlb_remove_table(). + */ list_add_tail(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); + } else { + /* If page is on list, now remove it. */ + list_del_init(&page->lru); + } spin_unlock_bh(&mm->context.lock); table = (unsigned long *) ((unsigned long) table | (0x01U << bit)); tlb_remove_table(tlb, table); @@ -403,9 +441,27 @@ void __tlb_remove_table(void *_table) } page_table_release_check(page, table, half, mask); - pgtable_pte_page_dtor(page); - __free_page(page); + if (TestClearPageActive(page)) + call_rcu(&page->rcu_head, pte_free_now); + else + pte_free_now(&page->rcu_head); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) +{ + struct page *page; + + page = virt_to_page(pgtable); + SetPageActive(page); + page_table_free(mm, (unsigned long *)pgtable); + /* + * page_table_free() does not do the pgste gmap_unlink() which + * page_table_free_rcu() does: warn us if pgste ever reaches here. + */ + WARN_ON_ONCE(mm_has_pgste(mm)); } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* * Base infrastructure required to generate basic asces, region, segment, -- cgit v1.2.3 From b43b3fff042d08a0bcc0a6d87c3216b44860298e Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:11 +0800 Subject: s390: mm: convert to GENERIC_IOREMAP By taking GENERIC_IOREMAP method, the generic generic_ioremap_prot(), generic_iounmap(), and their generic wrapper ioremap_prot(), ioremap() and iounmap() are all visible and available to arch. Arch needs to provide wrapper functions to override the generic versions if there's arch specific handling in its ioremap_prot(), ioremap() or iounmap(). This change will simplify implementation by removing duplicated code with generic_ioremap_prot() and generic_iounmap(), and has the equivalent functioality as before. Here, add wrapper functions ioremap_prot() and iounmap() for s390's special operation when ioremap() and iounmap(). And also replace including with in arch/s390/kernel/perf_cpum_sf.c, otherwise building error will be seen because macro defined in can't be seen in perf_cpum_sf.c. Link: https://lkml.kernel.org/r/20230706154520.11257-11-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Niklas Schnelle Tested-by: Niklas Schnelle Reviewed-by: Christoph Hellwig Reviewed-by: Mike Rapoport (IBM) Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/s390/Kconfig | 1 + arch/s390/include/asm/io.h | 21 +++++++++-------- arch/s390/pci/pci.c | 57 ++++++++-------------------------------------- 3 files changed, 23 insertions(+), 56 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 5b39918b7042..290b6f93b816 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -143,6 +143,7 @@ config S390 select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_VDSO_TIME_NS + select GENERIC_IOREMAP if PCI select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h index e3882b012bfa..4453ad7c11ac 100644 --- a/arch/s390/include/asm/io.h +++ b/arch/s390/include/asm/io.h @@ -22,11 +22,18 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); #define IO_SPACE_LIMIT 0 -void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot); -void __iomem *ioremap(phys_addr_t addr, size_t size); -void __iomem *ioremap_wc(phys_addr_t addr, size_t size); -void __iomem *ioremap_wt(phys_addr_t addr, size_t size); -void iounmap(volatile void __iomem *addr); +/* + * I/O memory mapping functions. + */ +#define ioremap_prot ioremap_prot +#define iounmap iounmap + +#define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL) + +#define ioremap_wc(addr, size) \ + ioremap_prot((addr), (size), pgprot_val(pgprot_writecombine(PAGE_KERNEL))) +#define ioremap_wt(addr, size) \ + ioremap_prot((addr), (size), pgprot_val(pgprot_writethrough(PAGE_KERNEL))) static inline void __iomem *ioport_map(unsigned long port, unsigned int nr) { @@ -51,10 +58,6 @@ static inline void ioport_unmap(void __iomem *p) #define pci_iomap_wc pci_iomap_wc #define pci_iomap_wc_range pci_iomap_wc_range -#define ioremap ioremap -#define ioremap_wt ioremap_wt -#define ioremap_wc ioremap_wc - #define memcpy_fromio(dst, src, count) zpci_memcpy_fromio(dst, src, count) #define memcpy_toio(dst, src, count) zpci_memcpy_toio(dst, src, count) #define memset_io(dst, val, count) zpci_memset_io(dst, val, count) diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index afc3f33788da..d34d5813d006 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -244,62 +244,25 @@ void __iowrite64_copy(void __iomem *to, const void *from, size_t count) zpci_memcpy_toio(to, from, count); } -static void __iomem *__ioremap(phys_addr_t addr, size_t size, pgprot_t prot) +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long prot) { - unsigned long offset, vaddr; - struct vm_struct *area; - phys_addr_t last_addr; - - last_addr = addr + size - 1; - if (!size || last_addr < addr) - return NULL; - + /* + * When PCI MIO instructions are unavailable the "physical" address + * encodes a hint for accessing the PCI memory space it represents. + * Just pass it unchanged such that ioread/iowrite can decode it. + */ if (!static_branch_unlikely(&have_mio)) - return (void __iomem *) addr; + return (void __iomem *)phys_addr; - offset = addr & ~PAGE_MASK; - addr &= PAGE_MASK; - size = PAGE_ALIGN(size + offset); - area = get_vm_area(size, VM_IOREMAP); - if (!area) - return NULL; - - vaddr = (unsigned long) area->addr; - if (ioremap_page_range(vaddr, vaddr + size, addr, prot)) { - free_vm_area(area); - return NULL; - } - return (void __iomem *) ((unsigned long) area->addr + offset); -} - -void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot) -{ - return __ioremap(addr, size, __pgprot(prot)); + return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); } EXPORT_SYMBOL(ioremap_prot); -void __iomem *ioremap(phys_addr_t addr, size_t size) -{ - return __ioremap(addr, size, PAGE_KERNEL); -} -EXPORT_SYMBOL(ioremap); - -void __iomem *ioremap_wc(phys_addr_t addr, size_t size) -{ - return __ioremap(addr, size, pgprot_writecombine(PAGE_KERNEL)); -} -EXPORT_SYMBOL(ioremap_wc); - -void __iomem *ioremap_wt(phys_addr_t addr, size_t size) -{ - return __ioremap(addr, size, pgprot_writethrough(PAGE_KERNEL)); -} -EXPORT_SYMBOL(ioremap_wt); - void iounmap(volatile void __iomem *addr) { if (static_branch_likely(&have_mio)) - vunmap((__force void *) ((unsigned long) addr & PAGE_MASK)); + generic_iounmap(addr); } EXPORT_SYMBOL(iounmap); -- cgit v1.2.3 From 284e05920498788c5df1a7dd6424adb426498e1c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:01 +0100 Subject: mm: remove CONFIG_PER_VMA_LOCK ifdefs Patch series "Handle most file-backed faults under the VMA lock", v3. This patchset adds the ability to handle page faults on parts of files which are already in the page cache without taking the mmap lock. This patch (of 10): Provide lock_vma_under_rcu() when CONFIG_PER_VMA_LOCK is not defined to eliminate ifdefs in the users. Link: https://lkml.kernel.org/r/20230724185410.1124082-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230724185410.1124082-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Punit Agrawal Cc: Arjun Roy Cc: Eric Dumazet Signed-off-by: Andrew Morton --- arch/s390/mm/fault.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 2f123429a291..6f6b9881e55e 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -407,7 +407,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) access = VM_WRITE; if (access == VM_WRITE) flags |= FAULT_FLAG_WRITE; -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; vma = lock_vma_under_rcu(mm, address); @@ -432,7 +431,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) goto out; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ mmap_read_lock(mm); gmap = NULL; -- cgit v1.2.3 From 0b6f15824cc7e431a9706c78bfb9cb3011477ad3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:53 +0530 Subject: mm/vmemmap optimization: split hugetlb and devdax vmemmap optimization Arm disabled hugetlb vmemmap optimization [1] because hugetlb vmemmap optimization includes an update of both the permissions (writeable to read-only) and the output address (pfn) of the vmemmap ptes. That is not supported without unmapping of pte(marking it invalid) by some architectures. With DAX vmemmap optimization we don't require such pte updates and architectures can enable DAX vmemmap optimization while having hugetlb vmemmap optimization disabled. Hence split DAX optimization support into a different config. s390, loongarch and riscv don't have devdax support. So the DAX config is not enabled for them. With this change, arm64 should be able to select DAX optimization [1] commit 060a2c92d1b6 ("arm64: mm: hugetlb: Disable HUGETLB_PAGE_OPTIMIZE_VMEMMAP") Link: https://lkml.kernel.org/r/20230724190759.483013-8-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/s390/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/s390') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 290b6f93b816..8ff6d1c21e38 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -127,7 +127,7 @@ config S390 select ARCH_WANTS_NO_INSTR select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_IPC_PARSE_VERSION - select ARCH_WANT_OPTIMIZE_VMEMMAP + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS2 select DMA_OPS if PCI -- cgit v1.2.3 From 6326c26c1514757242829b292b26eac589013200 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:56 -0700 Subject: s390: convert various pgalloc functions to use ptdescs As part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents, convert various page table functions to use ptdescs. Some of the functions use the *get*page*() helper functions. Convert these to use pagetable_alloc() and ptdesc_address() instead to help standardize page tables further. Link: https://lkml.kernel.org/r/20230807230513.102486-15-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/s390/include/asm/pgalloc.h | 4 +- arch/s390/include/asm/tlb.h | 4 +- arch/s390/mm/pgalloc.c | 128 ++++++++++++++++++++-------------------- 3 files changed, 69 insertions(+), 67 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 89a9d5ef94f8..376b4b23bdaa 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -86,7 +86,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) if (!table) return NULL; crst_table_init(table, _SEGMENT_ENTRY_EMPTY); - if (!pgtable_pmd_page_ctor(virt_to_page(table))) { + if (!pagetable_pmd_ctor(virt_to_ptdesc(table))) { crst_table_free(mm, table); return NULL; } @@ -97,7 +97,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { if (mm_pmd_folded(mm)) return; - pgtable_pmd_page_dtor(virt_to_page(pmd)); + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); crst_table_free(mm, (unsigned long *) pmd); } diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index b91f4a9b044c..383b1f91442c 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -89,12 +89,12 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, { if (mm_pmd_folded(tlb->mm)) return; - pgtable_pmd_page_dtor(virt_to_page(pmd)); + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); __tlb_adjust_range(tlb, address, PAGE_SIZE); tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; tlb->cleared_puds = 1; - tlb_remove_table(tlb, pmd); + tlb_remove_ptdesc(tlb, pmd); } /* diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index d7374add7820..07fc660a24aa 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -43,17 +43,17 @@ __initcall(page_table_register_sysctl); unsigned long *crst_table_alloc(struct mm_struct *mm) { - struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER); - if (!page) + if (!ptdesc) return NULL; - arch_set_page_dat(page, CRST_ALLOC_ORDER); - return (unsigned long *) page_to_virt(page); + arch_set_page_dat(ptdesc_page(ptdesc), CRST_ALLOC_ORDER); + return (unsigned long *) ptdesc_to_virt(ptdesc); } void crst_table_free(struct mm_struct *mm, unsigned long *table) { - free_pages((unsigned long)table, CRST_ALLOC_ORDER); + pagetable_free(virt_to_ptdesc(table)); } static void __crst_table_upgrade(void *arg) @@ -140,21 +140,21 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) struct page *page_table_alloc_pgste(struct mm_struct *mm) { - struct page *page; + struct ptdesc *ptdesc; u64 *table; - page = alloc_page(GFP_KERNEL); - if (page) { - table = (u64 *)page_to_virt(page); + ptdesc = pagetable_alloc(GFP_KERNEL, 0); + if (ptdesc) { + table = (u64 *)ptdesc_to_virt(ptdesc); memset64(table, _PAGE_INVALID, PTRS_PER_PTE); memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } - return page; + return ptdesc_page(ptdesc); } void page_table_free_pgste(struct page *page) { - __free_page(page); + pagetable_free(page_ptdesc(page)); } #endif /* CONFIG_PGSTE */ @@ -242,7 +242,7 @@ void page_table_free_pgste(struct page *page) unsigned long *page_table_alloc(struct mm_struct *mm) { unsigned long *table; - struct page *page; + struct ptdesc *ptdesc; unsigned int mask, bit; /* Try to get a fragment of a 4K page as a 2K page table */ @@ -250,9 +250,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm) table = NULL; spin_lock_bh(&mm->context.lock); if (!list_empty(&mm->context.pgtable_list)) { - page = list_first_entry(&mm->context.pgtable_list, - struct page, lru); - mask = atomic_read(&page->_refcount) >> 24; + ptdesc = list_first_entry(&mm->context.pgtable_list, + struct ptdesc, pt_list); + mask = atomic_read(&ptdesc->_refcount) >> 24; /* * The pending removal bits must also be checked. * Failure to do so might lead to an impossible @@ -264,13 +264,13 @@ unsigned long *page_table_alloc(struct mm_struct *mm) */ mask = (mask | (mask >> 4)) & 0x03U; if (mask != 0x03U) { - table = (unsigned long *) page_to_virt(page); + table = (unsigned long *) ptdesc_to_virt(ptdesc); bit = mask & 1; /* =1 -> second 2K */ if (bit) table += PTRS_PER_PTE; - atomic_xor_bits(&page->_refcount, + atomic_xor_bits(&ptdesc->_refcount, 0x01U << (bit + 24)); - list_del_init(&page->lru); + list_del_init(&ptdesc->pt_list); } } spin_unlock_bh(&mm->context.lock); @@ -278,28 +278,28 @@ unsigned long *page_table_alloc(struct mm_struct *mm) return table; } /* Allocate a fresh page */ - page = alloc_page(GFP_KERNEL); - if (!page) + ptdesc = pagetable_alloc(GFP_KERNEL, 0); + if (!ptdesc) return NULL; - if (!pgtable_pte_page_ctor(page)) { - __free_page(page); + if (!pagetable_pte_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - arch_set_page_dat(page, 0); + arch_set_page_dat(ptdesc_page(ptdesc), 0); /* Initialize page table */ - table = (unsigned long *) page_to_virt(page); + table = (unsigned long *) ptdesc_to_virt(ptdesc); if (mm_alloc_pgste(mm)) { /* Return 4K page table with PGSTEs */ - INIT_LIST_HEAD(&page->lru); - atomic_xor_bits(&page->_refcount, 0x03U << 24); + INIT_LIST_HEAD(&ptdesc->pt_list); + atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } else { /* Return the first 2K fragment of the page */ - atomic_xor_bits(&page->_refcount, 0x01U << 24); + atomic_xor_bits(&ptdesc->_refcount, 0x01U << 24); memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE); spin_lock_bh(&mm->context.lock); - list_add(&page->lru, &mm->context.pgtable_list); + list_add(&ptdesc->pt_list, &mm->context.pgtable_list); spin_unlock_bh(&mm->context.lock); } return table; @@ -322,19 +322,18 @@ static void page_table_release_check(struct page *page, void *table, static void pte_free_now(struct rcu_head *head) { - struct page *page; + struct ptdesc *ptdesc; - page = container_of(head, struct page, rcu_head); - pgtable_pte_page_dtor(page); - __free_page(page); + ptdesc = container_of(head, struct ptdesc, pt_rcu_head); + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); } void page_table_free(struct mm_struct *mm, unsigned long *table) { unsigned int mask, bit, half; - struct page *page; + struct ptdesc *ptdesc = virt_to_ptdesc(table); - page = virt_to_page(table); if (!mm_alloc_pgste(mm)) { /* Free 2K page table fragment of a 4K page */ bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); @@ -344,51 +343,50 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) * will happen outside of the critical section from this * function or from __tlb_remove_table() */ - mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if ((mask & 0x03U) && !PageActive(page)) { + if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) { /* * Other half is allocated, and neither half has had * its free deferred: add page to head of list, to make * this freed half available for immediate reuse. */ - list_add(&page->lru, &mm->context.pgtable_list); + list_add(&ptdesc->pt_list, &mm->context.pgtable_list); } else { /* If page is on list, now remove it. */ - list_del_init(&page->lru); + list_del_init(&ptdesc->pt_list); } spin_unlock_bh(&mm->context.lock); - mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24)); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x10U << (bit + 24)); mask >>= 24; if (mask != 0x00U) return; half = 0x01U << bit; } else { half = 0x03U; - mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); mask >>= 24; } - page_table_release_check(page, table, half, mask); - if (TestClearPageActive(page)) - call_rcu(&page->rcu_head, pte_free_now); + page_table_release_check(ptdesc_page(ptdesc), table, half, mask); + if (folio_test_clear_active(ptdesc_folio(ptdesc))) + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); else - pte_free_now(&page->rcu_head); + pte_free_now(&ptdesc->pt_rcu_head); } void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, unsigned long vmaddr) { struct mm_struct *mm; - struct page *page; unsigned int bit, mask; + struct ptdesc *ptdesc = virt_to_ptdesc(table); mm = tlb->mm; - page = virt_to_page(table); if (mm_alloc_pgste(mm)) { gmap_unlink(mm, table, vmaddr); table = (unsigned long *) ((unsigned long)table | 0x03U); - tlb_remove_table(tlb, table); + tlb_remove_ptdesc(tlb, table); return; } bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); @@ -398,19 +396,19 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, * outside of the critical section from __tlb_remove_table() or from * page_table_free() */ - mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if ((mask & 0x03U) && !PageActive(page)) { + if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) { /* * Other half is allocated, and neither half has had * its free deferred: add page to end of list, to make * this freed half available for reuse once its pending * bit has been cleared by __tlb_remove_table(). */ - list_add_tail(&page->lru, &mm->context.pgtable_list); + list_add_tail(&ptdesc->pt_list, &mm->context.pgtable_list); } else { /* If page is on list, now remove it. */ - list_del_init(&page->lru); + list_del_init(&ptdesc->pt_list); } spin_unlock_bh(&mm->context.lock); table = (unsigned long *) ((unsigned long) table | (0x01U << bit)); @@ -421,30 +419,30 @@ void __tlb_remove_table(void *_table) { unsigned int mask = (unsigned long) _table & 0x03U, half = mask; void *table = (void *)((unsigned long) _table ^ mask); - struct page *page = virt_to_page(table); + struct ptdesc *ptdesc = virt_to_ptdesc(table); switch (half) { case 0x00U: /* pmd, pud, or p4d */ - free_pages((unsigned long)table, CRST_ALLOC_ORDER); + pagetable_free(ptdesc); return; case 0x01U: /* lower 2K of a 4K page table */ case 0x02U: /* higher 2K of a 4K page table */ - mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24)); + mask = atomic_xor_bits(&ptdesc->_refcount, mask << (4 + 24)); mask >>= 24; if (mask != 0x00U) return; break; case 0x03U: /* 4K page table with pgstes */ - mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); mask >>= 24; break; } - page_table_release_check(page, table, half, mask); - if (TestClearPageActive(page)) - call_rcu(&page->rcu_head, pte_free_now); + page_table_release_check(ptdesc_page(ptdesc), table, half, mask); + if (folio_test_clear_active(ptdesc_folio(ptdesc))) + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); else - pte_free_now(&page->rcu_head); + pte_free_now(&ptdesc->pt_rcu_head); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -488,16 +486,20 @@ static void base_pgt_free(unsigned long *table) static unsigned long *base_crst_alloc(unsigned long val) { unsigned long *table; + struct ptdesc *ptdesc; - table = (unsigned long *)__get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER); - if (table) - crst_table_init(table, val); + ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, CRST_ALLOC_ORDER); + if (!ptdesc) + return NULL; + table = ptdesc_address(ptdesc); + + crst_table_init(table, val); return table; } static void base_crst_free(unsigned long *table) { - free_pages((unsigned long)table, CRST_ALLOC_ORDER); + pagetable_free(virt_to_ptdesc(table)); } #define BASE_ADDR_END_FUNC(NAME, SIZE) \ -- cgit v1.2.3 From 4089eef0e6ac1a179c58304c657b3df3bb6fe509 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 30 Jun 2023 14:19:54 -0700 Subject: mm: drop per-VMA lock when returning VM_FAULT_RETRY or VM_FAULT_COMPLETED handle_mm_fault returning VM_FAULT_RETRY or VM_FAULT_COMPLETED means mmap_lock has been released. However with per-VMA locks behavior is different and the caller should still release it. To make the rules consistent for the caller, drop the per-VMA lock when returning VM_FAULT_RETRY or VM_FAULT_COMPLETED. Currently the only path returning VM_FAULT_RETRY under per-VMA locks is do_swap_page and no path returns VM_FAULT_COMPLETED for now. [willy@infradead.org: fix riscv] Link: https://lkml.kernel.org/r/CAJuCfpE6GWEx1rPBmNpUfoD5o-gNFz9-UFywzCE2PbEGBiVz7g@mail.gmail.com Link: https://lkml.kernel.org/r/20230630211957.1341547-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Peter Xu Tested-by: Conor Dooley Cc: Alistair Popple Cc: Al Viro Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Josef Bacik Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michel Lespinasse Cc: Minchan Kim Cc: Pavel Tatashin Cc: Punit Agrawal Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- arch/s390/mm/fault.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/s390') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 6f6b9881e55e..a063774ba584 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -417,7 +417,8 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) goto lock_mmap; } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); if (likely(!(fault & VM_FAULT_ERROR))) -- cgit v1.2.3 From 843f9310e00ad4d6207cff88c05ce90b857625d0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:51 +0100 Subject: s390: implement the new page table range API Add set_ptes() and update_mmu_cache_range(). Link: https://lkml.kernel.org/r/20230802151406.3735276-24-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Gerald Schaefer Acked-by: Mike Rapoport (IBM) Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Signed-off-by: Andrew Morton --- arch/s390/include/asm/pgtable.h | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index c55f3c3365af..02973c740a5b 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -47,6 +47,7 @@ static inline void update_page_count(int level, long count) * tables contain all the necessary information. */ #define update_mmu_cache(vma, address, ptep) do { } while (0) +#define update_mmu_cache_range(vmf, vma, addr, ptep, nr) do { } while (0) #define update_mmu_cache_pmd(vma, address, ptep) do { } while (0) /* @@ -1316,20 +1317,34 @@ pgprot_t pgprot_writecombine(pgprot_t prot); pgprot_t pgprot_writethrough(pgprot_t prot); /* - * Certain architectures need to do special things when PTEs - * within a page table are directly modified. Thus, the following - * hook is made available. + * Set multiple PTEs to consecutive pages with a single call. All PTEs + * are within the same folio, PMD and VMA. */ -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t entry) +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t entry, unsigned int nr) { if (pte_present(entry)) entry = clear_pte_bit(entry, __pgprot(_PAGE_UNUSED)); - if (mm_has_pgste(mm)) - ptep_set_pte_at(mm, addr, ptep, entry); - else - set_pte(ptep, entry); + if (mm_has_pgste(mm)) { + for (;;) { + ptep_set_pte_at(mm, addr, ptep, entry); + if (--nr == 0) + break; + ptep++; + entry = __pte(pte_val(entry) + PAGE_SIZE); + addr += PAGE_SIZE; + } + } else { + for (;;) { + set_pte(ptep, entry); + if (--nr == 0) + break; + ptep++; + entry = __pte(pte_val(entry) + PAGE_SIZE); + } + } } +#define set_ptes set_ptes /* * Conversion functions: convert a page and protection to a page entry, -- cgit v1.2.3