From 527ed4f7d902d362471a93e1a4afb604c18ceb48 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 30 Jun 2023 14:22:52 +0800 Subject: mm: remove arguments of show_mem() All callers of show_mem() pass 0 and NULL, so we can remove the two arguments by directly calling __show_mem(0, NULL, MAX_NR_ZONES - 1) in show_mem(). Link: https://lkml.kernel.org/r/20230630062253.189440-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Christophe Leroy Cc: Greg Kroah-Hartman Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- arch/powerpc/xmon/xmon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index fae747cc57d2..ee17270d35d0 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -1084,7 +1084,7 @@ cmds(struct pt_regs *excp) memzcan(); break; case 'i': - show_mem(0, NULL); + show_mem(); break; default: termch = cmd; -- cgit v1.2.3 From 1279aa0656bbebbeecbd3bec0dd50faf35e5db38 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 30 Jun 2023 14:22:53 +0800 Subject: mm: make show_free_areas() static All callers of show_free_areas() pass 0 and NULL, so we can directly use show_mem() instead of show_free_areas(0, NULL), which could make show_free_areas() a static function. Link: https://lkml.kernel.org/r/20230630062253.189440-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Christophe Leroy Cc: Greg Kroah-Hartman Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- arch/sparc/kernel/setup_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c index 1adf5c1c16b8..34ef7febf0d5 100644 --- a/arch/sparc/kernel/setup_32.c +++ b/arch/sparc/kernel/setup_32.c @@ -83,7 +83,7 @@ static void prom_sync_me(void) "nop\n\t" : : "r" (&trapbase)); prom_printf("PROM SYNC COMMAND...\n"); - show_free_areas(0, NULL); + show_mem(); if (!is_idle_task(current)) { local_irq_enable(); ksys_sync(); -- cgit v1.2.3 From 332c151c710ad404e6e67eba7ae899ad8333333f Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Mon, 22 May 2023 17:43:10 -0700 Subject: arm64: mte: simplify swap tag restoration logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As a result of the patches "mm: Call arch_swap_restore() from do_swap_page()" and "mm: Call arch_swap_restore() from unuse_pte()", there are no circumstances in which a swapped-in page is installed in a page table without first having arch_swap_restore() called on it. Therefore, we no longer need the logic in set_pte_at() that restores the tags, so remove it. Link: https://lkml.kernel.org/r/20230523004312.1807357-4-pcc@google.com Link: https://linux-review.googlesource.com/id/I8ad54476f3b2d0144ccd8ce0c1d7a2963e5ff6f3 Signed-off-by: Peter Collingbourne Reviewed-by: Steven Price Reviewed-by: Catalin Marinas Cc: Alexandru Elisei Cc: Chinwen Chang Cc: David Hildenbrand Cc: Evgenii Stepanov Cc: Greg Kroah-Hartman Cc: kasan-dev@googlegroups.com Cc: kasan-dev Cc: "Kuan-Ying Lee (李冠穎)" Cc: Qun-Wei Lin Cc: Suren Baghdasaryan Cc: Vincenzo Frascino Cc: Will Deacon Cc: "Huang, Ying" Signed-off-by: Andrew Morton --- arch/arm64/include/asm/mte.h | 4 ++-- arch/arm64/include/asm/pgtable.h | 14 ++------------ arch/arm64/kernel/mte.c | 37 +++++++------------------------------ 3 files changed, 11 insertions(+), 44 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index c028afb1cd0b..4cedbaa16f41 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -90,7 +90,7 @@ static inline bool try_page_mte_tagging(struct page *page) } void mte_zero_clear_page_tags(void *addr); -void mte_sync_tags(pte_t old_pte, pte_t pte); +void mte_sync_tags(pte_t pte); void mte_copy_page_tags(void *kto, const void *kfrom); void mte_thread_init_user(void); void mte_thread_switch(struct task_struct *next); @@ -122,7 +122,7 @@ static inline bool try_page_mte_tagging(struct page *page) static inline void mte_zero_clear_page_tags(void *addr) { } -static inline void mte_sync_tags(pte_t old_pte, pte_t pte) +static inline void mte_sync_tags(pte_t pte) { } static inline void mte_copy_page_tags(void *kto, const void *kfrom) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 0bd18de9fd97..e8a252e62b12 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -337,18 +337,8 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, * don't expose tags (instruction fetches don't check tags). */ if (system_supports_mte() && pte_access_permitted(pte, false) && - !pte_special(pte)) { - pte_t old_pte = READ_ONCE(*ptep); - /* - * We only need to synchronise if the new PTE has tags enabled - * or if swapping in (in which case another mapping may have - * set tags in the past even if this PTE isn't tagged). - * (!pte_none() && !pte_present()) is an open coded version of - * is_swap_pte() - */ - if (pte_tagged(pte) || (!pte_none(old_pte) && !pte_present(old_pte))) - mte_sync_tags(old_pte, pte); - } + !pte_special(pte) && pte_tagged(pte)) + mte_sync_tags(pte); __check_safe_pte_update(mm, ptep, pte); diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 4c5ef9b20065..4edecaac8f91 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -35,41 +35,18 @@ DEFINE_STATIC_KEY_FALSE(mte_async_or_asymm_mode); EXPORT_SYMBOL_GPL(mte_async_or_asymm_mode); #endif -static void mte_sync_page_tags(struct page *page, pte_t old_pte, - bool check_swap, bool pte_is_tagged) -{ - if (check_swap && is_swap_pte(old_pte)) { - swp_entry_t entry = pte_to_swp_entry(old_pte); - - if (!non_swap_entry(entry)) - mte_restore_tags(entry, page); - } - - if (!pte_is_tagged) - return; - - if (try_page_mte_tagging(page)) { - mte_clear_page_tags(page_address(page)); - set_page_mte_tagged(page); - } -} - -void mte_sync_tags(pte_t old_pte, pte_t pte) +void mte_sync_tags(pte_t pte) { struct page *page = pte_page(pte); long i, nr_pages = compound_nr(page); - bool check_swap = nr_pages == 1; - bool pte_is_tagged = pte_tagged(pte); - - /* Early out if there's nothing to do */ - if (!check_swap && !pte_is_tagged) - return; /* if PG_mte_tagged is set, tags have already been initialised */ - for (i = 0; i < nr_pages; i++, page++) - if (!page_mte_tagged(page)) - mte_sync_page_tags(page, old_pte, check_swap, - pte_is_tagged); + for (i = 0; i < nr_pages; i++, page++) { + if (try_page_mte_tagging(page)) { + mte_clear_page_tags(page_address(page)); + set_page_mte_tagged(page); + } + } /* ensure the tags are visible before the PTE is set */ smp_wmb(); -- cgit v1.2.3 From de2e4626c70605b7ff5ab32b75336547663d465f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:33:08 -0700 Subject: arm: adjust_pte() use pte_offset_map_nolock() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of pte_lockptr(), use the recently added pte_offset_map_nolock() in adjust_pte(): because it gives the not-locked ptl for precisely that pte, which the caller can then safely lock; whereas pte_lockptr() is not so tightly coupled, because it dereferences the pmd pointer again. Link: https://lkml.kernel.org/r/4d5258bd-ffa0-018-253a-25f2c9b783f7@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm/mm/fault-armv.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index ca5302b0b7ee..7cb125497976 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -117,11 +117,10 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, * must use the nested version. This also means we need to * open-code the spin-locking. */ - pte = pte_offset_map(pmd, address); + pte = pte_offset_map_nolock(vma->vm_mm, pmd, address, &ptl); if (!pte) return 0; - ptl = pte_lockptr(vma->vm_mm, pmd); do_pte_lock(ptl); ret = do_adjust_pte(vma, address, pfn, pte); -- cgit v1.2.3 From 3d140215a6aec37f112aec1606c6a76f7e4443d3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:34:25 -0700 Subject: powerpc: assert_pte_locked() use pte_offset_map_nolock() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of pte_lockptr(), use the recently added pte_offset_map_nolock() in assert_pte_locked(). BUG if pte_offset_map_nolock() fails. This mod might cause new crashes: which either expose my ignorance, or indicate issues to be fixed, or limit the usage of assert_pte_locked(). [hughd@google.com: assert_pte_locked() still needs the pmd_none() check] Link: https://lkml.kernel.org/r/c73d1543-532c-3da2-8cf2-a95363a14116@google.com Link: https://lkml.kernel.org/r/e8d56c95-c132-a82e-5f5f-7bb1b738b057@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/mm/pgtable.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index cb2dcdb18f8e..a3dcdb2d5b4b 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -311,6 +311,8 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) p4d_t *p4d; pud_t *pud; pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; if (mm == &init_mm) return; @@ -329,8 +331,10 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) */ if (pmd_none(*pmd)) return; - BUG_ON(!pmd_present(*pmd)); - assert_spin_locked(pte_lockptr(mm, pmd)); + pte = pte_offset_map_nolock(mm, pmd, addr, &ptl); + BUG_ON(!pte); + assert_spin_locked(ptl); + pte_unmap(pte); } #endif /* CONFIG_DEBUG_VM */ -- cgit v1.2.3 From 32cc0b7c9d508efde8946a82eb3c4acfa8dfed15 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:35:59 -0700 Subject: powerpc: add pte_free_defer() for pgtables sharing page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add powerpc-specific pte_free_defer(), to free table page via call_rcu(). pte_free_defer() will be called inside khugepaged's retract_page_tables() loop, where allocating extra memory cannot be relied upon. This precedes the generic version to avoid build breakage from incompatible pgtable_t. This is awkward because the struct page contains only one rcu_head, but that page may be shared between PTE_FRAG_NR pagetables, each wanting to use the rcu_head at the same time. But powerpc never reuses a fragment once it has been freed: so mark the page Active in pte_free_defer(), before calling pte_fragment_free() directly; and there call_rcu() to pte_free_now() when last fragment is freed and the page is PageActive. Link: https://lkml.kernel.org/r/6e3ca5f1-334d-4b14-b92d-fc8e99914fcb@google.com Suggested-by: Jason Gunthorpe Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/pgalloc.h | 4 ++++ arch/powerpc/mm/pgtable-frag.c | 29 ++++++++++++++++++++++++++--- 2 files changed, 30 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index 3360cad78ace..3a971e2a8c73 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -45,6 +45,10 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) pte_fragment_free((unsigned long *)ptepage, 0); } +/* arch use pte_free_defer() implementation in arch/powerpc/mm/pgtable-frag.c */ +#define pte_free_defer pte_free_defer +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); + /* * Functions that deal with pagetables that could be at any level of * the table need to be passed an "index_size" so they know how to diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index 20652daa1d7e..0c6b68130025 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -106,6 +106,15 @@ pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel) return __alloc_for_ptecache(mm, kernel); } +static void pte_free_now(struct rcu_head *head) +{ + struct page *page; + + page = container_of(head, struct page, rcu_head); + pgtable_pte_page_dtor(page); + __free_page(page); +} + void pte_fragment_free(unsigned long *table, int kernel) { struct page *page = virt_to_page(table); @@ -115,8 +124,22 @@ void pte_fragment_free(unsigned long *table, int kernel) BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); if (atomic_dec_and_test(&page->pt_frag_refcount)) { - if (!kernel) - pgtable_pte_page_dtor(page); - __free_page(page); + if (kernel) + __free_page(page); + else if (TestClearPageActive(page)) + call_rcu(&page->rcu_head, pte_free_now); + else + pte_free_now(&page->rcu_head); } } + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) +{ + struct page *page; + + page = virt_to_page(pgtable); + SetPageActive(page); + pte_fragment_free((unsigned long *)pgtable, 0); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -- cgit v1.2.3 From ad1ac8d94cde7709e3ea5360963ff70df2c0b4aa Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:37:24 -0700 Subject: sparc: add pte_free_defer() for pte_t *pgtable_t MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add sparc-specific pte_free_defer(), to call pte_free() via call_rcu(). pte_free_defer() will be called inside khugepaged's retract_page_tables() loop, where allocating extra memory cannot be relied upon. This precedes the generic version to avoid build breakage from incompatible pgtable_t. sparc32 supports pagetables sharing a page, but does not support THP; sparc64 supports THP, but does not support pagetables sharing a page. So the sparc-specific pte_free_defer() is as simple as the generic one, except for converting between pte_t *pgtable_t and struct page *. Link: https://lkml.kernel.org/r/dc4f318d-a66a-5622-dc44-9018ea814b37@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/sparc/include/asm/pgalloc_64.h | 4 ++++ arch/sparc/mm/init_64.c | 16 ++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'arch') diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h index 7b5561d17ab1..caa7632be4c2 100644 --- a/arch/sparc/include/asm/pgalloc_64.h +++ b/arch/sparc/include/asm/pgalloc_64.h @@ -65,6 +65,10 @@ pgtable_t pte_alloc_one(struct mm_struct *mm); void pte_free_kernel(struct mm_struct *mm, pte_t *pte); void pte_free(struct mm_struct *mm, pgtable_t ptepage); +/* arch use pte_free_defer() implementation in arch/sparc/mm/init_64.c */ +#define pte_free_defer pte_free_defer +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); + #define pmd_populate_kernel(MM, PMD, PTE) pmd_set(MM, PMD, PTE) #define pmd_populate(MM, PMD, PTE) pmd_set(MM, PMD, PTE) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 04f9db0c3111..0d7fd793924c 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2930,6 +2930,22 @@ void pgtable_free(void *table, bool is_page) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void pte_free_now(struct rcu_head *head) +{ + struct page *page; + + page = container_of(head, struct page, rcu_head); + __pte_free((pgtable_t)page_address(page)); +} + +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) +{ + struct page *page; + + page = virt_to_page(pgtable); + call_rcu(&page->rcu_head, pte_free_now); +} + void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { -- cgit v1.2.3 From 8211dad6279817a8966ff6b74c2c588dd4166f45 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:38:35 -0700 Subject: s390: add pte_free_defer() for pgtables sharing page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add s390-specific pte_free_defer(), to free table page via call_rcu(). pte_free_defer() will be called inside khugepaged's retract_page_tables() loop, where allocating extra memory cannot be relied upon. This precedes the generic version to avoid build breakage from incompatible pgtable_t. This version is more complicated than others: because s390 fits two 2K page tables into one 4K page (so page->rcu_head must be shared between both halves), and already uses page->lru (which page->rcu_head overlays) to list any free halves; with clever management by page->_refcount bits. Build upon the existing management, adjusted to follow a new rule: that a page is never on the free list if pte_free_defer() was used on either half (marked by PageActive). And for simplicity, delay calling RCU until both halves are freed. Not adding back unallocated fragments to the list in pte_free_defer() can result in wasting some amount of memory for pagetables, depending on how long the allocated fragment will stay in use. In practice, this effect is expected to be insignificant, and not justify a far more complex approach, which might allow to add the fragments back later in __tlb_remove_table(), where we might not have a stable mm any more. [hughd@google.com: Claudio finds warning on mm_has_pgste() more useful than on mm_alloc_pgste()] Link: https://lkml.kernel.org/r/3bc095ba-a180-ce3b-82b1-2bfc64612f3@google.com Link: https://lkml.kernel.org/r/94eccf5f-264c-8abe-4567-e77f4b4e14a@google.com Signed-off-by: Hugh Dickins Reviewed-by: Gerald Schaefer Tested-by: Alexander Gordeev Acked-by: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/s390/include/asm/pgalloc.h | 4 +++ arch/s390/mm/pgalloc.c | 80 ++++++++++++++++++++++++++++++++++------- 2 files changed, 72 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 17eb618f1348..89a9d5ef94f8 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -143,6 +143,10 @@ static inline void pmd_populate(struct mm_struct *mm, #define pte_free_kernel(mm, pte) page_table_free(mm, (unsigned long *) pte) #define pte_free(mm, pte) page_table_free(mm, (unsigned long *) pte) +/* arch use pte_free_defer() implementation in arch/s390/mm/pgalloc.c */ +#define pte_free_defer pte_free_defer +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); + void vmem_map_init(void); void *vmem_crst_alloc(unsigned long val); pte_t *vmem_pte_alloc(void); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 66ab68db9842..d7374add7820 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -229,6 +229,15 @@ void page_table_free_pgste(struct page *page) * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable * while the PP bits are never used, nor such a page is added to or removed * from mm_context_t::pgtable_list. + * + * pte_free_defer() overrides those rules: it takes the page off pgtable_list, + * and prevents both 2K fragments from being reused. pte_free_defer() has to + * guarantee that its pgtable cannot be reused before the RCU grace period + * has elapsed (which page_table_free_rcu() does not actually guarantee). + * But for simplicity, because page->rcu_head overlays page->lru, and because + * the RCU callback might not be called before the mm_context_t has been freed, + * pte_free_defer() in this implementation prevents both fragments from being + * reused, and delays making the call to RCU until both fragments are freed. */ unsigned long *page_table_alloc(struct mm_struct *mm) { @@ -261,7 +270,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) table += PTRS_PER_PTE; atomic_xor_bits(&page->_refcount, 0x01U << (bit + 24)); - list_del(&page->lru); + list_del_init(&page->lru); } } spin_unlock_bh(&mm->context.lock); @@ -281,6 +290,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) table = (unsigned long *) page_to_virt(page); if (mm_alloc_pgste(mm)) { /* Return 4K page table with PGSTEs */ + INIT_LIST_HEAD(&page->lru); atomic_xor_bits(&page->_refcount, 0x03U << 24); memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); @@ -300,7 +310,9 @@ static void page_table_release_check(struct page *page, void *table, { char msg[128]; - if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask) + if (!IS_ENABLED(CONFIG_DEBUG_VM)) + return; + if (!mask && list_empty(&page->lru)) return; snprintf(msg, sizeof(msg), "Invalid pgtable %p release half 0x%02x mask 0x%02x", @@ -308,6 +320,15 @@ static void page_table_release_check(struct page *page, void *table, dump_page(page, msg); } +static void pte_free_now(struct rcu_head *head) +{ + struct page *page; + + page = container_of(head, struct page, rcu_head); + pgtable_pte_page_dtor(page); + __free_page(page); +} + void page_table_free(struct mm_struct *mm, unsigned long *table) { unsigned int mask, bit, half; @@ -325,10 +346,17 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) */ mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if (mask & 0x03U) + if ((mask & 0x03U) && !PageActive(page)) { + /* + * Other half is allocated, and neither half has had + * its free deferred: add page to head of list, to make + * this freed half available for immediate reuse. + */ list_add(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); + } else { + /* If page is on list, now remove it. */ + list_del_init(&page->lru); + } spin_unlock_bh(&mm->context.lock); mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24)); mask >>= 24; @@ -342,8 +370,10 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) } page_table_release_check(page, table, half, mask); - pgtable_pte_page_dtor(page); - __free_page(page); + if (TestClearPageActive(page)) + call_rcu(&page->rcu_head, pte_free_now); + else + pte_free_now(&page->rcu_head); } void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, @@ -370,10 +400,18 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, */ mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if (mask & 0x03U) + if ((mask & 0x03U) && !PageActive(page)) { + /* + * Other half is allocated, and neither half has had + * its free deferred: add page to end of list, to make + * this freed half available for reuse once its pending + * bit has been cleared by __tlb_remove_table(). + */ list_add_tail(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); + } else { + /* If page is on list, now remove it. */ + list_del_init(&page->lru); + } spin_unlock_bh(&mm->context.lock); table = (unsigned long *) ((unsigned long) table | (0x01U << bit)); tlb_remove_table(tlb, table); @@ -403,9 +441,27 @@ void __tlb_remove_table(void *_table) } page_table_release_check(page, table, half, mask); - pgtable_pte_page_dtor(page); - __free_page(page); + if (TestClearPageActive(page)) + call_rcu(&page->rcu_head, pte_free_now); + else + pte_free_now(&page->rcu_head); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) +{ + struct page *page; + + page = virt_to_page(pgtable); + SetPageActive(page); + page_table_free(mm, (unsigned long *)pgtable); + /* + * page_table_free() does not do the pgste gmap_unlink() which + * page_table_free_rcu() does: warn us if pgste ever reaches here. + */ + WARN_ON_ONCE(mm_has_pgste(mm)); } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* * Base infrastructure required to generate basic asces, region, segment, -- cgit v1.2.3 From aa232204c4689427cefa55fe975692b57291523a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:31 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pte_clear Remove unused addr in page_table_check_pte_clear and __page_table_check_pte_clear. Link: https://lkml.kernel.org/r/20230713172636.1705415-4-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index e8a252e62b12..f7ea51f9c1c1 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -928,7 +928,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, { pte_t pte = __pte(xchg_relaxed(&pte_val(*ptep), 0)); - page_table_check_pte_clear(mm, address, pte); + page_table_check_pte_clear(mm, pte); return pte; } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 75970ee2bda2..5e07312cd3e1 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -529,7 +529,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, { pte_t pte = __pte(atomic_long_xchg((atomic_long_t *)ptep, 0)); - page_table_check_pte_clear(mm, address, pte); + page_table_check_pte_clear(mm, pte); return pte; } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5700bb337987..5085e838b860 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1068,7 +1068,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = native_ptep_get_and_clear(ptep); - page_table_check_pte_clear(mm, addr, pte); + page_table_check_pte_clear(mm, pte); return pte; } @@ -1084,7 +1084,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, * care about updates and native needs no locking */ pte = native_local_ptep_get_and_clear(ptep); - page_table_check_pte_clear(mm, addr, pte); + page_table_check_pte_clear(mm, pte); } else { pte = ptep_get_and_clear(mm, addr, ptep); } -- cgit v1.2.3 From 1831414cd729a34af937d56ad684a66599de6344 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:32 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pmd_clear Remove unused addr in page_table_check_pmd_clear and __page_table_check_pmd_clear. Link: https://lkml.kernel.org/r/20230713172636.1705415-5-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index f7ea51f9c1c1..6e3387ec6013 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -940,7 +940,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, { pmd_t pmd = __pmd(xchg_relaxed(&pmd_val(*pmdp), 0)); - page_table_check_pmd_clear(mm, address, pmd); + page_table_check_pmd_clear(mm, pmd); return pmd; } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 5e07312cd3e1..388c3af8a9f9 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -742,7 +742,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, { pmd_t pmd = __pmd(atomic_long_xchg((atomic_long_t *)pmdp, 0)); - page_table_check_pmd_clear(mm, address, pmd); + page_table_check_pmd_clear(mm, pmd); return pmd; } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5085e838b860..5d71f933d933 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1133,7 +1133,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long { pmd_t pmd = native_pmdp_get_and_clear(pmdp); - page_table_check_pmd_clear(mm, addr, pmd); + page_table_check_pmd_clear(mm, pmd); return pmd; } -- cgit v1.2.3 From 931c38e16499a057e30a3033f4d6a9c242f0f156 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:33 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pud_clear Remove unused addr in __page_table_check_pud_clear and page_table_check_pud_clear. Link: https://lkml.kernel.org/r/20230713172636.1705415-6-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5d71f933d933..f07c610c3458 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1144,7 +1144,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, { pud_t pud = native_pudp_get_and_clear(pudp); - page_table_check_pud_clear(mm, addr, pud); + page_table_check_pud_clear(mm, pud); return pud; } -- cgit v1.2.3 From 1066293d426d3000793c3c3b4276ef38b63ada4a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:34 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pte_set Remove unused addr in __page_table_check_pte_set and page_table_check_pte_set. Link: https://lkml.kernel.org/r/20230713172636.1705415-7-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 6e3387ec6013..f0a8dcbca04a 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -348,7 +348,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - page_table_check_pte_set(mm, addr, ptep, pte); + page_table_check_pte_set(mm, ptep, pte); return __set_pte_at(mm, addr, ptep, pte); } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 388c3af8a9f9..90063afe8d36 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -499,7 +499,7 @@ static inline void __set_pte_at(struct mm_struct *mm, static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - page_table_check_pte_set(mm, addr, ptep, pteval); + page_table_check_pte_set(mm, ptep, pteval); __set_pte_at(mm, addr, ptep, pteval); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index f07c610c3458..d14f0d92f04b 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1023,7 +1023,7 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - page_table_check_pte_set(mm, addr, ptep, pte); + page_table_check_pte_set(mm, ptep, pte); set_pte(ptep, pte); } -- cgit v1.2.3 From a3b837130b5865521fa8662aceaa6ebc8d29389a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:35 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pmd_set Remove unused addr in __page_table_check_pmd_set and page_table_check_pmd_set. Link: https://lkml.kernel.org/r/20230713172636.1705415-8-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 4 ++-- arch/riscv/include/asm/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable.h | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index f0a8dcbca04a..1fbf8d3f42b1 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -524,7 +524,7 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(mm, addr, pmdp, pmd); + page_table_check_pmd_set(mm, pmdp, pmd); return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)); } @@ -976,7 +976,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); + page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); return __pmd(xchg_relaxed(&pmd_val(*pmdp), pmd_val(pmd))); } #endif diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 90063afe8d36..a30658b2611b 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -687,7 +687,7 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd) static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(mm, addr, pmdp, pmd); + page_table_check_pmd_set(mm, pmdp, pmd); return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)); } @@ -758,7 +758,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); + page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); return __pmd(atomic_long_xchg((atomic_long_t *)pmdp, pmd_val(pmd))); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index d14f0d92f04b..9cc26cb0bc9f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1030,7 +1030,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(mm, addr, pmdp, pmd); + page_table_check_pmd_set(mm, pmdp, pmd); set_pmd(pmdp, pmd); } @@ -1167,7 +1167,7 @@ static inline int pud_write(pud_t pud) static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); + page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); if (IS_ENABLED(CONFIG_SMP)) { return xchg(pmdp, pmd); } else { -- cgit v1.2.3 From 6d144436d954311f2dbacb5bf7b084042448d83e Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:36 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pud_set Remove unused addr in __page_table_check_pud_set and page_table_check_pud_set. Link: https://lkml.kernel.org/r/20230713172636.1705415-9-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 1fbf8d3f42b1..fe4b913589ee 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -531,7 +531,7 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(mm, addr, pudp, pud); + page_table_check_pud_set(mm, pudp, pud); return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud)); } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index a30658b2611b..44377f0d7c35 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -694,7 +694,7 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(mm, addr, pudp, pud); + page_table_check_pud_set(mm, pudp, pud); return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud)); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 9cc26cb0bc9f..ada1bbf12961 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1037,7 +1037,7 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(mm, addr, pudp, pud); + page_table_check_pud_set(mm, pudp, pud); native_set_pud(pudp, pud); } -- cgit v1.2.3 From 0b1f77e74b5a9234e83dc89f1593b769547a37fa Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:02 +0800 Subject: asm-generic/iomap.h: remove ARCH_HAS_IOREMAP_xx macros Patch series "mm: ioremap: Convert architectures to take GENERIC_IOREMAP way", v8. Motivation and implementation: ============================== Currently, many architecutres have't taken the standard GENERIC_IOREMAP way to implement ioremap_prot(), iounmap(), and ioremap_xx(), but make these functions specifically under each arch's folder. Those cause many duplicated code of ioremap() and iounmap(). In this patchset, firstly introduce generic_ioremap_prot() and generic_iounmap() to extract the generic code for GENERIC_IOREMAP. By taking GENERIC_IOREMAP method, the generic generic_ioremap_prot(), generic_iounmap(), and their generic wrapper ioremap_prot(), ioremap() and iounmap() are all visible and available to arch. Arch needs to provide wrapper functions to override the generic version if there's arch specific handling in its corresponding ioremap_prot(), ioremap() or iounmap(). With these changes, duplicated ioremap/iounmap() code uder ARCH-es are removed, and the equivalent functioality is kept as before. Background info: ================ 1) The converting more architectures to take GENERIC_IOREMAP way is suggested by Christoph in below discussion: https://lore.kernel.org/all/Yp7h0Jv6vpgt6xdZ@infradead.org/T/#u 2) In the previous v1 to v3, it's basically further action after arm64 has converted to GENERIC_IOREMAP way in below patchset. It's done by adding hook ioremap_allowed() and iounmap_allowed() in ARCH to add ARCH specific handling the middle of ioremap_prot() and iounmap(). [PATCH v5 0/6] arm64: Cleanup ioremap() and support ioremap_prot() https://lore.kernel.org/all/20220607125027.44946-1-wangkefeng.wang@huawei.com/T/#u Later, during v3 reviewing, Christophe Leroy suggested to introduce generic_ioremap_prot() and generic_iounmap() to generic codes, and ARCH can provide wrapper function ioremap_prot(), ioremap() or iounmap() if needed. Christophe made a RFC patchset as below to specially demonstrate his idea. This is what v4 and now v5 is doing. [RFC PATCH 0/8] mm: ioremap: Convert architectures to take GENERIC_IOREMAP way https://lore.kernel.org/all/cover.1665568707.git.christophe.leroy@csgroup.eu/T/#u Testing: ======== In v8, I only applied this patchset onto the latest linus's tree to build and run on arm64 and s390. This patch (of 19): Let's use '#define ioremap_xx' and "#ifdef ioremap_xx" instead. To remove defined ARCH_HAS_IOREMAP_xx macros in of each ARCH, the ARCH's own ioremap_wc|wt|np definition need be above "#include . Otherwise the redefinition error would be seen during compiling. So the relevant adjustments are made to avoid compiling error: loongarch: - doesn't include , defining ARCH_HAS_IOREMAP_WC is redundant, so simply remove it. m68k: - selected GENERIC_IOMAP, has been added in , and is included above , so simply remove ARCH_HAS_IOREMAP_WT defining. mips: - move "#include " below ioremap_wc definition in powerpc: - remove "#include " in because it's duplicated with the one in , let's rely on the latter. x86: - selected GENERIC_IOMAP, remove #include in the middle of . Let's rely on . Link: https://lkml.kernel.org/r/20230706154520.11257-2-bhe@redhat.com Signed-off-by: Baoquan He Acked-by: Geert Uytterhoeven Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Christoph Hellwig Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Christophe Leroy Cc: David Laight Cc: Helge Deller Cc: John Paul Adrian Glaubitz Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Nathan Chancellor Cc: Niklas Schnelle Cc: Stafford Horne Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Chris Zankel Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "James E.J. Bottomley" Cc: Jonas Bonn Cc: Max Filippov Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Rich Felker Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/loongarch/include/asm/io.h | 2 -- arch/m68k/include/asm/io_mm.h | 2 -- arch/m68k/include/asm/kmap.h | 2 -- arch/mips/include/asm/io.h | 5 ++--- arch/powerpc/include/asm/io.h | 9 +-------- arch/x86/include/asm/io.h | 5 ----- 6 files changed, 3 insertions(+), 22 deletions(-) (limited to 'arch') diff --git a/arch/loongarch/include/asm/io.h b/arch/loongarch/include/asm/io.h index 1c9410220040..0dcb36b32cb2 100644 --- a/arch/loongarch/include/asm/io.h +++ b/arch/loongarch/include/asm/io.h @@ -5,8 +5,6 @@ #ifndef _ASM_IO_H #define _ASM_IO_H -#define ARCH_HAS_IOREMAP_WC - #include #include diff --git a/arch/m68k/include/asm/io_mm.h b/arch/m68k/include/asm/io_mm.h index d41fa488453b..6a0abd4846c6 100644 --- a/arch/m68k/include/asm/io_mm.h +++ b/arch/m68k/include/asm/io_mm.h @@ -26,8 +26,6 @@ #include #include -#include - #ifdef CONFIG_ATARI #define atari_readb raw_inb #define atari_writeb raw_outb diff --git a/arch/m68k/include/asm/kmap.h b/arch/m68k/include/asm/kmap.h index dec05743d426..4efb3efa593a 100644 --- a/arch/m68k/include/asm/kmap.h +++ b/arch/m68k/include/asm/kmap.h @@ -4,8 +4,6 @@ #ifdef CONFIG_MMU -#define ARCH_HAS_IOREMAP_WT - /* Values for nocacheflag and cmode */ #define IOMAP_FULL_CACHING 0 #define IOMAP_NOCACHE_SER 1 diff --git a/arch/mips/include/asm/io.h b/arch/mips/include/asm/io.h index affd21e9c20b..062dd4e6b954 100644 --- a/arch/mips/include/asm/io.h +++ b/arch/mips/include/asm/io.h @@ -12,8 +12,6 @@ #ifndef _ASM_IO_H #define _ASM_IO_H -#define ARCH_HAS_IOREMAP_WC - #include #include #include @@ -25,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -210,6 +207,8 @@ void iounmap(const volatile void __iomem *addr); #define ioremap_wc(offset, size) \ ioremap_prot((offset), (size), boot_cpu_data.writecombine) +#include + #if defined(CONFIG_CPU_CAVIUM_OCTEON) #define war_io_reorder_wmb() wmb() #else diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index f1e657c9bbe8..67a3fb6de498 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -3,11 +3,6 @@ #define _ASM_POWERPC_IO_H #ifdef __KERNEL__ -#define ARCH_HAS_IOREMAP_WC -#ifdef CONFIG_PPC32 -#define ARCH_HAS_IOREMAP_WT -#endif - /* */ @@ -732,9 +727,7 @@ static inline void name at \ #define writel_relaxed(v, addr) writel(v, addr) #define writeq_relaxed(v, addr) writeq(v, addr) -#ifdef CONFIG_GENERIC_IOMAP -#include -#else +#ifndef CONFIG_GENERIC_IOMAP /* * Here comes the implementation of the IOMAP interfaces. */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index e9025640f634..76238842406a 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -35,9 +35,6 @@ * - Arnaldo Carvalho de Melo */ -#define ARCH_HAS_IOREMAP_WC -#define ARCH_HAS_IOREMAP_WT - #include #include #include @@ -212,8 +209,6 @@ void memset_io(volatile void __iomem *, int, size_t); #define memcpy_toio memcpy_toio #define memset_io memset_io -#include - /* * ISA space is 'always mapped' on a typical x86 system, no need to * explicitly ioremap() it. The fact that the ISA IO space is mapped -- cgit v1.2.3 From 5bd2cc56667d9357c040e1980811fcdade79837e Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:03 +0800 Subject: hexagon: mm: convert to GENERIC_IOREMAP By taking GENERIC_IOREMAP method, the generic ioremap_prot() and iounmap() are visible and available to arch. This change will simplify implementation by removing duplicated code with generic ioremap_prot() and iounmap(), and has the equivalent functioality. For hexagon, the current ioremap() and iounmap() are the same as generic version. After taking GENERIC_IOREMAP way, the old ioremap() and iounmap() can be completely removed. Link: https://lkml.kernel.org/r/20230706154520.11257-3-bhe@redhat.com Signed-off-by: Baoquan He Cc: Brian Cain Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Mike Rapoport (IBM) Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/hexagon/Kconfig | 1 + arch/hexagon/include/asm/io.h | 11 ++++++---- arch/hexagon/kernel/hexagon_ksyms.c | 2 -- arch/hexagon/mm/Makefile | 2 +- arch/hexagon/mm/ioremap.c | 44 ------------------------------------- 5 files changed, 9 insertions(+), 51 deletions(-) delete mode 100644 arch/hexagon/mm/ioremap.c (limited to 'arch') diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 6726f4941015..a880ee067d2e 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -25,6 +25,7 @@ config HEXAGON select NEED_SG_DMA_LENGTH select NO_IOPORT_MAP select GENERIC_IOMAP + select GENERIC_IOREMAP select GENERIC_SMP_IDLE_THREAD select STACKTRACE_SUPPORT select GENERIC_CLOCKEVENTS_BROADCAST diff --git a/arch/hexagon/include/asm/io.h b/arch/hexagon/include/asm/io.h index 46a099de85b7..e2b308e32a37 100644 --- a/arch/hexagon/include/asm/io.h +++ b/arch/hexagon/include/asm/io.h @@ -27,8 +27,6 @@ extern int remap_area_pages(unsigned long start, unsigned long phys_addr, unsigned long end, unsigned long flags); -extern void iounmap(const volatile void __iomem *addr); - /* Defined in lib/io.c, needed for smc91x driver. */ extern void __raw_readsw(const void __iomem *addr, void *data, int wordlen); extern void __raw_writesw(void __iomem *addr, const void *data, int wordlen); @@ -170,8 +168,13 @@ static inline void writel(u32 data, volatile void __iomem *addr) #define writew_relaxed __raw_writew #define writel_relaxed __raw_writel -void __iomem *ioremap(unsigned long phys_addr, unsigned long size); -#define ioremap_uc(X, Y) ioremap((X), (Y)) +/* + * I/O memory mapping functions. + */ +#define _PAGE_IOREMAP (_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \ + (__HEXAGON_C_DEV << 6)) + +#define ioremap_uc(addr, size) ioremap((addr), (size)) #define __raw_writel writel diff --git a/arch/hexagon/kernel/hexagon_ksyms.c b/arch/hexagon/kernel/hexagon_ksyms.c index ec56ce2d92a2..36a80e31d187 100644 --- a/arch/hexagon/kernel/hexagon_ksyms.c +++ b/arch/hexagon/kernel/hexagon_ksyms.c @@ -14,12 +14,10 @@ EXPORT_SYMBOL(__clear_user_hexagon); EXPORT_SYMBOL(raw_copy_from_user); EXPORT_SYMBOL(raw_copy_to_user); -EXPORT_SYMBOL(iounmap); EXPORT_SYMBOL(__vmgetie); EXPORT_SYMBOL(__vmsetie); EXPORT_SYMBOL(__vmyield); EXPORT_SYMBOL(empty_zero_page); -EXPORT_SYMBOL(ioremap); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memset); diff --git a/arch/hexagon/mm/Makefile b/arch/hexagon/mm/Makefile index 49911a906fd0..ba4b04d962d6 100644 --- a/arch/hexagon/mm/Makefile +++ b/arch/hexagon/mm/Makefile @@ -3,5 +3,5 @@ # Makefile for Hexagon memory management subsystem # -obj-y := init.o ioremap.o uaccess.o vm_fault.o cache.o +obj-y := init.o uaccess.o vm_fault.o cache.o obj-y += copy_to_user.o copy_from_user.o vm_tlb.o diff --git a/arch/hexagon/mm/ioremap.c b/arch/hexagon/mm/ioremap.c deleted file mode 100644 index 255c5b1ee1a7..000000000000 --- a/arch/hexagon/mm/ioremap.c +++ /dev/null @@ -1,44 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * I/O remap functions for Hexagon - * - * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved. - */ - -#include -#include -#include - -void __iomem *ioremap(unsigned long phys_addr, unsigned long size) -{ - unsigned long last_addr, addr; - unsigned long offset = phys_addr & ~PAGE_MASK; - struct vm_struct *area; - - pgprot_t prot = __pgprot(_PAGE_PRESENT|_PAGE_READ|_PAGE_WRITE - |(__HEXAGON_C_DEV << 6)); - - last_addr = phys_addr + size - 1; - - /* Wrapping not allowed */ - if (!size || (last_addr < phys_addr)) - return NULL; - - /* Rounds up to next page size, including whole-page offset */ - size = PAGE_ALIGN(offset + size); - - area = get_vm_area(size, VM_IOREMAP); - addr = (unsigned long)area->addr; - - if (ioremap_page_range(addr, addr+size, phys_addr, prot)) { - vunmap((void *)addr); - return NULL; - } - - return (void __iomem *) (offset + addr); -} - -void iounmap(const volatile void __iomem *addr) -{ - vunmap((void *) ((unsigned long) addr & PAGE_MASK)); -} -- cgit v1.2.3 From 53c98e35dcbcacdb8e5c4d9c8fd6dfa8962af5c7 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:04 +0800 Subject: openrisc: mm: remove unneeded early ioremap code Under arch/openrisc, there isn't any place where ioremap() is called. It means that there isn't early ioremap handling needed in openrisc, So the early ioremap handling code in ioremap() of arch/openrisc/mm/ioremap.c is unnecessary and can be removed. And also remove the special handling in iounmap() since no page is got from fixmap pool along with early ioremap code removing in ioremap(). Link: https://lore.kernel.org/linux-mm/YwxfxKrTUtAuejKQ@oscomms1/ Link: https://lkml.kernel.org/r/20230706154520.11257-4-bhe@redhat.com Signed-off-by: Baoquan He Acked-by: Stafford Horne Reviewed-by: Christoph Hellwig Reviewed-by: Mike Rapoport (IBM) Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Stafford Horne Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/openrisc/mm/ioremap.c | 43 +++++-------------------------------------- 1 file changed, 5 insertions(+), 38 deletions(-) (limited to 'arch') diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c index 8ec0dafecf25..cdbcc7e73684 100644 --- a/arch/openrisc/mm/ioremap.c +++ b/arch/openrisc/mm/ioremap.c @@ -22,8 +22,6 @@ extern int mem_init_done; -static unsigned int fixmaps_used __initdata; - /* * Remap an arbitrary physical address space into the kernel virtual * address space. Needed when the kernel wants to access high addresses @@ -52,24 +50,14 @@ void __iomem *__ref ioremap(phys_addr_t addr, unsigned long size) p = addr & PAGE_MASK; size = PAGE_ALIGN(last_addr + 1) - p; - if (likely(mem_init_done)) { - area = get_vm_area(size, VM_IOREMAP); - if (!area) - return NULL; - v = (unsigned long)area->addr; - } else { - if ((fixmaps_used + (size >> PAGE_SHIFT)) > FIX_N_IOREMAPS) - return NULL; - v = fix_to_virt(FIX_IOREMAP_BEGIN + fixmaps_used); - fixmaps_used += (size >> PAGE_SHIFT); - } + area = get_vm_area(size, VM_IOREMAP); + if (!area) + return NULL; + v = (unsigned long)area->addr; if (ioremap_page_range(v, v + size, p, __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_CI))) { - if (likely(mem_init_done)) - vfree(area->addr); - else - fixmaps_used -= (size >> PAGE_SHIFT); + vfree(area->addr); return NULL; } @@ -79,27 +67,6 @@ EXPORT_SYMBOL(ioremap); void iounmap(volatile void __iomem *addr) { - /* If the page is from the fixmap pool then we just clear out - * the fixmap mapping. - */ - if (unlikely((unsigned long)addr > FIXADDR_START)) { - /* This is a bit broken... we don't really know - * how big the area is so it's difficult to know - * how many fixed pages to invalidate... - * just flush tlb and hope for the best... - * consider this a FIXME - * - * Really we should be clearing out one or more page - * table entries for these virtual addresses so that - * future references cause a page fault... for now, we - * rely on two things: - * i) this code never gets called on known boards - * ii) invalid accesses to the freed areas aren't made - */ - flush_tlb_all(); - return; - } - return vfree((void *)(PAGE_MASK & (unsigned long)addr)); } EXPORT_SYMBOL(iounmap); -- cgit v1.2.3 From 06dfae39d20091f3c8aa995f00505e1e148b0b3f Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:08 +0800 Subject: arc: mm: convert to GENERIC_IOREMAP By taking GENERIC_IOREMAP method, the generic generic_ioremap_prot(), generic_iounmap(), and their generic wrapper ioremap_prot(), ioremap() and iounmap() are all visible and available to arch. Arch needs to provide wrapper functions to override the generic versions if there's arch specific handling in its ioremap_prot(), ioremap() or iounmap(). This change will simplify implementation by removing duplicated code with generic_ioremap_prot() and generic_iounmap(), and has the equivalent functioality as before. Here, add wrapper functions ioremap_prot() and iounmap() for arc's special operation when ioremap_prot() and iounmap(). Link: https://lkml.kernel.org/r/20230706154520.11257-8-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Mike Rapoport (IBM) Cc: Vineet Gupta Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/arc/Kconfig | 1 + arch/arc/include/asm/io.h | 7 +++---- arch/arc/mm/ioremap.c | 49 ++++------------------------------------------- 3 files changed, 8 insertions(+), 49 deletions(-) (limited to 'arch') diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 96cf8720bb93..6f4995ad9873 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -26,6 +26,7 @@ config ARC select GENERIC_PENDING_IRQ if SMP select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD + select GENERIC_IOREMAP select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARC_MMU_V4 diff --git a/arch/arc/include/asm/io.h b/arch/arc/include/asm/io.h index 80347382a380..4fdb7350636c 100644 --- a/arch/arc/include/asm/io.h +++ b/arch/arc/include/asm/io.h @@ -21,8 +21,9 @@ #endif extern void __iomem *ioremap(phys_addr_t paddr, unsigned long size); -extern void __iomem *ioremap_prot(phys_addr_t paddr, unsigned long size, - unsigned long flags); +#define ioremap ioremap +#define ioremap_prot ioremap_prot +#define iounmap iounmap static inline void __iomem *ioport_map(unsigned long port, unsigned int nr) { return (void __iomem *)port; @@ -32,8 +33,6 @@ static inline void ioport_unmap(void __iomem *addr) { } -extern void iounmap(const volatile void __iomem *addr); - /* * io{read,write}{16,32}be() macros */ diff --git a/arch/arc/mm/ioremap.c b/arch/arc/mm/ioremap.c index 712c2311daef..b07004d53267 100644 --- a/arch/arc/mm/ioremap.c +++ b/arch/arc/mm/ioremap.c @@ -8,7 +8,6 @@ #include #include #include -#include #include static inline bool arc_uncached_addr_space(phys_addr_t paddr) @@ -25,13 +24,6 @@ static inline bool arc_uncached_addr_space(phys_addr_t paddr) void __iomem *ioremap(phys_addr_t paddr, unsigned long size) { - phys_addr_t end; - - /* Don't allow wraparound or zero size */ - end = paddr + size - 1; - if (!size || (end < paddr)) - return NULL; - /* * If the region is h/w uncached, MMU mapping can be elided as optim * The cast to u32 is fine as this region can only be inside 4GB @@ -51,55 +43,22 @@ EXPORT_SYMBOL(ioremap); * ARC hardware uncached region, this one still goes thru the MMU as caller * might need finer access control (R/W/X) */ -void __iomem *ioremap_prot(phys_addr_t paddr, unsigned long size, +void __iomem *ioremap_prot(phys_addr_t paddr, size_t size, unsigned long flags) { - unsigned int off; - unsigned long vaddr; - struct vm_struct *area; - phys_addr_t end; pgprot_t prot = __pgprot(flags); - /* Don't allow wraparound, zero size */ - end = paddr + size - 1; - if ((!size) || (end < paddr)) - return NULL; - - /* An early platform driver might end up here */ - if (!slab_is_available()) - return NULL; - /* force uncached */ - prot = pgprot_noncached(prot); - - /* Mappings have to be page-aligned */ - off = paddr & ~PAGE_MASK; - paddr &= PAGE_MASK_PHYS; - size = PAGE_ALIGN(end + 1) - paddr; - - /* - * Ok, go for it.. - */ - area = get_vm_area(size, VM_IOREMAP); - if (!area) - return NULL; - area->phys_addr = paddr; - vaddr = (unsigned long)area->addr; - if (ioremap_page_range(vaddr, vaddr + size, paddr, prot)) { - vunmap((void __force *)vaddr); - return NULL; - } - return (void __iomem *)(off + (char __iomem *)vaddr); + return generic_ioremap_prot(paddr, size, pgprot_noncached(prot)); } EXPORT_SYMBOL(ioremap_prot); - -void iounmap(const volatile void __iomem *addr) +void iounmap(volatile void __iomem *addr) { /* weird double cast to handle phys_addr_t > 32 bits */ if (arc_uncached_addr_space((phys_addr_t)(u32)addr)) return; - vfree((void *)(PAGE_MASK & (unsigned long __force)addr)); + generic_iounmap(addr); } EXPORT_SYMBOL(iounmap); -- cgit v1.2.3 From 38d110aba3c4f3ab9a2bdf7862ae11afe5c96c43 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:09 +0800 Subject: ia64: mm: convert to GENERIC_IOREMAP By taking GENERIC_IOREMAP method, the generic generic_ioremap_prot(), generic_iounmap(), and their generic wrapper ioremap_prot(), ioremap() and iounmap() are all visible and available to arch. Arch needs to provide wrapper functions to override the generic versions if there's arch specific handling in its ioremap_prot(), ioremap() or iounmap(). This change will simplify implementation by removing duplicated code with generic_ioremap_prot() and generic_iounmap(), and has the equivalent functioality as before. Here, add wrapper functions ioremap_prot() and iounmap() for ia64's special operation when ioremap() and iounmap(). Link: https://lkml.kernel.org/r/20230706154520.11257-9-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Mike Rapoport (IBM) Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/ia64/Kconfig | 1 + arch/ia64/include/asm/io.h | 13 +++++-------- arch/ia64/mm/ioremap.c | 41 ++++++----------------------------------- 3 files changed, 12 insertions(+), 43 deletions(-) (limited to 'arch') diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 2cd93e6bf0fe..3ab75f36c037 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -47,6 +47,7 @@ config IA64 select GENERIC_IRQ_LEGACY select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP + select GENERIC_IOREMAP select GENERIC_SMP_IDLE_THREAD select ARCH_TASK_STRUCT_ON_STACK select ARCH_TASK_STRUCT_ALLOCATOR diff --git a/arch/ia64/include/asm/io.h b/arch/ia64/include/asm/io.h index 83a492c8d298..eedc0afa8cad 100644 --- a/arch/ia64/include/asm/io.h +++ b/arch/ia64/include/asm/io.h @@ -243,15 +243,12 @@ static inline void outsl(unsigned long port, const void *src, # ifdef __KERNEL__ -extern void __iomem * ioremap(unsigned long offset, unsigned long size); +#define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL) + extern void __iomem * ioremap_uc(unsigned long offset, unsigned long size); -extern void iounmap (volatile void __iomem *addr); -static inline void __iomem * ioremap_cache (unsigned long phys_addr, unsigned long size) -{ - return ioremap(phys_addr, size); -} -#define ioremap ioremap -#define ioremap_cache ioremap_cache + +#define ioremap_prot ioremap_prot +#define ioremap_cache ioremap #define ioremap_uc ioremap_uc #define iounmap iounmap diff --git a/arch/ia64/mm/ioremap.c b/arch/ia64/mm/ioremap.c index 92b81bc91397..711b6abc822e 100644 --- a/arch/ia64/mm/ioremap.c +++ b/arch/ia64/mm/ioremap.c @@ -29,13 +29,9 @@ early_ioremap (unsigned long phys_addr, unsigned long size) return __ioremap_uc(phys_addr); } -void __iomem * -ioremap (unsigned long phys_addr, unsigned long size) +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long flags) { - void __iomem *addr; - struct vm_struct *area; - unsigned long offset; - pgprot_t prot; u64 attr; unsigned long gran_base, gran_size; unsigned long page_base; @@ -68,36 +64,12 @@ ioremap (unsigned long phys_addr, unsigned long size) */ page_base = phys_addr & PAGE_MASK; size = PAGE_ALIGN(phys_addr + size) - page_base; - if (efi_mem_attribute(page_base, size) & EFI_MEMORY_WB) { - prot = PAGE_KERNEL; - - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - - /* - * Ok, go for it.. - */ - area = get_vm_area(size, VM_IOREMAP); - if (!area) - return NULL; - - area->phys_addr = phys_addr; - addr = (void __iomem *) area->addr; - if (ioremap_page_range((unsigned long) addr, - (unsigned long) addr + size, phys_addr, prot)) { - vunmap((void __force *) addr); - return NULL; - } - - return (void __iomem *) (offset + (char __iomem *)addr); - } + if (efi_mem_attribute(page_base, size) & EFI_MEMORY_WB) + return generic_ioremap_prot(phys_addr, size, __pgprot(flags)); return __ioremap_uc(phys_addr); } -EXPORT_SYMBOL(ioremap); +EXPORT_SYMBOL(ioremap_prot); void __iomem * ioremap_uc(unsigned long phys_addr, unsigned long size) @@ -114,8 +86,7 @@ early_iounmap (volatile void __iomem *addr, unsigned long size) { } -void -iounmap (volatile void __iomem *addr) +void iounmap(volatile void __iomem *addr) { if (REGION_NUMBER(addr) == RGN_GATE) vunmap((void *) ((unsigned long) addr & PAGE_MASK)); -- cgit v1.2.3 From 9b994429fe1808d0b2caa85f3afcf88e007e2e79 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:10 +0800 Subject: openrisc: mm: convert to GENERIC_IOREMAP By taking GENERIC_IOREMAP method, the generic generic_ioremap_prot(), generic_iounmap(), and their generic wrapper ioremap_prot(), ioremap() and iounmap() are all visible and available to arch. Arch needs to provide wrapper functions to override the generic versions if there's arch specific handling in its ioremap_prot(), ioremap() or iounmap(). This change will simplify implementation by removing duplicated code with generic_ioremap_prot() and generic_iounmap(), and has the equivalent functioality as before. For openrisc, the current ioremap() and iounmap() are the same as generic version. After taking GENERIC_IOREMAP way, the old ioremap() and iounmap() can be completely removed. Link: https://lkml.kernel.org/r/20230706154520.11257-10-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Mike Rapoport (IBM) Cc: Stafford Horne Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/openrisc/Kconfig | 1 + arch/openrisc/include/asm/io.h | 11 +++++----- arch/openrisc/mm/ioremap.c | 49 ------------------------------------------ 3 files changed, 7 insertions(+), 54 deletions(-) (limited to 'arch') diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index c7f282f60f64..fd9bb76a610b 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -21,6 +21,7 @@ config OPENRISC select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW select GENERIC_PCI_IOMAP + select GENERIC_IOREMAP select GENERIC_CPU_DEVICES select HAVE_PCI select HAVE_UID16 diff --git a/arch/openrisc/include/asm/io.h b/arch/openrisc/include/asm/io.h index ee6043a03173..5a6f0f16a5ce 100644 --- a/arch/openrisc/include/asm/io.h +++ b/arch/openrisc/include/asm/io.h @@ -15,6 +15,8 @@ #define __ASM_OPENRISC_IO_H #include +#include +#include /* * PCI: We do not use IO ports in OpenRISC @@ -27,11 +29,10 @@ #define PIO_OFFSET 0 #define PIO_MASK 0 -#define ioremap ioremap -void __iomem *ioremap(phys_addr_t offset, unsigned long size); - -#define iounmap iounmap -extern void iounmap(volatile void __iomem *addr); +/* + * I/O memory mapping functions. + */ +#define _PAGE_IOREMAP (pgprot_val(PAGE_KERNEL) | _PAGE_CI) #include diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c index cdbcc7e73684..91c8259d4b7e 100644 --- a/arch/openrisc/mm/ioremap.c +++ b/arch/openrisc/mm/ioremap.c @@ -22,55 +22,6 @@ extern int mem_init_done; -/* - * Remap an arbitrary physical address space into the kernel virtual - * address space. Needed when the kernel wants to access high addresses - * directly. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void __iomem *__ref ioremap(phys_addr_t addr, unsigned long size) -{ - phys_addr_t p; - unsigned long v; - unsigned long offset, last_addr; - struct vm_struct *area = NULL; - - /* Don't allow wraparound or zero size */ - last_addr = addr + size - 1; - if (!size || last_addr < addr) - return NULL; - - /* - * Mappings have to be page-aligned - */ - offset = addr & ~PAGE_MASK; - p = addr & PAGE_MASK; - size = PAGE_ALIGN(last_addr + 1) - p; - - area = get_vm_area(size, VM_IOREMAP); - if (!area) - return NULL; - v = (unsigned long)area->addr; - - if (ioremap_page_range(v, v + size, p, - __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_CI))) { - vfree(area->addr); - return NULL; - } - - return (void __iomem *)(offset + (char *)v); -} -EXPORT_SYMBOL(ioremap); - -void iounmap(volatile void __iomem *addr) -{ - return vfree((void *)(PAGE_MASK & (unsigned long)addr)); -} -EXPORT_SYMBOL(iounmap); - /** * OK, this one's a bit tricky... ioremap can get called before memory is * initialized (early serial console does this) and will want to alloc a page -- cgit v1.2.3 From b43b3fff042d08a0bcc0a6d87c3216b44860298e Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:11 +0800 Subject: s390: mm: convert to GENERIC_IOREMAP By taking GENERIC_IOREMAP method, the generic generic_ioremap_prot(), generic_iounmap(), and their generic wrapper ioremap_prot(), ioremap() and iounmap() are all visible and available to arch. Arch needs to provide wrapper functions to override the generic versions if there's arch specific handling in its ioremap_prot(), ioremap() or iounmap(). This change will simplify implementation by removing duplicated code with generic_ioremap_prot() and generic_iounmap(), and has the equivalent functioality as before. Here, add wrapper functions ioremap_prot() and iounmap() for s390's special operation when ioremap() and iounmap(). And also replace including with in arch/s390/kernel/perf_cpum_sf.c, otherwise building error will be seen because macro defined in can't be seen in perf_cpum_sf.c. Link: https://lkml.kernel.org/r/20230706154520.11257-11-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Niklas Schnelle Tested-by: Niklas Schnelle Reviewed-by: Christoph Hellwig Reviewed-by: Mike Rapoport (IBM) Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/s390/Kconfig | 1 + arch/s390/include/asm/io.h | 21 +++++++++-------- arch/s390/pci/pci.c | 57 ++++++++-------------------------------------- 3 files changed, 23 insertions(+), 56 deletions(-) (limited to 'arch') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 5b39918b7042..290b6f93b816 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -143,6 +143,7 @@ config S390 select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_VDSO_TIME_NS + select GENERIC_IOREMAP if PCI select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h index e3882b012bfa..4453ad7c11ac 100644 --- a/arch/s390/include/asm/io.h +++ b/arch/s390/include/asm/io.h @@ -22,11 +22,18 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); #define IO_SPACE_LIMIT 0 -void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot); -void __iomem *ioremap(phys_addr_t addr, size_t size); -void __iomem *ioremap_wc(phys_addr_t addr, size_t size); -void __iomem *ioremap_wt(phys_addr_t addr, size_t size); -void iounmap(volatile void __iomem *addr); +/* + * I/O memory mapping functions. + */ +#define ioremap_prot ioremap_prot +#define iounmap iounmap + +#define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL) + +#define ioremap_wc(addr, size) \ + ioremap_prot((addr), (size), pgprot_val(pgprot_writecombine(PAGE_KERNEL))) +#define ioremap_wt(addr, size) \ + ioremap_prot((addr), (size), pgprot_val(pgprot_writethrough(PAGE_KERNEL))) static inline void __iomem *ioport_map(unsigned long port, unsigned int nr) { @@ -51,10 +58,6 @@ static inline void ioport_unmap(void __iomem *p) #define pci_iomap_wc pci_iomap_wc #define pci_iomap_wc_range pci_iomap_wc_range -#define ioremap ioremap -#define ioremap_wt ioremap_wt -#define ioremap_wc ioremap_wc - #define memcpy_fromio(dst, src, count) zpci_memcpy_fromio(dst, src, count) #define memcpy_toio(dst, src, count) zpci_memcpy_toio(dst, src, count) #define memset_io(dst, val, count) zpci_memset_io(dst, val, count) diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index afc3f33788da..d34d5813d006 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -244,62 +244,25 @@ void __iowrite64_copy(void __iomem *to, const void *from, size_t count) zpci_memcpy_toio(to, from, count); } -static void __iomem *__ioremap(phys_addr_t addr, size_t size, pgprot_t prot) +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long prot) { - unsigned long offset, vaddr; - struct vm_struct *area; - phys_addr_t last_addr; - - last_addr = addr + size - 1; - if (!size || last_addr < addr) - return NULL; - + /* + * When PCI MIO instructions are unavailable the "physical" address + * encodes a hint for accessing the PCI memory space it represents. + * Just pass it unchanged such that ioread/iowrite can decode it. + */ if (!static_branch_unlikely(&have_mio)) - return (void __iomem *) addr; + return (void __iomem *)phys_addr; - offset = addr & ~PAGE_MASK; - addr &= PAGE_MASK; - size = PAGE_ALIGN(size + offset); - area = get_vm_area(size, VM_IOREMAP); - if (!area) - return NULL; - - vaddr = (unsigned long) area->addr; - if (ioremap_page_range(vaddr, vaddr + size, addr, prot)) { - free_vm_area(area); - return NULL; - } - return (void __iomem *) ((unsigned long) area->addr + offset); -} - -void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot) -{ - return __ioremap(addr, size, __pgprot(prot)); + return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); } EXPORT_SYMBOL(ioremap_prot); -void __iomem *ioremap(phys_addr_t addr, size_t size) -{ - return __ioremap(addr, size, PAGE_KERNEL); -} -EXPORT_SYMBOL(ioremap); - -void __iomem *ioremap_wc(phys_addr_t addr, size_t size) -{ - return __ioremap(addr, size, pgprot_writecombine(PAGE_KERNEL)); -} -EXPORT_SYMBOL(ioremap_wc); - -void __iomem *ioremap_wt(phys_addr_t addr, size_t size) -{ - return __ioremap(addr, size, pgprot_writethrough(PAGE_KERNEL)); -} -EXPORT_SYMBOL(ioremap_wt); - void iounmap(volatile void __iomem *addr) { if (static_branch_likely(&have_mio)) - vunmap((__force void *) ((unsigned long) addr & PAGE_MASK)); + generic_iounmap(addr); } EXPORT_SYMBOL(iounmap); -- cgit v1.2.3 From b94692e84dccf12dd30839c4e97b9ba028036a8f Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:12 +0800 Subject: sh: add including In , it provides a generic implementation of all I/O accessors. For some port|mm io functions, SuperH has its own implementation in arch/sh/kernel/iomap.c and arch/sh/include/asm/io_noioport.h. These will conflict with those in and cause compiling error. Hence add macro definitions to ensure that the SuperH version of them will override the generic version. [arnd@arndb.de: fix asm-generic/io.h inclusion] Link: https://lkml.kernel.org/r/20230802141658.2064864-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20230706154520.11257-12-bhe@redhat.com Signed-off-by: Baoquan He Signed-off-by: Arnd Bergmann Cc: John Paul Adrian Glaubitz Cc: Yoshinori Sato Cc: Rich Felker Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Mike Rapoport (IBM) Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/sh/include/asm/io.h | 49 +++++++++++++++++++++++++++++++++++++++ arch/sh/include/asm/io_noioport.h | 7 ------ 2 files changed, 49 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h index d8f3537ef57f..d8a0daf33ab5 100644 --- a/arch/sh/include/asm/io.h +++ b/arch/sh/include/asm/io.h @@ -119,6 +119,30 @@ void __raw_readsl(const void __iomem *addr, void *data, int longlen); __BUILD_MEMORY_STRING(__raw_, q, u64) +#define ioport_map ioport_map +#define ioport_unmap ioport_unmap +#define pci_iounmap pci_iounmap + +#define ioread8 ioread8 +#define ioread16 ioread16 +#define ioread16be ioread16be +#define ioread32 ioread32 +#define ioread32be ioread32be + +#define iowrite8 iowrite8 +#define iowrite16 iowrite16 +#define iowrite16be iowrite16be +#define iowrite32 iowrite32 +#define iowrite32be iowrite32be + +#define ioread8_rep ioread8_rep +#define ioread16_rep ioread16_rep +#define ioread32_rep ioread32_rep + +#define iowrite8_rep iowrite8_rep +#define iowrite16_rep iowrite16_rep +#define iowrite32_rep iowrite32_rep + #ifdef CONFIG_HAS_IOPORT_MAP /* @@ -221,10 +245,33 @@ __BUILD_IOPORT_STRING(q, u64) #endif +#define inb(addr) inb(addr) +#define inw(addr) inw(addr) +#define inl(addr) inl(addr) +#define outb(x, addr) outb((x), (addr)) +#define outw(x, addr) outw((x), (addr)) +#define outl(x, addr) outl((x), (addr)) + +#define inb_p(addr) inb(addr) +#define inw_p(addr) inw(addr) +#define inl_p(addr) inl(addr) +#define outb_p(x, addr) outb((x), (addr)) +#define outw_p(x, addr) outw((x), (addr)) +#define outl_p(x, addr) outl((x), (addr)) + +#define insb insb +#define insw insw +#define insl insl +#define outsb outsb +#define outsw outsw +#define outsl outsl #define IO_SPACE_LIMIT 0xffffffff /* We really want to try and get these to memcpy etc */ +#define memset_io memset_io +#define memcpy_fromio memcpy_fromio +#define memcpy_toio memcpy_toio void memcpy_fromio(void *, const volatile void __iomem *, unsigned long); void memcpy_toio(volatile void __iomem *, const void *, unsigned long); void memset_io(volatile void __iomem *, int, unsigned long); @@ -288,6 +335,8 @@ static inline void iounmap(volatile void __iomem *addr) { } #define xlate_dev_mem_ptr(p) __va(p) #define unxlate_dev_mem_ptr(p, v) do { } while (0) +#include + #define ARCH_HAS_VALID_PHYS_ADDR_RANGE int valid_phys_addr_range(phys_addr_t addr, size_t size); int valid_mmap_phys_addr_range(unsigned long pfn, size_t size); diff --git a/arch/sh/include/asm/io_noioport.h b/arch/sh/include/asm/io_noioport.h index f7938fe0f911..12dad91f41c1 100644 --- a/arch/sh/include/asm/io_noioport.h +++ b/arch/sh/include/asm/io_noioport.h @@ -46,13 +46,6 @@ static inline void ioport_unmap(void __iomem *addr) BUG(); } -#define inb_p(addr) inb(addr) -#define inw_p(addr) inw(addr) -#define inl_p(addr) inl(addr) -#define outb_p(x, addr) outb((x), (addr)) -#define outw_p(x, addr) outw((x), (addr)) -#define outl_p(x, addr) outl((x), (addr)) - static inline void insb(unsigned long port, void *dst, unsigned long count) { BUG(); -- cgit v1.2.3 From 0453c9a78015cb2219cda7239d881f4e3137bff8 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:13 +0800 Subject: sh: mm: convert to GENERIC_IOREMAP By taking GENERIC_IOREMAP method, the generic generic_ioremap_prot(), generic_iounmap(), and their generic wrapper ioremap_prot(), ioremap() and iounmap() are all visible and available to arch. Arch needs to provide wrapper functions to override the generic versions if there's arch specific handling in its ioremap_prot(), ioremap() or iounmap(). This change will simplify implementation by removing duplicated code with generic_ioremap_prot() and generic_iounmap(), and has the equivalent functioality as before. Here, add wrapper functions ioremap_prot() and iounmap() for SuperH's special operation when ioremap() and iounmap(). Link: https://lkml.kernel.org/r/20230706154520.11257-13-bhe@redhat.com Signed-off-by: Baoquan He Cc: John Paul Adrian Glaubitz Cc: Yoshinori Sato Cc: Rich Felker Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Mike Rapoport (IBM) Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/sh/Kconfig | 1 + arch/sh/include/asm/io.h | 40 ++++++----------------------- arch/sh/mm/ioremap.c | 65 ++++++++---------------------------------------- 3 files changed, 20 insertions(+), 86 deletions(-) (limited to 'arch') diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 2b3ce4fd3956..6be32254211c 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -29,6 +29,7 @@ config SUPERH select GENERIC_SMP_IDLE_THREAD select GUP_GET_PXX_LOW_HIGH if X2TLB select HAS_IOPORT if HAS_IOPORT_MAP + select GENERIC_IOREMAP if MMU select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_KGDB select HAVE_ARCH_SECCOMP_FILTER diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h index d8a0daf33ab5..f2f38e9d489a 100644 --- a/arch/sh/include/asm/io.h +++ b/arch/sh/include/asm/io.h @@ -290,40 +290,16 @@ unsigned long long poke_real_address_q(unsigned long long addr, #endif #ifdef CONFIG_MMU -void iounmap(void __iomem *addr); -void __iomem *__ioremap_caller(phys_addr_t offset, unsigned long size, - pgprot_t prot, void *caller); - -static inline void __iomem *ioremap(phys_addr_t offset, unsigned long size) -{ - return __ioremap_caller(offset, size, PAGE_KERNEL_NOCACHE, - __builtin_return_address(0)); -} - -static inline void __iomem * -ioremap_cache(phys_addr_t offset, unsigned long size) -{ - return __ioremap_caller(offset, size, PAGE_KERNEL, - __builtin_return_address(0)); -} -#define ioremap_cache ioremap_cache - -#ifdef CONFIG_HAVE_IOREMAP_PROT -static inline void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size, - unsigned long flags) -{ - return __ioremap_caller(offset, size, __pgprot(flags), - __builtin_return_address(0)); -} -#endif /* CONFIG_HAVE_IOREMAP_PROT */ +/* + * I/O memory mapping functions. + */ +#define ioremap_prot ioremap_prot +#define iounmap iounmap -#else /* CONFIG_MMU */ -static inline void __iomem *ioremap(phys_addr_t offset, size_t size) -{ - return (void __iomem *)(unsigned long)offset; -} +#define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL_NOCACHE) -static inline void iounmap(volatile void __iomem *addr) { } +#define ioremap_cache(addr, size) \ + ioremap_prot((addr), (size), pgprot_val(PAGE_KERNEL)) #endif /* CONFIG_MMU */ #define ioremap_uc ioremap diff --git a/arch/sh/mm/ioremap.c b/arch/sh/mm/ioremap.c index 21342581144d..c33b3daa4ad1 100644 --- a/arch/sh/mm/ioremap.c +++ b/arch/sh/mm/ioremap.c @@ -72,22 +72,11 @@ __ioremap_29bit(phys_addr_t offset, unsigned long size, pgprot_t prot) #define __ioremap_29bit(offset, size, prot) NULL #endif /* CONFIG_29BIT */ -/* - * Remap an arbitrary physical address space into the kernel virtual - * address space. Needed when the kernel wants to access high addresses - * directly. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void __iomem * __ref -__ioremap_caller(phys_addr_t phys_addr, unsigned long size, - pgprot_t pgprot, void *caller) +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long prot) { - struct vm_struct *area; - unsigned long offset, last_addr, addr, orig_addr; void __iomem *mapped; + pgprot_t pgprot = __pgprot(prot); mapped = __ioremap_trapped(phys_addr, size); if (mapped) @@ -97,11 +86,6 @@ __ioremap_caller(phys_addr_t phys_addr, unsigned long size, if (mapped) return mapped; - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - /* * If we can't yet use the regular approach, go the fixmap route. */ @@ -112,34 +96,14 @@ __ioremap_caller(phys_addr_t phys_addr, unsigned long size, * First try to remap through the PMB. * PMB entries are all pre-faulted. */ - mapped = pmb_remap_caller(phys_addr, size, pgprot, caller); + mapped = pmb_remap_caller(phys_addr, size, pgprot, + __builtin_return_address(0)); if (mapped && !IS_ERR(mapped)) return mapped; - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr+1) - phys_addr; - - /* - * Ok, go for it.. - */ - area = get_vm_area_caller(size, VM_IOREMAP, caller); - if (!area) - return NULL; - area->phys_addr = phys_addr; - orig_addr = addr = (unsigned long)area->addr; - - if (ioremap_page_range(addr, addr + size, phys_addr, pgprot)) { - vunmap((void *)orig_addr); - return NULL; - } - - return (void __iomem *)(offset + (char *)orig_addr); + return generic_ioremap_prot(phys_addr, size, pgprot); } -EXPORT_SYMBOL(__ioremap_caller); +EXPORT_SYMBOL(ioremap_prot); /* * Simple checks for non-translatable mappings. @@ -158,10 +122,9 @@ static inline int iomapping_nontranslatable(unsigned long offset) return 0; } -void iounmap(void __iomem *addr) +void iounmap(volatile void __iomem *addr) { unsigned long vaddr = (unsigned long __force)addr; - struct vm_struct *p; /* * Nothing to do if there is no translatable mapping. @@ -172,21 +135,15 @@ void iounmap(void __iomem *addr) /* * There's no VMA if it's from an early fixed mapping. */ - if (iounmap_fixed(addr) == 0) + if (iounmap_fixed((void __iomem *)addr) == 0) return; /* * If the PMB handled it, there's nothing else to do. */ - if (pmb_unmap(addr) == 0) + if (pmb_unmap((void __iomem *)addr) == 0) return; - p = remove_vm_area((void *)(vaddr & PAGE_MASK)); - if (!p) { - printk(KERN_ERR "%s: bad address %p\n", __func__, addr); - return; - } - - kfree(p); + generic_iounmap(addr); } EXPORT_SYMBOL(iounmap); -- cgit v1.2.3 From ca6c1af38128f19c8a388eecfde165210d2c6059 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:14 +0800 Subject: xtensa: mm: convert to GENERIC_IOREMAP By taking GENERIC_IOREMAP method, the generic generic_ioremap_prot(), generic_iounmap(), and their generic wrapper ioremap_prot(), ioremap() and iounmap() are all visible and available to arch. Arch needs to provide wrapper functions to override the generic versions if there's arch specific handling in its ioremap_prot(), ioremap() or iounmap(). This change will simplify implementation by removing duplicated code with generic_ioremap_prot() and generic_iounmap(), and has the equivalent functioality as before. Here, add wrapper functions ioremap_prot(), ioremap() and iounmap() for xtensa's special operation when ioremap() and iounmap(). Link: https://lkml.kernel.org/r/20230706154520.11257-14-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Mike Rapoport (IBM) Cc: Chris Zankel Cc: Max Filippov Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/xtensa/Kconfig | 1 + arch/xtensa/include/asm/io.h | 32 +++++++++--------------- arch/xtensa/mm/ioremap.c | 58 +++++++++++--------------------------------- 3 files changed, 27 insertions(+), 64 deletions(-) (limited to 'arch') diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 2a51a466779f..a5488cc40f58 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -28,6 +28,7 @@ config XTENSA select GENERIC_LIB_UCMPDI2 select GENERIC_PCI_IOMAP select GENERIC_SCHED_CLOCK + select GENERIC_IOREMAP if MMU select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL diff --git a/arch/xtensa/include/asm/io.h b/arch/xtensa/include/asm/io.h index a5b707e1c0f4..934e58399c8c 100644 --- a/arch/xtensa/include/asm/io.h +++ b/arch/xtensa/include/asm/io.h @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -24,22 +25,24 @@ #define PCI_IOBASE ((void __iomem *)XCHAL_KIO_BYPASS_VADDR) #ifdef CONFIG_MMU - -void __iomem *xtensa_ioremap_nocache(unsigned long addr, unsigned long size); -void __iomem *xtensa_ioremap_cache(unsigned long addr, unsigned long size); -void xtensa_iounmap(volatile void __iomem *addr); - /* - * Return the virtual address for the specified bus memory. + * I/O memory mapping functions. */ +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long prot); +#define ioremap_prot ioremap_prot +#define iounmap iounmap + static inline void __iomem *ioremap(unsigned long offset, unsigned long size) { if (offset >= XCHAL_KIO_PADDR && offset - XCHAL_KIO_PADDR < XCHAL_KIO_SIZE) return (void*)(offset-XCHAL_KIO_PADDR+XCHAL_KIO_BYPASS_VADDR); else - return xtensa_ioremap_nocache(offset, size); + return ioremap_prot(offset, size, + pgprot_val(pgprot_noncached(PAGE_KERNEL))); } +#define ioremap ioremap static inline void __iomem *ioremap_cache(unsigned long offset, unsigned long size) @@ -48,21 +51,10 @@ static inline void __iomem *ioremap_cache(unsigned long offset, && offset - XCHAL_KIO_PADDR < XCHAL_KIO_SIZE) return (void*)(offset-XCHAL_KIO_PADDR+XCHAL_KIO_CACHED_VADDR); else - return xtensa_ioremap_cache(offset, size); -} -#define ioremap_cache ioremap_cache + return ioremap_prot(offset, size, pgprot_val(PAGE_KERNEL)); -static inline void iounmap(volatile void __iomem *addr) -{ - unsigned long va = (unsigned long) addr; - - if (!(va >= XCHAL_KIO_CACHED_VADDR && - va - XCHAL_KIO_CACHED_VADDR < XCHAL_KIO_SIZE) && - !(va >= XCHAL_KIO_BYPASS_VADDR && - va - XCHAL_KIO_BYPASS_VADDR < XCHAL_KIO_SIZE)) - xtensa_iounmap(addr); } - +#define ioremap_cache ioremap_cache #endif /* CONFIG_MMU */ #include diff --git a/arch/xtensa/mm/ioremap.c b/arch/xtensa/mm/ioremap.c index a400188c16b9..8ca660b7ab49 100644 --- a/arch/xtensa/mm/ioremap.c +++ b/arch/xtensa/mm/ioremap.c @@ -6,60 +6,30 @@ */ #include -#include #include #include #include -static void __iomem *xtensa_ioremap(unsigned long paddr, unsigned long size, - pgprot_t prot) +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long prot) { - unsigned long offset = paddr & ~PAGE_MASK; - unsigned long pfn = __phys_to_pfn(paddr); - struct vm_struct *area; - unsigned long vaddr; - int err; - - paddr &= PAGE_MASK; - + unsigned long pfn = __phys_to_pfn((phys_addr)); WARN_ON(pfn_valid(pfn)); - size = PAGE_ALIGN(offset + size); - - area = get_vm_area(size, VM_IOREMAP); - if (!area) - return NULL; - - vaddr = (unsigned long)area->addr; - area->phys_addr = paddr; - - err = ioremap_page_range(vaddr, vaddr + size, paddr, prot); - - if (err) { - vunmap((void *)vaddr); - return NULL; - } - - flush_cache_vmap(vaddr, vaddr + size); - return (void __iomem *)(offset + vaddr); -} - -void __iomem *xtensa_ioremap_nocache(unsigned long addr, unsigned long size) -{ - return xtensa_ioremap(addr, size, pgprot_noncached(PAGE_KERNEL)); + return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); } -EXPORT_SYMBOL(xtensa_ioremap_nocache); +EXPORT_SYMBOL(ioremap_prot); -void __iomem *xtensa_ioremap_cache(unsigned long addr, unsigned long size) +void iounmap(volatile void __iomem *addr) { - return xtensa_ioremap(addr, size, PAGE_KERNEL); -} -EXPORT_SYMBOL(xtensa_ioremap_cache); + unsigned long va = (unsigned long) addr; -void xtensa_iounmap(volatile void __iomem *io_addr) -{ - void *addr = (void *)(PAGE_MASK & (unsigned long)io_addr); + if ((va >= XCHAL_KIO_CACHED_VADDR && + va - XCHAL_KIO_CACHED_VADDR < XCHAL_KIO_SIZE) || + (va >= XCHAL_KIO_BYPASS_VADDR && + va - XCHAL_KIO_BYPASS_VADDR < XCHAL_KIO_SIZE)) + return; - vunmap(addr); + generic_iounmap(addr); } -EXPORT_SYMBOL(xtensa_iounmap); +EXPORT_SYMBOL(iounmap); -- cgit v1.2.3 From 426b313f356a314af335899c51250dc0f49cd4a7 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:15 +0800 Subject: parisc: mm: convert to GENERIC_IOREMAP By taking GENERIC_IOREMAP method, the generic generic_ioremap_prot(), generic_iounmap(), and their generic wrapper ioremap_prot(), ioremap() and iounmap() are all visible and available to arch. Arch needs to provide wrapper functions to override the generic versions if there's arch specific handling in its ioremap_prot(), ioremap() or iounmap(). This change will simplify implementation by removing duplicated code with generic_ioremap_prot() and generic_iounmap(), and has the equivalent functioality as before. Here, add wrapper function ioremap_prot() for parisc's special operation when iounmap(). Link: https://lkml.kernel.org/r/20230706154520.11257-15-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Mike Rapoport (IBM) Acked-by: Helge Deller Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/parisc/Kconfig | 1 + arch/parisc/include/asm/io.h | 15 +++++++---- arch/parisc/mm/ioremap.c | 62 +++----------------------------------------- 3 files changed, 15 insertions(+), 63 deletions(-) (limited to 'arch') diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 4cb46d5c64a2..4cda9f0a3277 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -36,6 +36,7 @@ config PARISC select GENERIC_ATOMIC64 if !64BIT select GENERIC_IRQ_PROBE select GENERIC_PCI_IOMAP + select GENERIC_IOREMAP select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_SMP_IDLE_THREAD select GENERIC_ARCH_TOPOLOGY if SMP diff --git a/arch/parisc/include/asm/io.h b/arch/parisc/include/asm/io.h index c05e781be2f5..366537042465 100644 --- a/arch/parisc/include/asm/io.h +++ b/arch/parisc/include/asm/io.h @@ -125,12 +125,17 @@ static inline void gsc_writeq(unsigned long long val, unsigned long addr) /* * The standard PCI ioremap interfaces */ -void __iomem *ioremap(unsigned long offset, unsigned long size); -#define ioremap_wc ioremap -#define ioremap_uc ioremap -#define pci_iounmap pci_iounmap +#define ioremap_prot ioremap_prot + +#define _PAGE_IOREMAP (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | \ + _PAGE_ACCESSED | _PAGE_NO_CACHE) -extern void iounmap(const volatile void __iomem *addr); +#define ioremap_wc(addr, size) \ + ioremap_prot((addr), (size), _PAGE_IOREMAP) +#define ioremap_uc(addr, size) \ + ioremap_prot((addr), (size), _PAGE_IOREMAP) + +#define pci_iounmap pci_iounmap void memset_io(volatile void __iomem *addr, unsigned char val, int count); void memcpy_fromio(void *dst, const volatile void __iomem *src, int count); diff --git a/arch/parisc/mm/ioremap.c b/arch/parisc/mm/ioremap.c index 345ff0b66499..fd996472dfe7 100644 --- a/arch/parisc/mm/ioremap.c +++ b/arch/parisc/mm/ioremap.c @@ -13,25 +13,9 @@ #include #include -/* - * Generic mapping function (not visible outside): - */ - -/* - * Remap an arbitrary physical address space into the kernel virtual - * address space. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void __iomem *ioremap(unsigned long phys_addr, unsigned long size) +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long prot) { - void __iomem *addr; - struct vm_struct *area; - unsigned long offset, last_addr; - pgprot_t pgprot; - #ifdef CONFIG_EISA unsigned long end = phys_addr + size - 1; /* Support EISA addresses */ @@ -40,11 +24,6 @@ void __iomem *ioremap(unsigned long phys_addr, unsigned long size) phys_addr |= F_EXTEND(0xfc000000); #endif - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - /* * Don't allow anybody to remap normal RAM that we're using.. */ @@ -62,39 +41,6 @@ void __iomem *ioremap(unsigned long phys_addr, unsigned long size) } } - pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | - _PAGE_ACCESSED | _PAGE_NO_CACHE); - - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr + 1) - phys_addr; - - /* - * Ok, go for it.. - */ - area = get_vm_area(size, VM_IOREMAP); - if (!area) - return NULL; - - addr = (void __iomem *) area->addr; - if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size, - phys_addr, pgprot)) { - vunmap(addr); - return NULL; - } - - return (void __iomem *) (offset + (char __iomem *)addr); -} -EXPORT_SYMBOL(ioremap); - -void iounmap(const volatile void __iomem *io_addr) -{ - unsigned long addr = (unsigned long)io_addr & PAGE_MASK; - - if (is_vmalloc_addr((void *)addr)) - vunmap((void *)addr); + return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); } -EXPORT_SYMBOL(iounmap); +EXPORT_SYMBOL(ioremap_prot); -- cgit v1.2.3 From 016fec91013cfb27c9f5a101a87ae8266537fed1 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:17 +0800 Subject: mm: move is_ioremap_addr() into new header file Now is_ioremap_addr() is only used in kernel/iomem.c and gonna be used in mm/ioremap.c. Move it into its own new header file linux/ioremap.h. Link: https://lkml.kernel.org/r/20230706154520.11257-17-bhe@redhat.com Suggested-by: Christoph Hellwig Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Mike Rapoport (IBM) Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/pgtable.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 6a88bfdaa69b..445a22987aa3 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -157,16 +157,6 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd) return (pgtable_t)pmd_page_vaddr(pmd); } -#ifdef CONFIG_PPC64 -#define is_ioremap_addr is_ioremap_addr -static inline bool is_ioremap_addr(const void *x) -{ - unsigned long addr = (unsigned long)x; - - return addr >= IOREMAP_BASE && addr < IOREMAP_END; -} -#endif /* CONFIG_PPC64 */ - #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_H */ -- cgit v1.2.3 From 8d05554dca2af6a77dc38a152f4e84fdf7e35179 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 6 Jul 2023 23:45:18 +0800 Subject: powerpc: mm: convert to GENERIC_IOREMAP By taking GENERIC_IOREMAP method, the generic generic_ioremap_prot(), generic_iounmap(), and their generic wrapper ioremap_prot(), ioremap() and iounmap() are all visible and available to arch. Arch needs to provide wrapper functions to override the generic versions if there's arch specific handling in its ioremap_prot(), ioremap() or iounmap(). This change will simplify implementation by removing duplicated code with generic_ioremap_prot() and generic_iounmap(), and has the equivalent functioality as before. Here, add wrapper functions ioremap_prot() and iounmap() for powerpc's special operation when ioremap() and iounmap(). Link: https://lkml.kernel.org/r/20230706154520.11257-18-bhe@redhat.com Signed-off-by: Christophe Leroy Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Mike Rapoport (IBM) Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Nathan Chancellor Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/io.h | 8 +++----- arch/powerpc/mm/ioremap.c | 26 +------------------------- arch/powerpc/mm/ioremap_32.c | 19 +++++++++---------- arch/powerpc/mm/ioremap_64.c | 12 ++---------- 5 files changed, 16 insertions(+), 50 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 0b1172cbeccb..9222c138c457 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -193,6 +193,7 @@ config PPC select GENERIC_CPU_VULNERABILITIES if PPC_BARRIER_NOSPEC select GENERIC_EARLY_IOREMAP select GENERIC_GETTIMEOFDAY + select GENERIC_IOREMAP select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL select GENERIC_PCI_IOMAP if PCI diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 67a3fb6de498..0732b743e099 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -889,8 +889,8 @@ static inline void iosync(void) * */ extern void __iomem *ioremap(phys_addr_t address, unsigned long size); -extern void __iomem *ioremap_prot(phys_addr_t address, unsigned long size, - unsigned long flags); +#define ioremap ioremap +#define ioremap_prot ioremap_prot extern void __iomem *ioremap_wc(phys_addr_t address, unsigned long size); #define ioremap_wc ioremap_wc @@ -904,14 +904,12 @@ void __iomem *ioremap_coherent(phys_addr_t address, unsigned long size); #define ioremap_cache(addr, size) \ ioremap_prot((addr), (size), pgprot_val(PAGE_KERNEL)) -extern void iounmap(volatile void __iomem *addr); +#define iounmap iounmap void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size); int early_ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot); -void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, - pgprot_t prot, void *caller); extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size, pgprot_t prot, void *caller); diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c index 4f12504fb405..705e8e8ffde4 100644 --- a/arch/powerpc/mm/ioremap.c +++ b/arch/powerpc/mm/ioremap.c @@ -41,7 +41,7 @@ void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size) return __ioremap_caller(addr, size, prot, caller); } -void __iomem *ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) +void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long flags) { pte_t pte = __pte(flags); void *caller = __builtin_return_address(0); @@ -74,27 +74,3 @@ int early_ioremap_range(unsigned long ea, phys_addr_t pa, return 0; } - -void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, - pgprot_t prot, void *caller) -{ - struct vm_struct *area; - int ret; - unsigned long va; - - area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START, IOREMAP_END, caller); - if (area == NULL) - return NULL; - - area->phys_addr = pa; - va = (unsigned long)area->addr; - - ret = ioremap_page_range(va, va + size, pa, prot); - if (!ret) - return (void __iomem *)area->addr + offset; - - vunmap_range(va, va + size); - free_vm_area(area); - - return NULL; -} diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c index 9d13143b8be4..ca5bc6be3e6f 100644 --- a/arch/powerpc/mm/ioremap_32.c +++ b/arch/powerpc/mm/ioremap_32.c @@ -21,6 +21,13 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call phys_addr_t p, offset; int err; + /* + * If the address lies within the first 16 MB, assume it's in ISA + * memory space + */ + if (addr < SZ_16M) + addr += _ISA_MEM_BASE; + /* * Choose an address to map it to. * Once the vmalloc system is running, we use it. @@ -31,13 +38,6 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call offset = addr & ~PAGE_MASK; size = PAGE_ALIGN(addr + size) - p; - /* - * If the address lies within the first 16 MB, assume it's in ISA - * memory space - */ - if (p < 16 * 1024 * 1024) - p += _ISA_MEM_BASE; - #ifndef CONFIG_CRASH_DUMP /* * Don't allow anybody to remap normal RAM that we're using. @@ -63,7 +63,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call return (void __iomem *)v + offset; if (slab_is_available()) - return do_ioremap(p, offset, size, prot, caller); + return generic_ioremap_prot(addr, size, prot); /* * Should check if it is a candidate for a BAT mapping @@ -87,7 +87,6 @@ void iounmap(volatile void __iomem *addr) if (v_block_mapped((unsigned long)addr)) return; - if (addr > high_memory && (unsigned long)addr < ioremap_bot) - vunmap((void *)(PAGE_MASK & (unsigned long)addr)); + generic_iounmap(addr); } EXPORT_SYMBOL(iounmap); diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c index 3acece00b33e..d24e5f166723 100644 --- a/arch/powerpc/mm/ioremap_64.c +++ b/arch/powerpc/mm/ioremap_64.c @@ -29,7 +29,7 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size, return NULL; if (slab_is_available()) - return do_ioremap(paligned, offset, size, prot, caller); + return generic_ioremap_prot(addr, size, prot); pr_warn("ioremap() called early from %pS. Use early_ioremap() instead\n", caller); @@ -49,17 +49,9 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size, */ void iounmap(volatile void __iomem *token) { - void *addr; - if (!slab_is_available()) return; - addr = (void *)((unsigned long __force)PCI_FIX_ADDR(token) & PAGE_MASK); - - if ((unsigned long)addr < ioremap_bot) { - pr_warn("Attempt to iounmap early bolted mapping at 0x%p\n", addr); - return; - } - vunmap(addr); + generic_iounmap(PCI_FIX_ADDR(token)); } EXPORT_SYMBOL(iounmap); -- cgit v1.2.3 From 8f03d74f716313892908819a389ff9ede483c3a5 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:19 +0800 Subject: arm64 : mm: add wrapper function ioremap_prot() Since hook functions ioremap_allowed() and iounmap_allowed() will be obsoleted, add wrapper function ioremap_prot() to contain the the specific handling in addition to generic_ioremap_prot() invocation. Link: https://lkml.kernel.org/r/20230706154520.11257-19-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Kefeng Wang Reviewed-by: Mike Rapoport (IBM) Acked-by: Catalin Marinas Cc: Catalin Marinas Cc: Will Deacon Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/arm64/include/asm/io.h | 3 +-- arch/arm64/mm/ioremap.c | 10 ++++++---- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 51d92abf945e..3b694511b98f 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -139,8 +139,7 @@ extern void __memset_io(volatile void __iomem *, int, size_t); * I/O memory mapping functions. */ -bool ioremap_allowed(phys_addr_t phys_addr, size_t size, unsigned long prot); -#define ioremap_allowed ioremap_allowed +#define ioremap_prot ioremap_prot #define _PAGE_IOREMAP PROT_DEVICE_nGnRE diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c index c5af103d4ad4..269f2f63ab7d 100644 --- a/arch/arm64/mm/ioremap.c +++ b/arch/arm64/mm/ioremap.c @@ -3,20 +3,22 @@ #include #include -bool ioremap_allowed(phys_addr_t phys_addr, size_t size, unsigned long prot) +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long prot) { unsigned long last_addr = phys_addr + size - 1; /* Don't allow outside PHYS_MASK */ if (last_addr & ~PHYS_MASK) - return false; + return NULL; /* Don't allow RAM to be mapped. */ if (WARN_ON(pfn_is_map_memory(__phys_to_pfn(phys_addr)))) - return false; + return NULL; - return true; + return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); } +EXPORT_SYMBOL(ioremap_prot); /* * Must be called after early_fixmap_init -- cgit v1.2.3 From 65c8d30e679bdffeeaa0b84b7094a3c719aa6585 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 17 Jul 2023 21:10:01 +0800 Subject: mm/tlbbatch: introduce arch_tlbbatch_should_defer() Patch series "arm64: support batched/deferred tlb shootdown during page reclamation/migration", v11. Though ARM64 has the hardware to do tlb shootdown, the hardware broadcasting is not free. A simplest micro benchmark shows even on snapdragon 888 with only 8 cores, the overhead for ptep_clear_flush is huge even for paging out one page mapped by only one process: 5.36% a.out [kernel.kallsyms] [k] ptep_clear_flush While pages are mapped by multiple processes or HW has more CPUs, the cost should become even higher due to the bad scalability of tlb shootdown. The same benchmark can result in 16.99% CPU consumption on ARM64 server with around 100 cores according to the test on patch 4/4. This patchset leverages the existing BATCHED_UNMAP_TLB_FLUSH by 1. only send tlbi instructions in the first stage - arch_tlbbatch_add_mm() 2. wait for the completion of tlbi by dsb while doing tlbbatch sync in arch_tlbbatch_flush() Testing on snapdragon shows the overhead of ptep_clear_flush is removed by the patchset. The micro benchmark becomes 5% faster even for one page mapped by single process on snapdragon 888. Since BATCHED_UNMAP_TLB_FLUSH is implemented only on x86, the patchset does some renaming/extension for the current implementation first (Patch 1-3), then add the support on arm64 (Patch 4). This patch (of 4): The entire scheme of deferred TLB flush in reclaim path rests on the fact that the cost to refill TLB entries is less than flushing out individual entries by sending IPI to remote CPUs. But architecture can have different ways to evaluate that. Hence apart from checking TTU_BATCH_FLUSH in the TTU flags, rest of the decision should be architecture specific. [yangyicong@hisilicon.com: rebase and fix incorrect return value type] Link: https://lkml.kernel.org/r/20230717131004.12662-1-yangyicong@huawei.com Link: https://lkml.kernel.org/r/20230717131004.12662-2-yangyicong@huawei.com Signed-off-by: Anshuman Khandual [https://lore.kernel.org/linuxppc-dev/20171101101735.2318-2-khandual@linux.vnet.ibm.com/] Signed-off-by: Yicong Yang Reviewed-by: Kefeng Wang Reviewed-by: Anshuman Khandual Reviewed-by: Barry Song Reviewed-by: Xin Hao Tested-by: Punit Agrawal Reviewed-by: Catalin Marinas Cc: Arnd Bergmann Cc: Darren Hart Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: lipeifeng Cc: Mark Rutland Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Steven Miao Cc: Will Deacon Cc: Zeng Tao Cc: Barry Song Cc: Mel Gorman Cc: Nadav Amit Signed-off-by: Andrew Morton --- arch/x86/include/asm/tlbflush.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 80450e1d5385..cf2a1de5d388 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -253,6 +253,18 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false); } +static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) +{ + bool should_defer = false; + + /* If remote CPUs need to be flushed then defer batch the flush */ + if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) + should_defer = true; + put_cpu(); + + return should_defer; +} + static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) { /* -- cgit v1.2.3 From f73419bb89d606de9be2043febf0957d56627a5b Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 17 Jul 2023 21:10:02 +0800 Subject: mm/tlbbatch: rename and extend some functions This patch does some preparation works to extend batched TLB flush to arm64. Including: - Extend set_tlb_ubc_flush_pending() and arch_tlbbatch_add_mm() to accept an additional argument for address, architectures like arm64 may need this for tlbi. - Rename arch_tlbbatch_add_mm() to arch_tlbbatch_add_pending() to match its current function since we don't need to handle mm on architectures like arm64 and add_mm is not proper, add_pending will make sense to both as on x86 we're pending the TLB flush operations while on arm64 we're pending the synchronize operations. This intends no functional changes on x86. Link: https://lkml.kernel.org/r/20230717131004.12662-3-yangyicong@huawei.com Tested-by: Yicong Yang Tested-by: Xin Hao Tested-by: Punit Agrawal Signed-off-by: Barry Song Signed-off-by: Yicong Yang Reviewed-by: Kefeng Wang Reviewed-by: Xin Hao Reviewed-by: Anshuman Khandual Reviewed-by: Catalin Marinas Cc: Jonathan Corbet Cc: Nadav Amit Cc: Mel Gorman Cc: Anshuman Khandual Cc: Arnd Bergmann Cc: Barry Song Cc: Darren Hart Cc: Jonathan Cameron Cc: lipeifeng Cc: Mark Rutland Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Steven Miao Cc: Will Deacon Cc: Zeng Tao Signed-off-by: Andrew Morton --- arch/x86/include/asm/tlbflush.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index cf2a1de5d388..1c7d3a36e16c 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -276,8 +276,9 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) return atomic64_inc_return(&mm->context.tlb_gen); } -static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, - struct mm_struct *mm) +static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, + struct mm_struct *mm, + unsigned long uaddr) { inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); -- cgit v1.2.3 From db6c1f6f236dbcd271d51d37675bbccfcea7c7be Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Mon, 17 Jul 2023 21:10:03 +0800 Subject: mm/tlbbatch: introduce arch_flush_tlb_batched_pending() Currently we'll flush the mm in flush_tlb_batched_pending() to avoid race between reclaim unmaps pages by batched TLB flush and mprotect/munmap/etc. Other architectures like arm64 may only need a synchronization barrier(dsb) here rather than a full mm flush. So add arch_flush_tlb_batched_pending() to allow an arch-specific implementation here. This intends no functional changes on x86 since still a full mm flush for x86. Link: https://lkml.kernel.org/r/20230717131004.12662-4-yangyicong@huawei.com Signed-off-by: Yicong Yang Reviewed-by: Catalin Marinas Cc: Anshuman Khandual Cc: Anshuman Khandual Cc: Arnd Bergmann Cc: Barry Song Cc: Barry Song Cc: Darren Hart Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Kefeng Wang Cc: lipeifeng Cc: Mark Rutland Cc: Mel Gorman Cc: Nadav Amit Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Ryan Roberts Cc: Steven Miao Cc: Will Deacon Cc: Xin Hao Cc: Zeng Tao Signed-off-by: Andrew Morton --- arch/x86/include/asm/tlbflush.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 1c7d3a36e16c..837e4a50281a 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -284,6 +284,11 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); } +static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) +{ + flush_tlb_mm(mm); +} + extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); static inline bool pte_flags_need_flush(unsigned long oldflags, -- cgit v1.2.3 From 43b3dfdd04553171488cb11d46d21948b6b90e27 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 17 Jul 2023 21:10:04 +0800 Subject: arm64: support batched/deferred tlb shootdown during page reclamation/migration On x86, batched and deferred tlb shootdown has lead to 90% performance increase on tlb shootdown. on arm64, HW can do tlb shootdown without software IPI. But sync tlbi is still quite expensive. Even running a simplest program which requires swapout can prove this is true, #include #include #include #include int main() { #define SIZE (1 * 1024 * 1024) volatile unsigned char *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); memset(p, 0x88, SIZE); for (int k = 0; k < 10000; k++) { /* swap in */ for (int i = 0; i < SIZE; i += 4096) { (void)p[i]; } /* swap out */ madvise(p, SIZE, MADV_PAGEOUT); } } Perf result on snapdragon 888 with 8 cores by using zRAM as the swap block device. ~ # perf record taskset -c 4 ./a.out [ perf record: Woken up 10 times to write data ] [ perf record: Captured and wrote 2.297 MB perf.data (60084 samples) ] ~ # perf report # To display the perf.data header info, please use --header/--header-only options. # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 60K of event 'cycles' # Event count (approx.): 35706225414 # # Overhead Command Shared Object Symbol # ........ ....... ................. ...... # 21.07% a.out [kernel.kallsyms] [k] _raw_spin_unlock_irq 8.23% a.out [kernel.kallsyms] [k] _raw_spin_unlock_irqrestore 6.67% a.out [kernel.kallsyms] [k] filemap_map_pages 6.16% a.out [kernel.kallsyms] [k] __zram_bvec_write 5.36% a.out [kernel.kallsyms] [k] ptep_clear_flush 3.71% a.out [kernel.kallsyms] [k] _raw_spin_lock 3.49% a.out [kernel.kallsyms] [k] memset64 1.63% a.out [kernel.kallsyms] [k] clear_page 1.42% a.out [kernel.kallsyms] [k] _raw_spin_unlock 1.26% a.out [kernel.kallsyms] [k] mod_zone_state.llvm.8525150236079521930 1.23% a.out [kernel.kallsyms] [k] xas_load 1.15% a.out [kernel.kallsyms] [k] zram_slot_lock ptep_clear_flush() takes 5.36% CPU in the micro-benchmark swapping in/out a page mapped by only one process. If the page is mapped by multiple processes, typically, like more than 100 on a phone, the overhead would be much higher as we have to run tlb flush 100 times for one single page. Plus, tlb flush overhead will increase with the number of CPU cores due to the bad scalability of tlb shootdown in HW, so those ARM64 servers should expect much higher overhead. Further perf annonate shows 95% cpu time of ptep_clear_flush is actually used by the final dsb() to wait for the completion of tlb flush. This provides us a very good chance to leverage the existing batched tlb in kernel. The minimum modification is that we only send async tlbi in the first stage and we send dsb while we have to sync in the second stage. With the above simplest micro benchmark, collapsed time to finish the program decreases around 5%. Typical collapsed time w/o patch: ~ # time taskset -c 4 ./a.out 0.21user 14.34system 0:14.69elapsed w/ patch: ~ # time taskset -c 4 ./a.out 0.22user 13.45system 0:13.80elapsed Also tested with benchmark in the commit on Kunpeng920 arm64 server and observed an improvement around 12.5% with command `time ./swap_bench`. w/o w/ real 0m13.460s 0m11.771s user 0m0.248s 0m0.279s sys 0m12.039s 0m11.458s Originally it's noticed a 16.99% overhead of ptep_clear_flush() which has been eliminated by this patch: [root@localhost yang]# perf record -- ./swap_bench && perf report [...] 16.99% swap_bench [kernel.kallsyms] [k] ptep_clear_flush It is tested on 4,8,128 CPU platforms and shows to be beneficial on large systems but may not have improvement on small systems like on a 4 CPU platform. Also this patch improve the performance of page migration. Using pmbench and tries to migrate the pages of pmbench between node 0 and node 1 for 100 times for 1G memory, this patch decrease the time used around 20% (prev 18.338318910 sec after 13.981866350 sec) and saved the time used by ptep_clear_flush(). Link: https://lkml.kernel.org/r/20230717131004.12662-5-yangyicong@huawei.com Tested-by: Yicong Yang Tested-by: Xin Hao Tested-by: Punit Agrawal Signed-off-by: Barry Song Signed-off-by: Yicong Yang Reviewed-by: Kefeng Wang Reviewed-by: Xin Hao Reviewed-by: Anshuman Khandual Reviewed-by: Catalin Marinas Cc: Anshuman Khandual Cc: Jonathan Corbet Cc: Nadav Amit Cc: Mel Gorman Cc: Anshuman Khandual Cc: Arnd Bergmann Cc: Barry Song Cc: Darren Hart Cc: Jonathan Cameron Cc: lipeifeng Cc: Mark Rutland Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Steven Miao Cc: Will Deacon Cc: Zeng Tao Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/tlbbatch.h | 12 +++++++++++ arch/arm64/include/asm/tlbflush.h | 44 ++++++++++++++++++++++++++++++++++++--- 3 files changed, 54 insertions(+), 3 deletions(-) create mode 100644 arch/arm64/include/asm/tlbbatch.h (limited to 'arch') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a2511b30d0f6..751d8c8821db 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -96,6 +96,7 @@ config ARM64 select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_SUPPORTS_PAGE_TABLE_CHECK select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT diff --git a/arch/arm64/include/asm/tlbbatch.h b/arch/arm64/include/asm/tlbbatch.h new file mode 100644 index 000000000000..fedb0b87b8db --- /dev/null +++ b/arch/arm64/include/asm/tlbbatch.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ARCH_ARM64_TLBBATCH_H +#define _ARCH_ARM64_TLBBATCH_H + +struct arch_tlbflush_unmap_batch { + /* + * For arm64, HW can do tlb shootdown, so we don't + * need to record cpumask for sending IPI + */ +}; + +#endif /* _ARCH_ARM64_TLBBATCH_H */ diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 412a3b9a3c25..3456866c6a1d 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -254,17 +254,23 @@ static inline void flush_tlb_mm(struct mm_struct *mm) dsb(ish); } -static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, - unsigned long uaddr) +static inline void __flush_tlb_page_nosync(struct mm_struct *mm, + unsigned long uaddr) { unsigned long addr; dsb(ishst); - addr = __TLBI_VADDR(uaddr, ASID(vma->vm_mm)); + addr = __TLBI_VADDR(uaddr, ASID(mm)); __tlbi(vale1is, addr); __tlbi_user(vale1is, addr); } +static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, + unsigned long uaddr) +{ + return __flush_tlb_page_nosync(vma->vm_mm, uaddr); +} + static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) { @@ -272,6 +278,38 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, dsb(ish); } +static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) +{ +#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI + /* + * TLB flush deferral is not required on systems which are affected by + * ARM64_WORKAROUND_REPEAT_TLBI, as __tlbi()/__tlbi_user() implementation + * will have two consecutive TLBI instructions with a dsb(ish) in between + * defeating the purpose (i.e save overall 'dsb ish' cost). + */ + if (unlikely(cpus_have_const_cap(ARM64_WORKAROUND_REPEAT_TLBI))) + return false; +#endif + return true; +} + +static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, + struct mm_struct *mm, + unsigned long uaddr) +{ + __flush_tlb_page_nosync(mm, uaddr); +} + +static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) +{ + dsb(ish); +} + +static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) +{ + dsb(ish); +} + /* * This is meant to avoid soft lock-ups on large TLB flushing ranges and not * necessarily a performance improvement. -- cgit v1.2.3 From 6bbd42e2df8f90b022fa0b82986d58ecb30b9dcc Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 25 Jul 2023 23:42:05 +1000 Subject: mmu_notifiers: call invalidate_range() when invalidating TLBs The invalidate_range() is going to become an architecture specific mmu notifier used to keep the TLB of secondary MMUs such as an IOMMU in sync with the CPU page tables. Currently it is called from separate code paths to the main CPU TLB invalidations. This can lead to a secondary TLB not getting invalidated when required and makes it hard to reason about when exactly the secondary TLB is invalidated. To fix this move the notifier call to the architecture specific TLB maintenance functions for architectures that have secondary MMUs requiring explicit software invalidations. This fixes a SMMU bug on ARM64. On ARM64 PTE permission upgrades require a TLB invalidation. This invalidation is done by the architecture specific ptep_set_access_flags() which calls flush_tlb_page() if required. However this doesn't call the notifier resulting in infinite faults being generated by devices using the SMMU if it has previously cached a read-only PTE in it's TLB. Moving the invalidations into the TLB invalidation functions ensures all invalidations happen at the same time as the CPU invalidation. The architecture specific flush_tlb_all() routines do not call the notifier as none of the IOMMUs require this. Link: https://lkml.kernel.org/r/0287ae32d91393a582897d6c4db6f7456b1001f2.1690292440.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Suggested-by: Jason Gunthorpe Tested-by: SeongJae Park Acked-by: Catalin Marinas Reviewed-by: Jason Gunthorpe Tested-by: Luis Chamberlain Cc: Andrew Donnellan Cc: Chaitanya Kumar Borah Cc: Frederic Barrat Cc: John Hubbard Cc: Kevin Tian Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Nicolin Chen Cc: Robin Murphy Cc: Sean Christopherson Cc: Tvrtko Ursulin Cc: Will Deacon Cc: Zhi Wang Signed-off-by: Andrew Morton --- arch/arm64/include/asm/tlbflush.h | 5 +++++ arch/powerpc/include/asm/book3s/64/tlbflush.h | 1 + arch/powerpc/mm/book3s64/radix_hugetlbpage.c | 1 + arch/powerpc/mm/book3s64/radix_tlb.c | 4 ++++ arch/x86/include/asm/tlbflush.h | 2 ++ arch/x86/mm/tlb.c | 2 ++ 6 files changed, 15 insertions(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 3456866c6a1d..a99349d10c0e 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -252,6 +253,7 @@ static inline void flush_tlb_mm(struct mm_struct *mm) __tlbi(aside1is, asid); __tlbi_user(aside1is, asid); dsb(ish); + mmu_notifier_invalidate_range(mm, 0, -1UL); } static inline void __flush_tlb_page_nosync(struct mm_struct *mm, @@ -263,6 +265,8 @@ static inline void __flush_tlb_page_nosync(struct mm_struct *mm, addr = __TLBI_VADDR(uaddr, ASID(mm)); __tlbi(vale1is, addr); __tlbi_user(vale1is, addr); + mmu_notifier_invalidate_range(mm, uaddr & PAGE_MASK, + (uaddr & PAGE_MASK) + PAGE_SIZE); } static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, @@ -396,6 +400,7 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, scale++; } dsb(ish); + mmu_notifier_invalidate_range(vma->vm_mm, start, end); } static inline void flush_tlb_range(struct vm_area_struct *vma, diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index 0d0c1447ecf0..dca0477b0709 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -5,6 +5,7 @@ #define MMU_NO_CONTEXT ~0UL #include +#include #include #include diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c index 5e3195568525..f3fb49fd32fe 100644 --- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c @@ -39,6 +39,7 @@ void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long st radix__flush_tlb_pwc_range_psize(vma->vm_mm, start, end, psize); else radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize); + mmu_notifier_invalidate_range(vma->vm_mm, start, end); } void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 0bd4866d9824..4d44902a4962 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -987,6 +987,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm) } } preempt_enable(); + mmu_notifier_invalidate_range(mm, 0, -1UL); } EXPORT_SYMBOL(radix__flush_tlb_mm); @@ -1020,6 +1021,7 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); } preempt_enable(); + mmu_notifier_invalidate_range(mm, 0, -1UL); } void radix__flush_all_mm(struct mm_struct *mm) @@ -1228,6 +1230,7 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, } out: preempt_enable(); + mmu_notifier_invalidate_range(mm, start, end); } void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, @@ -1392,6 +1395,7 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm, } out: preempt_enable(); + mmu_notifier_invalidate_range(mm, start, end); } void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 837e4a50281a..0a5432364c5a 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -3,6 +3,7 @@ #define _ASM_X86_TLBFLUSH_H #include +#include #include #include @@ -282,6 +283,7 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b { inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); + mmu_notifier_invalidate_range(mm, 0, -1UL); } static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 267acf27480a..93b2f81f09da 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -1036,6 +1037,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, put_flush_tlb_info(); put_cpu(); + mmu_notifier_invalidate_range(mm, start, end); } -- cgit v1.2.3 From 1af5a8109904b7f00828e7f9f63f5695b42f8215 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 25 Jul 2023 23:42:07 +1000 Subject: mmu_notifiers: rename invalidate_range notifier There are two main use cases for mmu notifiers. One is by KVM which uses mmu_notifier_invalidate_range_start()/end() to manage a software TLB. The other is to manage hardware TLBs which need to use the invalidate_range() callback because HW can establish new TLB entries at any time. Hence using start/end() can lead to memory corruption as these callbacks happen too soon/late during page unmap. mmu notifier users should therefore either use the start()/end() callbacks or the invalidate_range() callbacks. To make this usage clearer rename the invalidate_range() callback to arch_invalidate_secondary_tlbs() and update documention. Link: https://lkml.kernel.org/r/6f77248cd25545c8020a54b4e567e8b72be4dca1.1690292440.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Suggested-by: Jason Gunthorpe Acked-by: Catalin Marinas Reviewed-by: Jason Gunthorpe Cc: Andrew Donnellan Cc: Chaitanya Kumar Borah Cc: Frederic Barrat Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kevin Tian Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Nicolin Chen Cc: Robin Murphy Cc: Sean Christopherson Cc: SeongJae Park Cc: Tvrtko Ursulin Cc: Will Deacon Cc: Zhi Wang Signed-off-by: Andrew Morton --- arch/arm64/include/asm/tlbflush.h | 6 +++--- arch/powerpc/mm/book3s64/radix_hugetlbpage.c | 2 +- arch/powerpc/mm/book3s64/radix_tlb.c | 8 ++++---- arch/x86/include/asm/tlbflush.h | 2 +- arch/x86/mm/tlb.c | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index a99349d10c0e..84a05a0bd2b6 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -253,7 +253,7 @@ static inline void flush_tlb_mm(struct mm_struct *mm) __tlbi(aside1is, asid); __tlbi_user(aside1is, asid); dsb(ish); - mmu_notifier_invalidate_range(mm, 0, -1UL); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } static inline void __flush_tlb_page_nosync(struct mm_struct *mm, @@ -265,7 +265,7 @@ static inline void __flush_tlb_page_nosync(struct mm_struct *mm, addr = __TLBI_VADDR(uaddr, ASID(mm)); __tlbi(vale1is, addr); __tlbi_user(vale1is, addr); - mmu_notifier_invalidate_range(mm, uaddr & PAGE_MASK, + mmu_notifier_arch_invalidate_secondary_tlbs(mm, uaddr & PAGE_MASK, (uaddr & PAGE_MASK) + PAGE_SIZE); } @@ -400,7 +400,7 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, scale++; } dsb(ish); - mmu_notifier_invalidate_range(vma->vm_mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end); } static inline void flush_tlb_range(struct vm_area_struct *vma, diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c index f3fb49fd32fe..17075c78d4bc 100644 --- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c @@ -39,7 +39,7 @@ void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long st radix__flush_tlb_pwc_range_psize(vma->vm_mm, start, end, psize); else radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize); - mmu_notifier_invalidate_range(vma->vm_mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end); } void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 4d44902a4962..06e647ef19d1 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -987,7 +987,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm) } } preempt_enable(); - mmu_notifier_invalidate_range(mm, 0, -1UL); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } EXPORT_SYMBOL(radix__flush_tlb_mm); @@ -1021,7 +1021,7 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); } preempt_enable(); - mmu_notifier_invalidate_range(mm, 0, -1UL); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } void radix__flush_all_mm(struct mm_struct *mm) @@ -1230,7 +1230,7 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, } out: preempt_enable(); - mmu_notifier_invalidate_range(mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, @@ -1395,7 +1395,7 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm, } out: preempt_enable(); - mmu_notifier_invalidate_range(mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 0a5432364c5a..6ab42caaa67a 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -283,7 +283,7 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b { inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); - mmu_notifier_invalidate_range(mm, 0, -1UL); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 93b2f81f09da..2d253919b3e8 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -1037,7 +1037,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, put_flush_tlb_info(); put_cpu(); - mmu_notifier_invalidate_range(mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } -- cgit v1.2.3 From 284e05920498788c5df1a7dd6424adb426498e1c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:01 +0100 Subject: mm: remove CONFIG_PER_VMA_LOCK ifdefs Patch series "Handle most file-backed faults under the VMA lock", v3. This patchset adds the ability to handle page faults on parts of files which are already in the page cache without taking the mmap lock. This patch (of 10): Provide lock_vma_under_rcu() when CONFIG_PER_VMA_LOCK is not defined to eliminate ifdefs in the users. Link: https://lkml.kernel.org/r/20230724185410.1124082-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230724185410.1124082-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Punit Agrawal Cc: Arjun Roy Cc: Eric Dumazet Signed-off-by: Andrew Morton --- arch/arm64/mm/fault.c | 2 -- arch/powerpc/mm/fault.c | 4 ---- arch/riscv/mm/fault.c | 4 ---- arch/s390/mm/fault.c | 2 -- arch/x86/mm/fault.c | 4 ---- 5 files changed, 16 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 3fe516b32577..103fcbdc6552 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -587,7 +587,6 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); -#ifdef CONFIG_PER_VMA_LOCK if (!(mm_flags & FAULT_FLAG_USER)) goto lock_mmap; @@ -615,7 +614,6 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, return 0; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ retry: vma = lock_mm_and_find_vma(mm, addr, regs); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 5bfdf6ecfa96..fafce6bdeff0 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -469,7 +469,6 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, if (is_exec) flags |= FAULT_FLAG_INSTRUCTION; -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; @@ -501,7 +500,6 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, return user_mode(regs) ? 0 : SIGBUS; lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -551,9 +549,7 @@ retry: mmap_read_unlock(current->mm); -#ifdef CONFIG_PER_VMA_LOCK done: -#endif if (unlikely(fault & VM_FAULT_ERROR)) return mm_fault_error(regs, address, fault); diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 6ea2cce4cc17..046732fcb48c 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -283,7 +283,6 @@ void handle_page_fault(struct pt_regs *regs) flags |= FAULT_FLAG_WRITE; else if (cause == EXC_INST_PAGE_FAULT) flags |= FAULT_FLAG_INSTRUCTION; -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; @@ -311,7 +310,6 @@ void handle_page_fault(struct pt_regs *regs) return; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ retry: vma = lock_mm_and_find_vma(mm, addr, regs); @@ -368,9 +366,7 @@ retry: mmap_read_unlock(mm); -#ifdef CONFIG_PER_VMA_LOCK done: -#endif if (unlikely(fault & VM_FAULT_ERROR)) { tsk->thread.bad_cause = cause; mm_fault_error(regs, addr, fault); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 2f123429a291..6f6b9881e55e 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -407,7 +407,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) access = VM_WRITE; if (access == VM_WRITE) flags |= FAULT_FLAG_WRITE; -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; vma = lock_vma_under_rcu(mm, address); @@ -432,7 +431,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) goto out; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ mmap_read_lock(mm); gmap = NULL; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e8711b2cafaf..787da09d24f3 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1328,7 +1328,6 @@ void do_user_addr_fault(struct pt_regs *regs, } #endif -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; @@ -1358,7 +1357,6 @@ void do_user_addr_fault(struct pt_regs *regs, return; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ retry: vma = lock_mm_and_find_vma(mm, address, regs); @@ -1418,9 +1416,7 @@ retry: } mmap_read_unlock(mm); -#ifdef CONFIG_PER_VMA_LOCK done: -#endif if (likely(!(fault & VM_FAULT_ERROR))) return; -- cgit v1.2.3 From 0b6f15824cc7e431a9706c78bfb9cb3011477ad3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:53 +0530 Subject: mm/vmemmap optimization: split hugetlb and devdax vmemmap optimization Arm disabled hugetlb vmemmap optimization [1] because hugetlb vmemmap optimization includes an update of both the permissions (writeable to read-only) and the output address (pfn) of the vmemmap ptes. That is not supported without unmapping of pte(marking it invalid) by some architectures. With DAX vmemmap optimization we don't require such pte updates and architectures can enable DAX vmemmap optimization while having hugetlb vmemmap optimization disabled. Hence split DAX optimization support into a different config. s390, loongarch and riscv don't have devdax support. So the DAX config is not enabled for them. With this change, arm64 should be able to select DAX optimization [1] commit 060a2c92d1b6 ("arm64: mm: hugetlb: Disable HUGETLB_PAGE_OPTIMIZE_VMEMMAP") Link: https://lkml.kernel.org/r/20230724190759.483013-8-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/loongarch/Kconfig | 2 +- arch/riscv/Kconfig | 2 +- arch/s390/Kconfig | 2 +- arch/x86/Kconfig | 3 ++- 4 files changed, 5 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index e71d5bf2cee0..dc56ddf9ba03 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -60,7 +60,7 @@ config LOONGARCH select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_LD_ORPHAN_WARN - select ARCH_WANT_OPTIMIZE_VMEMMAP + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP select ARCH_WANTS_NO_INSTR select BUILDTIME_TABLE_SORT select COMMON_CLK diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 4c07b9189c86..6943d34c1ec1 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -53,7 +53,7 @@ config RISCV select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT select ARCH_WANT_HUGE_PMD_SHARE if 64BIT select ARCH_WANT_LD_ORPHAN_WARN if !XIP_KERNEL - select ARCH_WANT_OPTIMIZE_VMEMMAP + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP select ARCH_WANTS_THP_SWAP if HAVE_ARCH_TRANSPARENT_HUGEPAGE select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU select BUILDTIME_TABLE_SORT if MMU diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 290b6f93b816..8ff6d1c21e38 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -127,7 +127,7 @@ config S390 select ARCH_WANTS_NO_INSTR select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_IPC_PARSE_VERSION - select ARCH_WANT_OPTIMIZE_VMEMMAP + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS2 select DMA_OPS if PCI diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7422db409770..78224aa76409 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -128,7 +128,8 @@ config X86 select ARCH_WANT_GENERAL_HUGETLB select ARCH_WANT_HUGE_PMD_SHARE select ARCH_WANT_LD_ORPHAN_WARN - select ARCH_WANT_OPTIMIZE_VMEMMAP if X86_64 + select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP if X86_64 + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64 select ARCH_WANTS_THP_SWAP if X86_64 select ARCH_HAS_PARANOID_L1D_FLUSH select BUILDTIME_TABLE_SORT -- cgit v1.2.3 From 104c49d5b6dc8b38aea0cb7300068b378cf37ac1 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:54 +0530 Subject: powerpc/mm/trace: convert trace event to trace event class A follow-up patch will add a pud variant for this same event. Using event class makes that addition simpler. No functional change in this patch. Link: https://lkml.kernel.org/r/20230724190759.483013-9-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Christophe Leroy Cc: Catalin Marinas Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/mm/book3s64/hash_pgtable.c | 2 +- arch/powerpc/mm/book3s64/radix_pgtable.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index 51f48984abca..988948d69bc1 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -214,7 +214,7 @@ unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr old = be64_to_cpu(old_be); - trace_hugepage_update(addr, old, clr, set); + trace_hugepage_update_pmd(addr, old, clr, set); if (old & H_PAGE_HASHPTE) hpte_do_hugepage_flush(mm, addr, pmdp, old); return old; diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index e7ea492ac510..02e185d2e4d6 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -962,7 +962,7 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add #endif old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1); - trace_hugepage_update(addr, old, clr, set); + trace_hugepage_update_pmd(addr, old, clr, set); return old; } -- cgit v1.2.3 From 27af67f35631ac4b61b5e4455b44c9aee8d2cc4b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:55 +0530 Subject: powerpc/book3s64/mm: enable transparent pud hugepage This is enabled only with radix translation and 1G hugepage size. This will be used with devdax device memory with a namespace alignment of 1G. Anon transparent hugepage is not supported even though we do have helpers checking pud_trans_huge(). We should never find that return true. The only expected pte bit combination is _PAGE_PTE | _PAGE_DEVMAP. Some of the helpers are never expected to get called on hash translation and hence is marked to call BUG() in such a case. Link: https://lkml.kernel.org/r/20230724190759.483013-10-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/64/hash.h | 9 ++ arch/powerpc/include/asm/book3s/64/pgtable.h | 155 +++++++++++++++++++-- arch/powerpc/include/asm/book3s/64/radix.h | 36 +++++ .../powerpc/include/asm/book3s/64/tlbflush-radix.h | 2 + arch/powerpc/include/asm/book3s/64/tlbflush.h | 8 ++ arch/powerpc/mm/book3s64/pgtable.c | 78 +++++++++++ arch/powerpc/mm/book3s64/radix_pgtable.c | 28 ++++ arch/powerpc/mm/book3s64/radix_tlb.c | 7 + arch/powerpc/platforms/Kconfig.cputype | 1 + 9 files changed, 313 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index d4a19e6547ac..6e70ae511631 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -138,7 +138,16 @@ static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b) } #define hash__pmd_bad(pmd) (pmd_val(pmd) & H_PMD_BAD_BITS) + +/* + * pud comparison that will work with both pte and page table pointer. + */ +static inline int hash__pud_same(pud_t pud_a, pud_t pud_b) +{ + return (((pud_raw(pud_a) ^ pud_raw(pud_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0); +} #define hash__pud_bad(pud) (pud_val(pud) & H_PUD_BAD_BITS) + static inline int hash__p4d_bad(p4d_t p4d) { return (p4d_val(p4d) == 0); diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 4acc9690f599..a8204566cfd0 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -921,8 +921,29 @@ static inline pud_t pte_pud(pte_t pte) { return __pud_raw(pte_raw(pte)); } + +static inline pte_t *pudp_ptep(pud_t *pud) +{ + return (pte_t *)pud; +} + +#define pud_pfn(pud) pte_pfn(pud_pte(pud)) +#define pud_dirty(pud) pte_dirty(pud_pte(pud)) +#define pud_young(pud) pte_young(pud_pte(pud)) +#define pud_mkold(pud) pte_pud(pte_mkold(pud_pte(pud))) +#define pud_wrprotect(pud) pte_pud(pte_wrprotect(pud_pte(pud))) +#define pud_mkdirty(pud) pte_pud(pte_mkdirty(pud_pte(pud))) +#define pud_mkclean(pud) pte_pud(pte_mkclean(pud_pte(pud))) +#define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud))) +#define pud_mkwrite(pud) pte_pud(pte_mkwrite(pud_pte(pud))) #define pud_write(pud) pte_write(pud_pte(pud)) +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY +#define pud_soft_dirty(pmd) pte_soft_dirty(pud_pte(pud)) +#define pud_mksoft_dirty(pmd) pte_pud(pte_mksoft_dirty(pud_pte(pud))) +#define pud_clear_soft_dirty(pmd) pte_pud(pte_clear_soft_dirty(pud_pte(pud))) +#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ + static inline int pud_bad(pud_t pud) { if (radix_enabled()) @@ -1115,15 +1136,24 @@ static inline bool pmd_access_permitted(pmd_t pmd, bool write) #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); +extern pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot); extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot); extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); +extern void set_pud_at(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud); + static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { } +static inline void update_mmu_cache_pud(struct vm_area_struct *vma, + unsigned long addr, pud_t *pud) +{ +} + extern int hash__has_transparent_hugepage(void); static inline int has_transparent_hugepage(void) { @@ -1133,6 +1163,14 @@ static inline int has_transparent_hugepage(void) } #define has_transparent_hugepage has_transparent_hugepage +static inline int has_transparent_pud_hugepage(void) +{ + if (radix_enabled()) + return radix__has_transparent_pud_hugepage(); + return 0; +} +#define has_transparent_pud_hugepage has_transparent_pud_hugepage + static inline unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, unsigned long clr, unsigned long set) @@ -1142,6 +1180,16 @@ pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set); } +static inline unsigned long +pud_hugepage_update(struct mm_struct *mm, unsigned long addr, pud_t *pudp, + unsigned long clr, unsigned long set) +{ + if (radix_enabled()) + return radix__pud_hugepage_update(mm, addr, pudp, clr, set); + BUG(); + return pud_val(*pudp); +} + /* * returns true for pmd migration entries, THP, devmap, hugetlb * But compile time dependent on THP config @@ -1151,6 +1199,11 @@ static inline int pmd_large(pmd_t pmd) return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE)); } +static inline int pud_large(pud_t pud) +{ + return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE)); +} + /* * For radix we should always find H_PAGE_HASHPTE zero. Hence * the below will work for radix too @@ -1166,6 +1219,17 @@ static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, return ((old & _PAGE_ACCESSED) != 0); } +static inline int __pudp_test_and_clear_young(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + unsigned long old; + + if ((pud_raw(*pudp) & cpu_to_be64(_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) + return 0; + old = pud_hugepage_update(mm, addr, pudp, _PAGE_ACCESSED, 0); + return ((old & _PAGE_ACCESSED) != 0); +} + #define __HAVE_ARCH_PMDP_SET_WRPROTECT static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) @@ -1174,6 +1238,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0); } +#define __HAVE_ARCH_PUDP_SET_WRPROTECT +static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long addr, + pud_t *pudp) +{ + if (pud_write(*pudp)) + pud_hugepage_update(mm, addr, pudp, _PAGE_WRITE, 0); +} + /* * Only returns true for a THP. False for pmd migration entry. * We also need to return true when we come across a pte that @@ -1195,6 +1267,17 @@ static inline int pmd_trans_huge(pmd_t pmd) return hash__pmd_trans_huge(pmd); } +static inline int pud_trans_huge(pud_t pud) +{ + if (!pud_present(pud)) + return false; + + if (radix_enabled()) + return radix__pud_trans_huge(pud); + return 0; +} + + #define __HAVE_ARCH_PMD_SAME static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { @@ -1203,6 +1286,15 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) return hash__pmd_same(pmd_a, pmd_b); } +#define pud_same pud_same +static inline int pud_same(pud_t pud_a, pud_t pud_b) +{ + if (radix_enabled()) + return radix__pud_same(pud_a, pud_b); + return hash__pud_same(pud_a, pud_b); +} + + static inline pmd_t __pmd_mkhuge(pmd_t pmd) { if (radix_enabled()) @@ -1210,6 +1302,14 @@ static inline pmd_t __pmd_mkhuge(pmd_t pmd) return hash__pmd_mkhuge(pmd); } +static inline pud_t __pud_mkhuge(pud_t pud) +{ + if (radix_enabled()) + return radix__pud_mkhuge(pud); + BUG(); + return pud; +} + /* * pfn_pmd return a pmd_t that can be used as pmd pte entry. */ @@ -1225,14 +1325,34 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd) return pmd; } +static inline pud_t pud_mkhuge(pud_t pud) +{ +#ifdef CONFIG_DEBUG_VM + if (radix_enabled()) + WARN_ON((pud_raw(pud) & cpu_to_be64(_PAGE_PTE)) == 0); + else + WARN_ON(1); +#endif + return pud; +} + + #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty); +#define __HAVE_ARCH_PUDP_SET_ACCESS_FLAGS +extern int pudp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, + pud_t entry, int dirty); #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); +#define __HAVE_ARCH_PUDP_TEST_AND_CLEAR_YOUNG +extern int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp); + #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, @@ -1243,6 +1363,16 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, return hash__pmdp_huge_get_and_clear(mm, addr, pmdp); } +#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR +static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + if (radix_enabled()) + return radix__pudp_huge_get_and_clear(mm, addr, pudp); + BUG(); + return *pudp; +} + static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { @@ -1257,6 +1387,11 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, int full); +#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, + unsigned long addr, + pud_t *pudp, int full); + #define __HAVE_ARCH_PGTABLE_DEPOSIT static inline void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) @@ -1305,6 +1440,14 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) return hash__pmd_mkdevmap(pmd); } +static inline pud_t pud_mkdevmap(pud_t pud) +{ + if (radix_enabled()) + return radix__pud_mkdevmap(pud); + BUG(); + return pud; +} + static inline int pmd_devmap(pmd_t pmd) { return pte_devmap(pmd_pte(pmd)); @@ -1312,7 +1455,7 @@ static inline int pmd_devmap(pmd_t pmd) static inline int pud_devmap(pud_t pud) { - return 0; + return pte_devmap(pud_pte(pud)); } static inline int pgd_devmap(pgd_t pgd) @@ -1321,16 +1464,6 @@ static inline int pgd_devmap(pgd_t pgd) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline int pud_pfn(pud_t pud) -{ - /* - * Currently all calls to pud_pfn() are gated around a pud_devmap() - * check so this should never be used. If it grows another user we - * want to know about it. - */ - BUILD_BUG(); - return 0; -} #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *); void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 686001eda936..2ef92f36340f 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -250,6 +250,10 @@ static inline int radix__pud_bad(pud_t pud) return !!(pud_val(pud) & RADIX_PUD_BAD_BITS); } +static inline int radix__pud_same(pud_t pud_a, pud_t pud_b) +{ + return ((pud_raw(pud_a) ^ pud_raw(pud_b)) == 0); +} static inline int radix__p4d_bad(p4d_t p4d) { @@ -268,9 +272,22 @@ static inline pmd_t radix__pmd_mkhuge(pmd_t pmd) return __pmd(pmd_val(pmd) | _PAGE_PTE); } +static inline int radix__pud_trans_huge(pud_t pud) +{ + return (pud_val(pud) & (_PAGE_PTE | _PAGE_DEVMAP)) == _PAGE_PTE; +} + +static inline pud_t radix__pud_mkhuge(pud_t pud) +{ + return __pud(pud_val(pud) | _PAGE_PTE); +} + extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, unsigned long clr, unsigned long set); +extern unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, unsigned long clr, + unsigned long set); extern pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, @@ -278,6 +295,9 @@ extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp); +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pud_t *pudp); + static inline int radix__has_transparent_hugepage(void) { /* For radix 2M at PMD level means thp */ @@ -285,6 +305,14 @@ static inline int radix__has_transparent_hugepage(void) return 1; return 0; } + +static inline int radix__has_transparent_pud_hugepage(void) +{ + /* For radix 1G at PUD level means pud hugepage support */ + if (mmu_psize_defs[MMU_PAGE_1G].shift == PUD_SHIFT) + return 1; + return 0; +} #endif static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd) @@ -292,9 +320,17 @@ static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd) return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_DEVMAP)); } +static inline pud_t radix__pud_mkdevmap(pud_t pud) +{ + return __pud(pud_val(pud) | (_PAGE_PTE | _PAGE_DEVMAP)); +} + +struct vmem_altmap; extern int __meminit radix__vmemmap_create_mapping(unsigned long start, unsigned long page_size, unsigned long phys); +int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, + int node, struct vmem_altmap *altmap); extern void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size); diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h index 77797a2a82eb..a38542259fab 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h @@ -68,6 +68,8 @@ void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, unsigned long end, int psize); extern void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); +extern void radix__flush_pud_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end); extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end); diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index dca0477b0709..1950c1b825b4 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -51,6 +51,14 @@ static inline void flush_pmd_tlb_range(struct vm_area_struct *vma, radix__flush_pmd_tlb_range(vma, start, end); } +#define __HAVE_ARCH_FLUSH_PUD_TLB_RANGE +static inline void flush_pud_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + if (radix_enabled()) + radix__flush_pud_tlb_range(vma, start, end); +} + #define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long start, diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 85c84e89e3ea..75b938268b04 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -64,11 +64,39 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, return changed; } +int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp, pud_t entry, int dirty) +{ + int changed; +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pud_devmap(*pudp)); + assert_spin_locked(pud_lockptr(vma->vm_mm, pudp)); +#endif + changed = !pud_same(*(pudp), entry); + if (changed) { + /* + * We can use MMU_PAGE_1G here, because only radix + * path look at the psize. + */ + __ptep_set_access_flags(vma, pudp_ptep(pudp), + pud_pte(entry), address, MMU_PAGE_1G); + } + return changed; +} + + int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); } + +int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp) +{ + return __pudp_test_and_clear_young(vma->vm_mm, address, pudp); +} + /* * set a new huge pmd. We should not be called for updating * an existing pmd entry. That should go via pmd_hugepage_update. @@ -90,6 +118,23 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); } +void set_pud_at(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud) +{ +#ifdef CONFIG_DEBUG_VM + /* + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. + */ + + WARN_ON(pte_hw_valid(pud_pte(*pudp))); + assert_spin_locked(pud_lockptr(mm, pudp)); + WARN_ON(!(pud_large(pud))); +#endif + trace_hugepage_set_pud(addr, pud_val(pud)); + return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud)); +} + static void do_serialize(void *arg) { /* We've taken the IPI, so try to trim the mask while here */ @@ -147,11 +192,35 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, return pmd; } +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp, int full) +{ + pud_t pud; + + VM_BUG_ON(addr & ~HPAGE_PMD_MASK); + VM_BUG_ON((pud_present(*pudp) && !pud_devmap(*pudp)) || + !pud_present(*pudp)); + pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp); + /* + * if it not a fullmm flush, then we can possibly end up converting + * this PMD pte entry to a regular level 0 PTE by a parallel page fault. + * Make sure we flush the tlb in this case. + */ + if (!full) + flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE); + return pud; +} + static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) { return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); } +static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot) +{ + return __pud(pud_val(pud) | pgprot_val(pgprot)); +} + /* * At some point we should be able to get rid of * pmd_mkhuge() and mk_huge_pmd() when we update all the @@ -166,6 +235,15 @@ pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot)); } +pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot) +{ + unsigned long pudv; + + pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; + + return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot)); +} + pmd_t mk_pmd(struct page *page, pgprot_t pgprot) { return pfn_pmd(page_to_pfn(page), pgprot); diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 02e185d2e4d6..227fea53c217 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -967,6 +967,23 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add return old; } +unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, unsigned long clr, + unsigned long set) +{ + unsigned long old; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pud_devmap(*pudp)); + assert_spin_locked(pud_lockptr(mm, pudp)); +#endif + + old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1); + trace_hugepage_update_pud(addr, old, clr, set); + + return old; +} + pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) @@ -1043,6 +1060,17 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, return old_pmd; } +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + pud_t old_pud; + unsigned long old; + + old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0); + old_pud = __pud(old); + return old_pud; +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 06e647ef19d1..3020a8b38572 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -1465,6 +1465,13 @@ void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, } EXPORT_SYMBOL(radix__flush_pmd_tlb_range); +void radix__flush_pud_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G); +} +EXPORT_SYMBOL(radix__flush_pud_tlb_range); + void radix__flush_tlb_all(void) { unsigned long rb,prs,r,rs; diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 45fd975ef521..340b86ef7284 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -94,6 +94,7 @@ config PPC_BOOK3S_64 select PPC_FPU select PPC_HAVE_PMU_SUPPORT select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_SPLIT_PMD_PTLOCK select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE -- cgit v1.2.3 From 368a0590d954a659b16ab945328ada0cc10f93a0 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:56 +0530 Subject: powerpc/book3s64/vmemmap: switch radix to use a different vmemmap handling function This is in preparation to update radix to implement vmemmap optimization for devdax. Below are the rules w.r.t radix vmemmap mapping 1. First try to map things using PMD (2M) 2. With altmap if altmap cross-boundary check returns true, fall back to PAGE_SIZE 3. If we can't allocate PMD_SIZE backing memory for vmemmap, fallback to PAGE_SIZE On removing vmemmap mapping, check if every subsection that is using the vmemmap area is invalid. If found to be invalid, that implies we can safely free the vmemmap area. We don't use the PAGE_UNUSED pattern used by x86 because with 64K page size, we need to do the above check even at the PAGE_SIZE granularity. [aneesh.kumar@linux.ibm.com: fix section mismatch warning] Link: https://lkml.kernel.org/r/87h6pqvu5g.fsf@linux.ibm.com [aneesh.kumar@linux.ibm.com: fix kernel build error] Link: https://lkml.kernel.org/r/877cqkwd20.fsf@linux.ibm.com Link: https://lkml.kernel.org/r/20230724190759.483013-11-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/64/radix.h | 2 + arch/powerpc/include/asm/pgtable.h | 6 + arch/powerpc/mm/book3s64/radix_pgtable.c | 327 ++++++++++++++++++++++++++--- arch/powerpc/mm/init_64.c | 26 ++- 4 files changed, 329 insertions(+), 32 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 2ef92f36340f..f1461289643a 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -331,6 +331,8 @@ extern int __meminit radix__vmemmap_create_mapping(unsigned long start, unsigned long phys); int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); +void __ref radix__vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap); extern void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size); diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 445a22987aa3..a4893b17705a 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -157,6 +157,12 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd) return (pgtable_t)pmd_page_vaddr(pmd); } +#ifdef CONFIG_PPC64 +int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size); +bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, + unsigned long page_size); +#endif /* CONFIG_PPC64 */ + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_H */ diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 227fea53c217..6d04dd579d03 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -744,8 +744,58 @@ static void free_pud_table(pud_t *pud_start, p4d_t *p4d) p4d_clear(p4d); } -static void remove_pte_table(pte_t *pte_start, unsigned long addr, - unsigned long end, bool direct) +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end) +{ + unsigned long start = ALIGN_DOWN(addr, PMD_SIZE); + + return !vmemmap_populated(start, PMD_SIZE); +} + +static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end) +{ + unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE); + + return !vmemmap_populated(start, PAGE_SIZE); + +} +#endif + +static void __meminit free_vmemmap_pages(struct page *page, + struct vmem_altmap *altmap, + int order) +{ + unsigned int nr_pages = 1 << order; + + if (altmap) { + unsigned long alt_start, alt_end; + unsigned long base_pfn = page_to_pfn(page); + + /* + * with 2M vmemmap mmaping we can have things setup + * such that even though atlmap is specified we never + * used altmap. + */ + alt_start = altmap->base_pfn; + alt_end = altmap->base_pfn + altmap->reserve + altmap->free; + + if (base_pfn >= alt_start && base_pfn < alt_end) { + vmem_altmap_free(altmap, nr_pages); + return; + } + } + + if (PageReserved(page)) { + /* allocated from memblock */ + while (nr_pages--) + free_reserved_page(page++); + } else + free_pages((unsigned long)page_address(page), order); +} + +static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr, + unsigned long end, bool direct, + struct vmem_altmap *altmap) { unsigned long next, pages = 0; pte_t *pte; @@ -759,24 +809,26 @@ static void remove_pte_table(pte_t *pte_start, unsigned long addr, if (!pte_present(*pte)) continue; - if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) { - /* - * The vmemmap_free() and remove_section_mapping() - * codepaths call us with aligned addresses. - */ - WARN_ONCE(1, "%s: unaligned range\n", __func__); - continue; + if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { + if (!direct) + free_vmemmap_pages(pte_page(*pte), altmap, 0); + pte_clear(&init_mm, addr, pte); + pages++; } - - pte_clear(&init_mm, addr, pte); - pages++; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + else if (!direct && vmemmap_page_is_unused(addr, next)) { + free_vmemmap_pages(pte_page(*pte), altmap, 0); + pte_clear(&init_mm, addr, pte); + } +#endif } if (direct) update_page_count(mmu_virtual_psize, -pages); } static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, - unsigned long end, bool direct) + unsigned long end, bool direct, + struct vmem_altmap *altmap) { unsigned long next, pages = 0; pte_t *pte_base; @@ -790,18 +842,24 @@ static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, continue; if (pmd_is_leaf(*pmd)) { - if (!IS_ALIGNED(addr, PMD_SIZE) || - !IS_ALIGNED(next, PMD_SIZE)) { - WARN_ONCE(1, "%s: unaligned range\n", __func__); - continue; + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) { + if (!direct) + free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE)); + pte_clear(&init_mm, addr, (pte_t *)pmd); + pages++; } - pte_clear(&init_mm, addr, (pte_t *)pmd); - pages++; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + else if (!direct && vmemmap_pmd_is_unused(addr, next)) { + free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE)); + pte_clear(&init_mm, addr, (pte_t *)pmd); + } +#endif continue; } pte_base = (pte_t *)pmd_page_vaddr(*pmd); - remove_pte_table(pte_base, addr, next, direct); + remove_pte_table(pte_base, addr, next, direct, altmap); free_pte_table(pte_base, pmd); } if (direct) @@ -809,7 +867,8 @@ static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, } static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr, - unsigned long end, bool direct) + unsigned long end, bool direct, + struct vmem_altmap *altmap) { unsigned long next, pages = 0; pmd_t *pmd_base; @@ -834,15 +893,16 @@ static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr, } pmd_base = pud_pgtable(*pud); - remove_pmd_table(pmd_base, addr, next, direct); + remove_pmd_table(pmd_base, addr, next, direct, altmap); free_pmd_table(pmd_base, pud); } if (direct) update_page_count(MMU_PAGE_1G, -pages); } -static void __meminit remove_pagetable(unsigned long start, unsigned long end, - bool direct) +static void __meminit +remove_pagetable(unsigned long start, unsigned long end, bool direct, + struct vmem_altmap *altmap) { unsigned long addr, next; pud_t *pud_base; @@ -871,7 +931,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end, } pud_base = p4d_pgtable(*p4d); - remove_pud_table(pud_base, addr, next, direct); + remove_pud_table(pud_base, addr, next, direct, altmap); free_pud_table(pud_base, p4d); } @@ -894,7 +954,7 @@ int __meminit radix__create_section_mapping(unsigned long start, int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) { - remove_pagetable(start, end, true); + remove_pagetable(start, end, true, NULL); return 0; } #endif /* CONFIG_MEMORY_HOTPLUG */ @@ -926,10 +986,223 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start, return 0; } +int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, + unsigned long addr, unsigned long next) +{ + int large = pmd_large(*pmdp); + + if (large) + vmemmap_verify(pmdp_ptep(pmdp), node, addr, next); + + return large; +} + +void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node, + unsigned long addr, unsigned long next) +{ + pte_t entry; + pte_t *ptep = pmdp_ptep(pmdp); + + VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE)); + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + set_pte_at(&init_mm, addr, ptep, entry); + asm volatile("ptesync": : :"memory"); + + vmemmap_verify(ptep, node, addr, next); +} + +static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr, + int node, + struct vmem_altmap *altmap, + struct page *reuse) +{ + pte_t *pte = pte_offset_kernel(pmdp, addr); + + if (pte_none(*pte)) { + pte_t entry; + void *p; + + if (!reuse) { + /* + * make sure we don't create altmap mappings + * covering things outside the device. + */ + if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE)) + altmap = NULL; + + p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap); + if (!p && altmap) + p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL); + if (!p) + return NULL; + } else { + /* + * When a PTE/PMD entry is freed from the init_mm + * there's a free_pages() call to this page allocated + * above. Thus this get_page() is paired with the + * put_page_testzero() on the freeing path. + * This can only called by certain ZONE_DEVICE path, + * and through vmemmap_populate_compound_pages() when + * slab is available. + */ + get_page(reuse); + p = page_to_virt(reuse); + } + + VM_BUG_ON(!PAGE_ALIGNED(addr)); + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + set_pte_at(&init_mm, addr, pte, entry); + asm volatile("ptesync": : :"memory"); + } + return pte; +} + +static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node, + unsigned long address) +{ + pud_t *pud; + + /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ + if (unlikely(p4d_none(*p4dp))) { + if (unlikely(!slab_is_available())) { + pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); + p4d_populate(&init_mm, p4dp, pud); + /* go to the pud_offset */ + } else + return pud_alloc(&init_mm, p4dp, address); + } + return pud_offset(p4dp, address); +} + +static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node, + unsigned long address) +{ + pmd_t *pmd; + + /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ + if (unlikely(pud_none(*pudp))) { + if (unlikely(!slab_is_available())) { + pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); + pud_populate(&init_mm, pudp, pmd); + } else + return pmd_alloc(&init_mm, pudp, address); + } + return pmd_offset(pudp, address); +} + +static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node, + unsigned long address) +{ + pte_t *pte; + + /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ + if (unlikely(pmd_none(*pmdp))) { + if (unlikely(!slab_is_available())) { + pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); + pmd_populate(&init_mm, pmdp, pte); + } else + return pte_alloc_kernel(pmdp, address); + } + return pte_offset_kernel(pmdp, address); +} + + + +int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) +{ + unsigned long addr; + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + for (addr = start; addr < end; addr = next) { + next = pmd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + pud = vmemmap_pud_alloc(p4d, node, addr); + if (!pud) + return -ENOMEM; + pmd = vmemmap_pmd_alloc(pud, node, addr); + if (!pmd) + return -ENOMEM; + + if (pmd_none(READ_ONCE(*pmd))) { + void *p; + + /* + * keep it simple by checking addr PMD_SIZE alignment + * and verifying the device boundary condition. + * For us to use a pmd mapping, both addr and pfn should + * be aligned. We skip if addr is not aligned and for + * pfn we hope we have extra area in the altmap that + * can help to find an aligned block. This can result + * in altmap block allocation failures, in which case + * we fallback to RAM for vmemmap allocation. + */ + if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) || + altmap_cross_boundary(altmap, addr, PMD_SIZE))) { + /* + * make sure we don't create altmap mappings + * covering things outside the device. + */ + goto base_mapping; + } + + p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); + if (p) { + vmemmap_set_pmd(pmd, p, node, addr, next); + continue; + } else if (altmap) { + /* + * A vmemmap block allocation can fail due to + * alignment requirements and we trying to align + * things aggressively there by running out of + * space. Try base mapping on failure. + */ + goto base_mapping; + } + } else if (vmemmap_check_pmd(pmd, node, addr, next)) { + /* + * If a huge mapping exist due to early call to + * vmemmap_populate, let's try to use that. + */ + continue; + } +base_mapping: + /* + * Not able allocate higher order memory to back memmap + * or we found a pointer to pte page. Allocate base page + * size vmemmap + */ + pte = vmemmap_pte_alloc(pmd, node, addr); + if (!pte) + return -ENOMEM; + + pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL); + if (!pte) + return -ENOMEM; + + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + next = addr + PAGE_SIZE; + } + return 0; +} + #ifdef CONFIG_MEMORY_HOTPLUG void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) { - remove_pagetable(start, start + page_size, false); + remove_pagetable(start, start + page_size, true, NULL); +} + +void __ref radix__vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) +{ + remove_pagetable(start, end, false, altmap); } #endif #endif diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index fe1b83020e0d..af0c9891c38f 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -92,7 +92,7 @@ static struct page * __meminit vmemmap_subsection_start(unsigned long vmemmap_ad * a page table lookup here because with the hash translation we don't keep * vmemmap details in linux page table. */ -static int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size) +int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size) { struct page *start; unsigned long vmemmap_end = vmemmap_addr + vmemmap_map_size; @@ -183,8 +183,8 @@ static __meminit int vmemmap_list_populate(unsigned long phys, return 0; } -static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, - unsigned long page_size) +bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, + unsigned long page_size) { unsigned long nr_pfn = page_size / sizeof(struct page); unsigned long start_pfn = page_to_pfn((struct page *)start); @@ -204,6 +204,11 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, bool altmap_alloc; unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; +#ifdef CONFIG_PPC_BOOK3S_64 + if (radix_enabled()) + return radix__vmemmap_populate(start, end, node, altmap); +#endif + /* Align to the page size of the linear mapping. */ start = ALIGN_DOWN(start, page_size); @@ -303,8 +308,8 @@ static unsigned long vmemmap_list_free(unsigned long start) return vmem_back->phys; } -void __ref vmemmap_free(unsigned long start, unsigned long end, - struct vmem_altmap *altmap) +static void __ref __vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) { unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; unsigned long page_order = get_order(page_size); @@ -362,6 +367,17 @@ void __ref vmemmap_free(unsigned long start, unsigned long end, vmemmap_remove_mapping(start, page_size); } } + +void __ref vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + if (radix_enabled()) + return radix__vmemmap_free(start, end, altmap); +#endif + return __vmemmap_free(start, end, altmap); +} + #endif void register_page_bootmem_memmap(unsigned long section_nr, struct page *start_page, unsigned long size) -- cgit v1.2.3 From f2b79c0d79683d552369bb9a7282c1f0226fc566 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:57 +0530 Subject: powerpc/book3s64/radix: add support for vmemmap optimization for radix With 2M PMD-level mapping, we require 32 struct pages and a single vmemmap page can contain 1024 struct pages (PAGE_SIZE/sizeof(struct page)). Hence with 64K page size, we don't use vmemmap deduplication for PMD-level mapping. [aneesh.kumar@linux.ibm.com: ppc64: don't include radix headers if CONFIG_PPC_RADIX_MMU=n] Link: https://lkml.kernel.org/r/87zg3jw8km.fsf@linux.ibm.com Link: https://lkml.kernel.org/r/20230724190759.483013-12-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/book3s/64/radix.h | 11 ++ arch/powerpc/mm/book3s64/radix_pgtable.c | 203 +++++++++++++++++++++++++++++ 3 files changed, 215 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 9222c138c457..d0497d13f5b4 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -174,6 +174,7 @@ config PPC select ARCH_WANT_IPC_PARSE_VERSION select ARCH_WANT_IRQS_OFF_ACTIVATE_MM select ARCH_WANT_LD_ORPHAN_WARN + select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP if PPC_RADIX_MMU select ARCH_WANTS_MODULES_DATA_IN_VMALLOC if PPC_BOOK3S_32 || PPC_8xx select ARCH_WEAK_RELEASE_ACQUIRE select BINFMT_ELF diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index f1461289643a..357e23a403d3 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -326,6 +326,7 @@ static inline pud_t radix__pud_mkdevmap(pud_t pud) } struct vmem_altmap; +struct dev_pagemap; extern int __meminit radix__vmemmap_create_mapping(unsigned long start, unsigned long page_size, unsigned long phys); @@ -363,5 +364,15 @@ int radix__remove_section_mapping(unsigned long start, unsigned long end); void radix__kernel_map_pages(struct page *page, int numpages, int enable); +#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP +#define vmemmap_can_optimize vmemmap_can_optimize +bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap); +#endif + +#define vmemmap_populate_compound_pages vmemmap_populate_compound_pages +int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, + unsigned long start, + unsigned long end, int node, + struct dev_pagemap *pgmap); #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 6d04dd579d03..c264e6a3620e 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -986,6 +986,15 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start, return 0; } + +bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) +{ + if (radix_enabled()) + return __vmemmap_can_optimize(altmap, pgmap); + + return false; +} + int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, unsigned long addr, unsigned long next) { @@ -1193,6 +1202,200 @@ base_mapping: return 0; } +static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node, + struct vmem_altmap *altmap, + struct page *reuse) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + pud = vmemmap_pud_alloc(p4d, node, addr); + if (!pud) + return NULL; + pmd = vmemmap_pmd_alloc(pud, node, addr); + if (!pmd) + return NULL; + if (pmd_leaf(*pmd)) + /* + * The second page is mapped as a hugepage due to a nearby request. + * Force our mapping to page size without deduplication + */ + return NULL; + pte = vmemmap_pte_alloc(pmd, node, addr); + if (!pte) + return NULL; + radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + return pte; +} + +static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr, + unsigned long pfn_offset, int node) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long map_addr; + + /* the second vmemmap page which we use for duplication */ + map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE; + pgd = pgd_offset_k(map_addr); + p4d = p4d_offset(pgd, map_addr); + pud = vmemmap_pud_alloc(p4d, node, map_addr); + if (!pud) + return NULL; + pmd = vmemmap_pmd_alloc(pud, node, map_addr); + if (!pmd) + return NULL; + if (pmd_leaf(*pmd)) + /* + * The second page is mapped as a hugepage due to a nearby request. + * Force our mapping to page size without deduplication + */ + return NULL; + pte = vmemmap_pte_alloc(pmd, node, map_addr); + if (!pte) + return NULL; + /* + * Check if there exist a mapping to the left + */ + if (pte_none(*pte)) { + /* + * Populate the head page vmemmap page. + * It can fall in different pmd, hence + * vmemmap_populate_address() + */ + pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL); + if (!pte) + return NULL; + /* + * Populate the tail pages vmemmap page + */ + pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL); + if (!pte) + return NULL; + vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE); + return pte; + } + return pte; +} + +int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, + unsigned long start, + unsigned long end, int node, + struct dev_pagemap *pgmap) +{ + /* + * we want to map things as base page size mapping so that + * we can save space in vmemmap. We could have huge mapping + * covering out both edges. + */ + unsigned long addr; + unsigned long addr_pfn = start_pfn; + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + for (addr = start; addr < end; addr = next) { + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + pud = vmemmap_pud_alloc(p4d, node, addr); + if (!pud) + return -ENOMEM; + pmd = vmemmap_pmd_alloc(pud, node, addr); + if (!pmd) + return -ENOMEM; + + if (pmd_leaf(READ_ONCE(*pmd))) { + /* existing huge mapping. Skip the range */ + addr_pfn += (PMD_SIZE >> PAGE_SHIFT); + next = pmd_addr_end(addr, end); + continue; + } + pte = vmemmap_pte_alloc(pmd, node, addr); + if (!pte) + return -ENOMEM; + if (!pte_none(*pte)) { + /* + * This could be because we already have a compound + * page whose VMEMMAP_RESERVE_NR pages were mapped and + * this request fall in those pages. + */ + addr_pfn += 1; + next = addr + PAGE_SIZE; + continue; + } else { + unsigned long nr_pages = pgmap_vmemmap_nr(pgmap); + unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages); + pte_t *tail_page_pte; + + /* + * if the address is aligned to huge page size it is the + * head mapping. + */ + if (pfn_offset == 0) { + /* Populate the head page vmemmap page */ + pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + /* + * Populate the tail pages vmemmap page + * It can fall in different pmd, hence + * vmemmap_populate_address() + */ + pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL); + if (!pte) + return -ENOMEM; + + addr_pfn += 2; + next = addr + 2 * PAGE_SIZE; + continue; + } + /* + * get the 2nd mapping details + * Also create it if that doesn't exist + */ + tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node); + if (!tail_page_pte) { + + pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + addr_pfn += 1; + next = addr + PAGE_SIZE; + continue; + } + + pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte)); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + addr_pfn += 1; + next = addr + PAGE_SIZE; + continue; + } + } + return 0; +} + + #ifdef CONFIG_MEMORY_HOTPLUG void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) { -- cgit v1.2.3 From 601f006fddc66e369fdac7c572f981eafd159dac Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:58 +0530 Subject: powerpc/book3s64/radix: remove mmu_vmemmap_psize This is not used by radix anymore. [aneesh.kumar@linux.ibm.com: fix kernel build error] Link: https://lkml.kernel.org/r/874jlowd0c.fsf@linux.ibm.com Link: https://lkml.kernel.org/r/20230724190759.483013-13-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/mm/book3s64/radix_pgtable.c | 11 ----------- arch/powerpc/mm/init_64.c | 21 ++++++++++++++------- 2 files changed, 14 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index c264e6a3620e..955cf6b68a3a 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -601,17 +601,6 @@ void __init radix__early_init_mmu(void) #else mmu_virtual_psize = MMU_PAGE_4K; #endif - -#ifdef CONFIG_SPARSEMEM_VMEMMAP - /* vmemmap mapping */ - if (mmu_psize_defs[MMU_PAGE_2M].shift) { - /* - * map vmemmap using 2M if available - */ - mmu_vmemmap_psize = MMU_PAGE_2M; - } else - mmu_vmemmap_psize = mmu_virtual_psize; -#endif #endif /* * initialize page table size diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index af0c9891c38f..f7930360406e 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -198,17 +198,12 @@ bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, return false; } -int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, - struct vmem_altmap *altmap) +static int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) { bool altmap_alloc; unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; -#ifdef CONFIG_PPC_BOOK3S_64 - if (radix_enabled()) - return radix__vmemmap_populate(start, end, node, altmap); -#endif - /* Align to the page size of the linear mapping. */ start = ALIGN_DOWN(start, page_size); @@ -277,6 +272,18 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, return 0; } +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) +{ + +#ifdef CONFIG_PPC_BOOK3S_64 + if (radix_enabled()) + return radix__vmemmap_populate(start, end, node, altmap); +#endif + + return __vmemmap_populate(start, end, node, altmap); +} + #ifdef CONFIG_MEMORY_HOTPLUG static unsigned long vmemmap_list_free(unsigned long start) { -- cgit v1.2.3 From 6be3601517d90b728095d70c14f3a04b9adcb166 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:59 +0530 Subject: powerpc/book3s64/radix: add debug message to give more details of vmemmap allocation Add some extra vmemmap pr_debug message that will indicate the type of vmemmap allocations. For ex: with DAX vmemmap optimization we can find the below details: [ 187.166580] radix-mmu: PAGE_SIZE vmemmap mapping [ 187.166587] radix-mmu: PAGE_SIZE vmemmap mapping [ 187.166591] radix-mmu: Tail page reuse vmemmap mapping [ 187.166594] radix-mmu: Tail page reuse vmemmap mapping [ 187.166598] radix-mmu: Tail page reuse vmemmap mapping [ 187.166601] radix-mmu: Tail page reuse vmemmap mapping [ 187.166604] radix-mmu: Tail page reuse vmemmap mapping [ 187.166608] radix-mmu: Tail page reuse vmemmap mapping [ 187.166611] radix-mmu: Tail page reuse vmemmap mapping [ 187.166614] radix-mmu: Tail page reuse vmemmap mapping [ 187.166617] radix-mmu: Tail page reuse vmemmap mapping [ 187.166620] radix-mmu: Tail page reuse vmemmap mapping [ 187.166623] radix-mmu: Tail page reuse vmemmap mapping [ 187.166626] radix-mmu: Tail page reuse vmemmap mapping [ 187.166629] radix-mmu: Tail page reuse vmemmap mapping [ 187.166632] radix-mmu: Tail page reuse vmemmap mapping And without vmemmap optimization [ 293.549931] radix-mmu: PMD_SIZE vmemmap mapping [ 293.549984] radix-mmu: PMD_SIZE vmemmap mapping [ 293.550032] radix-mmu: PMD_SIZE vmemmap mapping [ 293.550076] radix-mmu: PMD_SIZE vmemmap mapping [ 293.550117] radix-mmu: PMD_SIZE vmemmap mapping Link: https://lkml.kernel.org/r/20230724190759.483013-14-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/mm/book3s64/radix_pgtable.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 955cf6b68a3a..96679018e7fb 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -1033,6 +1033,7 @@ static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL); if (!p) return NULL; + pr_debug("PAGE_SIZE vmemmap mapping\n"); } else { /* * When a PTE/PMD entry is freed from the init_mm @@ -1045,6 +1046,7 @@ static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long */ get_page(reuse); p = page_to_virt(reuse); + pr_debug("Tail page reuse vmemmap mapping\n"); } VM_BUG_ON(!PAGE_ALIGNED(addr)); @@ -1154,6 +1156,7 @@ int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, in p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); if (p) { vmemmap_set_pmd(pmd, p, node, addr, next); + pr_debug("PMD_SIZE vmemmap mapping\n"); continue; } else if (altmap) { /* -- cgit v1.2.3 From 6a718bd2ed4a589e7e50e7d028d24e087139dd03 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Tue, 1 Aug 2023 20:42:03 +0800 Subject: arm64: tlbflush: add some comments for TLB batched flushing Add comments for arch_flush_tlb_batched_pending() and arch_tlbbatch_flush() to illustrate why only a DSB is needed. Link: https://lkml.kernel.org/r/20230801124203.62164-1-yangyicong@huawei.com Cc: Catalin Marinas Signed-off-by: Yicong Yang Reviewed-by: Alistair Popple Reviewed-by: Catalin Marinas Cc: Barry Song <21cnbao@gmail.com> Signed-off-by: Andrew Morton --- arch/arm64/include/asm/tlbflush.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 84a05a0bd2b6..55b50e1d4a84 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -304,11 +304,26 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b __flush_tlb_page_nosync(mm, uaddr); } +/* + * If mprotect/munmap/etc occurs during TLB batched flushing, we need to + * synchronise all the TLBI issued with a DSB to avoid the race mentioned in + * flush_tlb_batched_pending(). + */ static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) { dsb(ish); } +/* + * To support TLB batched flush for multiple pages unmapping, we only send + * the TLBI for each page in arch_tlbbatch_add_pending() and wait for the + * completion at the end in arch_tlbbatch_flush(). Since we've already issued + * TLBI for each page so only a DSB is needed to synchronise its effect on the + * other CPUs. + * + * This will save the time waiting on DSB comparing issuing a TLBI;DSB sequence + * for each page. + */ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) { dsb(ish); -- cgit v1.2.3 From 9cf6a060f95578c8147bdacdf55a1eaaa182ce49 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 2 Aug 2023 09:27:31 +0800 Subject: arm64: hugetlb: enable __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE It is better to use huge page size instead of PAGE_SIZE for stride when flush hugepage, which reduces the loop in __flush_tlb_range(). Let's support arch's flush_hugetlb_tlb_range(), which is used in hugetlb_unshare_all_pmds(), move_hugetlb_page_tables() and hugetlb_change_protection() for now. Note,: for hugepages based on contiguous bit, it has to be invalidated individually since the contiguous PTE bit is just a hint, the hardware may or may not take it into account. Link: https://lkml.kernel.org/r/20230802012731.62512-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Muchun Song Reviewed-by: Catalin Marinas Cc: Barry Song <21cnbao@gmail.com> Cc: Joel Fernandes (Google) Cc: Kalesh Singh Cc: "Kirill A. Shutemov" Cc: Mike Kravetz Cc: Mina Almasry Cc: Will Deacon Cc: William Kucharski Signed-off-by: Andrew Morton --- arch/arm64/include/asm/hugetlb.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 6a4a1ab8eb23..a91d6219aa78 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -60,4 +60,19 @@ extern void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, #include +#define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE +static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma, + unsigned long start, + unsigned long end) +{ + unsigned long stride = huge_page_size(hstate_vma(vma)); + + if (stride == PMD_SIZE) + __flush_tlb_range(vma, start, end, stride, false, 2); + else if (stride == PUD_SIZE) + __flush_tlb_range(vma, start, end, stride, false, 1); + else + __flush_tlb_range(vma, start, end, PAGE_SIZE, false, 0); +} + #endif /* __ASM_HUGETLB_H */ -- cgit v1.2.3 From 60081bf19b0ec8fa40c589bd361fa2bc763f1050 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Aug 2023 08:27:22 -0700 Subject: mm: lock vma explicitly before doing vm_flags_reset and vm_flags_reset_once Implicit vma locking inside vm_flags_reset() and vm_flags_reset_once() is not obvious and makes it hard to understand where vma locking is happening. Also in some cases (like in dup_userfaultfd()) vma should be locked earlier than vma_flags modification. To make locking more visible, change these functions to assert that the vma write lock is taken and explicitly lock the vma beforehand. Fix userfaultfd functions which should lock the vma earlier. Link: https://lkml.kernel.org/r/20230804152724.3090321-5-surenb@google.com Suggested-by: Linus Torvalds Signed-off-by: Suren Baghdasaryan Cc: Jann Horn Cc: Liam R. Howlett Signed-off-by: Andrew Morton --- arch/powerpc/kvm/book3s_hv_uvmem.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 709ebd578394..e2d6f9327f77 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -410,6 +410,7 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm, ret = H_STATE; break; } + vma_start_write(vma); /* Copy vm_flags to avoid partial modifications in ksm_madvise */ vm_flags = vma->vm_flags; ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, -- cgit v1.2.3 From 04d5ea46a15149a12f79c686b6a1ffc9c3233272 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 8 Aug 2023 14:44:56 +0530 Subject: mm/memory_hotplug: simplify ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE kconfig Patch series "Add support for memmap on memory feature on ppc64", v8. This patch series update memmap on memory feature to fall back to memmap allocation outside the memory block if the alignment rules are not met. This makes the feature more useful on architectures like ppc64 where alignment rules are different with 64K page size. This patch (of 6): Instead of adding menu entry with all supported architectures, add mm/Kconfig variable and select the same from supported architectures. No functional change in this patch. Link: https://lkml.kernel.org/r/20230808091501.287660-1-aneesh.kumar@linux.ibm.com Link: https://lkml.kernel.org/r/20230808091501.287660-2-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Christophe Leroy Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Vishal Verma Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 4 +--- arch/x86/Kconfig | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 751d8c8821db..a7a88322bf46 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -78,6 +78,7 @@ config ARM64 select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPTION select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION select ARCH_KEEP_MEMBLOCK + select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_GNU_PROPERTY select ARCH_USE_MEMTEST @@ -349,9 +350,6 @@ config GENERIC_CSUM config GENERIC_CALIBRATE_DELAY def_bool y -config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE - def_bool y - config SMP def_bool y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 78224aa76409..d0258e92a8af 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -102,6 +102,7 @@ config X86 select ARCH_HAS_DEBUG_WX select ARCH_HAS_ZONE_DMA_SET if EXPERT select ARCH_HAVE_NMI_SAFE_CMPXCHG + select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO @@ -2610,9 +2611,6 @@ config ARCH_HAS_ADD_PAGES def_bool y depends on ARCH_ENABLE_MEMORY_HOTPLUG -config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE - def_bool y - menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER -- cgit v1.2.3 From 603fd64dfa45d1e9df996911e4010f2b00731387 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 8 Aug 2023 14:45:00 +0530 Subject: powerpc/book3s64/memhotplug: enable memmap on memory for radix Radix vmemmap mapping can map things correctly at the PMD level or PTE level based on different device boundary checks. Hence we skip the restrictions w.r.t vmemmap size to be multiple of PMD_SIZE. This also makes the feature widely useful because to use PMD_SIZE vmemmap area we require a memory block size of 2GiB We can also use MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY to that the feature can work with a memory block size of 256MB. Using altmap.reserve feature to align things correctly at pageblock granularity. We can end up losing some pages in memory with this. For ex: with a 256MiB memory block size, we require 4 pages to map vmemmap pages, In order to align things correctly we end up adding a reserve of 28 pages. ie, for every 4096 pages 28 pages get reserved. Link: https://lkml.kernel.org/r/20230808091501.287660-6-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: David Hildenbrand Cc: Christophe Leroy Cc: Michael Ellerman Cc: Michal Hocko Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Vishal Verma Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/pgtable.h | 21 +++++++++++++++++++++ arch/powerpc/platforms/pseries/hotplug-memory.c | 2 +- 3 files changed, 23 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d0497d13f5b4..938294c996dc 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -157,6 +157,7 @@ config PPC select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_KEEP_MEMBLOCK + select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if PPC_RADIX_MMU select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index a4893b17705a..33464e6d6431 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -161,6 +161,27 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd) int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size); bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, unsigned long page_size); +/* + * mm/memory_hotplug.c:mhp_supports_memmap_on_memory goes into details + * some of the restrictions. We don't check for PMD_SIZE because our + * vmemmap allocation code can fallback correctly. The pageblock + * alignment requirement is met using altmap->reserve blocks. + */ +#define arch_supports_memmap_on_memory arch_supports_memmap_on_memory +static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size) +{ + if (!radix_enabled()) + return false; + /* + * With 4K page size and 2M PMD_SIZE, we can align + * things better with memory block size value + * starting from 128MB. Hence align things with PMD_SIZE. + */ + if (IS_ENABLED(CONFIG_PPC_4K_PAGES)) + return IS_ALIGNED(vmemmap_size, PMD_SIZE); + return true; +} + #endif /* CONFIG_PPC64 */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 9c62c2c3b3d0..4f3d6a2f9065 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -637,7 +637,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) nid = first_online_node; /* Add the memory */ - rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_NONE); + rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_MEMMAP_ON_MEMORY); if (rc) { invalidate_lmb_associativity_index(lmb); return rc; -- cgit v1.2.3 From 1865484af6b2ced2f367c1042b5f3816c11db148 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:48 -0700 Subject: mm: convert ptlock_ptr() to use ptdescs This removes some direct accesses to struct page, working towards splitting out struct ptdesc from struct page. Link: https://lkml.kernel.org/r/20230807230513.102486-7-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/x86/xen/mmu_pv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index e0a975165de7..8796ec310483 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -667,7 +667,7 @@ static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) spinlock_t *ptl = NULL; #if USE_SPLIT_PTE_PTLOCKS - ptl = ptlock_ptr(page); + ptl = ptlock_ptr(page_ptdesc(page)); spin_lock_nest_lock(ptl, &mm->page_table_lock); #endif -- cgit v1.2.3 From 4eaca96140b33e2d1fad761d1468c8293859da11 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:54 -0700 Subject: powerpc: convert various functions to use ptdescs In order to split struct ptdesc from struct page, convert various functions to use ptdescs. Link: https://lkml.kernel.org/r/20230807230513.102486-13-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/powerpc/mm/book3s64/mmu_context.c | 10 +++--- arch/powerpc/mm/book3s64/pgtable.c | 32 +++++++++---------- arch/powerpc/mm/pgtable-frag.c | 58 +++++++++++++++++----------------- 3 files changed, 50 insertions(+), 50 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c index c766e4c26e42..1715b07c630c 100644 --- a/arch/powerpc/mm/book3s64/mmu_context.c +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -246,15 +246,15 @@ static void destroy_contexts(mm_context_t *ctx) static void pmd_frag_destroy(void *pmd_frag) { int count; - struct page *page; + struct ptdesc *ptdesc; - page = virt_to_page(pmd_frag); + ptdesc = virt_to_ptdesc(pmd_frag); /* drop all the pending references */ count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ - if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) { - pgtable_pmd_page_dtor(page); - __free_page(page); + if (atomic_sub_and_test(PMD_FRAG_NR - count, &ptdesc->pt_frag_refcount)) { + pagetable_pmd_dtor(ptdesc); + pagetable_free(ptdesc); } } diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 75b938268b04..1498ccd08367 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -384,22 +384,22 @@ static pmd_t *get_pmd_from_cache(struct mm_struct *mm) static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm) { void *ret = NULL; - struct page *page; + struct ptdesc *ptdesc; gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO; if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; - page = alloc_page(gfp); - if (!page) + ptdesc = pagetable_alloc(gfp, 0); + if (!ptdesc) return NULL; - if (!pgtable_pmd_page_ctor(page)) { - __free_pages(page, 0); + if (!pagetable_pmd_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - atomic_set(&page->pt_frag_refcount, 1); + atomic_set(&ptdesc->pt_frag_refcount, 1); - ret = page_address(page); + ret = ptdesc_address(ptdesc); /* * if we support only one fragment just return the * allocated page. @@ -409,12 +409,12 @@ static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm) spin_lock(&mm->page_table_lock); /* - * If we find pgtable_page set, we return + * If we find ptdesc_page set, we return * the allocated page with single fragment * count. */ if (likely(!mm->context.pmd_frag)) { - atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR); + atomic_set(&ptdesc->pt_frag_refcount, PMD_FRAG_NR); mm->context.pmd_frag = ret + PMD_FRAG_SIZE; } spin_unlock(&mm->page_table_lock); @@ -435,15 +435,15 @@ pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr) void pmd_fragment_free(unsigned long *pmd) { - struct page *page = virt_to_page(pmd); + struct ptdesc *ptdesc = virt_to_ptdesc(pmd); - if (PageReserved(page)) - return free_reserved_page(page); + if (pagetable_is_reserved(ptdesc)) + return free_reserved_ptdesc(ptdesc); - BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); - if (atomic_dec_and_test(&page->pt_frag_refcount)) { - pgtable_pmd_page_dtor(page); - __free_page(page); + BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0); + if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) { + pagetable_pmd_dtor(ptdesc); + pagetable_free(ptdesc); } } diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index 0c6b68130025..8c31802f97e8 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -18,15 +18,15 @@ void pte_frag_destroy(void *pte_frag) { int count; - struct page *page; + struct ptdesc *ptdesc; - page = virt_to_page(pte_frag); + ptdesc = virt_to_ptdesc(pte_frag); /* drop all the pending references */ count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ - if (atomic_sub_and_test(PTE_FRAG_NR - count, &page->pt_frag_refcount)) { - pgtable_pte_page_dtor(page); - __free_page(page); + if (atomic_sub_and_test(PTE_FRAG_NR - count, &ptdesc->pt_frag_refcount)) { + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); } } @@ -55,25 +55,25 @@ static pte_t *get_pte_from_cache(struct mm_struct *mm) static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) { void *ret = NULL; - struct page *page; + struct ptdesc *ptdesc; if (!kernel) { - page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT); - if (!page) + ptdesc = pagetable_alloc(PGALLOC_GFP | __GFP_ACCOUNT, 0); + if (!ptdesc) return NULL; - if (!pgtable_pte_page_ctor(page)) { - __free_page(page); + if (!pagetable_pte_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } } else { - page = alloc_page(PGALLOC_GFP); - if (!page) + ptdesc = pagetable_alloc(PGALLOC_GFP, 0); + if (!ptdesc) return NULL; } - atomic_set(&page->pt_frag_refcount, 1); + atomic_set(&ptdesc->pt_frag_refcount, 1); - ret = page_address(page); + ret = ptdesc_address(ptdesc); /* * if we support only one fragment just return the * allocated page. @@ -82,12 +82,12 @@ static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) return ret; spin_lock(&mm->page_table_lock); /* - * If we find pgtable_page set, we return + * If we find ptdesc_page set, we return * the allocated page with single fragment * count. */ if (likely(!pte_frag_get(&mm->context))) { - atomic_set(&page->pt_frag_refcount, PTE_FRAG_NR); + atomic_set(&ptdesc->pt_frag_refcount, PTE_FRAG_NR); pte_frag_set(&mm->context, ret + PTE_FRAG_SIZE); } spin_unlock(&mm->page_table_lock); @@ -108,28 +108,28 @@ pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel) static void pte_free_now(struct rcu_head *head) { - struct page *page; + struct ptdesc *ptdesc; - page = container_of(head, struct page, rcu_head); - pgtable_pte_page_dtor(page); - __free_page(page); + ptdesc = container_of(head, struct ptdesc, pt_rcu_head); + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); } void pte_fragment_free(unsigned long *table, int kernel) { - struct page *page = virt_to_page(table); + struct ptdesc *ptdesc = virt_to_ptdesc(table); - if (PageReserved(page)) - return free_reserved_page(page); + if (pagetable_is_reserved(ptdesc)) + return free_reserved_ptdesc(ptdesc); - BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); - if (atomic_dec_and_test(&page->pt_frag_refcount)) { + BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0); + if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) { if (kernel) - __free_page(page); - else if (TestClearPageActive(page)) - call_rcu(&page->rcu_head, pte_free_now); + pagetable_free(ptdesc); + else if (folio_test_clear_active(ptdesc_folio(ptdesc))) + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); else - pte_free_now(&page->rcu_head); + pte_free_now(&ptdesc->pt_rcu_head); } } -- cgit v1.2.3 From f92c494f420a9fdb253861089f615e6e977aa62d Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:55 -0700 Subject: x86: convert various functions to use ptdescs In order to split struct ptdesc from struct page, convert various functions to use ptdescs. Some of the functions use the *get*page*() helper functions. Convert these to use pagetable_alloc() and ptdesc_address() instead to help standardize page tables further. Link: https://lkml.kernel.org/r/20230807230513.102486-14-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Mike Rapoport (IBM) Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/x86/mm/pgtable.c | 47 ++++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 19 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 15a8009a4480..d3a93e8766ee 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -52,7 +52,7 @@ early_param("userpte", setup_userpte); void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { - pgtable_pte_page_dtor(pte); + pagetable_pte_dtor(page_ptdesc(pte)); paravirt_release_pte(page_to_pfn(pte)); paravirt_tlb_remove_table(tlb, pte); } @@ -60,7 +60,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) #if CONFIG_PGTABLE_LEVELS > 2 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { - struct page *page = virt_to_page(pmd); + struct ptdesc *ptdesc = virt_to_ptdesc(pmd); paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); /* * NOTE! For PAE, any changes to the top page-directory-pointer-table @@ -69,8 +69,8 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) #ifdef CONFIG_X86_PAE tlb->need_flush_all = 1; #endif - pgtable_pmd_page_dtor(page); - paravirt_tlb_remove_table(tlb, page); + pagetable_pmd_dtor(ptdesc); + paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc)); } #if CONFIG_PGTABLE_LEVELS > 3 @@ -92,16 +92,16 @@ void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) static inline void pgd_list_add(pgd_t *pgd) { - struct page *page = virt_to_page(pgd); + struct ptdesc *ptdesc = virt_to_ptdesc(pgd); - list_add(&page->lru, &pgd_list); + list_add(&ptdesc->pt_list, &pgd_list); } static inline void pgd_list_del(pgd_t *pgd) { - struct page *page = virt_to_page(pgd); + struct ptdesc *ptdesc = virt_to_ptdesc(pgd); - list_del(&page->lru); + list_del(&ptdesc->pt_list); } #define UNSHARED_PTRS_PER_PGD \ @@ -112,12 +112,12 @@ static inline void pgd_list_del(pgd_t *pgd) static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) { - virt_to_page(pgd)->pt_mm = mm; + virt_to_ptdesc(pgd)->pt_mm = mm; } struct mm_struct *pgd_page_get_mm(struct page *page) { - return page->pt_mm; + return page_ptdesc(page)->pt_mm; } static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) @@ -213,11 +213,14 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; + struct ptdesc *ptdesc; for (i = 0; i < count; i++) if (pmds[i]) { - pgtable_pmd_page_dtor(virt_to_page(pmds[i])); - free_page((unsigned long)pmds[i]); + ptdesc = virt_to_ptdesc(pmds[i]); + + pagetable_pmd_dtor(ptdesc); + pagetable_free(ptdesc); mm_dec_nr_pmds(mm); } } @@ -230,18 +233,24 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; + gfp &= ~__GFP_HIGHMEM; for (i = 0; i < count; i++) { - pmd_t *pmd = (pmd_t *)__get_free_page(gfp); - if (!pmd) + pmd_t *pmd = NULL; + struct ptdesc *ptdesc = pagetable_alloc(gfp, 0); + + if (!ptdesc) failed = true; - if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) { - free_page((unsigned long)pmd); - pmd = NULL; + if (ptdesc && !pagetable_pmd_ctor(ptdesc)) { + pagetable_free(ptdesc); + ptdesc = NULL; failed = true; } - if (pmd) + if (ptdesc) { mm_inc_nr_pmds(mm); + pmd = ptdesc_address(ptdesc); + } + pmds[i] = pmd; } @@ -830,7 +839,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr) free_page((unsigned long)pmd_sv); - pgtable_pmd_page_dtor(virt_to_page(pmd)); + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); free_page((unsigned long)pmd); return 1; -- cgit v1.2.3 From 6326c26c1514757242829b292b26eac589013200 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:56 -0700 Subject: s390: convert various pgalloc functions to use ptdescs As part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents, convert various page table functions to use ptdescs. Some of the functions use the *get*page*() helper functions. Convert these to use pagetable_alloc() and ptdesc_address() instead to help standardize page tables further. Link: https://lkml.kernel.org/r/20230807230513.102486-15-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/s390/include/asm/pgalloc.h | 4 +- arch/s390/include/asm/tlb.h | 4 +- arch/s390/mm/pgalloc.c | 128 ++++++++++++++++++++-------------------- 3 files changed, 69 insertions(+), 67 deletions(-) (limited to 'arch') diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 89a9d5ef94f8..376b4b23bdaa 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -86,7 +86,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) if (!table) return NULL; crst_table_init(table, _SEGMENT_ENTRY_EMPTY); - if (!pgtable_pmd_page_ctor(virt_to_page(table))) { + if (!pagetable_pmd_ctor(virt_to_ptdesc(table))) { crst_table_free(mm, table); return NULL; } @@ -97,7 +97,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { if (mm_pmd_folded(mm)) return; - pgtable_pmd_page_dtor(virt_to_page(pmd)); + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); crst_table_free(mm, (unsigned long *) pmd); } diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index b91f4a9b044c..383b1f91442c 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -89,12 +89,12 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, { if (mm_pmd_folded(tlb->mm)) return; - pgtable_pmd_page_dtor(virt_to_page(pmd)); + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); __tlb_adjust_range(tlb, address, PAGE_SIZE); tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; tlb->cleared_puds = 1; - tlb_remove_table(tlb, pmd); + tlb_remove_ptdesc(tlb, pmd); } /* diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index d7374add7820..07fc660a24aa 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -43,17 +43,17 @@ __initcall(page_table_register_sysctl); unsigned long *crst_table_alloc(struct mm_struct *mm) { - struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER); - if (!page) + if (!ptdesc) return NULL; - arch_set_page_dat(page, CRST_ALLOC_ORDER); - return (unsigned long *) page_to_virt(page); + arch_set_page_dat(ptdesc_page(ptdesc), CRST_ALLOC_ORDER); + return (unsigned long *) ptdesc_to_virt(ptdesc); } void crst_table_free(struct mm_struct *mm, unsigned long *table) { - free_pages((unsigned long)table, CRST_ALLOC_ORDER); + pagetable_free(virt_to_ptdesc(table)); } static void __crst_table_upgrade(void *arg) @@ -140,21 +140,21 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) struct page *page_table_alloc_pgste(struct mm_struct *mm) { - struct page *page; + struct ptdesc *ptdesc; u64 *table; - page = alloc_page(GFP_KERNEL); - if (page) { - table = (u64 *)page_to_virt(page); + ptdesc = pagetable_alloc(GFP_KERNEL, 0); + if (ptdesc) { + table = (u64 *)ptdesc_to_virt(ptdesc); memset64(table, _PAGE_INVALID, PTRS_PER_PTE); memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } - return page; + return ptdesc_page(ptdesc); } void page_table_free_pgste(struct page *page) { - __free_page(page); + pagetable_free(page_ptdesc(page)); } #endif /* CONFIG_PGSTE */ @@ -242,7 +242,7 @@ void page_table_free_pgste(struct page *page) unsigned long *page_table_alloc(struct mm_struct *mm) { unsigned long *table; - struct page *page; + struct ptdesc *ptdesc; unsigned int mask, bit; /* Try to get a fragment of a 4K page as a 2K page table */ @@ -250,9 +250,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm) table = NULL; spin_lock_bh(&mm->context.lock); if (!list_empty(&mm->context.pgtable_list)) { - page = list_first_entry(&mm->context.pgtable_list, - struct page, lru); - mask = atomic_read(&page->_refcount) >> 24; + ptdesc = list_first_entry(&mm->context.pgtable_list, + struct ptdesc, pt_list); + mask = atomic_read(&ptdesc->_refcount) >> 24; /* * The pending removal bits must also be checked. * Failure to do so might lead to an impossible @@ -264,13 +264,13 @@ unsigned long *page_table_alloc(struct mm_struct *mm) */ mask = (mask | (mask >> 4)) & 0x03U; if (mask != 0x03U) { - table = (unsigned long *) page_to_virt(page); + table = (unsigned long *) ptdesc_to_virt(ptdesc); bit = mask & 1; /* =1 -> second 2K */ if (bit) table += PTRS_PER_PTE; - atomic_xor_bits(&page->_refcount, + atomic_xor_bits(&ptdesc->_refcount, 0x01U << (bit + 24)); - list_del_init(&page->lru); + list_del_init(&ptdesc->pt_list); } } spin_unlock_bh(&mm->context.lock); @@ -278,28 +278,28 @@ unsigned long *page_table_alloc(struct mm_struct *mm) return table; } /* Allocate a fresh page */ - page = alloc_page(GFP_KERNEL); - if (!page) + ptdesc = pagetable_alloc(GFP_KERNEL, 0); + if (!ptdesc) return NULL; - if (!pgtable_pte_page_ctor(page)) { - __free_page(page); + if (!pagetable_pte_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - arch_set_page_dat(page, 0); + arch_set_page_dat(ptdesc_page(ptdesc), 0); /* Initialize page table */ - table = (unsigned long *) page_to_virt(page); + table = (unsigned long *) ptdesc_to_virt(ptdesc); if (mm_alloc_pgste(mm)) { /* Return 4K page table with PGSTEs */ - INIT_LIST_HEAD(&page->lru); - atomic_xor_bits(&page->_refcount, 0x03U << 24); + INIT_LIST_HEAD(&ptdesc->pt_list); + atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } else { /* Return the first 2K fragment of the page */ - atomic_xor_bits(&page->_refcount, 0x01U << 24); + atomic_xor_bits(&ptdesc->_refcount, 0x01U << 24); memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE); spin_lock_bh(&mm->context.lock); - list_add(&page->lru, &mm->context.pgtable_list); + list_add(&ptdesc->pt_list, &mm->context.pgtable_list); spin_unlock_bh(&mm->context.lock); } return table; @@ -322,19 +322,18 @@ static void page_table_release_check(struct page *page, void *table, static void pte_free_now(struct rcu_head *head) { - struct page *page; + struct ptdesc *ptdesc; - page = container_of(head, struct page, rcu_head); - pgtable_pte_page_dtor(page); - __free_page(page); + ptdesc = container_of(head, struct ptdesc, pt_rcu_head); + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); } void page_table_free(struct mm_struct *mm, unsigned long *table) { unsigned int mask, bit, half; - struct page *page; + struct ptdesc *ptdesc = virt_to_ptdesc(table); - page = virt_to_page(table); if (!mm_alloc_pgste(mm)) { /* Free 2K page table fragment of a 4K page */ bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); @@ -344,51 +343,50 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) * will happen outside of the critical section from this * function or from __tlb_remove_table() */ - mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if ((mask & 0x03U) && !PageActive(page)) { + if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) { /* * Other half is allocated, and neither half has had * its free deferred: add page to head of list, to make * this freed half available for immediate reuse. */ - list_add(&page->lru, &mm->context.pgtable_list); + list_add(&ptdesc->pt_list, &mm->context.pgtable_list); } else { /* If page is on list, now remove it. */ - list_del_init(&page->lru); + list_del_init(&ptdesc->pt_list); } spin_unlock_bh(&mm->context.lock); - mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24)); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x10U << (bit + 24)); mask >>= 24; if (mask != 0x00U) return; half = 0x01U << bit; } else { half = 0x03U; - mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); mask >>= 24; } - page_table_release_check(page, table, half, mask); - if (TestClearPageActive(page)) - call_rcu(&page->rcu_head, pte_free_now); + page_table_release_check(ptdesc_page(ptdesc), table, half, mask); + if (folio_test_clear_active(ptdesc_folio(ptdesc))) + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); else - pte_free_now(&page->rcu_head); + pte_free_now(&ptdesc->pt_rcu_head); } void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, unsigned long vmaddr) { struct mm_struct *mm; - struct page *page; unsigned int bit, mask; + struct ptdesc *ptdesc = virt_to_ptdesc(table); mm = tlb->mm; - page = virt_to_page(table); if (mm_alloc_pgste(mm)) { gmap_unlink(mm, table, vmaddr); table = (unsigned long *) ((unsigned long)table | 0x03U); - tlb_remove_table(tlb, table); + tlb_remove_ptdesc(tlb, table); return; } bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); @@ -398,19 +396,19 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, * outside of the critical section from __tlb_remove_table() or from * page_table_free() */ - mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if ((mask & 0x03U) && !PageActive(page)) { + if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) { /* * Other half is allocated, and neither half has had * its free deferred: add page to end of list, to make * this freed half available for reuse once its pending * bit has been cleared by __tlb_remove_table(). */ - list_add_tail(&page->lru, &mm->context.pgtable_list); + list_add_tail(&ptdesc->pt_list, &mm->context.pgtable_list); } else { /* If page is on list, now remove it. */ - list_del_init(&page->lru); + list_del_init(&ptdesc->pt_list); } spin_unlock_bh(&mm->context.lock); table = (unsigned long *) ((unsigned long) table | (0x01U << bit)); @@ -421,30 +419,30 @@ void __tlb_remove_table(void *_table) { unsigned int mask = (unsigned long) _table & 0x03U, half = mask; void *table = (void *)((unsigned long) _table ^ mask); - struct page *page = virt_to_page(table); + struct ptdesc *ptdesc = virt_to_ptdesc(table); switch (half) { case 0x00U: /* pmd, pud, or p4d */ - free_pages((unsigned long)table, CRST_ALLOC_ORDER); + pagetable_free(ptdesc); return; case 0x01U: /* lower 2K of a 4K page table */ case 0x02U: /* higher 2K of a 4K page table */ - mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24)); + mask = atomic_xor_bits(&ptdesc->_refcount, mask << (4 + 24)); mask >>= 24; if (mask != 0x00U) return; break; case 0x03U: /* 4K page table with pgstes */ - mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); mask >>= 24; break; } - page_table_release_check(page, table, half, mask); - if (TestClearPageActive(page)) - call_rcu(&page->rcu_head, pte_free_now); + page_table_release_check(ptdesc_page(ptdesc), table, half, mask); + if (folio_test_clear_active(ptdesc_folio(ptdesc))) + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); else - pte_free_now(&page->rcu_head); + pte_free_now(&ptdesc->pt_rcu_head); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -488,16 +486,20 @@ static void base_pgt_free(unsigned long *table) static unsigned long *base_crst_alloc(unsigned long val) { unsigned long *table; + struct ptdesc *ptdesc; - table = (unsigned long *)__get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER); - if (table) - crst_table_init(table, val); + ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, CRST_ALLOC_ORDER); + if (!ptdesc) + return NULL; + table = ptdesc_address(ptdesc); + + crst_table_init(table, val); return table; } static void base_crst_free(unsigned long *table) { - free_pages((unsigned long)table, CRST_ALLOC_ORDER); + pagetable_free(virt_to_ptdesc(table)); } #define BASE_ADDR_END_FUNC(NAME, SIZE) \ -- cgit v1.2.3 From 358d1c39c82afaed58778633f6ed76c8fe9dbf9c Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:59 -0700 Subject: arm: convert various functions to use ptdescs As part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents, convert various page table functions to use ptdescs. late_alloc() also uses the __get_free_pages() helper function. Convert this to use pagetable_alloc() and ptdesc_address() instead to help standardize page tables further. Link: https://lkml.kernel.org/r/20230807230513.102486-18-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/arm/include/asm/tlb.h | 12 +++++++----- arch/arm/mm/mmu.c | 7 ++++--- 2 files changed, 11 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index b8cbe03ad260..f40d06ad5d2a 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -39,7 +39,9 @@ static inline void __tlb_remove_table(void *_table) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) { - pgtable_pte_page_dtor(pte); + struct ptdesc *ptdesc = page_ptdesc(pte); + + pagetable_pte_dtor(ptdesc); #ifndef CONFIG_ARM_LPAE /* @@ -50,17 +52,17 @@ __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) __tlb_adjust_range(tlb, addr - PAGE_SIZE, 2 * PAGE_SIZE); #endif - tlb_remove_table(tlb, pte); + tlb_remove_ptdesc(tlb, ptdesc); } static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr) { #ifdef CONFIG_ARM_LPAE - struct page *page = virt_to_page(pmdp); + struct ptdesc *ptdesc = virt_to_ptdesc(pmdp); - pgtable_pmd_page_dtor(page); - tlb_remove_table(tlb, page); + pagetable_pmd_dtor(ptdesc); + tlb_remove_ptdesc(tlb, ptdesc); #endif } diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 13fc4bb5f792..fdeaee30d167 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -737,11 +737,12 @@ static void __init *early_alloc(unsigned long sz) static void *__init late_alloc(unsigned long sz) { - void *ptr = (void *)__get_free_pages(GFP_PGTABLE_KERNEL, get_order(sz)); + void *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_HIGHMEM, + get_order(sz)); - if (!ptr || !pgtable_pte_page_ctor(virt_to_page(ptr))) + if (!ptdesc || !pagetable_pte_ctor(ptdesc)) BUG(); - return ptr; + return ptdesc_to_virt(ptdesc); } static pte_t * __init arm_pte_alloc(pmd_t *pmd, unsigned long addr, -- cgit v1.2.3 From 11b4fa8b2a56c0e7b9db4fe21c134a4cba414657 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:00 -0700 Subject: arm64: convert various functions to use ptdescs As part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents, convert various page table functions to use ptdescs. Link: https://lkml.kernel.org/r/20230807230513.102486-19-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Acked-by: Catalin Marinas Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/arm64/include/asm/tlb.h | 14 ++++++++------ arch/arm64/mm/mmu.c | 7 ++++--- 2 files changed, 12 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index c995d1f4594f..2c29239d05c3 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -75,18 +75,20 @@ static inline void tlb_flush(struct mmu_gather *tlb) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) { - pgtable_pte_page_dtor(pte); - tlb_remove_table(tlb, pte); + struct ptdesc *ptdesc = page_ptdesc(pte); + + pagetable_pte_dtor(ptdesc); + tlb_remove_ptdesc(tlb, ptdesc); } #if CONFIG_PGTABLE_LEVELS > 2 static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr) { - struct page *page = virt_to_page(pmdp); + struct ptdesc *ptdesc = virt_to_ptdesc(pmdp); - pgtable_pmd_page_dtor(page); - tlb_remove_table(tlb, page); + pagetable_pmd_dtor(ptdesc); + tlb_remove_ptdesc(tlb, ptdesc); } #endif @@ -94,7 +96,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp, unsigned long addr) { - tlb_remove_table(tlb, virt_to_page(pudp)); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(pudp)); } #endif diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 95d360805f8a..47781bec6171 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -426,6 +426,7 @@ static phys_addr_t __pgd_pgtable_alloc(int shift) static phys_addr_t pgd_pgtable_alloc(int shift) { phys_addr_t pa = __pgd_pgtable_alloc(shift); + struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa)); /* * Call proper page table ctor in case later we need to @@ -433,12 +434,12 @@ static phys_addr_t pgd_pgtable_alloc(int shift) * this pre-allocated page table. * * We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is - * folded, and if so pgtable_pmd_page_ctor() becomes nop. + * folded, and if so pagetable_pte_ctor() becomes nop. */ if (shift == PAGE_SHIFT) - BUG_ON(!pgtable_pte_page_ctor(phys_to_page(pa))); + BUG_ON(!pagetable_pte_ctor(ptdesc)); else if (shift == PMD_SHIFT) - BUG_ON(!pgtable_pmd_page_ctor(phys_to_page(pa))); + BUG_ON(!pagetable_pmd_ctor(ptdesc)); return pa; } -- cgit v1.2.3 From e647333995dde9c0b89369a4c23b9e410a080825 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:01 -0700 Subject: csky: convert __pte_free_tlb() to use ptdescs Part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents. Link: https://lkml.kernel.org/r/20230807230513.102486-20-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Guo Ren Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/csky/include/asm/pgalloc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h index 7d57e5da0914..9c84c9012e53 100644 --- a/arch/csky/include/asm/pgalloc.h +++ b/arch/csky/include/asm/pgalloc.h @@ -63,8 +63,8 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) #define __pte_free_tlb(tlb, pte, address) \ do { \ - pgtable_pte_page_dtor(pte); \ - tlb_remove_page(tlb, pte); \ + pagetable_pte_dtor(page_ptdesc(pte)); \ + tlb_remove_page_ptdesc(tlb, page_ptdesc(pte)); \ } while (0) extern void pagetable_init(void); -- cgit v1.2.3 From b45a12c0070a1d3e7666921f321e390df064f372 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:02 -0700 Subject: hexagon: convert __pte_free_tlb() to use ptdescs Part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents. Link: https://lkml.kernel.org/r/20230807230513.102486-21-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/hexagon/include/asm/pgalloc.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/hexagon/include/asm/pgalloc.h b/arch/hexagon/include/asm/pgalloc.h index f0c47e6a7427..55988625e6fb 100644 --- a/arch/hexagon/include/asm/pgalloc.h +++ b/arch/hexagon/include/asm/pgalloc.h @@ -87,10 +87,10 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, max_kernel_seg = pmdindex; } -#define __pte_free_tlb(tlb, pte, addr) \ -do { \ - pgtable_pte_page_dtor((pte)); \ - tlb_remove_page((tlb), (pte)); \ +#define __pte_free_tlb(tlb, pte, addr) \ +do { \ + pagetable_pte_dtor((page_ptdesc(pte))); \ + tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ } while (0) #endif -- cgit v1.2.3 From 382739797f79ec2a0ca28d3f00b2559b4905f94e Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:03 -0700 Subject: loongarch: convert various functions to use ptdescs As part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents, convert various page table functions to use ptdescs. Some of the functions use the *get*page*() helper functions. Convert these to use pagetable_alloc() and ptdesc_address() instead to help standardize page tables further. Link: https://lkml.kernel.org/r/20230807230513.102486-22-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/loongarch/include/asm/pgalloc.h | 27 +++++++++++++++------------ arch/loongarch/mm/pgtable.c | 7 ++++--- 2 files changed, 19 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/loongarch/include/asm/pgalloc.h b/arch/loongarch/include/asm/pgalloc.h index af1d1e4a6965..23f5b1107246 100644 --- a/arch/loongarch/include/asm/pgalloc.h +++ b/arch/loongarch/include/asm/pgalloc.h @@ -45,9 +45,9 @@ extern void pagetable_init(void); extern pgd_t *pgd_alloc(struct mm_struct *mm); #define __pte_free_tlb(tlb, pte, address) \ -do { \ - pgtable_pte_page_dtor(pte); \ - tlb_remove_page((tlb), pte); \ +do { \ + pagetable_pte_dtor(page_ptdesc(pte)); \ + tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \ } while (0) #ifndef __PAGETABLE_PMD_FOLDED @@ -55,18 +55,18 @@ do { \ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) { pmd_t *pmd; - struct page *pg; + struct ptdesc *ptdesc; - pg = alloc_page(GFP_KERNEL_ACCOUNT); - if (!pg) + ptdesc = pagetable_alloc(GFP_KERNEL_ACCOUNT, 0); + if (!ptdesc) return NULL; - if (!pgtable_pmd_page_ctor(pg)) { - __free_page(pg); + if (!pagetable_pmd_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - pmd = (pmd_t *)page_address(pg); + pmd = ptdesc_address(ptdesc); pmd_init(pmd); return pmd; } @@ -80,10 +80,13 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) { pud_t *pud; + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); - pud = (pud_t *) __get_free_page(GFP_KERNEL); - if (pud) - pud_init(pud); + if (!ptdesc) + return NULL; + pud = ptdesc_address(ptdesc); + + pud_init(pud); return pud; } diff --git a/arch/loongarch/mm/pgtable.c b/arch/loongarch/mm/pgtable.c index 36a6dc0148ae..5bd102b51f7c 100644 --- a/arch/loongarch/mm/pgtable.c +++ b/arch/loongarch/mm/pgtable.c @@ -11,10 +11,11 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *ret, *init; + pgd_t *init, *ret = NULL; + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); - ret = (pgd_t *) __get_free_page(GFP_KERNEL); - if (ret) { + if (ptdesc) { + ret = (pgd_t *)ptdesc_address(ptdesc); init = pgd_offset(&init_mm, 0UL); pgd_init(ret); memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD, -- cgit v1.2.3 From bff28e6bd08e65ac5540de571e9dfd33f7481148 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:04 -0700 Subject: m68k: convert various functions to use ptdescs As part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents, convert various page table functions to use ptdescs. Some of the functions use the *get*page*() helper functions. Convert these to use pagetable_alloc() and ptdesc_address() instead to help standardize page tables further. Link: https://lkml.kernel.org/r/20230807230513.102486-23-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Acked-by: Geert Uytterhoeven Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/m68k/include/asm/mcf_pgalloc.h | 47 ++++++++++++++++++------------------ arch/m68k/include/asm/sun3_pgalloc.h | 8 +++--- arch/m68k/mm/motorola.c | 4 +-- 3 files changed, 30 insertions(+), 29 deletions(-) (limited to 'arch') diff --git a/arch/m68k/include/asm/mcf_pgalloc.h b/arch/m68k/include/asm/mcf_pgalloc.h index 5c2c0a864524..302c5bf67179 100644 --- a/arch/m68k/include/asm/mcf_pgalloc.h +++ b/arch/m68k/include/asm/mcf_pgalloc.h @@ -5,22 +5,22 @@ #include #include -extern inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { - free_page((unsigned long) pte); + pagetable_free(virt_to_ptdesc(pte)); } extern const char bad_pmd_string[]; -extern inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { - unsigned long page = __get_free_page(GFP_DMA); + struct ptdesc *ptdesc = pagetable_alloc((GFP_DMA | __GFP_ZERO) & + ~__GFP_HIGHMEM, 0); - if (!page) + if (!ptdesc) return NULL; - memset((void *)page, 0, PAGE_SIZE); - return (pte_t *) (page); + return ptdesc_address(ptdesc); } extern inline pmd_t *pmd_alloc_kernel(pgd_t *pgd, unsigned long address) @@ -35,36 +35,34 @@ extern inline pmd_t *pmd_alloc_kernel(pgd_t *pgd, unsigned long address) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pgtable, unsigned long address) { - struct page *page = virt_to_page(pgtable); + struct ptdesc *ptdesc = virt_to_ptdesc(pgtable); - pgtable_pte_page_dtor(page); - __free_page(page); + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); } static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { - struct page *page = alloc_pages(GFP_DMA, 0); + struct ptdesc *ptdesc = pagetable_alloc(GFP_DMA | __GFP_ZERO, 0); pte_t *pte; - if (!page) + if (!ptdesc) return NULL; - if (!pgtable_pte_page_ctor(page)) { - __free_page(page); + if (!pagetable_pte_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - pte = page_address(page); - clear_page(pte); - + pte = ptdesc_address(ptdesc); return pte; } static inline void pte_free(struct mm_struct *mm, pgtable_t pgtable) { - struct page *page = virt_to_page(pgtable); + struct ptdesc *ptdesc = virt_to_ptdesc(pgtable); - pgtable_pte_page_dtor(page); - __free_page(page); + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); } /* @@ -75,16 +73,19 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pgtable) static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - free_page((unsigned long) pgd); + pagetable_free(virt_to_ptdesc(pgd)); } static inline pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *new_pgd; + struct ptdesc *ptdesc = pagetable_alloc((GFP_DMA | __GFP_NOWARN) & + ~__GFP_HIGHMEM, 0); - new_pgd = (pgd_t *)__get_free_page(GFP_DMA | __GFP_NOWARN); - if (!new_pgd) + if (!ptdesc) return NULL; + new_pgd = ptdesc_address(ptdesc); + memcpy(new_pgd, swapper_pg_dir, PTRS_PER_PGD * sizeof(pgd_t)); memset(new_pgd, 0, PAGE_OFFSET >> PGDIR_SHIFT); return new_pgd; diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h index 198036aff519..ff48573db2c0 100644 --- a/arch/m68k/include/asm/sun3_pgalloc.h +++ b/arch/m68k/include/asm/sun3_pgalloc.h @@ -17,10 +17,10 @@ extern const char bad_pmd_string[]; -#define __pte_free_tlb(tlb,pte,addr) \ -do { \ - pgtable_pte_page_dtor(pte); \ - tlb_remove_page((tlb), pte); \ +#define __pte_free_tlb(tlb, pte, addr) \ +do { \ + pagetable_pte_dtor(page_ptdesc(pte)); \ + tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \ } while (0) static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index c75984e2d86b..594575a0780c 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -161,7 +161,7 @@ void *get_pointer_table(int type) * m68k doesn't have SPLIT_PTE_PTLOCKS for not having * SMP. */ - pgtable_pte_page_ctor(virt_to_page(page)); + pagetable_pte_ctor(virt_to_ptdesc(page)); } mmu_page_ctor(page); @@ -201,7 +201,7 @@ int free_pointer_table(void *table, int type) list_del(dp); mmu_page_dtor((void *)page); if (type == TABLE_PTE) - pgtable_pte_page_dtor(virt_to_page((void *)page)); + pagetable_pte_dtor(virt_to_ptdesc((void *)page)); free_page (page); return 1; } else if (ptable_list[type].next != dp) { -- cgit v1.2.3 From 3e14fb19ad7cef7a6e998caabed3de4232a3f257 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:05 -0700 Subject: mips: convert various functions to use ptdescs As part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents, convert various page table functions to use ptdescs. Some of the functions use the *get*page*() helper functions. Convert these to use pagetable_alloc() and ptdesc_address() instead to help standardize page tables further. Link: https://lkml.kernel.org/r/20230807230513.102486-24-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/mips/include/asm/pgalloc.h | 32 ++++++++++++++++++-------------- arch/mips/mm/pgtable.c | 8 +++++--- 2 files changed, 23 insertions(+), 17 deletions(-) (limited to 'arch') diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index f72e737dda21..40e40a7eb94a 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -51,13 +51,13 @@ extern pgd_t *pgd_alloc(struct mm_struct *mm); static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - free_pages((unsigned long)pgd, PGD_TABLE_ORDER); + pagetable_free(virt_to_ptdesc(pgd)); } -#define __pte_free_tlb(tlb,pte,address) \ -do { \ - pgtable_pte_page_dtor(pte); \ - tlb_remove_page((tlb), pte); \ +#define __pte_free_tlb(tlb, pte, address) \ +do { \ + pagetable_pte_dtor(page_ptdesc(pte)); \ + tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \ } while (0) #ifndef __PAGETABLE_PMD_FOLDED @@ -65,18 +65,18 @@ do { \ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) { pmd_t *pmd; - struct page *pg; + struct ptdesc *ptdesc; - pg = alloc_pages(GFP_KERNEL_ACCOUNT, PMD_TABLE_ORDER); - if (!pg) + ptdesc = pagetable_alloc(GFP_KERNEL_ACCOUNT, PMD_TABLE_ORDER); + if (!ptdesc) return NULL; - if (!pgtable_pmd_page_ctor(pg)) { - __free_pages(pg, PMD_TABLE_ORDER); + if (!pagetable_pmd_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - pmd = (pmd_t *)page_address(pg); + pmd = ptdesc_address(ptdesc); pmd_init(pmd); return pmd; } @@ -90,10 +90,14 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) { pud_t *pud; + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, + PUD_TABLE_ORDER); - pud = (pud_t *) __get_free_pages(GFP_KERNEL, PUD_TABLE_ORDER); - if (pud) - pud_init(pud); + if (!ptdesc) + return NULL; + pud = ptdesc_address(ptdesc); + + pud_init(pud); return pud; } diff --git a/arch/mips/mm/pgtable.c b/arch/mips/mm/pgtable.c index b13314be5d0e..1506e458040d 100644 --- a/arch/mips/mm/pgtable.c +++ b/arch/mips/mm/pgtable.c @@ -10,10 +10,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *ret, *init; + pgd_t *init, *ret = NULL; + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, + PGD_TABLE_ORDER); - ret = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_TABLE_ORDER); - if (ret) { + if (ptdesc) { + ret = ptdesc_address(ptdesc); init = pgd_offset(&init_mm, 0UL); pgd_init(ret); memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD, -- cgit v1.2.3 From 61139e9a7592edc431e7586aab475a77c957e65c Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:06 -0700 Subject: nios2: convert __pte_free_tlb() to use ptdescs Part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents. Link: https://lkml.kernel.org/r/20230807230513.102486-25-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Acked-by: Dinh Nguyen Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/nios2/include/asm/pgalloc.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h index ecd1657bb2ce..ce6bb8e74271 100644 --- a/arch/nios2/include/asm/pgalloc.h +++ b/arch/nios2/include/asm/pgalloc.h @@ -28,10 +28,10 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, extern pgd_t *pgd_alloc(struct mm_struct *mm); -#define __pte_free_tlb(tlb, pte, addr) \ - do { \ - pgtable_pte_page_dtor(pte); \ - tlb_remove_page((tlb), (pte)); \ +#define __pte_free_tlb(tlb, pte, addr) \ + do { \ + pagetable_pte_dtor(page_ptdesc(pte)); \ + tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ } while (0) #endif /* _ASM_NIOS2_PGALLOC_H */ -- cgit v1.2.3 From 5823b9fe0451869b4c67a920533d47c5bf1e7628 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:07 -0700 Subject: openrisc: convert __pte_free_tlb() to use ptdescs Part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents. Link: https://lkml.kernel.org/r/20230807230513.102486-26-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/openrisc/include/asm/pgalloc.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h index b7b2b8d16fad..c6a73772a546 100644 --- a/arch/openrisc/include/asm/pgalloc.h +++ b/arch/openrisc/include/asm/pgalloc.h @@ -66,10 +66,10 @@ extern inline pgd_t *pgd_alloc(struct mm_struct *mm) extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); -#define __pte_free_tlb(tlb, pte, addr) \ -do { \ - pgtable_pte_page_dtor(pte); \ - tlb_remove_page((tlb), (pte)); \ +#define __pte_free_tlb(tlb, pte, addr) \ +do { \ + pagetable_pte_dtor(page_ptdesc(pte)); \ + tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ } while (0) #endif -- cgit v1.2.3 From 380f2c1ae9d45a1aa19a2b05dbe57371feebb394 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:08 -0700 Subject: riscv: convert alloc_{pmd, pte}_late() to use ptdescs As part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents, convert various page table functions to use ptdescs. Some of the functions use the *get*page*() helper functions. Convert these to use pagetable_alloc() and ptdesc_address() instead to help standardize page tables further. Link: https://lkml.kernel.org/r/20230807230513.102486-27-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Palmer Dabbelt Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/riscv/include/asm/pgalloc.h | 8 ++++---- arch/riscv/mm/init.c | 16 ++++++---------- 2 files changed, 10 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index 59dc12b5b7e8..d169a4f41a2e 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -153,10 +153,10 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) #endif /* __PAGETABLE_PMD_FOLDED */ -#define __pte_free_tlb(tlb, pte, buf) \ -do { \ - pgtable_pte_page_dtor(pte); \ - tlb_remove_page((tlb), pte); \ +#define __pte_free_tlb(tlb, pte, buf) \ +do { \ + pagetable_pte_dtor(page_ptdesc(pte)); \ + tlb_remove_page_ptdesc((tlb), page_ptdesc(pte));\ } while (0) #endif /* CONFIG_MMU */ diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 9ce504737d18..430a3d05a841 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -353,12 +353,10 @@ static inline phys_addr_t __init alloc_pte_fixmap(uintptr_t va) static phys_addr_t __init alloc_pte_late(uintptr_t va) { - unsigned long vaddr; - - vaddr = __get_free_page(GFP_KERNEL); - BUG_ON(!vaddr || !pgtable_pte_page_ctor(virt_to_page((void *)vaddr))); + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); - return __pa(vaddr); + BUG_ON(!ptdesc || !pagetable_pte_ctor(ptdesc)); + return __pa((pte_t *)ptdesc_address(ptdesc)); } static void __init create_pte_mapping(pte_t *ptep, @@ -436,12 +434,10 @@ static phys_addr_t __init alloc_pmd_fixmap(uintptr_t va) static phys_addr_t __init alloc_pmd_late(uintptr_t va) { - unsigned long vaddr; - - vaddr = __get_free_page(GFP_KERNEL); - BUG_ON(!vaddr || !pgtable_pmd_page_ctor(virt_to_page((void *)vaddr))); + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); - return __pa(vaddr); + BUG_ON(!ptdesc || !pagetable_pmd_ctor(ptdesc)); + return __pa((pmd_t *)ptdesc_address(ptdesc)); } static void __init create_pmd_mapping(pmd_t *pmdp, -- cgit v1.2.3 From bb3be388537b85b567dd614b492d66f383bc8273 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:09 -0700 Subject: sh: convert pte_free_tlb() to use ptdescs Part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents. Also cleans up some spacing issues. Link: https://lkml.kernel.org/r/20230807230513.102486-28-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Geert Uytterhoeven Acked-by: John Paul Adrian Glaubitz Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/sh/include/asm/pgalloc.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index a9e98233c4d4..5d8577ab1591 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -2,6 +2,7 @@ #ifndef __ASM_SH_PGALLOC_H #define __ASM_SH_PGALLOC_H +#include #include #define __HAVE_ARCH_PMD_ALLOC_ONE @@ -31,10 +32,10 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, set_pmd(pmd, __pmd((unsigned long)page_address(pte))); } -#define __pte_free_tlb(tlb,pte,addr) \ -do { \ - pgtable_pte_page_dtor(pte); \ - tlb_remove_page((tlb), (pte)); \ +#define __pte_free_tlb(tlb, pte, addr) \ +do { \ + pagetable_pte_dtor(page_ptdesc(pte)); \ + tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ } while (0) #endif /* __ASM_SH_PGALLOC_H */ -- cgit v1.2.3 From b3311d707c8f1c1d3b0e45f5c4785e9ca3d723a1 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:10 -0700 Subject: sparc64: convert various functions to use ptdescs As part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents, convert various page table functions to use ptdescs. Link: https://lkml.kernel.org/r/20230807230513.102486-29-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/sparc/mm/init_64.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 0d7fd793924c..9a63a3e08e40 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2893,14 +2893,15 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm) pgtable_t pte_alloc_one(struct mm_struct *mm) { - struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL | __GFP_ZERO, 0); + + if (!ptdesc) return NULL; - if (!pgtable_pte_page_ctor(page)) { - __free_page(page); + if (!pagetable_pte_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - return (pte_t *) page_address(page); + return ptdesc_address(ptdesc); } void pte_free_kernel(struct mm_struct *mm, pte_t *pte) @@ -2910,10 +2911,10 @@ void pte_free_kernel(struct mm_struct *mm, pte_t *pte) static void __pte_free(pgtable_t pte) { - struct page *page = virt_to_page(pte); + struct ptdesc *ptdesc = virt_to_ptdesc(pte); - pgtable_pte_page_dtor(page); - __free_page(page); + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); } void pte_free(struct mm_struct *mm, pgtable_t pte) -- cgit v1.2.3 From 222107e1601f6de9c662784929c0f819cc01fa21 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:11 -0700 Subject: sparc: convert pgtable_pte_page_{ctor, dtor}() to ptdesc equivalents Part of the conversions to replace pgtable pte constructor/destructors with ptdesc equivalents. Link: https://lkml.kernel.org/r/20230807230513.102486-30-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/sparc/mm/srmmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index 13f027afc875..8393faa3e596 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -355,7 +355,8 @@ pgtable_t pte_alloc_one(struct mm_struct *mm) return NULL; page = pfn_to_page(__nocache_pa((unsigned long)ptep) >> PAGE_SHIFT); spin_lock(&mm->page_table_lock); - if (page_ref_inc_return(page) == 2 && !pgtable_pte_page_ctor(page)) { + if (page_ref_inc_return(page) == 2 && + !pagetable_pte_ctor(page_ptdesc(page))) { page_ref_dec(page); ptep = NULL; } @@ -371,7 +372,7 @@ void pte_free(struct mm_struct *mm, pgtable_t ptep) page = pfn_to_page(__nocache_pa((unsigned long)ptep) >> PAGE_SHIFT); spin_lock(&mm->page_table_lock); if (page_ref_dec_return(page) == 1) - pgtable_pte_page_dtor(page); + pagetable_pte_dtor(page_ptdesc(page)); spin_unlock(&mm->page_table_lock); srmmu_free_nocache(ptep, SRMMU_PTE_TABLE_SIZE); -- cgit v1.2.3 From da9aefca789d753071e3f36fa940da329c11f7f8 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:12 -0700 Subject: um: convert {pmd, pte}_free_tlb() to use ptdescs Part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents. Also cleans up some spacing issues. Link: https://lkml.kernel.org/r/20230807230513.102486-31-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/um/include/asm/pgalloc.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index 8ec7cd46dd96..de5e31c64793 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -25,19 +25,19 @@ */ extern pgd_t *pgd_alloc(struct mm_struct *); -#define __pte_free_tlb(tlb,pte, address) \ -do { \ - pgtable_pte_page_dtor(pte); \ - tlb_remove_page((tlb),(pte)); \ +#define __pte_free_tlb(tlb, pte, address) \ +do { \ + pagetable_pte_dtor(page_ptdesc(pte)); \ + tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ } while (0) #ifdef CONFIG_3_LEVEL_PGTABLES -#define __pmd_free_tlb(tlb, pmd, address) \ -do { \ - pgtable_pmd_page_dtor(virt_to_page(pmd)); \ - tlb_remove_page((tlb),virt_to_page(pmd)); \ -} while (0) \ +#define __pmd_free_tlb(tlb, pmd, address) \ +do { \ + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); \ + tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pmd)); \ +} while (0) #endif -- cgit v1.2.3 From 889690bcbccb457d85cfd16ce0d8259ecdb10ce4 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 13 Aug 2023 21:52:38 -0700 Subject: arm: include asm/cacheflush.h in asm/hugetlb.h Patch series "arch: include asm/cacheflush.h in asm/hugetlb.h". Three architectures are using PG_dcache_clean in their asm/hugetlb.h, but relying on accident to include the asm/cacheflush.h which defines it. This patch (of 3): PG_dcache_clean is used in asm/hugetlb.h but defined in asm/cacheflush.h: builds rely on an accident of that being included via linux/mempolicy.h, but better include it directly (like arch/sh/include/asm/hugetlb.h does). Link: https://lkml.kernel.org/r/6d2acfa4-7f44-d3b4-b0a8-5495c5985e4c@google.com Link: https://lkml.kernel.org/r/4b055d0-7b2e-72bf-9b9d-8f3f1cd312d0@google.com Signed-off-by: Hugh Dickins Cc: Catalin Marinas Cc: Mike Kravetz Cc: Palmer Dabbelt Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm/include/asm/hugetlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h index d02d6ca88e92..a3a82b7158d4 100644 --- a/arch/arm/include/asm/hugetlb.h +++ b/arch/arm/include/asm/hugetlb.h @@ -10,6 +10,7 @@ #ifndef _ASM_ARM_HUGETLB_H #define _ASM_ARM_HUGETLB_H +#include #include #include #include -- cgit v1.2.3 From 1de8c835a936859f01a91174474a8b96d61b0400 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 13 Aug 2023 21:53:17 -0700 Subject: arm64: include asm/cacheflush.h in asm/hugetlb.h PG_dcache_clean is used in asm/hugetlb.h but defined in asm/cacheflush.h: builds rely on an accident of that being included via linux/mempolicy.h, but better include it directly (like arch/sh/include/asm/hugetlb.h does). Link: https://lkml.kernel.org/r/bd77cc1b-e83b-f276-9e27-c19e7c9119aa@google.com Signed-off-by: Hugh Dickins Cc: Catalin Marinas Cc: Mike Kravetz Cc: Palmer Dabbelt Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/include/asm/hugetlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index a91d6219aa78..f43a38ac1779 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -10,6 +10,7 @@ #ifndef __ASM_HUGETLB_H #define __ASM_HUGETLB_H +#include #include #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION -- cgit v1.2.3 From 33a9fb09836ace6cb7bd48cf30ab22d56285560c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 13 Aug 2023 21:53:57 -0700 Subject: riscv: include asm/cacheflush.h in asm/hugetlb.h PG_dcache_clean is used in asm/hugetlb.h but defined in asm/cacheflush.h: builds rely on an accident of that being included via linux/mempolicy.h, but better include it directly (like arch/sh/include/asm/hugetlb.h does). Link: https://lkml.kernel.org/r/84bd3b96-8dbe-51b1-d7d1-6e4f9d8937d8@google.com Signed-off-by: Hugh Dickins Cc: Catalin Marinas Cc: Mike Kravetz Cc: Palmer Dabbelt Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/riscv/include/asm/hugetlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index ce1ebda1a49a..34e24f078cc1 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -2,6 +2,7 @@ #ifndef _ASM_RISCV_HUGETLB_H #define _ASM_RISCV_HUGETLB_H +#include #include static inline void arch_clear_hugepage_flags(struct page *page) -- cgit v1.2.3 From 4089eef0e6ac1a179c58304c657b3df3bb6fe509 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 30 Jun 2023 14:19:54 -0700 Subject: mm: drop per-VMA lock when returning VM_FAULT_RETRY or VM_FAULT_COMPLETED handle_mm_fault returning VM_FAULT_RETRY or VM_FAULT_COMPLETED means mmap_lock has been released. However with per-VMA locks behavior is different and the caller should still release it. To make the rules consistent for the caller, drop the per-VMA lock when returning VM_FAULT_RETRY or VM_FAULT_COMPLETED. Currently the only path returning VM_FAULT_RETRY under per-VMA locks is do_swap_page and no path returns VM_FAULT_COMPLETED for now. [willy@infradead.org: fix riscv] Link: https://lkml.kernel.org/r/CAJuCfpE6GWEx1rPBmNpUfoD5o-gNFz9-UFywzCE2PbEGBiVz7g@mail.gmail.com Link: https://lkml.kernel.org/r/20230630211957.1341547-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Peter Xu Tested-by: Conor Dooley Cc: Alistair Popple Cc: Al Viro Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Josef Bacik Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michel Lespinasse Cc: Minchan Kim Cc: Pavel Tatashin Cc: Punit Agrawal Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- arch/arm64/mm/fault.c | 3 ++- arch/powerpc/mm/fault.c | 3 ++- arch/riscv/mm/fault.c | 3 ++- arch/s390/mm/fault.c | 3 ++- arch/x86/mm/fault.c | 3 ++- 5 files changed, 10 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 103fcbdc6552..2e5d1e238af9 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -599,7 +599,8 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, goto lock_mmap; } fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index fafce6bdeff0..b1723094d464 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -488,7 +488,8 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 046732fcb48c..6115d7514972 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -296,7 +296,8 @@ void handle_page_fault(struct pt_regs *regs) } fault = handle_mm_fault(vma, addr, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 6f6b9881e55e..a063774ba584 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -417,7 +417,8 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) goto lock_mmap; } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); if (likely(!(fault & VM_FAULT_ERROR))) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 787da09d24f3..2e861b9360c7 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1340,7 +1340,8 @@ void do_user_addr_fault(struct pt_regs *regs, goto lock_mmap; } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); -- cgit v1.2.3 From f9bff0e31881d03badf191d3b0005839391f5f2b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:29 +0100 Subject: minmax: add in_range() macro Patch series "New page table range API", v6. This patchset changes the API used by the MM to set up page table entries. The four APIs are: set_ptes(mm, addr, ptep, pte, nr) update_mmu_cache_range(vma, addr, ptep, nr) flush_dcache_folio(folio) flush_icache_pages(vma, page, nr) flush_dcache_folio() isn't technically new, but no architecture implemented it, so I've done that for them. The old APIs remain around but are mostly implemented by calling the new interfaces. The new APIs are based around setting up N page table entries at once. The N entries belong to the same PMD, the same folio and the same VMA, so ptep++ is a legitimate operation, and locking is taken care of for you. Some architectures can do a better job of it than just a loop, but I have hesitated to make too deep a change to architectures I don't understand well. One thing I have changed in every architecture is that PG_arch_1 is now a per-folio bit instead of a per-page bit when used for dcache clean/dirty tracking. This was something that would have to happen eventually, and it makes sense to do it now rather than iterate over every page involved in a cache flush and figure out if it needs to happen. The point of all this is better performance, and Fengwei Yin has measured improvement on x86. I suspect you'll see improvement on your architecture too. Try the new will-it-scale test mentioned here: https://lore.kernel.org/linux-mm/20230206140639.538867-5-fengwei.yin@intel.com/ You'll need to run it on an XFS filesystem and have CONFIG_TRANSPARENT_HUGEPAGE set. This patchset is the basis for much of the anonymous large folio work being done by Ryan, so it's received quite a lot of testing over the last few months. This patch (of 38): Determine if a value lies within a range more efficiently (subtraction + comparison vs two comparisons and an AND). It also has useful (under some circumstances) behaviour if the range exceeds the maximum value of the type. Convert all the conflicting definitions of in_range() within the kernel; some can use the generic definition while others need their own definition. Link: https://lkml.kernel.org/r/20230802151406.3735276-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230802151406.3735276-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- arch/arm/mm/pageattr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm/mm/pageattr.c b/arch/arm/mm/pageattr.c index c3c34fe714b0..064ad508c149 100644 --- a/arch/arm/mm/pageattr.c +++ b/arch/arm/mm/pageattr.c @@ -25,7 +25,7 @@ static int change_page_range(pte_t *ptep, unsigned long addr, void *data) return 0; } -static bool in_range(unsigned long start, unsigned long size, +static bool range_in_range(unsigned long start, unsigned long size, unsigned long range_start, unsigned long range_end) { return start >= range_start && start < range_end && @@ -63,8 +63,8 @@ static int change_memory_common(unsigned long addr, int numpages, if (!size) return 0; - if (!in_range(start, size, MODULES_VADDR, MODULES_END) && - !in_range(start, size, VMALLOC_START, VMALLOC_END)) + if (!range_in_range(start, size, MODULES_VADDR, MODULES_END) && + !range_in_range(start, size, VMALLOC_START, VMALLOC_END)) return -EINVAL; return __change_memory_common(start, size, set_mask, clear_mask); -- cgit v1.2.3 From a379322022c0961fe0b638cdd842d3c38eeff92c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:30 +0100 Subject: mm: convert page_table_check_pte_set() to page_table_check_ptes_set() Tell the page table check how many PTEs & PFNs we want it to check. Link: https://lkml.kernel.org/r/20230802151406.3735276-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Rapoport (IBM) Acked-by: Pasha Tatashin Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index fe4b913589ee..445b18d7a47c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -348,7 +348,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - page_table_check_pte_set(mm, ptep, pte); + page_table_check_ptes_set(mm, ptep, pte, 1); return __set_pte_at(mm, addr, ptep, pte); } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 44377f0d7c35..01e4aabc8898 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -499,7 +499,7 @@ static inline void __set_pte_at(struct mm_struct *mm, static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - page_table_check_pte_set(mm, ptep, pteval); + page_table_check_ptes_set(mm, ptep, pteval, 1); __set_pte_at(mm, addr, ptep, pteval); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index ada1bbf12961..cd0b6337d03c 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1023,7 +1023,7 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - page_table_check_pte_set(mm, ptep, pte); + page_table_check_ptes_set(mm, ptep, pte, 1); set_pte(ptep, pte); } -- cgit v1.2.3 From 63497b716be30fb268b2358836efb4bb9e615f15 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:35 +0100 Subject: alpha: implement the new page table range API Add PFN_PTE_SHIFT, update_mmu_cache_range() and flush_icache_pages(). Link: https://lkml.kernel.org/r/20230802151406.3735276-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Signed-off-by: Andrew Morton --- arch/alpha/include/asm/cacheflush.h | 10 ++++++++++ arch/alpha/include/asm/pgtable.h | 10 ++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/alpha/include/asm/cacheflush.h b/arch/alpha/include/asm/cacheflush.h index 9945ff483eaf..3956460e69e2 100644 --- a/arch/alpha/include/asm/cacheflush.h +++ b/arch/alpha/include/asm/cacheflush.h @@ -57,6 +57,16 @@ extern void flush_icache_user_page(struct vm_area_struct *vma, #define flush_icache_page(vma, page) \ flush_icache_user_page((vma), (page), 0, 0) +/* + * Both implementations of flush_icache_user_page flush the entire + * address space, so one call, no matter how many pages. + */ +static inline void flush_icache_pages(struct vm_area_struct *vma, + struct page *page, unsigned int nr) +{ + flush_icache_user_page(vma, page, 0, 0); +} + #include #endif /* _ALPHA_CACHEFLUSH_H */ diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index ba43cb841d19..747b5f706c47 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -26,7 +26,6 @@ struct vm_area_struct; * hook is made available. */ #define set_pte(pteptr, pteval) ((*(pteptr)) = (pteval)) -#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) /* PMD_SHIFT determines the size of the area a second-level page table can map */ #define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-3)) @@ -189,7 +188,8 @@ extern unsigned long __zero_page(void); * and a page entry and page directory to the page they refer to. */ #define page_to_pa(page) (page_to_pfn(page) << PAGE_SHIFT) -#define pte_pfn(pte) (pte_val(pte) >> 32) +#define PFN_PTE_SHIFT 32 +#define pte_pfn(pte) (pte_val(pte) >> PFN_PTE_SHIFT) #define pte_page(pte) pfn_to_page(pte_pfn(pte)) #define mk_pte(page, pgprot) \ @@ -303,6 +303,12 @@ extern inline void update_mmu_cache(struct vm_area_struct * vma, { } +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) +{ +} + /* * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that * are !pte_none() && !pte_present(). -- cgit v1.2.3 From ac4cfaccedac891d29560ddfb64cb5c1e710e1e1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:36 +0100 Subject: arc: implement the new page table range API Add PFN_PTE_SHIFT, update_mmu_cache_range(), flush_dcache_folio() and flush_icache_pages(). Change the PG_dc_clean flag from being per-page to per-folio (which means it cannot always be set as we don't know that all pages in this folio were cleaned). Enhance the internal flush routines to take the number of pages to flush. Link: https://lkml.kernel.org/r/20230802151406.3735276-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Vineet Gupta Signed-off-by: Andrew Morton --- arch/arc/include/asm/cacheflush.h | 7 +++- arch/arc/include/asm/pgtable-bits-arcv2.h | 12 +++--- arch/arc/include/asm/pgtable-levels.h | 1 + arch/arc/mm/cache.c | 61 +++++++++++++++++++------------ arch/arc/mm/tlb.c | 18 +++++---- 5 files changed, 59 insertions(+), 40 deletions(-) (limited to 'arch') diff --git a/arch/arc/include/asm/cacheflush.h b/arch/arc/include/asm/cacheflush.h index e201b4b1655a..04f65f588510 100644 --- a/arch/arc/include/asm/cacheflush.h +++ b/arch/arc/include/asm/cacheflush.h @@ -25,17 +25,20 @@ * in update_mmu_cache() */ #define flush_icache_page(vma, page) +#define flush_icache_pages(vma, page, nr) void flush_cache_all(void); void flush_icache_range(unsigned long kstart, unsigned long kend); void __sync_icache_dcache(phys_addr_t paddr, unsigned long vaddr, int len); -void __inv_icache_page(phys_addr_t paddr, unsigned long vaddr); -void __flush_dcache_page(phys_addr_t paddr, unsigned long vaddr); +void __inv_icache_pages(phys_addr_t paddr, unsigned long vaddr, unsigned nr); +void __flush_dcache_pages(phys_addr_t paddr, unsigned long vaddr, unsigned nr); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 void flush_dcache_page(struct page *page); +void flush_dcache_folio(struct folio *folio); +#define flush_dcache_folio flush_dcache_folio void dma_cache_wback_inv(phys_addr_t start, unsigned long sz); void dma_cache_inv(phys_addr_t start, unsigned long sz); diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h b/arch/arc/include/asm/pgtable-bits-arcv2.h index 6e9f8ca6d6a1..ee78ab30958d 100644 --- a/arch/arc/include/asm/pgtable-bits-arcv2.h +++ b/arch/arc/include/asm/pgtable-bits-arcv2.h @@ -100,14 +100,12 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); } -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) -{ - set_pte(ptep, pteval); -} +struct vm_fault; +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr); -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, - pte_t *ptep); +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) /* * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that diff --git a/arch/arc/include/asm/pgtable-levels.h b/arch/arc/include/asm/pgtable-levels.h index ef68758b69f7..fc417c75c24d 100644 --- a/arch/arc/include/asm/pgtable-levels.h +++ b/arch/arc/include/asm/pgtable-levels.h @@ -169,6 +169,7 @@ #define pte_ERROR(e) \ pr_crit("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e)) +#define PFN_PTE_SHIFT PAGE_SHIFT #define pte_none(x) (!pte_val(x)) #define pte_present(x) (pte_val(x) & _PAGE_PRESENT) #define pte_clear(mm,addr,ptep) set_pte_at(mm, addr, ptep, __pte(0)) diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index 55c6de138eae..3c16ee942a5c 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -752,17 +752,17 @@ static inline void arc_slc_enable(void) * There's a corollary case, where kernel READs from a userspace mapped page. * If the U-mapping is not congruent to K-mapping, former needs flushing. */ -void flush_dcache_page(struct page *page) +void flush_dcache_folio(struct folio *folio) { struct address_space *mapping; if (!cache_is_vipt_aliasing()) { - clear_bit(PG_dc_clean, &page->flags); + clear_bit(PG_dc_clean, &folio->flags); return; } /* don't handle anon pages here */ - mapping = page_mapping_file(page); + mapping = folio_flush_mapping(folio); if (!mapping) return; @@ -771,17 +771,27 @@ void flush_dcache_page(struct page *page) * Make a note that K-mapping is dirty */ if (!mapping_mapped(mapping)) { - clear_bit(PG_dc_clean, &page->flags); - } else if (page_mapcount(page)) { - + clear_bit(PG_dc_clean, &folio->flags); + } else if (folio_mapped(folio)) { /* kernel reading from page with U-mapping */ - phys_addr_t paddr = (unsigned long)page_address(page); - unsigned long vaddr = page->index << PAGE_SHIFT; + phys_addr_t paddr = (unsigned long)folio_address(folio); + unsigned long vaddr = folio_pos(folio); + /* + * vaddr is not actually the virtual address, but is + * congruent to every user mapping. + */ if (addr_not_cache_congruent(paddr, vaddr)) - __flush_dcache_page(paddr, vaddr); + __flush_dcache_pages(paddr, vaddr, + folio_nr_pages(folio)); } } +EXPORT_SYMBOL(flush_dcache_folio); + +void flush_dcache_page(struct page *page) +{ + return flush_dcache_folio(page_folio(page)); +} EXPORT_SYMBOL(flush_dcache_page); /* @@ -921,18 +931,18 @@ void __sync_icache_dcache(phys_addr_t paddr, unsigned long vaddr, int len) } /* wrapper to compile time eliminate alignment checks in flush loop */ -void __inv_icache_page(phys_addr_t paddr, unsigned long vaddr) +void __inv_icache_pages(phys_addr_t paddr, unsigned long vaddr, unsigned nr) { - __ic_line_inv_vaddr(paddr, vaddr, PAGE_SIZE); + __ic_line_inv_vaddr(paddr, vaddr, nr * PAGE_SIZE); } /* * wrapper to clearout kernel or userspace mappings of a page * For kernel mappings @vaddr == @paddr */ -void __flush_dcache_page(phys_addr_t paddr, unsigned long vaddr) +void __flush_dcache_pages(phys_addr_t paddr, unsigned long vaddr, unsigned nr) { - __dc_line_op(paddr, vaddr & PAGE_MASK, PAGE_SIZE, OP_FLUSH_N_INV); + __dc_line_op(paddr, vaddr & PAGE_MASK, nr * PAGE_SIZE, OP_FLUSH_N_INV); } noinline void flush_cache_all(void) @@ -962,10 +972,10 @@ void flush_cache_page(struct vm_area_struct *vma, unsigned long u_vaddr, u_vaddr &= PAGE_MASK; - __flush_dcache_page(paddr, u_vaddr); + __flush_dcache_pages(paddr, u_vaddr, 1); if (vma->vm_flags & VM_EXEC) - __inv_icache_page(paddr, u_vaddr); + __inv_icache_pages(paddr, u_vaddr, 1); } void flush_cache_range(struct vm_area_struct *vma, unsigned long start, @@ -978,9 +988,9 @@ void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long u_vaddr) { /* TBD: do we really need to clear the kernel mapping */ - __flush_dcache_page((phys_addr_t)page_address(page), u_vaddr); - __flush_dcache_page((phys_addr_t)page_address(page), - (phys_addr_t)page_address(page)); + __flush_dcache_pages((phys_addr_t)page_address(page), u_vaddr, 1); + __flush_dcache_pages((phys_addr_t)page_address(page), + (phys_addr_t)page_address(page), 1); } @@ -989,6 +999,8 @@ void flush_anon_page(struct vm_area_struct *vma, struct page *page, void copy_user_highpage(struct page *to, struct page *from, unsigned long u_vaddr, struct vm_area_struct *vma) { + struct folio *src = page_folio(from); + struct folio *dst = page_folio(to); void *kfrom = kmap_atomic(from); void *kto = kmap_atomic(to); int clean_src_k_mappings = 0; @@ -1005,7 +1017,7 @@ void copy_user_highpage(struct page *to, struct page *from, * addr_not_cache_congruent() is 0 */ if (page_mapcount(from) && addr_not_cache_congruent(kfrom, u_vaddr)) { - __flush_dcache_page((unsigned long)kfrom, u_vaddr); + __flush_dcache_pages((unsigned long)kfrom, u_vaddr, 1); clean_src_k_mappings = 1; } @@ -1019,17 +1031,17 @@ void copy_user_highpage(struct page *to, struct page *from, * non copied user pages (e.g. read faults which wire in pagecache page * directly). */ - clear_bit(PG_dc_clean, &to->flags); + clear_bit(PG_dc_clean, &dst->flags); /* * if SRC was already usermapped and non-congruent to kernel mapping * sync the kernel mapping back to physical page */ if (clean_src_k_mappings) { - __flush_dcache_page((unsigned long)kfrom, (unsigned long)kfrom); - set_bit(PG_dc_clean, &from->flags); + __flush_dcache_pages((unsigned long)kfrom, + (unsigned long)kfrom, 1); } else { - clear_bit(PG_dc_clean, &from->flags); + clear_bit(PG_dc_clean, &src->flags); } kunmap_atomic(kto); @@ -1038,8 +1050,9 @@ void copy_user_highpage(struct page *to, struct page *from, void clear_user_page(void *to, unsigned long u_vaddr, struct page *page) { + struct folio *folio = page_folio(page); clear_page(to); - clear_bit(PG_dc_clean, &page->flags); + clear_bit(PG_dc_clean, &folio->flags); } EXPORT_SYMBOL(clear_user_page); diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c index 5f71445f26bd..6f40f37e6550 100644 --- a/arch/arc/mm/tlb.c +++ b/arch/arc/mm/tlb.c @@ -467,8 +467,8 @@ void create_tlb(struct vm_area_struct *vma, unsigned long vaddr, pte_t *ptep) * Note that flush (when done) involves both WBACK - so physical page is * in sync as well as INV - so any non-congruent aliases don't remain */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long vaddr_unaligned, - pte_t *ptep) +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long vaddr_unaligned, pte_t *ptep, unsigned int nr) { unsigned long vaddr = vaddr_unaligned & PAGE_MASK; phys_addr_t paddr = pte_val(*ptep) & PAGE_MASK_PHYS; @@ -491,15 +491,19 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long vaddr_unaligned, */ if ((vma->vm_flags & VM_EXEC) || addr_not_cache_congruent(paddr, vaddr)) { - - int dirty = !test_and_set_bit(PG_dc_clean, &page->flags); + struct folio *folio = page_folio(page); + int dirty = !test_and_set_bit(PG_dc_clean, &folio->flags); if (dirty) { + unsigned long offset = offset_in_folio(folio, paddr); + nr = folio_nr_pages(folio); + paddr -= offset; + vaddr -= offset; /* wback + inv dcache lines (K-mapping) */ - __flush_dcache_page(paddr, paddr); + __flush_dcache_pages(paddr, paddr, nr); /* invalidate any existing icache lines (U-mapping) */ if (vma->vm_flags & VM_EXEC) - __inv_icache_page(paddr, vaddr); + __inv_icache_pages(paddr, vaddr, nr); } } } @@ -531,7 +535,7 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { pte_t pte = __pte(pmd_val(*pmd)); - update_mmu_cache(vma, addr, &pte); + update_mmu_cache_range(NULL, vma, addr, &pte, HPAGE_PMD_NR); } void local_flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, -- cgit v1.2.3 From 8b5989f3333717273d02ab87ba8781f72a6783ab Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:37 +0100 Subject: arm: implement the new page table range API Add set_ptes(), update_mmu_cache_range(), flush_dcache_folio() and flush_icache_pages(). Change the PG_dcache_clear flag from being per-page to per-folio which makes __dma_page_dev_to_cpu() a bit more exciting. Also add flush_cache_pages(), even though this isn't used by generic code (yet?) [m.szyprowski@samsung.com: fix potential endless loop in __dma_page_dev_to_cpu()] Link: https://lkml.kernel.org/r/20230809172737.3574190-1-m.szyprowski@samsung.com [willy@infradead.org: fix folio conversion in __dma_page_dev_to_cpu()] Link: https://lkml.kernel.org/r/20230823191852.1556561-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230802151406.3735276-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Marek Szyprowski Acked-by: Mike Rapoport (IBM) Reviewed-by: Russell King (Oracle) Signed-off-by: Andrew Morton --- arch/arm/include/asm/cacheflush.h | 24 ++++++---- arch/arm/include/asm/pgtable.h | 5 +- arch/arm/include/asm/tlbflush.h | 14 ++++-- arch/arm/mm/copypage-v4mc.c | 5 +- arch/arm/mm/copypage-v6.c | 5 +- arch/arm/mm/copypage-xscale.c | 5 +- arch/arm/mm/dma-mapping.c | 28 ++++++----- arch/arm/mm/fault-armv.c | 16 +++---- arch/arm/mm/flush.c | 99 ++++++++++++++++++++++++--------------- arch/arm/mm/mm.h | 2 +- arch/arm/mm/mmu.c | 14 ++++-- arch/arm/mm/nommu.c | 6 +++ 12 files changed, 136 insertions(+), 87 deletions(-) (limited to 'arch') diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index a094f964c869..841e268d2374 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -231,14 +231,15 @@ vivt_flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned vma->vm_flags); } -static inline void -vivt_flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr, unsigned long pfn) +static inline void vivt_flush_cache_pages(struct vm_area_struct *vma, + unsigned long user_addr, unsigned long pfn, unsigned int nr) { struct mm_struct *mm = vma->vm_mm; if (!mm || cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm))) { unsigned long addr = user_addr & PAGE_MASK; - __cpuc_flush_user_range(addr, addr + PAGE_SIZE, vma->vm_flags); + __cpuc_flush_user_range(addr, addr + nr * PAGE_SIZE, + vma->vm_flags); } } @@ -247,15 +248,17 @@ vivt_flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr, unsig vivt_flush_cache_mm(mm) #define flush_cache_range(vma,start,end) \ vivt_flush_cache_range(vma,start,end) -#define flush_cache_page(vma,addr,pfn) \ - vivt_flush_cache_page(vma,addr,pfn) +#define flush_cache_pages(vma, addr, pfn, nr) \ + vivt_flush_cache_pages(vma, addr, pfn, nr) #else -extern void flush_cache_mm(struct mm_struct *mm); -extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); -extern void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr, unsigned long pfn); +void flush_cache_mm(struct mm_struct *mm); +void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); +void flush_cache_pages(struct vm_area_struct *vma, unsigned long user_addr, + unsigned long pfn, unsigned int nr); #endif #define flush_cache_dup_mm(mm) flush_cache_mm(mm) +#define flush_cache_page(vma, addr, pfn) flush_cache_pages(vma, addr, pfn, 1) /* * flush_icache_user_range is used when we want to ensure that the @@ -289,7 +292,9 @@ extern void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr * See update_mmu_cache for the user space part. */ #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -extern void flush_dcache_page(struct page *); +void flush_dcache_page(struct page *); +void flush_dcache_folio(struct folio *folio); +#define flush_dcache_folio flush_dcache_folio #define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 static inline void flush_kernel_vmap_range(void *addr, int size) @@ -321,6 +326,7 @@ static inline void flush_anon_page(struct vm_area_struct *vma, * duplicate cache flushing elsewhere performed by flush_dcache_page(). */ #define flush_icache_page(vma,page) do { } while (0) +#define flush_icache_pages(vma, page, nr) do { } while (0) /* * flush_cache_vmap() is used when creating mappings (eg, via vmap, diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index 34662a9d4cab..ba573f22d7cc 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -207,8 +207,9 @@ static inline void __sync_icache_dcache(pte_t pteval) extern void __sync_icache_dcache(pte_t pteval); #endif -void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval); +void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval, unsigned int nr); +#define set_ptes set_ptes static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot) { diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h index 0ccc985b90af..38c6e4a2a0b6 100644 --- a/arch/arm/include/asm/tlbflush.h +++ b/arch/arm/include/asm/tlbflush.h @@ -619,18 +619,22 @@ extern void flush_bp_all(void); * If PG_dcache_clean is not set for the page, we need to ensure that any * cache entries for the kernels virtual memory range are written * back to the page. On ARMv6 and later, the cache coherency is handled via - * the set_pte_at() function. + * the set_ptes() function. */ #if __LINUX_ARM_ARCH__ < 6 -extern void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep); +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr); #else -static inline void update_mmu_cache(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, + unsigned int nr) { } #endif +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) + #define update_mmu_cache_pmd(vma, address, pmd) do { } while (0) #endif diff --git a/arch/arm/mm/copypage-v4mc.c b/arch/arm/mm/copypage-v4mc.c index f1da3b439b96..7ddd82b9fe8b 100644 --- a/arch/arm/mm/copypage-v4mc.c +++ b/arch/arm/mm/copypage-v4mc.c @@ -64,10 +64,11 @@ static void mc_copy_user_page(void *from, void *to) void v4_mc_copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { + struct folio *src = page_folio(from); void *kto = kmap_atomic(to); - if (!test_and_set_bit(PG_dcache_clean, &from->flags)) - __flush_dcache_page(page_mapping_file(from), from); + if (!test_and_set_bit(PG_dcache_clean, &src->flags)) + __flush_dcache_folio(folio_flush_mapping(src), src); raw_spin_lock(&minicache_lock); diff --git a/arch/arm/mm/copypage-v6.c b/arch/arm/mm/copypage-v6.c index d8a115de5507..a1a71f36d850 100644 --- a/arch/arm/mm/copypage-v6.c +++ b/arch/arm/mm/copypage-v6.c @@ -69,11 +69,12 @@ static void discard_old_kernel_data(void *kto) static void v6_copy_user_highpage_aliasing(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { + struct folio *src = page_folio(from); unsigned int offset = CACHE_COLOUR(vaddr); unsigned long kfrom, kto; - if (!test_and_set_bit(PG_dcache_clean, &from->flags)) - __flush_dcache_page(page_mapping_file(from), from); + if (!test_and_set_bit(PG_dcache_clean, &src->flags)) + __flush_dcache_folio(folio_flush_mapping(src), src); /* FIXME: not highmem safe */ discard_old_kernel_data(page_address(to)); diff --git a/arch/arm/mm/copypage-xscale.c b/arch/arm/mm/copypage-xscale.c index bcb485620a05..f1e29d3e8193 100644 --- a/arch/arm/mm/copypage-xscale.c +++ b/arch/arm/mm/copypage-xscale.c @@ -84,10 +84,11 @@ static void mc_copy_user_page(void *from, void *to) void xscale_mc_copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { + struct folio *src = page_folio(from); void *kto = kmap_atomic(to); - if (!test_and_set_bit(PG_dcache_clean, &from->flags)) - __flush_dcache_page(page_mapping_file(from), from); + if (!test_and_set_bit(PG_dcache_clean, &src->flags)) + __flush_dcache_folio(folio_flush_mapping(src), src); raw_spin_lock(&minicache_lock); diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 033a1bce2b17..5409225b4abc 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -709,19 +709,21 @@ static void __dma_page_dev_to_cpu(struct page *page, unsigned long off, * Mark the D-cache clean for these pages to avoid extra flushing. */ if (dir != DMA_TO_DEVICE && size >= PAGE_SIZE) { - unsigned long pfn; - size_t left = size; - - pfn = page_to_pfn(page) + off / PAGE_SIZE; - off %= PAGE_SIZE; - if (off) { - pfn++; - left -= PAGE_SIZE - off; - } - while (left >= PAGE_SIZE) { - page = pfn_to_page(pfn++); - set_bit(PG_dcache_clean, &page->flags); - left -= PAGE_SIZE; + struct folio *folio = pfn_folio(paddr / PAGE_SIZE); + size_t offset = offset_in_folio(folio, paddr); + + for (;;) { + size_t sz = folio_size(folio) - offset; + + if (size < sz) + break; + if (!offset) + set_bit(PG_dcache_clean, &folio->flags); + offset = 0; + size -= sz; + if (!size) + break; + folio = folio_next(folio); } } } diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index 7cb125497976..2286c2ea60ec 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -180,12 +180,12 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma, * * Note that the pte lock will be held. */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep) +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) { unsigned long pfn = pte_pfn(*ptep); struct address_space *mapping; - struct page *page; + struct folio *folio; if (!pfn_valid(pfn)) return; @@ -194,13 +194,13 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, * The zero page is never written to, so never has any dirty * cache lines, and therefore never needs to be flushed. */ - page = pfn_to_page(pfn); - if (page == ZERO_PAGE(0)) + if (is_zero_pfn(pfn)) return; - mapping = page_mapping_file(page); - if (!test_and_set_bit(PG_dcache_clean, &page->flags)) - __flush_dcache_page(mapping, page); + folio = page_folio(pfn_to_page(pfn)); + mapping = folio_flush_mapping(folio); + if (!test_and_set_bit(PG_dcache_clean, &folio->flags)) + __flush_dcache_folio(mapping, folio); if (mapping) { if (cache_is_vivt()) make_coherent(mapping, vma, addr, ptep, pfn); diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 2508be91b7a0..d19d140a10c7 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -95,10 +95,10 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned __flush_icache_all(); } -void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr, unsigned long pfn) +void flush_cache_pages(struct vm_area_struct *vma, unsigned long user_addr, unsigned long pfn, unsigned int nr) { if (cache_is_vivt()) { - vivt_flush_cache_page(vma, user_addr, pfn); + vivt_flush_cache_pages(vma, user_addr, pfn, nr); return; } @@ -196,29 +196,31 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page, #endif } -void __flush_dcache_page(struct address_space *mapping, struct page *page) +void __flush_dcache_folio(struct address_space *mapping, struct folio *folio) { /* * Writeback any data associated with the kernel mapping of this * page. This ensures that data in the physical page is mutually * coherent with the kernels mapping. */ - if (!PageHighMem(page)) { - __cpuc_flush_dcache_area(page_address(page), page_size(page)); + if (!folio_test_highmem(folio)) { + __cpuc_flush_dcache_area(folio_address(folio), + folio_size(folio)); } else { unsigned long i; if (cache_is_vipt_nonaliasing()) { - for (i = 0; i < compound_nr(page); i++) { - void *addr = kmap_atomic(page + i); + for (i = 0; i < folio_nr_pages(folio); i++) { + void *addr = kmap_local_folio(folio, + i * PAGE_SIZE); __cpuc_flush_dcache_area(addr, PAGE_SIZE); - kunmap_atomic(addr); + kunmap_local(addr); } } else { - for (i = 0; i < compound_nr(page); i++) { - void *addr = kmap_high_get(page + i); + for (i = 0; i < folio_nr_pages(folio); i++) { + void *addr = kmap_high_get(folio_page(folio, i)); if (addr) { __cpuc_flush_dcache_area(addr, PAGE_SIZE); - kunmap_high(page + i); + kunmap_high(folio_page(folio, i)); } } } @@ -230,15 +232,14 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page) * userspace colour, which is congruent with page->index. */ if (mapping && cache_is_vipt_aliasing()) - flush_pfn_alias(page_to_pfn(page), - page->index << PAGE_SHIFT); + flush_pfn_alias(folio_pfn(folio), folio_pos(folio)); } -static void __flush_dcache_aliases(struct address_space *mapping, struct page *page) +static void __flush_dcache_aliases(struct address_space *mapping, struct folio *folio) { struct mm_struct *mm = current->active_mm; - struct vm_area_struct *mpnt; - pgoff_t pgoff; + struct vm_area_struct *vma; + pgoff_t pgoff, pgoff_end; /* * There are possible user space mappings of this page: @@ -246,21 +247,36 @@ static void __flush_dcache_aliases(struct address_space *mapping, struct page *p * data in the current VM view associated with this page. * - aliasing VIPT: we only need to find one mapping of this page. */ - pgoff = page->index; + pgoff = folio->index; + pgoff_end = pgoff + folio_nr_pages(folio) - 1; flush_dcache_mmap_lock(mapping); - vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) { - unsigned long offset; + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff_end) { + unsigned long start, offset, pfn; + unsigned int nr; /* * If this VMA is not in our MM, we can ignore it. */ - if (mpnt->vm_mm != mm) + if (vma->vm_mm != mm) continue; - if (!(mpnt->vm_flags & VM_MAYSHARE)) + if (!(vma->vm_flags & VM_MAYSHARE)) continue; - offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT; - flush_cache_page(mpnt, mpnt->vm_start + offset, page_to_pfn(page)); + + start = vma->vm_start; + pfn = folio_pfn(folio); + nr = folio_nr_pages(folio); + offset = pgoff - vma->vm_pgoff; + if (offset > -nr) { + pfn -= offset; + nr += offset; + } else { + start += offset * PAGE_SIZE; + } + if (start + nr * PAGE_SIZE > vma->vm_end) + nr = (vma->vm_end - start) / PAGE_SIZE; + + flush_cache_pages(vma, start, pfn, nr); } flush_dcache_mmap_unlock(mapping); } @@ -269,7 +285,7 @@ static void __flush_dcache_aliases(struct address_space *mapping, struct page *p void __sync_icache_dcache(pte_t pteval) { unsigned long pfn; - struct page *page; + struct folio *folio; struct address_space *mapping; if (cache_is_vipt_nonaliasing() && !pte_exec(pteval)) @@ -279,14 +295,14 @@ void __sync_icache_dcache(pte_t pteval) if (!pfn_valid(pfn)) return; - page = pfn_to_page(pfn); + folio = page_folio(pfn_to_page(pfn)); if (cache_is_vipt_aliasing()) - mapping = page_mapping_file(page); + mapping = folio_flush_mapping(folio); else mapping = NULL; - if (!test_and_set_bit(PG_dcache_clean, &page->flags)) - __flush_dcache_page(mapping, page); + if (!test_and_set_bit(PG_dcache_clean, &folio->flags)) + __flush_dcache_folio(mapping, folio); if (pte_exec(pteval)) __flush_icache_all(); @@ -312,7 +328,7 @@ void __sync_icache_dcache(pte_t pteval) * Note that we disable the lazy flush for SMP configurations where * the cache maintenance operations are not automatically broadcasted. */ -void flush_dcache_page(struct page *page) +void flush_dcache_folio(struct folio *folio) { struct address_space *mapping; @@ -320,31 +336,36 @@ void flush_dcache_page(struct page *page) * The zero page is never written to, so never has any dirty * cache lines, and therefore never needs to be flushed. */ - if (page == ZERO_PAGE(0)) + if (is_zero_pfn(folio_pfn(folio))) return; if (!cache_ops_need_broadcast() && cache_is_vipt_nonaliasing()) { - if (test_bit(PG_dcache_clean, &page->flags)) - clear_bit(PG_dcache_clean, &page->flags); + if (test_bit(PG_dcache_clean, &folio->flags)) + clear_bit(PG_dcache_clean, &folio->flags); return; } - mapping = page_mapping_file(page); + mapping = folio_flush_mapping(folio); if (!cache_ops_need_broadcast() && - mapping && !page_mapcount(page)) - clear_bit(PG_dcache_clean, &page->flags); + mapping && !folio_mapped(folio)) + clear_bit(PG_dcache_clean, &folio->flags); else { - __flush_dcache_page(mapping, page); + __flush_dcache_folio(mapping, folio); if (mapping && cache_is_vivt()) - __flush_dcache_aliases(mapping, page); + __flush_dcache_aliases(mapping, folio); else if (mapping) __flush_icache_all(); - set_bit(PG_dcache_clean, &page->flags); + set_bit(PG_dcache_clean, &folio->flags); } } -EXPORT_SYMBOL(flush_dcache_page); +EXPORT_SYMBOL(flush_dcache_folio); +void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); +} +EXPORT_SYMBOL(flush_dcache_page); /* * Flush an anonymous page so that users of get_user_pages() * can safely access the data. The expected sequence is: diff --git a/arch/arm/mm/mm.h b/arch/arm/mm/mm.h index d7ffccb7fea7..419316316711 100644 --- a/arch/arm/mm/mm.h +++ b/arch/arm/mm/mm.h @@ -45,7 +45,7 @@ struct mem_type { const struct mem_type *get_mem_type(unsigned int type); -extern void __flush_dcache_page(struct address_space *mapping, struct page *page); +void __flush_dcache_folio(struct address_space *mapping, struct folio *folio); /* * ARM specific vm_struct->flags bits. diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index fdeaee30d167..674ed71573a8 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -1789,7 +1789,7 @@ void __init paging_init(const struct machine_desc *mdesc) bootmem_init(); empty_zero_page = virt_to_page(zero_page); - __flush_dcache_page(NULL, empty_zero_page); + __flush_dcache_folio(NULL, page_folio(empty_zero_page)); } void __init early_mm_init(const struct machine_desc *mdesc) @@ -1798,8 +1798,8 @@ void __init early_mm_init(const struct machine_desc *mdesc) early_paging_init(mdesc); } -void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) +void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval, unsigned int nr) { unsigned long ext = 0; @@ -1809,5 +1809,11 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, ext |= PTE_EXT_NG; } - set_pte_ext(ptep, pteval, ext); + for (;;) { + set_pte_ext(ptep, pteval, ext); + if (--nr == 0) + break; + ptep++; + pte_val(pteval) += PAGE_SIZE; + } } diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index 43cfd06bbeba..c415f3859b20 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c @@ -180,6 +180,12 @@ void setup_mm_for_reboot(void) { } +void flush_dcache_folio(struct folio *folio) +{ + __cpuc_flush_dcache_area(folio_address(folio), folio_size(folio)); +} +EXPORT_SYMBOL(flush_dcache_folio); + void flush_dcache_page(struct page *page) { __cpuc_flush_dcache_area(page_address(page), PAGE_SIZE); -- cgit v1.2.3 From 4a169d61c2ede9fdf27103e1f454d4a0401d9025 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:38 +0100 Subject: arm64: implement the new page table range API Add set_ptes(), update_mmu_cache_range() and flush_dcache_folio(). Change the PG_dcache_clean flag from being per-page to per-folio. Link: https://lkml.kernel.org/r/20230802151406.3735276-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Catalin Marinas Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- arch/arm64/include/asm/cacheflush.h | 4 +++- arch/arm64/include/asm/pgtable.h | 26 +++++++++++++++++++------- arch/arm64/mm/flush.c | 36 ++++++++++++++---------------------- 3 files changed, 36 insertions(+), 30 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 37185e978aeb..d115451ed263 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -114,7 +114,7 @@ extern void copy_to_user_page(struct vm_area_struct *, struct page *, #define copy_to_user_page copy_to_user_page /* - * flush_dcache_page is used when the kernel has written to the page + * flush_dcache_folio is used when the kernel has written to the page * cache page at virtual address page->virtual. * * If this page isn't mapped (ie, page_mapping == NULL), or it might @@ -127,6 +127,8 @@ extern void copy_to_user_page(struct vm_area_struct *, struct page *, */ #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *); +void flush_dcache_folio(struct folio *); +#define flush_dcache_folio flush_dcache_folio static __always_inline void icache_inval_all_pou(void) { diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 445b18d7a47c..76bba654b5d7 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -345,12 +345,21 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, set_pte(ptep, pte); } -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - page_table_check_ptes_set(mm, ptep, pte, 1); - return __set_pte_at(mm, addr, ptep, pte); +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) +{ + page_table_check_ptes_set(mm, ptep, pte, nr); + + for (;;) { + __set_pte_at(mm, addr, ptep, pte); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + pte_val(pte) += PAGE_SIZE; + } } +#define set_ptes set_ptes /* * Huge pte definitions. @@ -1049,8 +1058,9 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) /* * On AArch64, the cache coherency is handled via the set_pte_at() function. */ -static inline void update_mmu_cache(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, + unsigned int nr) { /* * We don't do anything here, so there's a very small chance of @@ -1059,6 +1069,8 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, */ } +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) #define update_mmu_cache_pmd(vma, address, pmd) do { } while (0) #ifdef CONFIG_ARM64_PA_BITS_52 diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 4e6476094952..013eead9b695 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -51,20 +51,13 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page, void __sync_icache_dcache(pte_t pte) { - struct page *page = pte_page(pte); + struct folio *folio = page_folio(pte_page(pte)); - /* - * HugeTLB pages are always fully mapped, so only setting head page's - * PG_dcache_clean flag is enough. - */ - if (PageHuge(page)) - page = compound_head(page); - - if (!test_bit(PG_dcache_clean, &page->flags)) { - sync_icache_aliases((unsigned long)page_address(page), - (unsigned long)page_address(page) + - page_size(page)); - set_bit(PG_dcache_clean, &page->flags); + if (!test_bit(PG_dcache_clean, &folio->flags)) { + sync_icache_aliases((unsigned long)folio_address(folio), + (unsigned long)folio_address(folio) + + folio_size(folio)); + set_bit(PG_dcache_clean, &folio->flags); } } EXPORT_SYMBOL_GPL(__sync_icache_dcache); @@ -74,17 +67,16 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache); * it as dirty for later flushing when mapped in user space (if executable, * see __sync_icache_dcache). */ -void flush_dcache_page(struct page *page) +void flush_dcache_folio(struct folio *folio) { - /* - * HugeTLB pages are always fully mapped and only head page will be - * set PG_dcache_clean (see comments in __sync_icache_dcache()). - */ - if (PageHuge(page)) - page = compound_head(page); + if (test_bit(PG_dcache_clean, &folio->flags)) + clear_bit(PG_dcache_clean, &folio->flags); +} +EXPORT_SYMBOL(flush_dcache_folio); - if (test_bit(PG_dcache_clean, &page->flags)) - clear_bit(PG_dcache_clean, &page->flags); +void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); } EXPORT_SYMBOL(flush_dcache_page); -- cgit v1.2.3 From e724e7aaf9ca794670a4d4931af7a7e24e37fec3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:39 +0100 Subject: csky: implement the new page table range API Add PFN_PTE_SHIFT, update_mmu_cache_range() and flush_dcache_folio(). Change the PG_dcache_clean flag from being per-page to per-folio. Link: https://lkml.kernel.org/r/20230802151406.3735276-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Guo Ren Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- arch/csky/abiv1/cacheflush.c | 32 +++++++++++++++++++------------- arch/csky/abiv1/inc/abi/cacheflush.h | 2 ++ arch/csky/abiv2/cacheflush.c | 32 ++++++++++++++++---------------- arch/csky/abiv2/inc/abi/cacheflush.h | 10 ++++++++-- arch/csky/include/asm/pgtable.h | 8 +++++--- 5 files changed, 50 insertions(+), 34 deletions(-) (limited to 'arch') diff --git a/arch/csky/abiv1/cacheflush.c b/arch/csky/abiv1/cacheflush.c index 94fbc03cbe70..171e8fb32285 100644 --- a/arch/csky/abiv1/cacheflush.c +++ b/arch/csky/abiv1/cacheflush.c @@ -15,45 +15,51 @@ #define PG_dcache_clean PG_arch_1 -void flush_dcache_page(struct page *page) +void flush_dcache_folio(struct folio *folio) { struct address_space *mapping; - if (page == ZERO_PAGE(0)) + if (is_zero_pfn(folio_pfn(folio))) return; - mapping = page_mapping_file(page); + mapping = folio_flush_mapping(folio); - if (mapping && !page_mapcount(page)) - clear_bit(PG_dcache_clean, &page->flags); + if (mapping && !folio_mapped(folio)) + clear_bit(PG_dcache_clean, &folio->flags); else { dcache_wbinv_all(); if (mapping) icache_inv_all(); - set_bit(PG_dcache_clean, &page->flags); + set_bit(PG_dcache_clean, &folio->flags); } } +EXPORT_SYMBOL(flush_dcache_folio); + +void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); +} EXPORT_SYMBOL(flush_dcache_page); -void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep) +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) { unsigned long pfn = pte_pfn(*ptep); - struct page *page; + struct folio *folio; flush_tlb_page(vma, addr); if (!pfn_valid(pfn)) return; - page = pfn_to_page(pfn); - if (page == ZERO_PAGE(0)) + if (is_zero_pfn(pfn)) return; - if (!test_and_set_bit(PG_dcache_clean, &page->flags)) + folio = page_folio(pfn_to_page(pfn)); + if (!test_and_set_bit(PG_dcache_clean, &folio->flags)) dcache_wbinv_all(); - if (page_mapping_file(page)) { + if (folio_flush_mapping(folio)) { if (vma->vm_flags & VM_EXEC) icache_inv_all(); } diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h index ed62e2066ba7..0d6cb65624c4 100644 --- a/arch/csky/abiv1/inc/abi/cacheflush.h +++ b/arch/csky/abiv1/inc/abi/cacheflush.h @@ -9,6 +9,8 @@ #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *); +void flush_dcache_folio(struct folio *); +#define flush_dcache_folio flush_dcache_folio #define flush_cache_mm(mm) dcache_wbinv_all() #define flush_cache_page(vma, page, pfn) cache_wbinv_all() diff --git a/arch/csky/abiv2/cacheflush.c b/arch/csky/abiv2/cacheflush.c index 9923cd24db58..d05a551af5d5 100644 --- a/arch/csky/abiv2/cacheflush.c +++ b/arch/csky/abiv2/cacheflush.c @@ -7,32 +7,32 @@ #include #include -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, - pte_t *pte) +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long address, pte_t *pte, unsigned int nr) { - unsigned long addr; - struct page *page; + unsigned long pfn = pte_pfn(*pte); + struct folio *folio; + unsigned int i; flush_tlb_page(vma, address); - if (!pfn_valid(pte_pfn(*pte))) + if (!pfn_valid(pfn)) return; - page = pfn_to_page(pte_pfn(*pte)); - if (page == ZERO_PAGE(0)) - return; + folio = page_folio(pfn_to_page(pfn)); - if (test_and_set_bit(PG_dcache_clean, &page->flags)) + if (test_and_set_bit(PG_dcache_clean, &folio->flags)) return; - addr = (unsigned long) kmap_atomic(page); - - dcache_wb_range(addr, addr + PAGE_SIZE); + for (i = 0; i < folio_nr_pages(folio); i++) { + unsigned long addr = (unsigned long) kmap_local_folio(folio, + i * PAGE_SIZE); - if (vma->vm_flags & VM_EXEC) - icache_inv_range(addr, addr + PAGE_SIZE); - - kunmap_atomic((void *) addr); + dcache_wb_range(addr, addr + PAGE_SIZE); + if (vma->vm_flags & VM_EXEC) + icache_inv_range(addr, addr + PAGE_SIZE); + kunmap_local((void *) addr); + } } void flush_icache_deferred(struct mm_struct *mm) diff --git a/arch/csky/abiv2/inc/abi/cacheflush.h b/arch/csky/abiv2/inc/abi/cacheflush.h index a565e00c3f70..9c728933a776 100644 --- a/arch/csky/abiv2/inc/abi/cacheflush.h +++ b/arch/csky/abiv2/inc/abi/cacheflush.h @@ -18,11 +18,17 @@ #define PG_dcache_clean PG_arch_1 +static inline void flush_dcache_folio(struct folio *folio) +{ + if (test_bit(PG_dcache_clean, &folio->flags)) + clear_bit(PG_dcache_clean, &folio->flags); +} +#define flush_dcache_folio flush_dcache_folio + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 static inline void flush_dcache_page(struct page *page) { - if (test_bit(PG_dcache_clean, &page->flags)) - clear_bit(PG_dcache_clean, &page->flags); + flush_dcache_folio(page_folio(page)); } #define flush_dcache_mmap_lock(mapping) do { } while (0) diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index d4042495febc..42405037c871 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -28,6 +28,7 @@ #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) +#define PFN_PTE_SHIFT PAGE_SHIFT #define pmd_pfn(pmd) (pmd_phys(pmd) >> PAGE_SHIFT) #define pmd_page(pmd) (pfn_to_page(pmd_phys(pmd) >> PAGE_SHIFT)) #define pte_clear(mm, addr, ptep) set_pte((ptep), \ @@ -90,7 +91,6 @@ static inline void set_pte(pte_t *p, pte_t pte) /* prevent out of order excution */ smp_mb(); } -#define set_pte_at(mm, addr, ptep, pteval) set_pte(ptep, pteval) static inline pte_t *pmd_page_vaddr(pmd_t pmd) { @@ -263,8 +263,10 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern void paging_init(void); -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, - pte_t *pte); +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long address, pte_t *pte, unsigned int nr); +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ remap_pfn_range(vma, vaddr, pfn, size, prot) -- cgit v1.2.3 From 9ff633944165d11c53c088d9596db3da66e90396 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:40 +0100 Subject: hexagon: implement the new page table range API Add PFN_PTE_SHIFT and update_mmu_cache_range(). Link: https://lkml.kernel.org/r/20230802151406.3735276-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Brian Cain Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- arch/hexagon/include/asm/cacheflush.h | 8 ++++++-- arch/hexagon/include/asm/pgtable.h | 9 +-------- 2 files changed, 7 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/hexagon/include/asm/cacheflush.h b/arch/hexagon/include/asm/cacheflush.h index 6eff0730e6ef..dc3f500a5a01 100644 --- a/arch/hexagon/include/asm/cacheflush.h +++ b/arch/hexagon/include/asm/cacheflush.h @@ -58,12 +58,16 @@ extern void flush_cache_all_hexagon(void); * clean the cache when the PTE is set. * */ -static inline void update_mmu_cache(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) { /* generic_ptrace_pokedata doesn't wind up here, does it? */ } +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) + void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, void *src, int len); #define copy_to_user_page copy_to_user_page diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index 59393613d086..dd05dd71b8ec 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -338,6 +338,7 @@ static inline int pte_exec(pte_t pte) /* __swp_entry_to_pte - extract PTE from swap entry */ #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define PFN_PTE_SHIFT PAGE_SHIFT /* pfn_pte - convert page number and protection value to page table entry */ #define pfn_pte(pfn, pgprot) __pte((pfn << PAGE_SHIFT) | pgprot_val(pgprot)) @@ -345,14 +346,6 @@ static inline int pte_exec(pte_t pte) #define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT) #define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) -/* - * set_pte_at - update page table and do whatever magic may be - * necessary to make the underlying hardware/firmware take note. - * - * VM may require a virtual instruction to alert the MMU. - */ -#define set_pte_at(mm, addr, ptep, pte) set_pte(ptep, pte) - static inline unsigned long pmd_page_vaddr(pmd_t pmd) { return (unsigned long)__va(pmd_val(pmd) & PAGE_MASK); -- cgit v1.2.3 From 876397837d582ce72f977ac3e635ce74eebcecc9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:41 +0100 Subject: ia64: implement the new page table range API Add PFN_PTE_SHIFT, update_mmu_cache_range() and flush_dcache_folio(). Change the PG_arch_1 (aka PG_dcache_clean) flag from being per-page to per-folio, which makes arch_dma_mark_clean() and mark_clean() a little more exciting. [willy@infradead.org: fix folio_size() handling] Link: https://lkml.kernel.org/r/ZNPlOCe8F+nrzPxr@casper.infradead.org Link: https://lkml.kernel.org/r/20230802151406.3735276-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- arch/ia64/hp/common/sba_iommu.c | 30 +++++++++++++++++++----------- arch/ia64/include/asm/cacheflush.h | 14 ++++++++++---- arch/ia64/include/asm/pgtable.h | 4 ++-- arch/ia64/mm/init.c | 32 +++++++++++++++++++++++--------- 4 files changed, 54 insertions(+), 26 deletions(-) (limited to 'arch') diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index 8ad6946521d8..c4d477e8bcd4 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -798,22 +798,30 @@ sba_io_pdir_entry(u64 *pdir_ptr, unsigned long vba) #endif #ifdef ENABLE_MARK_CLEAN -/** +/* * Since DMA is i-cache coherent, any (complete) pages that were written via * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to * flush them when they get mapped into an executable vm-area. */ -static void -mark_clean (void *addr, size_t size) +static void mark_clean(void *addr, size_t size) { - unsigned long pg_addr, end; - - pg_addr = PAGE_ALIGN((unsigned long) addr); - end = (unsigned long) addr + size; - while (pg_addr + PAGE_SIZE <= end) { - struct page *page = virt_to_page((void *)pg_addr); - set_bit(PG_arch_1, &page->flags); - pg_addr += PAGE_SIZE; + struct folio *folio = virt_to_folio(addr); + ssize_t left = size; + size_t offset = offset_in_folio(folio, addr); + + if (offset) { + left -= folio_size(folio) - offset; + if (left <= 0) + return; + folio = folio_next(folio); + } + + while (left >= folio_size(folio)) { + left -= folio_size(folio); + set_bit(PG_arch_1, &folio->flags); + if (!left) + break; + folio = folio_next(folio); } } #endif diff --git a/arch/ia64/include/asm/cacheflush.h b/arch/ia64/include/asm/cacheflush.h index 708c0fa5d975..eac493fa9e0d 100644 --- a/arch/ia64/include/asm/cacheflush.h +++ b/arch/ia64/include/asm/cacheflush.h @@ -13,10 +13,16 @@ #include #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -#define flush_dcache_page(page) \ -do { \ - clear_bit(PG_arch_1, &(page)->flags); \ -} while (0) +static inline void flush_dcache_folio(struct folio *folio) +{ + clear_bit(PG_arch_1, &folio->flags); +} +#define flush_dcache_folio flush_dcache_folio + +static inline void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); +} extern void flush_icache_range(unsigned long start, unsigned long end); #define flush_icache_range flush_icache_range diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 21c97e31a28a..4e5dd800ce1f 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -206,6 +206,7 @@ ia64_phys_addr_valid (unsigned long addr) #define RGN_MAP_SHIFT (PGDIR_SHIFT + PTRS_PER_PGD_SHIFT - 3) #define RGN_MAP_LIMIT ((1UL << RGN_MAP_SHIFT) - PAGE_SIZE) /* per region addr limit */ +#define PFN_PTE_SHIFT PAGE_SHIFT /* * Conversion functions: convert page frame number (pfn) and a protection value to a page * table entry (pte). @@ -303,8 +304,6 @@ static inline void set_pte(pte_t *ptep, pte_t pteval) *ptep = pteval; } -#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) - /* * Make page protection values cacheable, uncacheable, or write- * combining. Note that "protection" is really a misnomer here as the @@ -396,6 +395,7 @@ pte_same (pte_t a, pte_t b) return pte_val(a) == pte_val(b); } +#define update_mmu_cache_range(vmf, vma, address, ptep, nr) do { } while (0) #define update_mmu_cache(vma, address, ptep) do { } while (0) extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 7f5353e28516..05b0f2f0c073 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -50,30 +50,44 @@ void __ia64_sync_icache_dcache (pte_t pte) { unsigned long addr; - struct page *page; + struct folio *folio; - page = pte_page(pte); - addr = (unsigned long) page_address(page); + folio = page_folio(pte_page(pte)); + addr = (unsigned long)folio_address(folio); - if (test_bit(PG_arch_1, &page->flags)) + if (test_bit(PG_arch_1, &folio->flags)) return; /* i-cache is already coherent with d-cache */ - flush_icache_range(addr, addr + page_size(page)); - set_bit(PG_arch_1, &page->flags); /* mark page as clean */ + flush_icache_range(addr, addr + folio_size(folio)); + set_bit(PG_arch_1, &folio->flags); /* mark page as clean */ } /* - * Since DMA is i-cache coherent, any (complete) pages that were written via + * Since DMA is i-cache coherent, any (complete) folios that were written via * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to * flush them when they get mapped into an executable vm-area. */ void arch_dma_mark_clean(phys_addr_t paddr, size_t size) { unsigned long pfn = PHYS_PFN(paddr); + struct folio *folio = page_folio(pfn_to_page(pfn)); + ssize_t left = size; + size_t offset = offset_in_folio(folio, paddr); - do { + if (offset) { + left -= folio_size(folio) - offset; + if (left <= 0) + return; + folio = folio_next(folio); + } + + while (left >= (ssize_t)folio_size(folio)) { + left -= folio_size(folio); set_bit(PG_arch_1, &pfn_to_page(pfn)->flags); - } while (++pfn <= PHYS_PFN(paddr + size - 1)); + if (!left) + break; + folio = folio_next(folio); + } } inline void -- cgit v1.2.3 From a6d01af08b2e40772cf97e700b699850f6862886 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:42 +0100 Subject: loongarch: implement the new page table range API Add update_mmu_cache_range() and change _PFN_SHIFT to PFN_PTE_SHIFT. It would probably be more efficient to implement __update_tlb() by flushing the entire folio instead of calling __update_tlb() N times, but I'll leave that for someone who understands the architecture better. Link: https://lkml.kernel.org/r/20230802151406.3735276-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Huacai Chen Cc: WANG Xuerui Signed-off-by: Andrew Morton --- arch/loongarch/include/asm/cacheflush.h | 1 + arch/loongarch/include/asm/pgtable-bits.h | 4 ++-- arch/loongarch/include/asm/pgtable.h | 33 +++++++++++++++++-------------- arch/loongarch/mm/pgtable.c | 2 +- arch/loongarch/mm/tlb.c | 2 +- 5 files changed, 23 insertions(+), 19 deletions(-) (limited to 'arch') diff --git a/arch/loongarch/include/asm/cacheflush.h b/arch/loongarch/include/asm/cacheflush.h index 0681788eb474..88a44da50a3b 100644 --- a/arch/loongarch/include/asm/cacheflush.h +++ b/arch/loongarch/include/asm/cacheflush.h @@ -47,6 +47,7 @@ void local_flush_icache_range(unsigned long start, unsigned long end); #define flush_cache_vmap(start, end) do { } while (0) #define flush_cache_vunmap(start, end) do { } while (0) #define flush_icache_page(vma, page) do { } while (0) +#define flush_icache_pages(vma, page) do { } while (0) #define flush_icache_user_page(vma, page, addr, len) do { } while (0) #define flush_dcache_page(page) do { } while (0) #define flush_dcache_mmap_lock(mapping) do { } while (0) diff --git a/arch/loongarch/include/asm/pgtable-bits.h b/arch/loongarch/include/asm/pgtable-bits.h index de46a6b1e9f1..35348d4c4209 100644 --- a/arch/loongarch/include/asm/pgtable-bits.h +++ b/arch/loongarch/include/asm/pgtable-bits.h @@ -50,12 +50,12 @@ #define _PAGE_NO_EXEC (_ULCAST_(1) << _PAGE_NO_EXEC_SHIFT) #define _PAGE_RPLV (_ULCAST_(1) << _PAGE_RPLV_SHIFT) #define _CACHE_MASK (_ULCAST_(3) << _CACHE_SHIFT) -#define _PFN_SHIFT (PAGE_SHIFT - 12 + _PAGE_PFN_SHIFT) +#define PFN_PTE_SHIFT (PAGE_SHIFT - 12 + _PAGE_PFN_SHIFT) #define _PAGE_USER (PLV_USER << _PAGE_PLV_SHIFT) #define _PAGE_KERN (PLV_KERN << _PAGE_PLV_SHIFT) -#define _PFN_MASK (~((_ULCAST_(1) << (_PFN_SHIFT)) - 1) & \ +#define _PFN_MASK (~((_ULCAST_(1) << (PFN_PTE_SHIFT)) - 1) & \ ((_ULCAST_(1) << (_PAGE_PFN_END_SHIFT)) - 1)) /* diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 38afeb7dd58b..e7cf25e452c0 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -237,9 +237,9 @@ extern pmd_t mk_pmd(struct page *page, pgprot_t prot); extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); #define pte_page(x) pfn_to_page(pte_pfn(x)) -#define pte_pfn(x) ((unsigned long)(((x).pte & _PFN_MASK) >> _PFN_SHIFT)) -#define pfn_pte(pfn, prot) __pte(((pfn) << _PFN_SHIFT) | pgprot_val(prot)) -#define pfn_pmd(pfn, prot) __pmd(((pfn) << _PFN_SHIFT) | pgprot_val(prot)) +#define pte_pfn(x) ((unsigned long)(((x).pte & _PFN_MASK) >> PFN_PTE_SHIFT)) +#define pfn_pte(pfn, prot) __pte(((pfn) << PFN_PTE_SHIFT) | pgprot_val(prot)) +#define pfn_pmd(pfn, prot) __pmd(((pfn) << PFN_PTE_SHIFT) | pgprot_val(prot)) /* * Initialize a new pgd / pud / pmd table with invalid pointers. @@ -334,19 +334,13 @@ static inline void set_pte(pte_t *ptep, pte_t pteval) } } -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) -{ - set_pte(ptep, pteval); -} - static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { /* Preserve global status for the pair */ if (pte_val(*ptep_buddy(ptep)) & _PAGE_GLOBAL) - set_pte_at(mm, addr, ptep, __pte(_PAGE_GLOBAL)); + set_pte(ptep, __pte(_PAGE_GLOBAL)); else - set_pte_at(mm, addr, ptep, __pte(0)); + set_pte(ptep, __pte(0)); } #define PGD_T_LOG2 (__builtin_ffs(sizeof(pgd_t)) - 1) @@ -445,11 +439,20 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) extern void __update_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); -static inline void update_mmu_cache(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) { - __update_tlb(vma, address, ptep); + for (;;) { + __update_tlb(vma, address, ptep); + if (--nr == 0) + break; + address += PAGE_SIZE; + ptep++; + } } +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) #define __HAVE_ARCH_UPDATE_MMU_TLB #define update_mmu_tlb update_mmu_cache @@ -462,7 +465,7 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, static inline unsigned long pmd_pfn(pmd_t pmd) { - return (pmd_val(pmd) & _PFN_MASK) >> _PFN_SHIFT; + return (pmd_val(pmd) & _PFN_MASK) >> PFN_PTE_SHIFT; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/arch/loongarch/mm/pgtable.c b/arch/loongarch/mm/pgtable.c index 5bd102b51f7c..b14343e211b6 100644 --- a/arch/loongarch/mm/pgtable.c +++ b/arch/loongarch/mm/pgtable.c @@ -108,7 +108,7 @@ pmd_t mk_pmd(struct page *page, pgprot_t prot) { pmd_t pmd; - pmd_val(pmd) = (page_to_pfn(page) << _PFN_SHIFT) | pgprot_val(prot); + pmd_val(pmd) = (page_to_pfn(page) << PFN_PTE_SHIFT) | pgprot_val(prot); return pmd; } diff --git a/arch/loongarch/mm/tlb.c b/arch/loongarch/mm/tlb.c index 00bb563e3c89..eb8572e201ea 100644 --- a/arch/loongarch/mm/tlb.c +++ b/arch/loongarch/mm/tlb.c @@ -252,7 +252,7 @@ static void output_pgtable_bits_defines(void) pr_define("_PAGE_WRITE_SHIFT %d\n", _PAGE_WRITE_SHIFT); pr_define("_PAGE_NO_READ_SHIFT %d\n", _PAGE_NO_READ_SHIFT); pr_define("_PAGE_NO_EXEC_SHIFT %d\n", _PAGE_NO_EXEC_SHIFT); - pr_define("_PFN_SHIFT %d\n", _PFN_SHIFT); + pr_define("PFN_PTE_SHIFT %d\n", PFN_PTE_SHIFT); pr_debug("\n"); } -- cgit v1.2.3 From 5553b15a4bbba8039e1f31b63642048286f540dc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:43 +0100 Subject: m68k: implement the new page table range API Add PFN_PTE_SHIFT, update_mmu_cache_range(), flush_icache_pages() and flush_dcache_folio(). Link: https://lkml.kernel.org/r/20230802151406.3735276-16-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Geert Uytterhoeven Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- arch/m68k/include/asm/cacheflush_mm.h | 27 ++++++++++++++++++--------- arch/m68k/include/asm/mcf_pgtable.h | 1 + arch/m68k/include/asm/motorola_pgtable.h | 1 + arch/m68k/include/asm/pgtable_mm.h | 10 ++++++---- arch/m68k/include/asm/sun3_pgtable.h | 1 + arch/m68k/mm/motorola.c | 2 +- 6 files changed, 28 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/m68k/include/asm/cacheflush_mm.h b/arch/m68k/include/asm/cacheflush_mm.h index 1ac55e7b47f0..88eb85e81ef6 100644 --- a/arch/m68k/include/asm/cacheflush_mm.h +++ b/arch/m68k/include/asm/cacheflush_mm.h @@ -220,24 +220,29 @@ static inline void flush_cache_page(struct vm_area_struct *vma, unsigned long vm /* Push the page at kernel virtual address and clear the icache */ /* RZ: use cpush %bc instead of cpush %dc, cinv %ic */ -static inline void __flush_page_to_ram(void *vaddr) +static inline void __flush_pages_to_ram(void *vaddr, unsigned int nr) { if (CPU_IS_COLDFIRE) { unsigned long addr, start, end; addr = ((unsigned long) vaddr) & ~(PAGE_SIZE - 1); start = addr & ICACHE_SET_MASK; - end = (addr + PAGE_SIZE - 1) & ICACHE_SET_MASK; + end = (addr + nr * PAGE_SIZE - 1) & ICACHE_SET_MASK; if (start > end) { flush_cf_bcache(0, end); end = ICACHE_MAX_ADDR; } flush_cf_bcache(start, end); } else if (CPU_IS_040_OR_060) { - __asm__ __volatile__("nop\n\t" - ".chip 68040\n\t" - "cpushp %%bc,(%0)\n\t" - ".chip 68k" - : : "a" (__pa(vaddr))); + unsigned long paddr = __pa(vaddr); + + do { + __asm__ __volatile__("nop\n\t" + ".chip 68040\n\t" + "cpushp %%bc,(%0)\n\t" + ".chip 68k" + : : "a" (paddr)); + paddr += PAGE_SIZE; + } while (--nr); } else { unsigned long _tmp; __asm__ __volatile__("movec %%cacr,%0\n\t" @@ -249,10 +254,14 @@ static inline void __flush_page_to_ram(void *vaddr) } #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -#define flush_dcache_page(page) __flush_page_to_ram(page_address(page)) +#define flush_dcache_page(page) __flush_pages_to_ram(page_address(page), 1) +#define flush_dcache_folio(folio) \ + __flush_pages_to_ram(folio_address(folio), folio_nr_pages(folio)) #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_icache_page(vma, page) __flush_page_to_ram(page_address(page)) +#define flush_icache_pages(vma, page, nr) \ + __flush_pages_to_ram(page_address(page), nr) +#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1) extern void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); diff --git a/arch/m68k/include/asm/mcf_pgtable.h b/arch/m68k/include/asm/mcf_pgtable.h index 43e8da8465f9..772b7e7b0654 100644 --- a/arch/m68k/include/asm/mcf_pgtable.h +++ b/arch/m68k/include/asm/mcf_pgtable.h @@ -291,6 +291,7 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) return pte; } +#define PFN_PTE_SHIFT PAGE_SHIFT #define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT) #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h index ec0dc19ab834..38d5e5edc3e1 100644 --- a/arch/m68k/include/asm/motorola_pgtable.h +++ b/arch/m68k/include/asm/motorola_pgtable.h @@ -112,6 +112,7 @@ static inline void pud_set(pud_t *pudp, pmd_t *pmdp) #define pte_present(pte) (pte_val(pte) & (_PAGE_PRESENT | _PAGE_PROTNONE)) #define pte_clear(mm,addr,ptep) ({ pte_val(*(ptep)) = 0; }) +#define PFN_PTE_SHIFT PAGE_SHIFT #define pte_page(pte) virt_to_page(__va(pte_val(pte))) #define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT) #define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h index b93c41fe2067..dbdf1c2b2f66 100644 --- a/arch/m68k/include/asm/pgtable_mm.h +++ b/arch/m68k/include/asm/pgtable_mm.h @@ -31,8 +31,6 @@ do{ \ *(pteptr) = (pteval); \ } while(0) -#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) - /* PMD_SHIFT determines the size of the area a second-level page table can map */ #if CONFIG_PGTABLE_LEVELS == 3 @@ -138,11 +136,15 @@ extern void kernel_set_cachemode(void *addr, unsigned long size, int cmode); * tables contain all the necessary information. The Sun3 does, but * they are updated on demand. */ -static inline void update_mmu_cache(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) { } +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) + #endif /* !__ASSEMBLY__ */ /* MMU-specific headers */ diff --git a/arch/m68k/include/asm/sun3_pgtable.h b/arch/m68k/include/asm/sun3_pgtable.h index 9e7bf8a5f8f8..0cc39a88ce55 100644 --- a/arch/m68k/include/asm/sun3_pgtable.h +++ b/arch/m68k/include/asm/sun3_pgtable.h @@ -105,6 +105,7 @@ static inline void pte_clear (struct mm_struct *mm, unsigned long addr, pte_t *p pte_val (*ptep) = 0; } +#define PFN_PTE_SHIFT 0 #define pte_pfn(pte) (pte_val(pte) & SUN3_PAGE_PGNUM_MASK) #define pfn_pte(pfn, pgprot) \ ({ pte_t __pte; pte_val(__pte) = pfn | pgprot_val(pgprot); __pte; }) diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index 594575a0780c..c1761d309fc6 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -81,7 +81,7 @@ static inline void cache_page(void *vaddr) void mmu_page_ctor(void *page) { - __flush_page_to_ram(page); + __flush_pages_to_ram(page, 1); flush_tlb_kernel_page(page); nocache_page(page); } -- cgit v1.2.3 From 27a8b944fe91503ba15241d9a8504a34af0009fa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:44 +0100 Subject: microblaze: implement the new page table range API Rename PFN_SHIFT_OFFSET to PTE_PFN_SHIFT. Change the calling convention for set_pte() to be the same as other architectures. Add update_mmu_cache_range(), flush_icache_pages() and flush_dcache_folio(). [arnd@arndb.de: mark flush_dcache_folio() inline] Link: https://lkml.kernel.org/r/20230810141947.1236730-9-arnd@kernel.org Link: https://lkml.kernel.org/r/20230802151406.3735276-17-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Arnd Bergmann Acked-by: Mike Rapoport (IBM) Cc: Michal Simek Signed-off-by: Andrew Morton --- arch/microblaze/include/asm/cacheflush.h | 8 ++++++++ arch/microblaze/include/asm/pgtable.h | 15 ++++----------- arch/microblaze/include/asm/tlbflush.h | 4 +++- 3 files changed, 15 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/microblaze/include/asm/cacheflush.h b/arch/microblaze/include/asm/cacheflush.h index 39f8fb6768d8..ffa2cf3893e4 100644 --- a/arch/microblaze/include/asm/cacheflush.h +++ b/arch/microblaze/include/asm/cacheflush.h @@ -74,6 +74,14 @@ do { \ flush_dcache_range((unsigned) (addr), (unsigned) (addr) + PAGE_SIZE); \ } while (0); +static inline void flush_dcache_folio(struct folio *folio) +{ + unsigned long addr = folio_pfn(folio) << PAGE_SHIFT; + + flush_dcache_range(addr, addr + folio_size(folio)); +} +#define flush_dcache_folio flush_dcache_folio + #define flush_cache_page(vma, vmaddr, pfn) \ flush_dcache_range(pfn << PAGE_SHIFT, (pfn << PAGE_SHIFT) + PAGE_SIZE); diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index d1b8272abcd9..6f9b99082518 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -230,12 +230,12 @@ extern unsigned long empty_zero_page[1024]; #define pte_page(x) (mem_map + (unsigned long) \ ((pte_val(x) - memory_start) >> PAGE_SHIFT)) -#define PFN_SHIFT_OFFSET (PAGE_SHIFT) +#define PFN_PTE_SHIFT PAGE_SHIFT -#define pte_pfn(x) (pte_val(x) >> PFN_SHIFT_OFFSET) +#define pte_pfn(x) (pte_val(x) >> PFN_PTE_SHIFT) #define pfn_pte(pfn, prot) \ - __pte(((pte_basic_t)(pfn) << PFN_SHIFT_OFFSET) | pgprot_val(prot)) + __pte(((pte_basic_t)(pfn) << PFN_PTE_SHIFT) | pgprot_val(prot)) #ifndef __ASSEMBLY__ /* @@ -330,14 +330,7 @@ static inline unsigned long pte_update(pte_t *p, unsigned long clr, /* * set_pte stores a linux PTE into the linux page table. */ -static inline void set_pte(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - *ptep = pte; -} - -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) +static inline void set_pte(pte_t *ptep, pte_t pte) { *ptep = pte; } diff --git a/arch/microblaze/include/asm/tlbflush.h b/arch/microblaze/include/asm/tlbflush.h index 2038168ed128..a31ae9d44083 100644 --- a/arch/microblaze/include/asm/tlbflush.h +++ b/arch/microblaze/include/asm/tlbflush.h @@ -33,7 +33,9 @@ static inline void local_flush_tlb_range(struct vm_area_struct *vma, #define flush_tlb_kernel_range(start, end) do { } while (0) -#define update_mmu_cache(vma, addr, ptep) do { } while (0) +#define update_mmu_cache_range(vmf, vma, addr, ptep, nr) do { } while (0) +#define update_mmu_cache(vma, addr, pte) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) #define flush_tlb_all local_flush_tlb_all #define flush_tlb_mm local_flush_tlb_mm -- cgit v1.2.3 From 15fa3e8e32692a423209a1808ef098f7ec3174f5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:45 +0100 Subject: mips: implement the new page table range API Rename _PFN_SHIFT to PFN_PTE_SHIFT. Convert a few places to call set_pte() instead of set_pte_at(). Add set_ptes(), update_mmu_cache_range(), flush_icache_pages() and flush_dcache_folio(). Change the PG_arch_1 (aka PG_dcache_dirty) flag from being per-page to per-folio. Link: https://lkml.kernel.org/r/20230802151406.3735276-18-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/bcm47xx/prom.c | 2 +- arch/mips/include/asm/cacheflush.h | 32 +++++++++++------- arch/mips/include/asm/pgtable-32.h | 10 +++--- arch/mips/include/asm/pgtable-64.h | 6 ++-- arch/mips/include/asm/pgtable-bits.h | 6 ++-- arch/mips/include/asm/pgtable.h | 63 +++++++++++++++++++++++------------- arch/mips/mm/c-r4k.c | 5 +-- arch/mips/mm/cache.c | 56 ++++++++++++++++---------------- arch/mips/mm/init.c | 21 +++++++----- arch/mips/mm/pgtable-32.c | 2 +- arch/mips/mm/pgtable-64.c | 2 +- arch/mips/mm/tlbex.c | 2 +- 12 files changed, 121 insertions(+), 86 deletions(-) (limited to 'arch') diff --git a/arch/mips/bcm47xx/prom.c b/arch/mips/bcm47xx/prom.c index a9bea411d928..99a1ba5394e0 100644 --- a/arch/mips/bcm47xx/prom.c +++ b/arch/mips/bcm47xx/prom.c @@ -116,7 +116,7 @@ void __init prom_init(void) #if defined(CONFIG_BCM47XX_BCMA) && defined(CONFIG_HIGHMEM) #define EXTVBASE 0xc0000000 -#define ENTRYLO(x) ((pte_val(pfn_pte((x) >> _PFN_SHIFT, PAGE_KERNEL_UNCACHED)) >> 6) | 1) +#define ENTRYLO(x) ((pte_val(pfn_pte((x) >> PFN_PTE_SHIFT, PAGE_KERNEL_UNCACHED)) >> 6) | 1) #include diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h index d8d3f80f9fc0..0f389bc7cb90 100644 --- a/arch/mips/include/asm/cacheflush.h +++ b/arch/mips/include/asm/cacheflush.h @@ -36,12 +36,12 @@ */ #define PG_dcache_dirty PG_arch_1 -#define Page_dcache_dirty(page) \ - test_bit(PG_dcache_dirty, &(page)->flags) -#define SetPageDcacheDirty(page) \ - set_bit(PG_dcache_dirty, &(page)->flags) -#define ClearPageDcacheDirty(page) \ - clear_bit(PG_dcache_dirty, &(page)->flags) +#define folio_test_dcache_dirty(folio) \ + test_bit(PG_dcache_dirty, &(folio)->flags) +#define folio_set_dcache_dirty(folio) \ + set_bit(PG_dcache_dirty, &(folio)->flags) +#define folio_clear_dcache_dirty(folio) \ + clear_bit(PG_dcache_dirty, &(folio)->flags) extern void (*flush_cache_all)(void); extern void (*__flush_cache_all)(void); @@ -50,15 +50,24 @@ extern void (*flush_cache_mm)(struct mm_struct *mm); extern void (*flush_cache_range)(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void (*flush_cache_page)(struct vm_area_struct *vma, unsigned long page, unsigned long pfn); -extern void __flush_dcache_page(struct page *page); +extern void __flush_dcache_pages(struct page *page, unsigned int nr); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 +static inline void flush_dcache_folio(struct folio *folio) +{ + if (cpu_has_dc_aliases) + __flush_dcache_pages(&folio->page, folio_nr_pages(folio)); + else if (!cpu_has_ic_fills_f_dc) + folio_set_dcache_dirty(folio); +} +#define flush_dcache_folio flush_dcache_folio + static inline void flush_dcache_page(struct page *page) { if (cpu_has_dc_aliases) - __flush_dcache_page(page); + __flush_dcache_pages(page, 1); else if (!cpu_has_ic_fills_f_dc) - SetPageDcacheDirty(page); + folio_set_dcache_dirty(page_folio(page)); } #define flush_dcache_mmap_lock(mapping) do { } while (0) @@ -73,10 +82,11 @@ static inline void flush_anon_page(struct vm_area_struct *vma, __flush_anon_page(page, vmaddr); } -static inline void flush_icache_page(struct vm_area_struct *vma, - struct page *page) +static inline void flush_icache_pages(struct vm_area_struct *vma, + struct page *page, unsigned int nr) { } +#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1) extern void (*flush_icache_range)(unsigned long start, unsigned long end); extern void (*local_flush_icache_range)(unsigned long start, unsigned long end); diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h index ba0016709a1a..0e196650f4f4 100644 --- a/arch/mips/include/asm/pgtable-32.h +++ b/arch/mips/include/asm/pgtable-32.h @@ -153,7 +153,7 @@ static inline void pmd_clear(pmd_t *pmdp) #if defined(CONFIG_XPA) #define MAX_POSSIBLE_PHYSMEM_BITS 40 -#define pte_pfn(x) (((unsigned long)((x).pte_high >> _PFN_SHIFT)) | (unsigned long)((x).pte_low << _PAGE_PRESENT_SHIFT)) +#define pte_pfn(x) (((unsigned long)((x).pte_high >> PFN_PTE_SHIFT)) | (unsigned long)((x).pte_low << _PAGE_PRESENT_SHIFT)) static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) { @@ -161,7 +161,7 @@ pfn_pte(unsigned long pfn, pgprot_t prot) pte.pte_low = (pfn >> _PAGE_PRESENT_SHIFT) | (pgprot_val(prot) & ~_PFNX_MASK); - pte.pte_high = (pfn << _PFN_SHIFT) | + pte.pte_high = (pfn << PFN_PTE_SHIFT) | (pgprot_val(prot) & ~_PFN_MASK); return pte; } @@ -184,9 +184,9 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) #else #define MAX_POSSIBLE_PHYSMEM_BITS 32 -#define pte_pfn(x) ((unsigned long)((x).pte >> _PFN_SHIFT)) -#define pfn_pte(pfn, prot) __pte(((unsigned long long)(pfn) << _PFN_SHIFT) | pgprot_val(prot)) -#define pfn_pmd(pfn, prot) __pmd(((unsigned long long)(pfn) << _PFN_SHIFT) | pgprot_val(prot)) +#define pte_pfn(x) ((unsigned long)((x).pte >> PFN_PTE_SHIFT)) +#define pfn_pte(pfn, prot) __pte(((unsigned long long)(pfn) << PFN_PTE_SHIFT) | pgprot_val(prot)) +#define pfn_pmd(pfn, prot) __pmd(((unsigned long long)(pfn) << PFN_PTE_SHIFT) | pgprot_val(prot)) #endif /* defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) */ #define pte_page(x) pfn_to_page(pte_pfn(x)) diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h index 98e24e3e7f2b..20ca48c1b606 100644 --- a/arch/mips/include/asm/pgtable-64.h +++ b/arch/mips/include/asm/pgtable-64.h @@ -298,9 +298,9 @@ static inline void pud_clear(pud_t *pudp) #define pte_page(x) pfn_to_page(pte_pfn(x)) -#define pte_pfn(x) ((unsigned long)((x).pte >> _PFN_SHIFT)) -#define pfn_pte(pfn, prot) __pte(((pfn) << _PFN_SHIFT) | pgprot_val(prot)) -#define pfn_pmd(pfn, prot) __pmd(((pfn) << _PFN_SHIFT) | pgprot_val(prot)) +#define pte_pfn(x) ((unsigned long)((x).pte >> PFN_PTE_SHIFT)) +#define pfn_pte(pfn, prot) __pte(((pfn) << PFN_PTE_SHIFT) | pgprot_val(prot)) +#define pfn_pmd(pfn, prot) __pmd(((pfn) << PFN_PTE_SHIFT) | pgprot_val(prot)) #ifndef __PAGETABLE_PMD_FOLDED static inline pmd_t *pud_pgtable(pud_t pud) diff --git a/arch/mips/include/asm/pgtable-bits.h b/arch/mips/include/asm/pgtable-bits.h index 1c576679aa87..421e78c30253 100644 --- a/arch/mips/include/asm/pgtable-bits.h +++ b/arch/mips/include/asm/pgtable-bits.h @@ -182,10 +182,10 @@ enum pgtable_bits { #if defined(CONFIG_CPU_R3K_TLB) # define _CACHE_UNCACHED (1 << _CACHE_UNCACHED_SHIFT) # define _CACHE_MASK _CACHE_UNCACHED -# define _PFN_SHIFT PAGE_SHIFT +# define PFN_PTE_SHIFT PAGE_SHIFT #else # define _CACHE_MASK (7 << _CACHE_SHIFT) -# define _PFN_SHIFT (PAGE_SHIFT - 12 + _CACHE_SHIFT + 3) +# define PFN_PTE_SHIFT (PAGE_SHIFT - 12 + _CACHE_SHIFT + 3) #endif #ifndef _PAGE_NO_EXEC @@ -195,7 +195,7 @@ enum pgtable_bits { #define _PAGE_SILENT_READ _PAGE_VALID #define _PAGE_SILENT_WRITE _PAGE_DIRTY -#define _PFN_MASK (~((1 << (_PFN_SHIFT)) - 1)) +#define _PFN_MASK (~((1 << (PFN_PTE_SHIFT)) - 1)) /* * The final layouts of the PTE bits are: diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 574fa14ac8b2..cbb93a834f52 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -66,7 +66,7 @@ extern void paging_init(void); static inline unsigned long pmd_pfn(pmd_t pmd) { - return pmd_val(pmd) >> _PFN_SHIFT; + return pmd_val(pmd) >> PFN_PTE_SHIFT; } #ifndef CONFIG_MIPS_HUGE_TLB_SUPPORT @@ -105,9 +105,6 @@ do { \ } \ } while(0) -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval); - #if defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) #ifdef CONFIG_XPA @@ -157,7 +154,7 @@ static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *pt null.pte_low = null.pte_high = _PAGE_GLOBAL; } - set_pte_at(mm, addr, ptep, null); + set_pte(ptep, null); htw_start(); } #else @@ -196,28 +193,41 @@ static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *pt #if !defined(CONFIG_CPU_R3K_TLB) /* Preserve global status for the pair */ if (pte_val(*ptep_buddy(ptep)) & _PAGE_GLOBAL) - set_pte_at(mm, addr, ptep, __pte(_PAGE_GLOBAL)); + set_pte(ptep, __pte(_PAGE_GLOBAL)); else #endif - set_pte_at(mm, addr, ptep, __pte(0)); + set_pte(ptep, __pte(0)); htw_start(); } #endif -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) { + unsigned int i; + bool do_sync = false; - if (!pte_present(pteval)) - goto cache_sync_done; + for (i = 0; i < nr; i++) { + if (!pte_present(pte)) + continue; + if (pte_present(ptep[i]) && + (pte_pfn(ptep[i]) == pte_pfn(pte))) + continue; + do_sync = true; + } - if (pte_present(*ptep) && (pte_pfn(*ptep) == pte_pfn(pteval))) - goto cache_sync_done; + if (do_sync) + __update_cache(addr, pte); - __update_cache(addr, pteval); -cache_sync_done: - set_pte(ptep, pteval); + for (;;) { + set_pte(ptep, pte); + if (--nr == 0) + break; + ptep++; + pte = __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT)); + } } +#define set_ptes set_ptes /* * (pmds are folded into puds so this doesn't get actually called, @@ -486,7 +496,7 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, pte_t entry, int dirty) { if (!pte_same(*ptep, entry)) - set_pte_at(vma->vm_mm, address, ptep, entry); + set_pte(ptep, entry); /* * update_mmu_cache will unconditionally execute, handling both * the case that the PTE changed and the spurious fault case. @@ -568,12 +578,21 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) extern void __update_tlb(struct vm_area_struct *vma, unsigned long address, pte_t pte); -static inline void update_mmu_cache(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) -{ - pte_t pte = *ptep; - __update_tlb(vma, address, pte); +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) +{ + for (;;) { + pte_t pte = *ptep; + __update_tlb(vma, address, pte); + if (--nr == 0) + break; + ptep++; + address += PAGE_SIZE; + } } +#define update_mmu_cache(vma, address, ptep) \ + update_mmu_cache_range(NULL, vma, address, ptep, 1) #define __HAVE_ARCH_UPDATE_MMU_TLB #define update_mmu_tlb update_mmu_cache diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c index 4b6554b48923..187d1c16361c 100644 --- a/arch/mips/mm/c-r4k.c +++ b/arch/mips/mm/c-r4k.c @@ -568,13 +568,14 @@ static inline void local_r4k_flush_cache_page(void *args) if ((mm == current->active_mm) && (pte_val(*ptep) & _PAGE_VALID)) vaddr = NULL; else { + struct folio *folio = page_folio(page); /* * Use kmap_coherent or kmap_atomic to do flushes for * another ASID than the current one. */ map_coherent = (cpu_has_dc_aliases && - page_mapcount(page) && - !Page_dcache_dirty(page)); + folio_mapped(folio) && + !folio_test_dcache_dirty(folio)); if (map_coherent) vaddr = kmap_coherent(page, addr); else diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index d21cf8c6cf6c..02042100e267 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -99,13 +99,15 @@ SYSCALL_DEFINE3(cacheflush, unsigned long, addr, unsigned long, bytes, return 0; } -void __flush_dcache_page(struct page *page) +void __flush_dcache_pages(struct page *page, unsigned int nr) { - struct address_space *mapping = page_mapping_file(page); + struct folio *folio = page_folio(page); + struct address_space *mapping = folio_flush_mapping(folio); unsigned long addr; + unsigned int i; if (mapping && !mapping_mapped(mapping)) { - SetPageDcacheDirty(page); + folio_set_dcache_dirty(folio); return; } @@ -114,25 +116,21 @@ void __flush_dcache_page(struct page *page) * case is for exec env/arg pages and those are %99 certainly going to * get faulted into the tlb (and thus flushed) anyways. */ - if (PageHighMem(page)) - addr = (unsigned long)kmap_atomic(page); - else - addr = (unsigned long)page_address(page); - - flush_data_cache_page(addr); - - if (PageHighMem(page)) - kunmap_atomic((void *)addr); + for (i = 0; i < nr; i++) { + addr = (unsigned long)kmap_local_page(page + i); + flush_data_cache_page(addr); + kunmap_local((void *)addr); + } } - -EXPORT_SYMBOL(__flush_dcache_page); +EXPORT_SYMBOL(__flush_dcache_pages); void __flush_anon_page(struct page *page, unsigned long vmaddr) { unsigned long addr = (unsigned long) page_address(page); + struct folio *folio = page_folio(page); if (pages_do_alias(addr, vmaddr)) { - if (page_mapcount(page) && !Page_dcache_dirty(page)) { + if (folio_mapped(folio) && !folio_test_dcache_dirty(folio)) { void *kaddr; kaddr = kmap_coherent(page, vmaddr); @@ -147,27 +145,29 @@ EXPORT_SYMBOL(__flush_anon_page); void __update_cache(unsigned long address, pte_t pte) { - struct page *page; + struct folio *folio; unsigned long pfn, addr; int exec = !pte_no_exec(pte) && !cpu_has_ic_fills_f_dc; + unsigned int i; pfn = pte_pfn(pte); if (unlikely(!pfn_valid(pfn))) return; - page = pfn_to_page(pfn); - if (Page_dcache_dirty(page)) { - if (PageHighMem(page)) - addr = (unsigned long)kmap_atomic(page); - else - addr = (unsigned long)page_address(page); - - if (exec || pages_do_alias(addr, address & PAGE_MASK)) - flush_data_cache_page(addr); - if (PageHighMem(page)) - kunmap_atomic((void *)addr); + folio = page_folio(pfn_to_page(pfn)); + address &= PAGE_MASK; + address -= offset_in_folio(folio, pfn << PAGE_SHIFT); + + if (folio_test_dcache_dirty(folio)) { + for (i = 0; i < folio_nr_pages(folio); i++) { + addr = (unsigned long)kmap_local_folio(folio, i); - ClearPageDcacheDirty(page); + if (exec || pages_do_alias(addr, address)) + flush_data_cache_page(addr); + kunmap_local((void *)addr); + address += PAGE_SIZE; + } + folio_clear_dcache_dirty(folio); } } diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 5a8002839550..5dcb525a8995 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -88,7 +88,7 @@ static void *__kmap_pgprot(struct page *page, unsigned long addr, pgprot_t prot) pte_t pte; int tlbidx; - BUG_ON(Page_dcache_dirty(page)); + BUG_ON(folio_test_dcache_dirty(page_folio(page))); preempt_disable(); pagefault_disable(); @@ -169,11 +169,12 @@ void kunmap_coherent(void) void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { + struct folio *src = page_folio(from); void *vfrom, *vto; vto = kmap_atomic(to); if (cpu_has_dc_aliases && - page_mapcount(from) && !Page_dcache_dirty(from)) { + folio_mapped(src) && !folio_test_dcache_dirty(src)) { vfrom = kmap_coherent(from, vaddr); copy_page(vto, vfrom); kunmap_coherent(); @@ -194,15 +195,17 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, const void *src, unsigned long len) { + struct folio *folio = page_folio(page); + if (cpu_has_dc_aliases && - page_mapcount(page) && !Page_dcache_dirty(page)) { + folio_mapped(folio) && !folio_test_dcache_dirty(folio)) { void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); memcpy(vto, src, len); kunmap_coherent(); } else { memcpy(dst, src, len); if (cpu_has_dc_aliases) - SetPageDcacheDirty(page); + folio_set_dcache_dirty(folio); } if (vma->vm_flags & VM_EXEC) flush_cache_page(vma, vaddr, page_to_pfn(page)); @@ -212,15 +215,17 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, const void *src, unsigned long len) { + struct folio *folio = page_folio(page); + if (cpu_has_dc_aliases && - page_mapcount(page) && !Page_dcache_dirty(page)) { + folio_mapped(folio) && !folio_test_dcache_dirty(folio)) { void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); memcpy(dst, vfrom, len); kunmap_coherent(); } else { memcpy(dst, src, len); if (cpu_has_dc_aliases) - SetPageDcacheDirty(page); + folio_set_dcache_dirty(folio); } } EXPORT_SYMBOL_GPL(copy_from_user_page); @@ -448,10 +453,10 @@ static inline void __init mem_init_free_highmem(void) void __init mem_init(void) { /* - * When _PFN_SHIFT is greater than PAGE_SHIFT we won't have enough PTE + * When PFN_PTE_SHIFT is greater than PAGE_SHIFT we won't have enough PTE * bits to hold a full 32b physical address on MIPS32 systems. */ - BUILD_BUG_ON(IS_ENABLED(CONFIG_32BIT) && (_PFN_SHIFT > PAGE_SHIFT)); + BUILD_BUG_ON(IS_ENABLED(CONFIG_32BIT) && (PFN_PTE_SHIFT > PAGE_SHIFT)); #ifdef CONFIG_HIGHMEM max_mapnr = highend_pfn ? highend_pfn : max_low_pfn; diff --git a/arch/mips/mm/pgtable-32.c b/arch/mips/mm/pgtable-32.c index f57fb69472f8..84dd5136d53a 100644 --- a/arch/mips/mm/pgtable-32.c +++ b/arch/mips/mm/pgtable-32.c @@ -35,7 +35,7 @@ pmd_t mk_pmd(struct page *page, pgprot_t prot) { pmd_t pmd; - pmd_val(pmd) = (page_to_pfn(page) << _PFN_SHIFT) | pgprot_val(prot); + pmd_val(pmd) = (page_to_pfn(page) << PFN_PTE_SHIFT) | pgprot_val(prot); return pmd; } diff --git a/arch/mips/mm/pgtable-64.c b/arch/mips/mm/pgtable-64.c index b4386a0e2ef8..c76d21f7dffb 100644 --- a/arch/mips/mm/pgtable-64.c +++ b/arch/mips/mm/pgtable-64.c @@ -93,7 +93,7 @@ pmd_t mk_pmd(struct page *page, pgprot_t prot) { pmd_t pmd; - pmd_val(pmd) = (page_to_pfn(page) << _PFN_SHIFT) | pgprot_val(prot); + pmd_val(pmd) = (page_to_pfn(page) << PFN_PTE_SHIFT) | pgprot_val(prot); return pmd; } diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c index 8d514a9082c6..b4e1c783e617 100644 --- a/arch/mips/mm/tlbex.c +++ b/arch/mips/mm/tlbex.c @@ -253,7 +253,7 @@ static void output_pgtable_bits_defines(void) pr_define("_PAGE_GLOBAL_SHIFT %d\n", _PAGE_GLOBAL_SHIFT); pr_define("_PAGE_VALID_SHIFT %d\n", _PAGE_VALID_SHIFT); pr_define("_PAGE_DIRTY_SHIFT %d\n", _PAGE_DIRTY_SHIFT); - pr_define("_PFN_SHIFT %d\n", _PFN_SHIFT); + pr_define("PFN_PTE_SHIFT %d\n", PFN_PTE_SHIFT); pr_debug("\n"); } -- cgit v1.2.3 From 994209410919f2b84b7e4ab2e78785d9715308ad Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:46 +0100 Subject: nios2: implement the new page table range API Add set_ptes(), update_mmu_cache_range(), flush_icache_pages() and flush_dcache_folio(). Change the PG_arch_1 (aka PG_dcache_dirty) flag from being per-page to per-folio. Link: https://lkml.kernel.org/r/20230802151406.3735276-19-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Dinh Nguyen Signed-off-by: Andrew Morton --- arch/nios2/include/asm/cacheflush.h | 6 ++- arch/nios2/include/asm/pgtable.h | 28 ++++++++----- arch/nios2/mm/cacheflush.c | 79 ++++++++++++++++++++----------------- 3 files changed, 67 insertions(+), 46 deletions(-) (limited to 'arch') diff --git a/arch/nios2/include/asm/cacheflush.h b/arch/nios2/include/asm/cacheflush.h index d0b71dd71287..8624ca83cffe 100644 --- a/arch/nios2/include/asm/cacheflush.h +++ b/arch/nios2/include/asm/cacheflush.h @@ -29,9 +29,13 @@ extern void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long pfn); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 void flush_dcache_page(struct page *page); +void flush_dcache_folio(struct folio *folio); +#define flush_dcache_folio flush_dcache_folio extern void flush_icache_range(unsigned long start, unsigned long end); -extern void flush_icache_page(struct vm_area_struct *vma, struct page *page); +void flush_icache_pages(struct vm_area_struct *vma, struct page *page, + unsigned int nr); +#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1); #define flush_cache_vmap(start, end) flush_dcache_range(start, end) #define flush_cache_vunmap(start, end) flush_dcache_range(start, end) diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index 0f5c2564e9f5..be6bf3e0bd7a 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -178,14 +178,21 @@ static inline void set_pte(pte_t *ptep, pte_t pteval) *ptep = pteval; } -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) { - unsigned long paddr = (unsigned long)page_to_virt(pte_page(pteval)); - - flush_dcache_range(paddr, paddr + PAGE_SIZE); - set_pte(ptep, pteval); + unsigned long paddr = (unsigned long)page_to_virt(pte_page(pte)); + + flush_dcache_range(paddr, paddr + nr * PAGE_SIZE); + for (;;) { + set_pte(ptep, pte); + if (--nr == 0) + break; + ptep++; + pte_val(pte) += 1; + } } +#define set_ptes set_ptes static inline int pmd_none(pmd_t pmd) { @@ -202,7 +209,7 @@ static inline void pte_clear(struct mm_struct *mm, pte_val(null) = (addr >> PAGE_SHIFT) & 0xf; - set_pte_at(mm, addr, ptep, null); + set_pte(ptep, null); } /* @@ -273,7 +280,10 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) extern void __init paging_init(void); extern void __init mmu_init(void); -extern void update_mmu_cache(struct vm_area_struct *vma, - unsigned long address, pte_t *pte); +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr); + +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) #endif /* _ASM_NIOS2_PGTABLE_H */ diff --git a/arch/nios2/mm/cacheflush.c b/arch/nios2/mm/cacheflush.c index 6aa9257c3ede..28b805f465a8 100644 --- a/arch/nios2/mm/cacheflush.c +++ b/arch/nios2/mm/cacheflush.c @@ -71,26 +71,26 @@ static void __flush_icache(unsigned long start, unsigned long end) __asm__ __volatile(" flushp\n"); } -static void flush_aliases(struct address_space *mapping, struct page *page) +static void flush_aliases(struct address_space *mapping, struct folio *folio) { struct mm_struct *mm = current->active_mm; - struct vm_area_struct *mpnt; + struct vm_area_struct *vma; pgoff_t pgoff; + unsigned long nr = folio_nr_pages(folio); - pgoff = page->index; + pgoff = folio->index; flush_dcache_mmap_lock(mapping); - vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) { - unsigned long offset; + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff + nr - 1) { + unsigned long start; - if (mpnt->vm_mm != mm) + if (vma->vm_mm != mm) continue; - if (!(mpnt->vm_flags & VM_MAYSHARE)) + if (!(vma->vm_flags & VM_MAYSHARE)) continue; - offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT; - flush_cache_page(mpnt, mpnt->vm_start + offset, - page_to_pfn(page)); + start = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + flush_cache_range(vma, start, start + nr * PAGE_SIZE); } flush_dcache_mmap_unlock(mapping); } @@ -138,10 +138,11 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start, __flush_icache(start, end); } -void flush_icache_page(struct vm_area_struct *vma, struct page *page) +void flush_icache_pages(struct vm_area_struct *vma, struct page *page, + unsigned int nr) { unsigned long start = (unsigned long) page_address(page); - unsigned long end = start + PAGE_SIZE; + unsigned long end = start + nr * PAGE_SIZE; __flush_dcache(start, end); __flush_icache(start, end); @@ -158,19 +159,19 @@ void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, __flush_icache(start, end); } -void __flush_dcache_page(struct address_space *mapping, struct page *page) +static void __flush_dcache_folio(struct folio *folio) { /* * Writeback any data associated with the kernel mapping of this * page. This ensures that data in the physical page is mutually * coherent with the kernels mapping. */ - unsigned long start = (unsigned long)page_address(page); + unsigned long start = (unsigned long)folio_address(folio); - __flush_dcache(start, start + PAGE_SIZE); + __flush_dcache(start, start + folio_size(folio)); } -void flush_dcache_page(struct page *page) +void flush_dcache_folio(struct folio *folio) { struct address_space *mapping; @@ -178,32 +179,38 @@ void flush_dcache_page(struct page *page) * The zero page is never written to, so never has any dirty * cache lines, and therefore never needs to be flushed. */ - if (page == ZERO_PAGE(0)) + if (is_zero_pfn(folio_pfn(folio))) return; - mapping = page_mapping_file(page); + mapping = folio_flush_mapping(folio); /* Flush this page if there are aliases. */ if (mapping && !mapping_mapped(mapping)) { - clear_bit(PG_dcache_clean, &page->flags); + clear_bit(PG_dcache_clean, &folio->flags); } else { - __flush_dcache_page(mapping, page); + __flush_dcache_folio(folio); if (mapping) { - unsigned long start = (unsigned long)page_address(page); - flush_aliases(mapping, page); - flush_icache_range(start, start + PAGE_SIZE); + unsigned long start = (unsigned long)folio_address(folio); + flush_aliases(mapping, folio); + flush_icache_range(start, start + folio_size(folio)); } - set_bit(PG_dcache_clean, &page->flags); + set_bit(PG_dcache_clean, &folio->flags); } } +EXPORT_SYMBOL(flush_dcache_folio); + +void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); +} EXPORT_SYMBOL(flush_dcache_page); -void update_mmu_cache(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr) { pte_t pte = *ptep; unsigned long pfn = pte_pfn(pte); - struct page *page; + struct folio *folio; struct address_space *mapping; reload_tlb_page(vma, address, pte); @@ -215,19 +222,19 @@ void update_mmu_cache(struct vm_area_struct *vma, * The zero page is never written to, so never has any dirty * cache lines, and therefore never needs to be flushed. */ - page = pfn_to_page(pfn); - if (page == ZERO_PAGE(0)) + if (is_zero_pfn(pfn)) return; - mapping = page_mapping_file(page); - if (!test_and_set_bit(PG_dcache_clean, &page->flags)) - __flush_dcache_page(mapping, page); + folio = page_folio(pfn_to_page(pfn)); + if (!test_and_set_bit(PG_dcache_clean, &folio->flags)) + __flush_dcache_folio(folio); - if(mapping) - { - flush_aliases(mapping, page); + mapping = folio_flush_mapping(folio); + if (mapping) { + flush_aliases(mapping, folio); if (vma->vm_flags & VM_EXEC) - flush_icache_page(vma, page); + flush_icache_pages(vma, &folio->page, + folio_nr_pages(folio)); } } -- cgit v1.2.3 From 063e409dcc37a5834fe94342b3cbcfe17d094eed Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:47 +0100 Subject: openrisc: implement the new page table range API Add PFN_PTE_SHIFT, update_mmu_cache_range() and flush_dcache_folio(). Change the PG_arch_1 (aka PG_dcache_dirty) flag from being per-page to per-folio. Link: https://lkml.kernel.org/r/20230802151406.3735276-20-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Stafford Horne Signed-off-by: Andrew Morton --- arch/openrisc/include/asm/cacheflush.h | 8 +++++++- arch/openrisc/include/asm/pgtable.h | 15 ++++++++++----- arch/openrisc/mm/cache.c | 12 ++++++++---- 3 files changed, 25 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/openrisc/include/asm/cacheflush.h b/arch/openrisc/include/asm/cacheflush.h index eeac40d4a854..984c331ff5f4 100644 --- a/arch/openrisc/include/asm/cacheflush.h +++ b/arch/openrisc/include/asm/cacheflush.h @@ -56,10 +56,16 @@ static inline void sync_icache_dcache(struct page *page) */ #define PG_dc_clean PG_arch_1 +static inline void flush_dcache_folio(struct folio *folio) +{ + clear_bit(PG_dc_clean, &folio->flags); +} +#define flush_dcache_folio flush_dcache_folio + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 static inline void flush_dcache_page(struct page *page) { - clear_bit(PG_dc_clean, &page->flags); + flush_dcache_folio(page_folio(page)); } #define flush_icache_user_page(vma, page, addr, len) \ diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 3eb9b9555d0d..7bdf1bb0d177 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -46,7 +46,7 @@ extern void paging_init(void); * hook is made available. */ #define set_pte(pteptr, pteval) ((*(pteptr)) = (pteval)) -#define set_pte_at(mm, addr, ptep, pteval) set_pte(ptep, pteval) + /* * (pmds are folded into pgds so this doesn't get actually called, * but the define is needed for a generic inline function.) @@ -357,6 +357,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define __pmd_offset(address) \ (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) +#define PFN_PTE_SHIFT PAGE_SHIFT #define pte_pfn(x) ((unsigned long)(((x).pte)) >> PAGE_SHIFT) #define pfn_pte(pfn, prot) __pte((((pfn) << PAGE_SHIFT)) | pgprot_val(prot)) @@ -379,13 +380,17 @@ static inline void update_tlb(struct vm_area_struct *vma, extern void update_cache(struct vm_area_struct *vma, unsigned long address, pte_t *pte); -static inline void update_mmu_cache(struct vm_area_struct *vma, - unsigned long address, pte_t *pte) +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) { - update_tlb(vma, address, pte); - update_cache(vma, address, pte); + update_tlb(vma, address, ptep); + update_cache(vma, address, ptep); } +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) + /* __PHX__ FIXME, SWAP, this probably doesn't work */ /* diff --git a/arch/openrisc/mm/cache.c b/arch/openrisc/mm/cache.c index 534a52ec5e66..eb43b73f3855 100644 --- a/arch/openrisc/mm/cache.c +++ b/arch/openrisc/mm/cache.c @@ -43,15 +43,19 @@ void update_cache(struct vm_area_struct *vma, unsigned long address, pte_t *pte) { unsigned long pfn = pte_val(*pte) >> PAGE_SHIFT; - struct page *page = pfn_to_page(pfn); - int dirty = !test_and_set_bit(PG_dc_clean, &page->flags); + struct folio *folio = page_folio(pfn_to_page(pfn)); + int dirty = !test_and_set_bit(PG_dc_clean, &folio->flags); /* * Since icaches do not snoop for updated data on OpenRISC, we * must write back and invalidate any dirty pages manually. We * can skip data pages, since they will not end up in icaches. */ - if ((vma->vm_flags & VM_EXEC) && dirty) - sync_icache_dcache(page); + if ((vma->vm_flags & VM_EXEC) && dirty) { + unsigned int nr = folio_nr_pages(folio); + + while (nr--) + sync_icache_dcache(folio_page(folio, nr)); + } } -- cgit v1.2.3 From e70bbca607424dbb236cc641adba39c2cc0d65c5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:48 +0100 Subject: parisc: implement the new page table range API Add set_ptes(), update_mmu_cache_range(), flush_dcache_folio() and flush_icache_pages(). Change the PG_arch_1 (aka PG_dcache_dirty) flag from being per-page to per-folio. Link: https://lkml.kernel.org/r/20230802151406.3735276-21-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: "James E.J. Bottomley" Cc: Helge Deller Signed-off-by: Andrew Morton --- arch/parisc/include/asm/cacheflush.h | 14 +++-- arch/parisc/include/asm/pgtable.h | 37 +++++++----- arch/parisc/kernel/cache.c | 107 ++++++++++++++++++++++++----------- 3 files changed, 105 insertions(+), 53 deletions(-) (limited to 'arch') diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index c8b6928cee1e..b77c3e0c37d3 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -43,8 +43,13 @@ void invalidate_kernel_vmap_range(void *vaddr, int size); #define flush_cache_vmap(start, end) flush_cache_all() #define flush_cache_vunmap(start, end) flush_cache_all() +void flush_dcache_folio(struct folio *folio); +#define flush_dcache_folio flush_dcache_folio #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -void flush_dcache_page(struct page *page); +static inline void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); +} #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) @@ -53,10 +58,9 @@ void flush_dcache_page(struct page *page); #define flush_dcache_mmap_unlock_irqrestore(mapping, flags) \ xa_unlock_irqrestore(&mapping->i_pages, flags) -#define flush_icache_page(vma,page) do { \ - flush_kernel_dcache_page_addr(page_address(page)); \ - flush_kernel_icache_page(page_address(page)); \ -} while (0) +void flush_icache_pages(struct vm_area_struct *vma, struct page *page, + unsigned int nr); +#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1) #define flush_icache_range(s,e) do { \ flush_kernel_dcache_range_asm(s,e); \ diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 5656395c95ee..ce38bb375b60 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -73,15 +73,6 @@ extern void __update_cache(pte_t pte); mb(); \ } while(0) -#define set_pte_at(mm, addr, pteptr, pteval) \ - do { \ - if (pte_present(pteval) && \ - pte_user(pteval)) \ - __update_cache(pteval); \ - *(pteptr) = (pteval); \ - purge_tlb_entries(mm, addr); \ - } while (0) - #endif /* !__ASSEMBLY__ */ #define pte_ERROR(e) \ @@ -285,7 +276,7 @@ extern unsigned long *empty_zero_page; #define pte_none(x) (pte_val(x) == 0) #define pte_present(x) (pte_val(x) & _PAGE_PRESENT) #define pte_user(x) (pte_val(x) & _PAGE_USER) -#define pte_clear(mm, addr, xp) set_pte_at(mm, addr, xp, __pte(0)) +#define pte_clear(mm, addr, xp) set_pte(xp, __pte(0)) #define pmd_flag(x) (pmd_val(x) & PxD_FLAG_MASK) #define pmd_address(x) ((unsigned long)(pmd_val(x) &~ PxD_FLAG_MASK) << PxD_VALUE_SHIFT) @@ -391,11 +382,29 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) extern void paging_init (void); +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) +{ + if (pte_present(pte) && pte_user(pte)) + __update_cache(pte); + for (;;) { + *ptep = pte; + purge_tlb_entries(mm, addr); + if (--nr == 0) + break; + ptep++; + pte_val(pte) += 1 << PFN_PTE_SHIFT; + addr += PAGE_SIZE; + } +} +#define set_ptes set_ptes + /* Used for deferring calls to flush_dcache_page() */ #define PG_dcache_dirty PG_arch_1 -#define update_mmu_cache(vms,addr,ptep) __update_cache(*ptep) +#define update_mmu_cache_range(vmf, vma, addr, ptep, nr) __update_cache(*ptep) +#define update_mmu_cache(vma, addr, ptep) __update_cache(*ptep) /* * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that @@ -450,7 +459,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned if (!pte_young(pte)) { return 0; } - set_pte_at(vma->vm_mm, addr, ptep, pte_mkold(pte)); + set_pte(ptep, pte_mkold(pte)); return 1; } @@ -460,14 +469,14 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t old_pte; old_pte = *ptep; - set_pte_at(mm, addr, ptep, __pte(0)); + set_pte(ptep, __pte(0)); return old_pte; } static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - set_pte_at(mm, addr, ptep, pte_wrprotect(*ptep)); + set_pte(ptep, pte_wrprotect(*ptep)); } #define pte_same(A,B) (pte_val(A) == pte_val(B)) diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index b55b35c89d6a..442109a48940 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -94,11 +94,11 @@ static inline void flush_data_cache(void) /* Kernel virtual address of pfn. */ #define pfn_va(pfn) __va(PFN_PHYS(pfn)) -void -__update_cache(pte_t pte) +void __update_cache(pte_t pte) { unsigned long pfn = pte_pfn(pte); - struct page *page; + struct folio *folio; + unsigned int nr; /* We don't have pte special. As a result, we can be called with an invalid pfn and we don't need to flush the kernel dcache page. @@ -106,13 +106,17 @@ __update_cache(pte_t pte) if (!pfn_valid(pfn)) return; - page = pfn_to_page(pfn); - if (page_mapping_file(page) && - test_bit(PG_dcache_dirty, &page->flags)) { - flush_kernel_dcache_page_addr(pfn_va(pfn)); - clear_bit(PG_dcache_dirty, &page->flags); + folio = page_folio(pfn_to_page(pfn)); + pfn = folio_pfn(folio); + nr = folio_nr_pages(folio); + if (folio_flush_mapping(folio) && + test_bit(PG_dcache_dirty, &folio->flags)) { + while (nr--) + flush_kernel_dcache_page_addr(pfn_va(pfn + nr)); + clear_bit(PG_dcache_dirty, &folio->flags); } else if (parisc_requires_coherency()) - flush_kernel_dcache_page_addr(pfn_va(pfn)); + while (nr--) + flush_kernel_dcache_page_addr(pfn_va(pfn + nr)); } void @@ -366,6 +370,20 @@ static void flush_user_cache_page(struct vm_area_struct *vma, unsigned long vmad preempt_enable(); } +void flush_icache_pages(struct vm_area_struct *vma, struct page *page, + unsigned int nr) +{ + void *kaddr = page_address(page); + + for (;;) { + flush_kernel_dcache_page_addr(kaddr); + flush_kernel_icache_page(kaddr); + if (--nr == 0) + break; + kaddr += PAGE_SIZE; + } +} + static inline pte_t *get_ptep(struct mm_struct *mm, unsigned long addr) { pte_t *ptep = NULL; @@ -394,27 +412,30 @@ static inline bool pte_needs_flush(pte_t pte) == (_PAGE_PRESENT | _PAGE_ACCESSED); } -void flush_dcache_page(struct page *page) +void flush_dcache_folio(struct folio *folio) { - struct address_space *mapping = page_mapping_file(page); - struct vm_area_struct *mpnt; - unsigned long offset; + struct address_space *mapping = folio_flush_mapping(folio); + struct vm_area_struct *vma; unsigned long addr, old_addr = 0; + void *kaddr; unsigned long count = 0; - unsigned long flags; + unsigned long i, nr, flags; pgoff_t pgoff; if (mapping && !mapping_mapped(mapping)) { - set_bit(PG_dcache_dirty, &page->flags); + set_bit(PG_dcache_dirty, &folio->flags); return; } - flush_kernel_dcache_page_addr(page_address(page)); + nr = folio_nr_pages(folio); + kaddr = folio_address(folio); + for (i = 0; i < nr; i++) + flush_kernel_dcache_page_addr(kaddr + i * PAGE_SIZE); if (!mapping) return; - pgoff = page->index; + pgoff = folio->index; /* * We have carefully arranged in arch_get_unmapped_area() that @@ -424,20 +445,33 @@ void flush_dcache_page(struct page *page) * on machines that support equivalent aliasing */ flush_dcache_mmap_lock_irqsave(mapping, flags); - vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) { - offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT; - addr = mpnt->vm_start + offset; - if (parisc_requires_coherency()) { - bool needs_flush = false; - pte_t *ptep; + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff + nr - 1) { + unsigned long offset = pgoff - vma->vm_pgoff; + unsigned long pfn = folio_pfn(folio); + + addr = vma->vm_start; + nr = folio_nr_pages(folio); + if (offset > -nr) { + pfn -= offset; + nr += offset; + } else { + addr += offset * PAGE_SIZE; + } + if (addr + nr * PAGE_SIZE > vma->vm_end) + nr = (vma->vm_end - addr) / PAGE_SIZE; - ptep = get_ptep(mpnt->vm_mm, addr); - if (ptep) { - needs_flush = pte_needs_flush(*ptep); + if (parisc_requires_coherency()) { + for (i = 0; i < nr; i++) { + pte_t *ptep = get_ptep(vma->vm_mm, + addr + i * PAGE_SIZE); + if (!ptep) + continue; + if (pte_needs_flush(*ptep)) + flush_user_cache_page(vma, + addr + i * PAGE_SIZE); + /* Optimise accesses to the same table? */ pte_unmap(ptep); } - if (needs_flush) - flush_user_cache_page(mpnt, addr); } else { /* * The TLB is the engine of coherence on parisc: @@ -450,27 +484,32 @@ void flush_dcache_page(struct page *page) * in (until the user or kernel specifically * accesses it, of course) */ - flush_tlb_page(mpnt, addr); + for (i = 0; i < nr; i++) + flush_tlb_page(vma, addr + i * PAGE_SIZE); if (old_addr == 0 || (old_addr & (SHM_COLOUR - 1)) != (addr & (SHM_COLOUR - 1))) { - __flush_cache_page(mpnt, addr, page_to_phys(page)); + for (i = 0; i < nr; i++) + __flush_cache_page(vma, + addr + i * PAGE_SIZE, + (pfn + i) * PAGE_SIZE); /* * Software is allowed to have any number * of private mappings to a page. */ - if (!(mpnt->vm_flags & VM_SHARED)) + if (!(vma->vm_flags & VM_SHARED)) continue; if (old_addr) pr_err("INEQUIVALENT ALIASES 0x%lx and 0x%lx in file %pD\n", - old_addr, addr, mpnt->vm_file); - old_addr = addr; + old_addr, addr, vma->vm_file); + if (nr == folio_nr_pages(folio)) + old_addr = addr; } } WARN_ON(++count == 4096); } flush_dcache_mmap_unlock_irqrestore(mapping, flags); } -EXPORT_SYMBOL(flush_dcache_page); +EXPORT_SYMBOL(flush_dcache_folio); /* Defined in arch/parisc/kernel/pacache.S */ EXPORT_SYMBOL(flush_kernel_dcache_range_asm); -- cgit v1.2.3 From 9fee28baa601f4dbf869b1373183b312d2d5ef3d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:49 +0100 Subject: powerpc: implement the new page table range API Add set_ptes(), update_mmu_cache_range() and flush_dcache_folio(). Change the PG_arch_1 (aka PG_dcache_dirty) flag from being per-page to per-folio. [willy@infradead.org: re-export flush_dcache_icache_folio()] Link: https://lkml.kernel.org/r/ZMx1daYwvD9EM7Cv@casper.infradead.org Link: https://lkml.kernel.org/r/20230802151406.3735276-22-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/32/pgtable.h | 5 --- arch/powerpc/include/asm/book3s/64/pgtable.h | 6 +--- arch/powerpc/include/asm/book3s/pgtable.h | 11 ++---- arch/powerpc/include/asm/cacheflush.h | 14 +++++--- arch/powerpc/include/asm/kvm_ppc.h | 10 +++--- arch/powerpc/include/asm/nohash/pgtable.h | 16 +++------ arch/powerpc/include/asm/pgtable.h | 12 +++++++ arch/powerpc/mm/book3s64/hash_utils.c | 11 +++--- arch/powerpc/mm/cacheflush.c | 41 ++++++++------------- arch/powerpc/mm/nohash/e500_hugetlbpage.c | 3 +- arch/powerpc/mm/pgtable.c | 53 ++++++++++++++++------------ 11 files changed, 89 insertions(+), 93 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 7bf1fe7297c6..5f12b9382909 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -462,11 +462,6 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) pgprot_val(pgprot)); } -static inline unsigned long pte_pfn(pte_t pte) -{ - return pte_val(pte) >> PTE_RPN_SHIFT; -} - /* Generic modifiers for PTE bits */ static inline pte_t pte_wrprotect(pte_t pte) { diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index a8204566cfd0..8269b231c533 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -104,6 +104,7 @@ * and every thing below PAGE_SHIFT; */ #define PTE_RPN_MASK (((1UL << _PAGE_PA_MAX) - 1) & (PAGE_MASK)) +#define PTE_RPN_SHIFT PAGE_SHIFT /* * set of bits not changed in pmd_modify. Even though we have hash specific bits * in here, on radix we expect them to be zero. @@ -569,11 +570,6 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) return __pte(((pte_basic_t)pfn << PAGE_SHIFT) | pgprot_val(pgprot) | _PAGE_PTE); } -static inline unsigned long pte_pfn(pte_t pte) -{ - return (pte_val(pte) & PTE_RPN_MASK) >> PAGE_SHIFT; -} - /* Generic modifiers for PTE bits */ static inline pte_t pte_wrprotect(pte_t pte) { diff --git a/arch/powerpc/include/asm/book3s/pgtable.h b/arch/powerpc/include/asm/book3s/pgtable.h index d18b748ea3ae..3b7bd36a2321 100644 --- a/arch/powerpc/include/asm/book3s/pgtable.h +++ b/arch/powerpc/include/asm/book3s/pgtable.h @@ -9,13 +9,6 @@ #endif #ifndef __ASSEMBLY__ -/* Insert a PTE, top-level function is out of line. It uses an inline - * low level function in the respective pgtable-* files - */ -extern void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, - pte_t pte); - - #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); @@ -36,7 +29,9 @@ void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t * corresponding HPTE into the hash table ahead of time, instead of * waiting for the inevitable extra hash-table miss exception. */ -static inline void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) { if (IS_ENABLED(CONFIG_PPC32) && !mmu_has_feature(MMU_FTR_HPTE_TABLE)) return; diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h index 7564dd4fd12b..ef7d2de33b89 100644 --- a/arch/powerpc/include/asm/cacheflush.h +++ b/arch/powerpc/include/asm/cacheflush.h @@ -35,13 +35,19 @@ static inline void flush_cache_vmap(unsigned long start, unsigned long end) * It just marks the page as not i-cache clean. We do the i-cache * flush later when the page is given to a user process, if necessary. */ -static inline void flush_dcache_page(struct page *page) +static inline void flush_dcache_folio(struct folio *folio) { if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) return; /* avoid an atomic op if possible */ - if (test_bit(PG_dcache_clean, &page->flags)) - clear_bit(PG_dcache_clean, &page->flags); + if (test_bit(PG_dcache_clean, &folio->flags)) + clear_bit(PG_dcache_clean, &folio->flags); +} +#define flush_dcache_folio flush_dcache_folio + +static inline void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); } void flush_icache_range(unsigned long start, unsigned long stop); @@ -51,7 +57,7 @@ void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); #define flush_icache_user_page flush_icache_user_page -void flush_dcache_icache_page(struct page *page); +void flush_dcache_icache_folio(struct folio *folio); /** * flush_dcache_range(): Write any modified data cache blocks out to memory and diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index d16d80ad2ae4..b4da8514af43 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -894,7 +894,7 @@ void kvmppc_init_lpid(unsigned long nr_lpids); static inline void kvmppc_mmu_flush_icache(kvm_pfn_t pfn) { - struct page *page; + struct folio *folio; /* * We can only access pages that the kernel maps * as memory. Bail out for unmapped ones. @@ -903,10 +903,10 @@ static inline void kvmppc_mmu_flush_icache(kvm_pfn_t pfn) return; /* Clear i-cache for new pages */ - page = pfn_to_page(pfn); - if (!test_bit(PG_dcache_clean, &page->flags)) { - flush_dcache_icache_page(page); - set_bit(PG_dcache_clean, &page->flags); + folio = page_folio(pfn_to_page(pfn)); + if (!test_bit(PG_dcache_clean, &folio->flags)) { + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags); } } diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index a6caaaab6f92..56ea48276356 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -101,8 +101,6 @@ static inline bool pte_access_permitted(pte_t pte, bool write) static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) { return __pte(((pte_basic_t)(pfn) << PTE_RPN_SHIFT) | pgprot_val(pgprot)); } -static inline unsigned long pte_pfn(pte_t pte) { - return pte_val(pte) >> PTE_RPN_SHIFT; } /* Generic modifiers for PTE bits */ static inline pte_t pte_exprotect(pte_t pte) @@ -166,12 +164,6 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) return __pte(pte_val(pte) & ~_PAGE_SWP_EXCLUSIVE); } -/* Insert a PTE, top-level function is out of line. It uses an inline - * low level function in the respective pgtable-* files - */ -extern void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, - pte_t pte); - /* This low level function performs the actual PTE insertion * Setting the PTE depends on the MMU type and other factors. It's * an horrible mess that I'm not going to try to clean up now but @@ -282,10 +274,12 @@ static inline int pud_huge(pud_t pud) * for the page which has just been mapped in. */ #if defined(CONFIG_PPC_E500) && defined(CONFIG_HUGETLB_PAGE) -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr); #else -static inline -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) {} +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) {} #endif #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 33464e6d6431..b2e9bc4a52c1 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -41,6 +41,12 @@ struct mm_struct; #ifndef __ASSEMBLY__ +void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte, unsigned int nr); +#define set_ptes set_ptes +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) + #ifndef MAX_PTRS_PER_PGD #define MAX_PTRS_PER_PGD PTRS_PER_PGD #endif @@ -48,6 +54,12 @@ struct mm_struct; /* Keep these as a macros to avoid include dependency mess */ #define pte_page(x) pfn_to_page(pte_pfn(x)) #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) + +static inline unsigned long pte_pfn(pte_t pte) +{ + return (pte_val(pte) & PTE_RPN_MASK) >> PTE_RPN_SHIFT; +} + /* * Select all bits except the pfn */ diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index fedffe3ae136..ad2afa08e62e 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1307,18 +1307,19 @@ void hash__early_init_mmu_secondary(void) */ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) { - struct page *page; + struct folio *folio; if (!pfn_valid(pte_pfn(pte))) return pp; - page = pte_page(pte); + folio = page_folio(pte_page(pte)); /* page is dirty */ - if (!test_bit(PG_dcache_clean, &page->flags) && !PageReserved(page)) { + if (!test_bit(PG_dcache_clean, &folio->flags) && + !folio_test_reserved(folio)) { if (trap == INTERRUPT_INST_STORAGE) { - flush_dcache_icache_page(page); - set_bit(PG_dcache_clean, &page->flags); + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags); } else pp |= HPTE_R_N; } diff --git a/arch/powerpc/mm/cacheflush.c b/arch/powerpc/mm/cacheflush.c index 0e9b4879c0f9..15189592da09 100644 --- a/arch/powerpc/mm/cacheflush.c +++ b/arch/powerpc/mm/cacheflush.c @@ -148,44 +148,31 @@ static void __flush_dcache_icache(void *p) invalidate_icache_range(addr, addr + PAGE_SIZE); } -static void flush_dcache_icache_hugepage(struct page *page) +void flush_dcache_icache_folio(struct folio *folio) { - int i; - int nr = compound_nr(page); + unsigned int i, nr = folio_nr_pages(folio); - if (!PageHighMem(page)) { + if (flush_coherent_icache()) + return; + + if (!folio_test_highmem(folio)) { + void *addr = folio_address(folio); for (i = 0; i < nr; i++) - __flush_dcache_icache(lowmem_page_address(page + i)); - } else { + __flush_dcache_icache(addr + i * PAGE_SIZE); + } else if (IS_ENABLED(CONFIG_BOOKE) || sizeof(phys_addr_t) > sizeof(void *)) { for (i = 0; i < nr; i++) { - void *start = kmap_local_page(page + i); + void *start = kmap_local_folio(folio, i * PAGE_SIZE); __flush_dcache_icache(start); kunmap_local(start); } - } -} - -void flush_dcache_icache_page(struct page *page) -{ - if (flush_coherent_icache()) - return; - - if (PageCompound(page)) - return flush_dcache_icache_hugepage(page); - - if (!PageHighMem(page)) { - __flush_dcache_icache(lowmem_page_address(page)); - } else if (IS_ENABLED(CONFIG_BOOKE) || sizeof(phys_addr_t) > sizeof(void *)) { - void *start = kmap_local_page(page); - - __flush_dcache_icache(start); - kunmap_local(start); } else { - flush_dcache_icache_phys(page_to_phys(page)); + unsigned long pfn = folio_pfn(folio); + for (i = 0; i < nr; i++) + flush_dcache_icache_phys((pfn + i) * PAGE_SIZE); } } -EXPORT_SYMBOL(flush_dcache_icache_page); +EXPORT_SYMBOL(flush_dcache_icache_folio); void clear_user_page(void *page, unsigned long vaddr, struct page *pg) { diff --git a/arch/powerpc/mm/nohash/e500_hugetlbpage.c b/arch/powerpc/mm/nohash/e500_hugetlbpage.c index 58c8d9849cb1..6b30e40d4590 100644 --- a/arch/powerpc/mm/nohash/e500_hugetlbpage.c +++ b/arch/powerpc/mm/nohash/e500_hugetlbpage.c @@ -178,7 +178,8 @@ book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte) * * This must always be called with the pte lock held. */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr) { if (is_vm_hugetlb_page(vma)) book3e_hugetlb_preload(vma, address, *ptep); diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index a3dcdb2d5b4b..3f86fd217690 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -58,7 +58,7 @@ static inline int pte_looks_normal(pte_t pte) return 0; } -static struct page *maybe_pte_to_page(pte_t pte) +static struct folio *maybe_pte_to_folio(pte_t pte) { unsigned long pfn = pte_pfn(pte); struct page *page; @@ -68,7 +68,7 @@ static struct page *maybe_pte_to_page(pte_t pte) page = pfn_to_page(pfn); if (PageReserved(page)) return NULL; - return page; + return page_folio(page); } #ifdef CONFIG_PPC_BOOK3S @@ -84,12 +84,12 @@ static pte_t set_pte_filter_hash(pte_t pte) pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || cpu_has_feature(CPU_FTR_NOEXECUTE))) { - struct page *pg = maybe_pte_to_page(pte); - if (!pg) + struct folio *folio = maybe_pte_to_folio(pte); + if (!folio) return pte; - if (!test_bit(PG_dcache_clean, &pg->flags)) { - flush_dcache_icache_page(pg); - set_bit(PG_dcache_clean, &pg->flags); + if (!test_bit(PG_dcache_clean, &folio->flags)) { + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags); } } return pte; @@ -107,7 +107,7 @@ static pte_t set_pte_filter_hash(pte_t pte) { return pte; } */ static inline pte_t set_pte_filter(pte_t pte) { - struct page *pg; + struct folio *folio; if (radix_enabled()) return pte; @@ -120,18 +120,18 @@ static inline pte_t set_pte_filter(pte_t pte) return pte; /* If you set _PAGE_EXEC on weird pages you're on your own */ - pg = maybe_pte_to_page(pte); - if (unlikely(!pg)) + folio = maybe_pte_to_folio(pte); + if (unlikely(!folio)) return pte; /* If the page clean, we move on */ - if (test_bit(PG_dcache_clean, &pg->flags)) + if (test_bit(PG_dcache_clean, &folio->flags)) return pte; /* If it's an exec fault, we flush the cache and make it clean */ if (is_exec_fault()) { - flush_dcache_icache_page(pg); - set_bit(PG_dcache_clean, &pg->flags); + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags); return pte; } @@ -142,7 +142,7 @@ static inline pte_t set_pte_filter(pte_t pte) static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, int dirty) { - struct page *pg; + struct folio *folio; if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) return pte; @@ -168,17 +168,17 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, #endif /* CONFIG_DEBUG_VM */ /* If you set _PAGE_EXEC on weird pages you're on your own */ - pg = maybe_pte_to_page(pte); - if (unlikely(!pg)) + folio = maybe_pte_to_folio(pte); + if (unlikely(!folio)) goto bail; /* If the page is already clean, we move on */ - if (test_bit(PG_dcache_clean, &pg->flags)) + if (test_bit(PG_dcache_clean, &folio->flags)) goto bail; /* Clean the page and set PG_dcache_clean */ - flush_dcache_icache_page(pg); - set_bit(PG_dcache_clean, &pg->flags); + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags); bail: return pte_mkexec(pte); @@ -187,8 +187,8 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, /* * set_pte stores a linux PTE into the linux page table. */ -void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, - pte_t pte) +void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte, unsigned int nr) { /* * Make sure hardware valid bit is not set. We don't do @@ -203,7 +203,16 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte = set_pte_filter(pte); /* Perform the setting of the PTE */ - __set_pte_at(mm, addr, ptep, pte, 0); + arch_enter_lazy_mmu_mode(); + for (;;) { + __set_pte_at(mm, addr, ptep, pte, 0); + if (--nr == 0) + break; + ptep++; + pte = __pte(pte_val(pte) + (1UL << PTE_RPN_SHIFT)); + addr += PAGE_SIZE; + } + arch_leave_lazy_mmu_mode(); } void unmap_kernel_page(unsigned long va) -- cgit v1.2.3 From 864609c6a0b5f0464f6ec7869cb2a45a529c35d7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:50 +0100 Subject: riscv: implement the new page table range API Add set_ptes(), update_mmu_cache_range() and flush_dcache_folio(). Change the PG_dcache_clean flag from being per-page to per-folio. Link: https://lkml.kernel.org/r/20230802151406.3735276-23-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Alexandre Ghiti Acked-by: Mike Rapoport (IBM) Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Signed-off-by: Andrew Morton --- arch/riscv/include/asm/cacheflush.h | 19 +++++++++---------- arch/riscv/include/asm/pgtable.h | 37 ++++++++++++++++++++++++------------- arch/riscv/mm/cacheflush.c | 13 +++---------- 3 files changed, 36 insertions(+), 33 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index 8091b8bf4883..0d8c92c5dfb7 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -15,20 +15,19 @@ static inline void local_flush_icache_all(void) #define PG_dcache_clean PG_arch_1 -static inline void flush_dcache_page(struct page *page) +static inline void flush_dcache_folio(struct folio *folio) { - /* - * HugeTLB pages are always fully mapped and only head page will be - * set PG_dcache_clean (see comments in flush_icache_pte()). - */ - if (PageHuge(page)) - page = compound_head(page); - - if (test_bit(PG_dcache_clean, &page->flags)) - clear_bit(PG_dcache_clean, &page->flags); + if (test_bit(PG_dcache_clean, &folio->flags)) + clear_bit(PG_dcache_clean, &folio->flags); } +#define flush_dcache_folio flush_dcache_folio #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 +static inline void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); +} + /* * RISC-V doesn't have an instruction to flush parts of the instruction cache, * so instead we just flush the whole thing. diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 01e4aabc8898..ac42e9121e52 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -445,8 +445,9 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) /* Commit new configuration to MMU hardware */ -static inline void update_mmu_cache(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) { /* * The kernel assumes that TLBs don't cache invalid entries, but @@ -455,8 +456,11 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, * Relying on flush_tlb_fix_spurious_fault would suffice, but * the extra traps reduce performance. So, eagerly SFENCE.VMA. */ - local_flush_tlb_page(address); + while (nr--) + local_flush_tlb_page(address + nr * PAGE_SIZE); } +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) #define __HAVE_ARCH_UPDATE_MMU_TLB #define update_mmu_tlb update_mmu_cache @@ -487,8 +491,7 @@ static inline void set_pte(pte_t *ptep, pte_t pteval) void flush_icache_pte(pte_t pte); -static inline void __set_pte_at(struct mm_struct *mm, - unsigned long addr, pte_t *ptep, pte_t pteval) +static inline void __set_pte_at(pte_t *ptep, pte_t pteval) { if (pte_present(pteval) && pte_exec(pteval)) flush_icache_pte(pteval); @@ -496,17 +499,25 @@ static inline void __set_pte_at(struct mm_struct *mm, set_pte(ptep, pteval); } -static inline void set_pte_at(struct mm_struct *mm, - unsigned long addr, pte_t *ptep, pte_t pteval) +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval, unsigned int nr) { - page_table_check_ptes_set(mm, ptep, pteval, 1); - __set_pte_at(mm, addr, ptep, pteval); + page_table_check_ptes_set(mm, ptep, pteval, nr); + + for (;;) { + __set_pte_at(ptep, pteval); + if (--nr == 0) + break; + ptep++; + pte_val(pteval) += 1 << _PAGE_PFN_SHIFT; + } } +#define set_ptes set_ptes static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - __set_pte_at(mm, addr, ptep, __pte(0)); + __set_pte_at(ptep, __pte(0)); } #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS @@ -515,7 +526,7 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, pte_t entry, int dirty) { if (!pte_same(*ptep, entry)) - set_pte_at(vma->vm_mm, address, ptep, entry); + __set_pte_at(ptep, entry); /* * update_mmu_cache will unconditionally execute, handling both * the case that the PTE changed and the spurious fault case. @@ -688,14 +699,14 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { page_table_check_pmd_set(mm, pmdp, pmd); - return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)); + return __set_pte_at((pte_t *)pmdp, pmd_pte(pmd)); } static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { page_table_check_pud_set(mm, pudp, pud); - return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud)); + return __set_pte_at((pte_t *)pudp, pud_pte(pud)); } #ifdef CONFIG_PAGE_TABLE_CHECK diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index fbc59b3f69f2..f1387272a551 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -82,18 +82,11 @@ void flush_icache_mm(struct mm_struct *mm, bool local) #ifdef CONFIG_MMU void flush_icache_pte(pte_t pte) { - struct page *page = pte_page(pte); + struct folio *folio = page_folio(pte_page(pte)); - /* - * HugeTLB pages are always fully mapped, so only setting head page's - * PG_dcache_clean flag is enough. - */ - if (PageHuge(page)) - page = compound_head(page); - - if (!test_bit(PG_dcache_clean, &page->flags)) { + if (!test_bit(PG_dcache_clean, &folio->flags)) { flush_icache_all(); - set_bit(PG_dcache_clean, &page->flags); + set_bit(PG_dcache_clean, &folio->flags); } } #endif /* CONFIG_MMU */ -- cgit v1.2.3 From 843f9310e00ad4d6207cff88c05ce90b857625d0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:51 +0100 Subject: s390: implement the new page table range API Add set_ptes() and update_mmu_cache_range(). Link: https://lkml.kernel.org/r/20230802151406.3735276-24-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Gerald Schaefer Acked-by: Mike Rapoport (IBM) Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Signed-off-by: Andrew Morton --- arch/s390/include/asm/pgtable.h | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index c55f3c3365af..02973c740a5b 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -47,6 +47,7 @@ static inline void update_page_count(int level, long count) * tables contain all the necessary information. */ #define update_mmu_cache(vma, address, ptep) do { } while (0) +#define update_mmu_cache_range(vmf, vma, addr, ptep, nr) do { } while (0) #define update_mmu_cache_pmd(vma, address, ptep) do { } while (0) /* @@ -1316,20 +1317,34 @@ pgprot_t pgprot_writecombine(pgprot_t prot); pgprot_t pgprot_writethrough(pgprot_t prot); /* - * Certain architectures need to do special things when PTEs - * within a page table are directly modified. Thus, the following - * hook is made available. + * Set multiple PTEs to consecutive pages with a single call. All PTEs + * are within the same folio, PMD and VMA. */ -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t entry) +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t entry, unsigned int nr) { if (pte_present(entry)) entry = clear_pte_bit(entry, __pgprot(_PAGE_UNUSED)); - if (mm_has_pgste(mm)) - ptep_set_pte_at(mm, addr, ptep, entry); - else - set_pte(ptep, entry); + if (mm_has_pgste(mm)) { + for (;;) { + ptep_set_pte_at(mm, addr, ptep, entry); + if (--nr == 0) + break; + ptep++; + entry = __pte(pte_val(entry) + PAGE_SIZE); + addr += PAGE_SIZE; + } + } else { + for (;;) { + set_pte(ptep, entry); + if (--nr == 0) + break; + ptep++; + entry = __pte(pte_val(entry) + PAGE_SIZE); + } + } } +#define set_ptes set_ptes /* * Conversion functions: convert a page and protection to a page entry, -- cgit v1.2.3 From 157efa290441bf7eb952f81717704afef09ae0d6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:52 +0100 Subject: sh: implement the new page table range API Add PFN_PTE_SHIFT, update_mmu_cache_range(), flush_dcache_folio() and flush_icache_pages(). Change the PG_dcache_clean flag from being per-page to per-folio. Flush the entire folio containing the pages in flush_icache_pages() for ease of implementation. Link: https://lkml.kernel.org/r/20230802151406.3735276-25-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Yoshinori Sato Cc: Rich Felker Cc: John Paul Adrian Glaubitz Signed-off-by: Andrew Morton --- arch/sh/include/asm/cacheflush.h | 21 ++++++++++------ arch/sh/include/asm/pgtable.h | 7 ++++-- arch/sh/include/asm/pgtable_32.h | 5 ++-- arch/sh/mm/cache-j2.c | 4 ++-- arch/sh/mm/cache-sh4.c | 26 +++++++++++++------- arch/sh/mm/cache-sh7705.c | 26 ++++++++++++-------- arch/sh/mm/cache.c | 52 +++++++++++++++++++++++----------------- arch/sh/mm/kmap.c | 3 ++- 8 files changed, 89 insertions(+), 55 deletions(-) (limited to 'arch') diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h index 481a664287e2..9fceef6f3e00 100644 --- a/arch/sh/include/asm/cacheflush.h +++ b/arch/sh/include/asm/cacheflush.h @@ -13,9 +13,9 @@ * - flush_cache_page(mm, vmaddr, pfn) flushes a single page * - flush_cache_range(vma, start, end) flushes a range of pages * - * - flush_dcache_page(pg) flushes(wback&invalidates) a page for dcache + * - flush_dcache_folio(folio) flushes(wback&invalidates) a folio for dcache * - flush_icache_range(start, end) flushes(invalidates) a range for icache - * - flush_icache_page(vma, pg) flushes(invalidates) a page for icache + * - flush_icache_pages(vma, pg, nr) flushes(invalidates) pages for icache * - flush_cache_sigtramp(vaddr) flushes the signal trampoline */ extern void (*local_flush_cache_all)(void *args); @@ -23,9 +23,9 @@ extern void (*local_flush_cache_mm)(void *args); extern void (*local_flush_cache_dup_mm)(void *args); extern void (*local_flush_cache_page)(void *args); extern void (*local_flush_cache_range)(void *args); -extern void (*local_flush_dcache_page)(void *args); +extern void (*local_flush_dcache_folio)(void *args); extern void (*local_flush_icache_range)(void *args); -extern void (*local_flush_icache_page)(void *args); +extern void (*local_flush_icache_folio)(void *args); extern void (*local_flush_cache_sigtramp)(void *args); static inline void cache_noop(void *args) { } @@ -42,11 +42,18 @@ extern void flush_cache_page(struct vm_area_struct *vma, extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -void flush_dcache_page(struct page *page); +void flush_dcache_folio(struct folio *folio); +#define flush_dcache_folio flush_dcache_folio +static inline void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); +} + extern void flush_icache_range(unsigned long start, unsigned long end); #define flush_icache_user_range flush_icache_range -extern void flush_icache_page(struct vm_area_struct *vma, - struct page *page); +void flush_icache_pages(struct vm_area_struct *vma, struct page *page, + unsigned int nr); +#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1) extern void flush_cache_sigtramp(unsigned long address); struct flusher_data { diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index 3ce30becf6df..729f5c6225fb 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h @@ -102,13 +102,16 @@ extern void __update_cache(struct vm_area_struct *vma, extern void __update_tlb(struct vm_area_struct *vma, unsigned long address, pte_t pte); -static inline void -update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) { pte_t pte = *ptep; __update_cache(vma, address, pte); __update_tlb(vma, address, pte); } +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern void paging_init(void); diff --git a/arch/sh/include/asm/pgtable_32.h b/arch/sh/include/asm/pgtable_32.h index 21952b094650..676f3d4ef6ce 100644 --- a/arch/sh/include/asm/pgtable_32.h +++ b/arch/sh/include/asm/pgtable_32.h @@ -307,14 +307,13 @@ static inline void set_pte(pte_t *ptep, pte_t pte) #define set_pte(pteptr, pteval) (*(pteptr) = pteval) #endif -#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) - /* * (pmds are folded into pgds so this doesn't get actually called, * but the define is needed for a generic inline function.) */ #define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval) +#define PFN_PTE_SHIFT PAGE_SHIFT #define pfn_pte(pfn, prot) \ __pte(((unsigned long long)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define pfn_pmd(pfn, prot) \ @@ -323,7 +322,7 @@ static inline void set_pte(pte_t *ptep, pte_t pte) #define pte_none(x) (!pte_val(x)) #define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) -#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) +#define pte_clear(mm, addr, ptep) set_pte(ptep, __pte(0)) #define pmd_none(x) (!pmd_val(x)) #define pmd_present(x) (pmd_val(x)) diff --git a/arch/sh/mm/cache-j2.c b/arch/sh/mm/cache-j2.c index f277862a11f5..9ac960214380 100644 --- a/arch/sh/mm/cache-j2.c +++ b/arch/sh/mm/cache-j2.c @@ -55,9 +55,9 @@ void __init j2_cache_init(void) local_flush_cache_dup_mm = j2_flush_both; local_flush_cache_page = j2_flush_both; local_flush_cache_range = j2_flush_both; - local_flush_dcache_page = j2_flush_dcache; + local_flush_dcache_folio = j2_flush_dcache; local_flush_icache_range = j2_flush_icache; - local_flush_icache_page = j2_flush_icache; + local_flush_icache_folio = j2_flush_icache; local_flush_cache_sigtramp = j2_flush_icache; pr_info("Initial J2 CCR is %.8x\n", __raw_readl(j2_ccr_base)); diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c index 72c2e1b46c08..862046f26981 100644 --- a/arch/sh/mm/cache-sh4.c +++ b/arch/sh/mm/cache-sh4.c @@ -107,19 +107,29 @@ static inline void flush_cache_one(unsigned long start, unsigned long phys) * Write back & invalidate the D-cache of the page. * (To avoid "alias" issues) */ -static void sh4_flush_dcache_page(void *arg) +static void sh4_flush_dcache_folio(void *arg) { - struct page *page = arg; - unsigned long addr = (unsigned long)page_address(page); + struct folio *folio = arg; #ifndef CONFIG_SMP - struct address_space *mapping = page_mapping_file(page); + struct address_space *mapping = folio_flush_mapping(folio); if (mapping && !mapping_mapped(mapping)) - clear_bit(PG_dcache_clean, &page->flags); + clear_bit(PG_dcache_clean, &folio->flags); else #endif - flush_cache_one(CACHE_OC_ADDRESS_ARRAY | - (addr & shm_align_mask), page_to_phys(page)); + { + unsigned long pfn = folio_pfn(folio); + unsigned long addr = (unsigned long)folio_address(folio); + unsigned int i, nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) { + flush_cache_one(CACHE_OC_ADDRESS_ARRAY | + (addr & shm_align_mask), + pfn * PAGE_SIZE); + addr += PAGE_SIZE; + pfn++; + } + } wmb(); } @@ -379,7 +389,7 @@ void __init sh4_cache_init(void) __raw_readl(CCN_PRR)); local_flush_icache_range = sh4_flush_icache_range; - local_flush_dcache_page = sh4_flush_dcache_page; + local_flush_dcache_folio = sh4_flush_dcache_folio; local_flush_cache_all = sh4_flush_cache_all; local_flush_cache_mm = sh4_flush_cache_mm; local_flush_cache_dup_mm = sh4_flush_cache_mm; diff --git a/arch/sh/mm/cache-sh7705.c b/arch/sh/mm/cache-sh7705.c index 9b63a53a5e46..b509a407588f 100644 --- a/arch/sh/mm/cache-sh7705.c +++ b/arch/sh/mm/cache-sh7705.c @@ -132,15 +132,20 @@ static void __flush_dcache_page(unsigned long phys) * Write back & invalidate the D-cache of the page. * (To avoid "alias" issues) */ -static void sh7705_flush_dcache_page(void *arg) +static void sh7705_flush_dcache_folio(void *arg) { - struct page *page = arg; - struct address_space *mapping = page_mapping_file(page); + struct folio *folio = arg; + struct address_space *mapping = folio_flush_mapping(folio); if (mapping && !mapping_mapped(mapping)) - clear_bit(PG_dcache_clean, &page->flags); - else - __flush_dcache_page(__pa(page_address(page))); + clear_bit(PG_dcache_clean, &folio->flags); + else { + unsigned long pfn = folio_pfn(folio); + unsigned int i, nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) + __flush_dcache_page((pfn + i) * PAGE_SIZE); + } } static void sh7705_flush_cache_all(void *args) @@ -176,19 +181,20 @@ static void sh7705_flush_cache_page(void *args) * Not entirely sure why this is necessary on SH3 with 32K cache but * without it we get occasional "Memory fault" when loading a program. */ -static void sh7705_flush_icache_page(void *page) +static void sh7705_flush_icache_folio(void *arg) { - __flush_purge_region(page_address(page), PAGE_SIZE); + struct folio *folio = arg; + __flush_purge_region(folio_address(folio), folio_size(folio)); } void __init sh7705_cache_init(void) { local_flush_icache_range = sh7705_flush_icache_range; - local_flush_dcache_page = sh7705_flush_dcache_page; + local_flush_dcache_folio = sh7705_flush_dcache_folio; local_flush_cache_all = sh7705_flush_cache_all; local_flush_cache_mm = sh7705_flush_cache_all; local_flush_cache_dup_mm = sh7705_flush_cache_all; local_flush_cache_range = sh7705_flush_cache_all; local_flush_cache_page = sh7705_flush_cache_page; - local_flush_icache_page = sh7705_flush_icache_page; + local_flush_icache_folio = sh7705_flush_icache_folio; } diff --git a/arch/sh/mm/cache.c b/arch/sh/mm/cache.c index 3aef78ceb820..9bcaa5619eab 100644 --- a/arch/sh/mm/cache.c +++ b/arch/sh/mm/cache.c @@ -20,9 +20,9 @@ void (*local_flush_cache_mm)(void *args) = cache_noop; void (*local_flush_cache_dup_mm)(void *args) = cache_noop; void (*local_flush_cache_page)(void *args) = cache_noop; void (*local_flush_cache_range)(void *args) = cache_noop; -void (*local_flush_dcache_page)(void *args) = cache_noop; +void (*local_flush_dcache_folio)(void *args) = cache_noop; void (*local_flush_icache_range)(void *args) = cache_noop; -void (*local_flush_icache_page)(void *args) = cache_noop; +void (*local_flush_icache_folio)(void *args) = cache_noop; void (*local_flush_cache_sigtramp)(void *args) = cache_noop; void (*__flush_wback_region)(void *start, int size); @@ -61,15 +61,17 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, const void *src, unsigned long len) { - if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) && - test_bit(PG_dcache_clean, &page->flags)) { + struct folio *folio = page_folio(page); + + if (boot_cpu_data.dcache.n_aliases && folio_mapped(folio) && + test_bit(PG_dcache_clean, &folio->flags)) { void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); memcpy(vto, src, len); kunmap_coherent(vto); } else { memcpy(dst, src, len); if (boot_cpu_data.dcache.n_aliases) - clear_bit(PG_dcache_clean, &page->flags); + clear_bit(PG_dcache_clean, &folio->flags); } if (vma->vm_flags & VM_EXEC) @@ -80,27 +82,30 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, const void *src, unsigned long len) { + struct folio *folio = page_folio(page); + if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) && - test_bit(PG_dcache_clean, &page->flags)) { + test_bit(PG_dcache_clean, &folio->flags)) { void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); memcpy(dst, vfrom, len); kunmap_coherent(vfrom); } else { memcpy(dst, src, len); if (boot_cpu_data.dcache.n_aliases) - clear_bit(PG_dcache_clean, &page->flags); + clear_bit(PG_dcache_clean, &folio->flags); } } void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { + struct folio *src = page_folio(from); void *vfrom, *vto; vto = kmap_atomic(to); - if (boot_cpu_data.dcache.n_aliases && page_mapcount(from) && - test_bit(PG_dcache_clean, &from->flags)) { + if (boot_cpu_data.dcache.n_aliases && folio_mapped(src) && + test_bit(PG_dcache_clean, &src->flags)) { vfrom = kmap_coherent(from, vaddr); copy_page(vto, vfrom); kunmap_coherent(vfrom); @@ -136,27 +141,28 @@ EXPORT_SYMBOL(clear_user_highpage); void __update_cache(struct vm_area_struct *vma, unsigned long address, pte_t pte) { - struct page *page; unsigned long pfn = pte_pfn(pte); if (!boot_cpu_data.dcache.n_aliases) return; - page = pfn_to_page(pfn); if (pfn_valid(pfn)) { - int dirty = !test_and_set_bit(PG_dcache_clean, &page->flags); + struct folio *folio = page_folio(pfn_to_page(pfn)); + int dirty = !test_and_set_bit(PG_dcache_clean, &folio->flags); if (dirty) - __flush_purge_region(page_address(page), PAGE_SIZE); + __flush_purge_region(folio_address(folio), + folio_size(folio)); } } void __flush_anon_page(struct page *page, unsigned long vmaddr) { + struct folio *folio = page_folio(page); unsigned long addr = (unsigned long) page_address(page); if (pages_do_alias(addr, vmaddr)) { - if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) && - test_bit(PG_dcache_clean, &page->flags)) { + if (boot_cpu_data.dcache.n_aliases && folio_mapped(folio) && + test_bit(PG_dcache_clean, &folio->flags)) { void *kaddr; kaddr = kmap_coherent(page, vmaddr); @@ -164,7 +170,8 @@ void __flush_anon_page(struct page *page, unsigned long vmaddr) /* __flush_purge_region((void *)kaddr, PAGE_SIZE); */ kunmap_coherent(kaddr); } else - __flush_purge_region((void *)addr, PAGE_SIZE); + __flush_purge_region(folio_address(folio), + folio_size(folio)); } } @@ -215,11 +222,11 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start, } EXPORT_SYMBOL(flush_cache_range); -void flush_dcache_page(struct page *page) +void flush_dcache_folio(struct folio *folio) { - cacheop_on_each_cpu(local_flush_dcache_page, page, 1); + cacheop_on_each_cpu(local_flush_dcache_folio, folio, 1); } -EXPORT_SYMBOL(flush_dcache_page); +EXPORT_SYMBOL(flush_dcache_folio); void flush_icache_range(unsigned long start, unsigned long end) { @@ -233,10 +240,11 @@ void flush_icache_range(unsigned long start, unsigned long end) } EXPORT_SYMBOL(flush_icache_range); -void flush_icache_page(struct vm_area_struct *vma, struct page *page) +void flush_icache_pages(struct vm_area_struct *vma, struct page *page, + unsigned int nr) { - /* Nothing uses the VMA, so just pass the struct page along */ - cacheop_on_each_cpu(local_flush_icache_page, page, 1); + /* Nothing uses the VMA, so just pass the folio along */ + cacheop_on_each_cpu(local_flush_icache_folio, page_folio(page), 1); } void flush_cache_sigtramp(unsigned long address) diff --git a/arch/sh/mm/kmap.c b/arch/sh/mm/kmap.c index 73fd7cc99430..fa50e8f6e7a9 100644 --- a/arch/sh/mm/kmap.c +++ b/arch/sh/mm/kmap.c @@ -27,10 +27,11 @@ void __init kmap_coherent_init(void) void *kmap_coherent(struct page *page, unsigned long addr) { + struct folio *folio = page_folio(page); enum fixed_addresses idx; unsigned long vaddr; - BUG_ON(!test_bit(PG_dcache_clean, &page->flags)); + BUG_ON(!test_bit(PG_dcache_clean, &folio->flags)); preempt_disable(); pagefault_disable(); -- cgit v1.2.3 From 665f640294540a941aabb81ae46dfc671aff5259 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:53 +0100 Subject: sparc32: implement the new page table range API Add PFN_PTE_SHIFT, update_mmu_cache_range(), flush_dcache_folio() and flush_icache_pages(). Link: https://lkml.kernel.org/r/20230802151406.3735276-26-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: "David S. Miller" Signed-off-by: Andrew Morton --- arch/sparc/include/asm/cacheflush_32.h | 10 ++++++++-- arch/sparc/include/asm/pgtable_32.h | 8 ++++---- arch/sparc/mm/init_32.c | 13 +++++++++++-- 3 files changed, 23 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/sparc/include/asm/cacheflush_32.h b/arch/sparc/include/asm/cacheflush_32.h index adb6991d0455..c8dd971f0e88 100644 --- a/arch/sparc/include/asm/cacheflush_32.h +++ b/arch/sparc/include/asm/cacheflush_32.h @@ -2,6 +2,7 @@ #ifndef _SPARC_CACHEFLUSH_H #define _SPARC_CACHEFLUSH_H +#include #include #define flush_cache_all() \ @@ -16,6 +17,7 @@ sparc32_cachetlb_ops->cache_page(vma, addr) #define flush_icache_range(start, end) do { } while (0) #define flush_icache_page(vma, pg) do { } while (0) +#define flush_icache_pages(vma, pg, nr) do { } while (0) #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ @@ -35,11 +37,15 @@ #define flush_page_for_dma(addr) \ sparc32_cachetlb_ops->page_for_dma(addr) -struct page; void sparc_flush_page_to_ram(struct page *page); +void sparc_flush_folio_to_ram(struct folio *folio); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -#define flush_dcache_page(page) sparc_flush_page_to_ram(page) +#define flush_dcache_folio(folio) sparc_flush_folio_to_ram(folio) +static inline void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); +} #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index d4330e3c57a6..315d316614ca 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -101,8 +101,6 @@ static inline void set_pte(pte_t *ptep, pte_t pteval) srmmu_swap((unsigned long *)ptep, pte_val(pteval)); } -#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) - static inline int srmmu_device_memory(unsigned long x) { return ((x & 0xF0000000) != 0); @@ -256,6 +254,7 @@ static inline pte_t pte_mkyoung(pte_t pte) return __pte(pte_val(pte) | SRMMU_REF); } +#define PFN_PTE_SHIFT (PAGE_SHIFT - 4) #define pfn_pte(pfn, prot) mk_pte(pfn_to_page(pfn), prot) static inline unsigned long pte_pfn(pte_t pte) @@ -268,7 +267,7 @@ static inline unsigned long pte_pfn(pte_t pte) */ return ~0UL; } - return (pte_val(pte) & SRMMU_PTE_PMASK) >> (PAGE_SHIFT-4); + return (pte_val(pte) & SRMMU_PTE_PMASK) >> PFN_PTE_SHIFT; } #define pte_page(pte) pfn_to_page(pte_pfn(pte)) @@ -318,6 +317,7 @@ void mmu_info(struct seq_file *m); #define FAULT_CODE_USER 0x4 #define update_mmu_cache(vma, address, ptep) do { } while (0) +#define update_mmu_cache_range(vmf, vma, address, ptep, nr) do { } while (0) void srmmu_mapiorange(unsigned int bus, unsigned long xpa, unsigned long xva, unsigned int len); @@ -422,7 +422,7 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, ({ \ int __changed = !pte_same(*(__ptep), __entry); \ if (__changed) { \ - set_pte_at((__vma)->vm_mm, (__address), __ptep, __entry); \ + set_pte(__ptep, __entry); \ flush_tlb_page(__vma, __address); \ } \ __changed; \ diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 9c0ea457bdf0..d96a14ffceeb 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -297,11 +297,20 @@ void sparc_flush_page_to_ram(struct page *page) { unsigned long vaddr = (unsigned long)page_address(page); - if (vaddr) - __flush_page_to_ram(vaddr); + __flush_page_to_ram(vaddr); } EXPORT_SYMBOL(sparc_flush_page_to_ram); +void sparc_flush_folio_to_ram(struct folio *folio) +{ + unsigned long vaddr = (unsigned long)folio_address(folio); + unsigned int i, nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) + __flush_page_to_ram(vaddr + i * PAGE_SIZE); +} +EXPORT_SYMBOL(sparc_flush_folio_to_ram); + static const pgprot_t protection_map[16] = { [VM_NONE] = PAGE_NONE, [VM_READ] = PAGE_READONLY, -- cgit v1.2.3 From 1a10a44dfc1d55ba84987da1f8377258a044499c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:54 +0100 Subject: sparc64: implement the new page table range API Add set_ptes(), update_mmu_cache_range(), flush_dcache_folio() and flush_icache_pages(). Convert the PG_dcache_dirty flag from being per-page to per-folio. Link: https://lkml.kernel.org/r/20230802151406.3735276-27-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: "David S. Miller" Signed-off-by: Andrew Morton --- arch/sparc/include/asm/cacheflush_64.h | 18 +++++--- arch/sparc/include/asm/pgtable_64.h | 29 ++++++++++--- arch/sparc/kernel/smp_64.c | 56 +++++++++++++++--------- arch/sparc/mm/init_64.c | 78 ++++++++++++++++++++-------------- arch/sparc/mm/tlb.c | 5 ++- 5 files changed, 119 insertions(+), 67 deletions(-) (limited to 'arch') diff --git a/arch/sparc/include/asm/cacheflush_64.h b/arch/sparc/include/asm/cacheflush_64.h index b9341836597e..a9a719f04d06 100644 --- a/arch/sparc/include/asm/cacheflush_64.h +++ b/arch/sparc/include/asm/cacheflush_64.h @@ -35,20 +35,26 @@ void flush_icache_range(unsigned long start, unsigned long end); void __flush_icache_page(unsigned long); void __flush_dcache_page(void *addr, int flush_icache); -void flush_dcache_page_impl(struct page *page); +void flush_dcache_folio_impl(struct folio *folio); #ifdef CONFIG_SMP -void smp_flush_dcache_page_impl(struct page *page, int cpu); -void flush_dcache_page_all(struct mm_struct *mm, struct page *page); +void smp_flush_dcache_folio_impl(struct folio *folio, int cpu); +void flush_dcache_folio_all(struct mm_struct *mm, struct folio *folio); #else -#define smp_flush_dcache_page_impl(page,cpu) flush_dcache_page_impl(page) -#define flush_dcache_page_all(mm,page) flush_dcache_page_impl(page) +#define smp_flush_dcache_folio_impl(folio, cpu) flush_dcache_folio_impl(folio) +#define flush_dcache_folio_all(mm, folio) flush_dcache_folio_impl(folio) #endif void __flush_dcache_range(unsigned long start, unsigned long end); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -void flush_dcache_page(struct page *page); +void flush_dcache_folio(struct folio *folio); +#define flush_dcache_folio flush_dcache_folio +static inline void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); +} #define flush_icache_page(vma, pg) do { } while(0) +#define flush_icache_pages(vma, pg, nr) do { } while(0) void flush_ptrace_access(struct vm_area_struct *, struct page *, unsigned long uaddr, void *kaddr, diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 5563efa1a19f..09aa37cc4469 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -86,6 +86,7 @@ extern unsigned long VMALLOC_END; #define vmemmap ((struct page *)VMEMMAP_BASE) #include +#include bool kern_addr_valid(unsigned long addr); @@ -927,8 +928,21 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, maybe_tlb_batch_add(mm, addr, ptep, orig, fullmm, PAGE_SHIFT); } -#define set_pte_at(mm,addr,ptep,pte) \ - __set_pte_at((mm), (addr), (ptep), (pte), 0) +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) +{ + arch_enter_lazy_mmu_mode(); + for (;;) { + __set_pte_at(mm, addr, ptep, pte, 0); + if (--nr == 0) + break; + ptep++; + pte_val(pte) += PAGE_SIZE; + addr += PAGE_SIZE; + } + arch_leave_lazy_mmu_mode(); +} +#define set_ptes set_ptes #define pte_clear(mm,addr,ptep) \ set_pte_at((mm), (addr), (ptep), __pte(0UL)) @@ -947,8 +961,8 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, \ if (pfn_valid(this_pfn) && \ (((old_addr) ^ (new_addr)) & (1 << 13))) \ - flush_dcache_page_all(current->mm, \ - pfn_to_page(this_pfn)); \ + flush_dcache_folio_all(current->mm, \ + page_folio(pfn_to_page(this_pfn))); \ } \ newpte; \ }) @@ -963,7 +977,10 @@ struct seq_file; void mmu_info(struct seq_file *); struct vm_area_struct; -void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *); +void update_mmu_cache_range(struct vm_fault *, struct vm_area_struct *, + unsigned long addr, pte_t *ptep, unsigned int nr); +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) #ifdef CONFIG_TRANSPARENT_HUGEPAGE void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd); @@ -1121,8 +1138,6 @@ static inline bool pte_access_permitted(pte_t pte, bool write) } #define pte_access_permitted pte_access_permitted -#include - /* We provide our own get_unmapped_area to cope with VA holes and * SHM area cache aliasing for userland. */ diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index e5964d1d8b37..f3969a3600db 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -921,20 +921,26 @@ extern unsigned long xcall_flush_dcache_page_cheetah; #endif extern unsigned long xcall_flush_dcache_page_spitfire; -static inline void __local_flush_dcache_page(struct page *page) +static inline void __local_flush_dcache_folio(struct folio *folio) { + unsigned int i, nr = folio_nr_pages(folio); + #ifdef DCACHE_ALIASING_POSSIBLE - __flush_dcache_page(page_address(page), + for (i = 0; i < nr; i++) + __flush_dcache_page(folio_address(folio) + i * PAGE_SIZE, ((tlb_type == spitfire) && - page_mapping_file(page) != NULL)); + folio_flush_mapping(folio) != NULL)); #else - if (page_mapping_file(page) != NULL && - tlb_type == spitfire) - __flush_icache_page(__pa(page_address(page))); + if (folio_flush_mapping(folio) != NULL && + tlb_type == spitfire) { + unsigned long pfn = folio_pfn(folio) + for (i = 0; i < nr; i++) + __flush_icache_page((pfn + i) * PAGE_SIZE); + } #endif } -void smp_flush_dcache_page_impl(struct page *page, int cpu) +void smp_flush_dcache_folio_impl(struct folio *folio, int cpu) { int this_cpu; @@ -948,14 +954,14 @@ void smp_flush_dcache_page_impl(struct page *page, int cpu) this_cpu = get_cpu(); if (cpu == this_cpu) { - __local_flush_dcache_page(page); + __local_flush_dcache_folio(folio); } else if (cpu_online(cpu)) { - void *pg_addr = page_address(page); + void *pg_addr = folio_address(folio); u64 data0 = 0; if (tlb_type == spitfire) { data0 = ((u64)&xcall_flush_dcache_page_spitfire); - if (page_mapping_file(page) != NULL) + if (folio_flush_mapping(folio) != NULL) data0 |= ((u64)1 << 32); } else if (tlb_type == cheetah || tlb_type == cheetah_plus) { #ifdef DCACHE_ALIASING_POSSIBLE @@ -963,18 +969,23 @@ void smp_flush_dcache_page_impl(struct page *page, int cpu) #endif } if (data0) { - xcall_deliver(data0, __pa(pg_addr), - (u64) pg_addr, cpumask_of(cpu)); + unsigned int i, nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) { + xcall_deliver(data0, __pa(pg_addr), + (u64) pg_addr, cpumask_of(cpu)); #ifdef CONFIG_DEBUG_DCFLUSH - atomic_inc(&dcpage_flushes_xcall); + atomic_inc(&dcpage_flushes_xcall); #endif + pg_addr += PAGE_SIZE; + } } } put_cpu(); } -void flush_dcache_page_all(struct mm_struct *mm, struct page *page) +void flush_dcache_folio_all(struct mm_struct *mm, struct folio *folio) { void *pg_addr; u64 data0; @@ -988,10 +999,10 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page) atomic_inc(&dcpage_flushes); #endif data0 = 0; - pg_addr = page_address(page); + pg_addr = folio_address(folio); if (tlb_type == spitfire) { data0 = ((u64)&xcall_flush_dcache_page_spitfire); - if (page_mapping_file(page) != NULL) + if (folio_flush_mapping(folio) != NULL) data0 |= ((u64)1 << 32); } else if (tlb_type == cheetah || tlb_type == cheetah_plus) { #ifdef DCACHE_ALIASING_POSSIBLE @@ -999,13 +1010,18 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page) #endif } if (data0) { - xcall_deliver(data0, __pa(pg_addr), - (u64) pg_addr, cpu_online_mask); + unsigned int i, nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) { + xcall_deliver(data0, __pa(pg_addr), + (u64) pg_addr, cpu_online_mask); #ifdef CONFIG_DEBUG_DCFLUSH - atomic_inc(&dcpage_flushes_xcall); + atomic_inc(&dcpage_flushes_xcall); #endif + pg_addr += PAGE_SIZE; + } } - __local_flush_dcache_page(page); + __local_flush_dcache_folio(folio); preempt_enable(); } diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 9a63a3e08e40..f83017992eaa 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -195,21 +195,26 @@ atomic_t dcpage_flushes_xcall = ATOMIC_INIT(0); #endif #endif -inline void flush_dcache_page_impl(struct page *page) +inline void flush_dcache_folio_impl(struct folio *folio) { + unsigned int i, nr = folio_nr_pages(folio); + BUG_ON(tlb_type == hypervisor); #ifdef CONFIG_DEBUG_DCFLUSH atomic_inc(&dcpage_flushes); #endif #ifdef DCACHE_ALIASING_POSSIBLE - __flush_dcache_page(page_address(page), - ((tlb_type == spitfire) && - page_mapping_file(page) != NULL)); + for (i = 0; i < nr; i++) + __flush_dcache_page(folio_address(folio) + i * PAGE_SIZE, + ((tlb_type == spitfire) && + folio_flush_mapping(folio) != NULL)); #else - if (page_mapping_file(page) != NULL && - tlb_type == spitfire) - __flush_icache_page(__pa(page_address(page))); + if (folio_flush_mapping(folio) != NULL && + tlb_type == spitfire) { + for (i = 0; i < nr; i++) + __flush_icache_page((pfn + i) * PAGE_SIZE); + } #endif } @@ -218,10 +223,10 @@ inline void flush_dcache_page_impl(struct page *page) #define PG_dcache_cpu_mask \ ((1UL<flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask) +#define dcache_dirty_cpu(folio) \ + (((folio)->flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask) -static inline void set_dcache_dirty(struct page *page, int this_cpu) +static inline void set_dcache_dirty(struct folio *folio, int this_cpu) { unsigned long mask = this_cpu; unsigned long non_cpu_bits; @@ -238,11 +243,11 @@ static inline void set_dcache_dirty(struct page *page, int this_cpu) "bne,pn %%xcc, 1b\n\t" " nop" : /* no outputs */ - : "r" (mask), "r" (non_cpu_bits), "r" (&page->flags) + : "r" (mask), "r" (non_cpu_bits), "r" (&folio->flags) : "g1", "g7"); } -static inline void clear_dcache_dirty_cpu(struct page *page, unsigned long cpu) +static inline void clear_dcache_dirty_cpu(struct folio *folio, unsigned long cpu) { unsigned long mask = (1UL << PG_dcache_dirty); @@ -260,7 +265,7 @@ static inline void clear_dcache_dirty_cpu(struct page *page, unsigned long cpu) " nop\n" "2:" : /* no outputs */ - : "r" (cpu), "r" (mask), "r" (&page->flags), + : "r" (cpu), "r" (mask), "r" (&folio->flags), "i" (PG_dcache_cpu_mask), "i" (PG_dcache_cpu_shift) : "g1", "g7"); @@ -284,9 +289,10 @@ static void flush_dcache(unsigned long pfn) page = pfn_to_page(pfn); if (page) { + struct folio *folio = page_folio(page); unsigned long pg_flags; - pg_flags = page->flags; + pg_flags = folio->flags; if (pg_flags & (1UL << PG_dcache_dirty)) { int cpu = ((pg_flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask); @@ -296,11 +302,11 @@ static void flush_dcache(unsigned long pfn) * in the SMP case. */ if (cpu == this_cpu) - flush_dcache_page_impl(page); + flush_dcache_folio_impl(folio); else - smp_flush_dcache_page_impl(page, cpu); + smp_flush_dcache_folio_impl(folio, cpu); - clear_dcache_dirty_cpu(page, cpu); + clear_dcache_dirty_cpu(folio, cpu); put_cpu(); } @@ -388,12 +394,14 @@ bool __init arch_hugetlb_valid_size(unsigned long size) } #endif /* CONFIG_HUGETLB_PAGE */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr) { struct mm_struct *mm; unsigned long flags; bool is_huge_tsb; pte_t pte = *ptep; + unsigned int i; if (tlb_type != hypervisor) { unsigned long pfn = pte_pfn(pte); @@ -440,15 +448,21 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t * } } #endif - if (!is_huge_tsb) - __update_mmu_tsb_insert(mm, MM_TSB_BASE, PAGE_SHIFT, - address, pte_val(pte)); + if (!is_huge_tsb) { + for (i = 0; i < nr; i++) { + __update_mmu_tsb_insert(mm, MM_TSB_BASE, PAGE_SHIFT, + address, pte_val(pte)); + address += PAGE_SIZE; + pte_val(pte) += PAGE_SIZE; + } + } spin_unlock_irqrestore(&mm->context.lock, flags); } -void flush_dcache_page(struct page *page) +void flush_dcache_folio(struct folio *folio) { + unsigned long pfn = folio_pfn(folio); struct address_space *mapping; int this_cpu; @@ -459,35 +473,35 @@ void flush_dcache_page(struct page *page) * is merely the zero page. The 'bigcore' testcase in GDB * causes this case to run millions of times. */ - if (page == ZERO_PAGE(0)) + if (is_zero_pfn(pfn)) return; this_cpu = get_cpu(); - mapping = page_mapping_file(page); + mapping = folio_flush_mapping(folio); if (mapping && !mapping_mapped(mapping)) { - int dirty = test_bit(PG_dcache_dirty, &page->flags); + bool dirty = test_bit(PG_dcache_dirty, &folio->flags); if (dirty) { - int dirty_cpu = dcache_dirty_cpu(page); + int dirty_cpu = dcache_dirty_cpu(folio); if (dirty_cpu == this_cpu) goto out; - smp_flush_dcache_page_impl(page, dirty_cpu); + smp_flush_dcache_folio_impl(folio, dirty_cpu); } - set_dcache_dirty(page, this_cpu); + set_dcache_dirty(folio, this_cpu); } else { /* We could delay the flush for the !page_mapping * case too. But that case is for exec env/arg * pages and those are %99 certainly going to get * faulted into the tlb (and thus flushed) anyways. */ - flush_dcache_page_impl(page); + flush_dcache_folio_impl(folio); } out: put_cpu(); } -EXPORT_SYMBOL(flush_dcache_page); +EXPORT_SYMBOL(flush_dcache_folio); void __kprobes flush_icache_range(unsigned long start, unsigned long end) { @@ -2280,10 +2294,10 @@ void __init paging_init(void) setup_page_offset(); /* These build time checkes make sure that the dcache_dirty_cpu() - * page->flags usage will work. + * folio->flags usage will work. * * When a page gets marked as dcache-dirty, we store the - * cpu number starting at bit 32 in the page->flags. Also, + * cpu number starting at bit 32 in the folio->flags. Also, * functions like clear_dcache_dirty_cpu use the cpu mask * in 13-bit signed-immediate instruction fields. */ diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index 7ecf8556947a..0d41c94ec3ac 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -118,6 +118,7 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, unsigned long paddr, pfn = pte_pfn(orig); struct address_space *mapping; struct page *page; + struct folio *folio; if (!pfn_valid(pfn)) goto no_cache_flush; @@ -127,13 +128,13 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, goto no_cache_flush; /* A real file page? */ - mapping = page_mapping_file(page); + mapping = folio_flush_mapping(folio); if (!mapping) goto no_cache_flush; paddr = (unsigned long) page_address(page); if ((paddr ^ vaddr) & (1 << 13)) - flush_dcache_page_all(mm, page); + flush_dcache_folio_all(mm, folio); } no_cache_flush: -- cgit v1.2.3 From fd8132e6e9fdecb9ff7d1db98014d372e03f3c9d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:55 +0100 Subject: um: implement the new page table range API Add PFN_PTE_SHIFT and update_mmu_cache_range(). Link: https://lkml.kernel.org/r/20230802151406.3735276-28-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Richard Weinberger Cc: Anton Ivanov Cc: Johannes Berg Signed-off-by: Andrew Morton --- arch/um/include/asm/pgtable.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index a70d1618eb35..44f6c76167d9 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -242,11 +242,7 @@ static inline void set_pte(pte_t *pteptr, pte_t pteval) if(pte_present(*pteptr)) *pteptr = pte_mknewprot(*pteptr); } -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *pteptr, pte_t pteval) -{ - set_pte(pteptr, pteval); -} +#define PFN_PTE_SHIFT PAGE_SHIFT #define __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) @@ -290,6 +286,7 @@ struct mm_struct; extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr); #define update_mmu_cache(vma,address,ptep) do {} while (0) +#define update_mmu_cache_range(vmf, vma, address, ptep, nr) do {} while (0) /* * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that -- cgit v1.2.3 From a3e1c9372c9b95945aa58f318990cf3e4bf2881d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:56 +0100 Subject: x86: implement the new page table range API Add PFN_PTE_SHIFT and a noop update_mmu_cache_range(). Link: https://lkml.kernel.org/r/20230802151406.3735276-29-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index cd0b6337d03c..dbf8af70b7c2 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -185,6 +185,8 @@ static inline int pte_special(pte_t pte) static inline u64 protnone_mask(u64 val); +#define PFN_PTE_SHIFT PAGE_SHIFT + static inline unsigned long pte_pfn(pte_t pte) { phys_addr_t pfn = pte_val(pte); @@ -1020,13 +1022,6 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) return res; } -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - page_table_check_ptes_set(mm, ptep, pte, 1); - set_pte(ptep, pte); -} - static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { @@ -1292,6 +1287,11 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { } +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ +} static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { -- cgit v1.2.3 From 4fbb7e7f47dbc631a9f5bad3171ccbca171ed1d3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:57 +0100 Subject: xtensa: implement the new page table range API Add PFN_PTE_SHIFT, update_mmu_cache_range(), flush_dcache_folio() and flush_icache_pages(). Link: https://lkml.kernel.org/r/20230802151406.3735276-30-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Max Filippov Signed-off-by: Andrew Morton --- arch/xtensa/include/asm/cacheflush.h | 9 +++- arch/xtensa/include/asm/pgtable.h | 18 ++++---- arch/xtensa/mm/cache.c | 83 ++++++++++++++++++++---------------- 3 files changed, 63 insertions(+), 47 deletions(-) (limited to 'arch') diff --git a/arch/xtensa/include/asm/cacheflush.h b/arch/xtensa/include/asm/cacheflush.h index 7b4359312c25..35153f6725e4 100644 --- a/arch/xtensa/include/asm/cacheflush.h +++ b/arch/xtensa/include/asm/cacheflush.h @@ -119,8 +119,14 @@ void flush_cache_page(struct vm_area_struct*, #define flush_cache_vmap(start,end) flush_cache_all() #define flush_cache_vunmap(start,end) flush_cache_all() +void flush_dcache_folio(struct folio *folio); +#define flush_dcache_folio flush_dcache_folio + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -void flush_dcache_page(struct page *); +static inline void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); +} void local_flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); @@ -156,6 +162,7 @@ void local_flush_cache_page(struct vm_area_struct *vma, /* This is not required, see Documentation/core-api/cachetlb.rst */ #define flush_icache_page(vma,page) do { } while (0) +#define flush_icache_pages(vma, page, nr) do { } while (0) #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index fc7a14884c6c..ef79cb6c20dc 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -274,6 +274,7 @@ static inline pte_t pte_mkwrite(pte_t pte) * and a page entry and page directory to the page they refer to. */ +#define PFN_PTE_SHIFT PAGE_SHIFT #define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT) #define pte_same(a,b) (pte_val(a) == pte_val(b)) #define pte_page(x) pfn_to_page(pte_pfn(x)) @@ -301,15 +302,9 @@ static inline void update_pte(pte_t *ptep, pte_t pteval) struct mm_struct; -static inline void -set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) -{ - update_pte(ptep, pteval); -} - -static inline void set_pte(pte_t *ptep, pte_t pteval) +static inline void set_pte(pte_t *ptep, pte_t pte) { - update_pte(ptep, pteval); + update_pte(ptep, pte); } static inline void @@ -407,8 +402,11 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) #else -extern void update_mmu_cache(struct vm_area_struct * vma, - unsigned long address, pte_t *ptep); +struct vm_fault; +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr); +#define update_mmu_cache(vma, address, ptep) \ + update_mmu_cache_range(NULL, vma, address, ptep, 1) typedef pte_t *pte_addr_t; diff --git a/arch/xtensa/mm/cache.c b/arch/xtensa/mm/cache.c index 19e5a478a7e8..7ec66a79f472 100644 --- a/arch/xtensa/mm/cache.c +++ b/arch/xtensa/mm/cache.c @@ -121,9 +121,9 @@ EXPORT_SYMBOL(copy_user_highpage); * */ -void flush_dcache_page(struct page *page) +void flush_dcache_folio(struct folio *folio) { - struct address_space *mapping = page_mapping_file(page); + struct address_space *mapping = folio_flush_mapping(folio); /* * If we have a mapping but the page is not mapped to user-space @@ -132,14 +132,14 @@ void flush_dcache_page(struct page *page) */ if (mapping && !mapping_mapped(mapping)) { - if (!test_bit(PG_arch_1, &page->flags)) - set_bit(PG_arch_1, &page->flags); + if (!test_bit(PG_arch_1, &folio->flags)) + set_bit(PG_arch_1, &folio->flags); return; } else { - - unsigned long phys = page_to_phys(page); - unsigned long temp = page->index << PAGE_SHIFT; + unsigned long phys = folio_pfn(folio) * PAGE_SIZE; + unsigned long temp = folio_pos(folio); + unsigned int i, nr = folio_nr_pages(folio); unsigned long alias = !(DCACHE_ALIAS_EQ(temp, phys)); unsigned long virt; @@ -154,22 +154,26 @@ void flush_dcache_page(struct page *page) return; preempt_disable(); - virt = TLBTEMP_BASE_1 + (phys & DCACHE_ALIAS_MASK); - __flush_invalidate_dcache_page_alias(virt, phys); + for (i = 0; i < nr; i++) { + virt = TLBTEMP_BASE_1 + (phys & DCACHE_ALIAS_MASK); + __flush_invalidate_dcache_page_alias(virt, phys); - virt = TLBTEMP_BASE_1 + (temp & DCACHE_ALIAS_MASK); + virt = TLBTEMP_BASE_1 + (temp & DCACHE_ALIAS_MASK); - if (alias) - __flush_invalidate_dcache_page_alias(virt, phys); + if (alias) + __flush_invalidate_dcache_page_alias(virt, phys); - if (mapping) - __invalidate_icache_page_alias(virt, phys); + if (mapping) + __invalidate_icache_page_alias(virt, phys); + phys += PAGE_SIZE; + temp += PAGE_SIZE; + } preempt_enable(); } /* There shouldn't be an entry in the cache for this page anymore. */ } -EXPORT_SYMBOL(flush_dcache_page); +EXPORT_SYMBOL(flush_dcache_folio); /* * For now, flush the whole cache. FIXME?? @@ -207,45 +211,52 @@ EXPORT_SYMBOL(local_flush_cache_page); #endif /* DCACHE_WAY_SIZE > PAGE_SIZE */ -void -update_mmu_cache(struct vm_area_struct * vma, unsigned long addr, pte_t *ptep) +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) { unsigned long pfn = pte_pfn(*ptep); - struct page *page; + struct folio *folio; + unsigned int i; if (!pfn_valid(pfn)) return; - page = pfn_to_page(pfn); + folio = page_folio(pfn_to_page(pfn)); - /* Invalidate old entry in TLBs */ - - flush_tlb_page(vma, addr); + /* Invalidate old entries in TLBs */ + for (i = 0; i < nr; i++) + flush_tlb_page(vma, addr + i * PAGE_SIZE); + nr = folio_nr_pages(folio); #if (DCACHE_WAY_SIZE > PAGE_SIZE) - if (!PageReserved(page) && test_bit(PG_arch_1, &page->flags)) { - unsigned long phys = page_to_phys(page); + if (!folio_test_reserved(folio) && test_bit(PG_arch_1, &folio->flags)) { + unsigned long phys = folio_pfn(folio) * PAGE_SIZE; unsigned long tmp; preempt_disable(); - tmp = TLBTEMP_BASE_1 + (phys & DCACHE_ALIAS_MASK); - __flush_invalidate_dcache_page_alias(tmp, phys); - tmp = TLBTEMP_BASE_1 + (addr & DCACHE_ALIAS_MASK); - __flush_invalidate_dcache_page_alias(tmp, phys); - __invalidate_icache_page_alias(tmp, phys); + for (i = 0; i < nr; i++) { + tmp = TLBTEMP_BASE_1 + (phys & DCACHE_ALIAS_MASK); + __flush_invalidate_dcache_page_alias(tmp, phys); + tmp = TLBTEMP_BASE_1 + (addr & DCACHE_ALIAS_MASK); + __flush_invalidate_dcache_page_alias(tmp, phys); + __invalidate_icache_page_alias(tmp, phys); + phys += PAGE_SIZE; + } preempt_enable(); - clear_bit(PG_arch_1, &page->flags); + clear_bit(PG_arch_1, &folio->flags); } #else - if (!PageReserved(page) && !test_bit(PG_arch_1, &page->flags) + if (!folio_test_reserved(folio) && !test_bit(PG_arch_1, &folio->flags) && (vma->vm_flags & VM_EXEC) != 0) { - unsigned long paddr = (unsigned long)kmap_atomic(page); - __flush_dcache_page(paddr); - __invalidate_icache_page(paddr); - set_bit(PG_arch_1, &page->flags); - kunmap_atomic((void *)paddr); + for (i = 0; i < nr; i++) { + void *paddr = kmap_local_folio(folio, i * PAGE_SIZE); + __flush_dcache_page((unsigned long)paddr); + __invalidate_icache_page((unsigned long)paddr); + kunmap_local(paddr); + } + set_bit(PG_arch_1, &folio->flags); } #endif } -- cgit v1.2.3 From 203b7b6aad6769a43987deb81c35456de8bb16c7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:59 +0100 Subject: mm: rationalise flush_icache_pages() and flush_icache_page() Move the default (no-op) implementation of flush_icache_pages() to from . Remove the flush_icache_page() wrapper from each architecture into . Link: https://lkml.kernel.org/r/20230802151406.3735276-32-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- arch/alpha/include/asm/cacheflush.h | 5 +---- arch/arc/include/asm/cacheflush.h | 9 --------- arch/arm/include/asm/cacheflush.h | 7 ------- arch/csky/abiv1/inc/abi/cacheflush.h | 1 - arch/csky/abiv2/inc/abi/cacheflush.h | 1 - arch/hexagon/include/asm/cacheflush.h | 2 +- arch/loongarch/include/asm/cacheflush.h | 2 -- arch/m68k/include/asm/cacheflush_mm.h | 1 - arch/mips/include/asm/cacheflush.h | 6 ------ arch/nios2/include/asm/cacheflush.h | 2 +- arch/parisc/include/asm/cacheflush.h | 2 +- arch/sh/include/asm/cacheflush.h | 2 +- arch/sparc/include/asm/cacheflush_32.h | 2 -- arch/sparc/include/asm/cacheflush_64.h | 3 --- arch/xtensa/include/asm/cacheflush.h | 4 ---- 15 files changed, 5 insertions(+), 44 deletions(-) (limited to 'arch') diff --git a/arch/alpha/include/asm/cacheflush.h b/arch/alpha/include/asm/cacheflush.h index 3956460e69e2..36a7e924c3b9 100644 --- a/arch/alpha/include/asm/cacheflush.h +++ b/arch/alpha/include/asm/cacheflush.h @@ -53,10 +53,6 @@ extern void flush_icache_user_page(struct vm_area_struct *vma, #define flush_icache_user_page flush_icache_user_page #endif /* CONFIG_SMP */ -/* This is used only in __do_fault and do_swap_page. */ -#define flush_icache_page(vma, page) \ - flush_icache_user_page((vma), (page), 0, 0) - /* * Both implementations of flush_icache_user_page flush the entire * address space, so one call, no matter how many pages. @@ -66,6 +62,7 @@ static inline void flush_icache_pages(struct vm_area_struct *vma, { flush_icache_user_page(vma, page, 0, 0); } +#define flush_icache_pages flush_icache_pages #include diff --git a/arch/arc/include/asm/cacheflush.h b/arch/arc/include/asm/cacheflush.h index 04f65f588510..bd5b1a9a0544 100644 --- a/arch/arc/include/asm/cacheflush.h +++ b/arch/arc/include/asm/cacheflush.h @@ -18,15 +18,6 @@ #include #include -/* - * Semantically we need this because icache doesn't snoop dcache/dma. - * However ARC Cache flush requires paddr as well as vaddr, latter not available - * in the flush_icache_page() API. So we no-op it but do the equivalent work - * in update_mmu_cache() - */ -#define flush_icache_page(vma, page) -#define flush_icache_pages(vma, page, nr) - void flush_cache_all(void); void flush_icache_range(unsigned long kstart, unsigned long kend); diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 841e268d2374..f6181f69577f 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -321,13 +321,6 @@ static inline void flush_anon_page(struct vm_area_struct *vma, #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) -/* - * We don't appear to need to do anything here. In fact, if we did, we'd - * duplicate cache flushing elsewhere performed by flush_dcache_page(). - */ -#define flush_icache_page(vma,page) do { } while (0) -#define flush_icache_pages(vma, page, nr) do { } while (0) - /* * flush_cache_vmap() is used when creating mappings (eg, via vmap, * vmalloc, ioremap etc) in kernel space for pages. On non-VIPT diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h index 0d6cb65624c4..908d8b0bc4fd 100644 --- a/arch/csky/abiv1/inc/abi/cacheflush.h +++ b/arch/csky/abiv1/inc/abi/cacheflush.h @@ -45,7 +45,6 @@ extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, u #define flush_cache_vmap(start, end) cache_wbinv_all() #define flush_cache_vunmap(start, end) cache_wbinv_all() -#define flush_icache_page(vma, page) do {} while (0); #define flush_icache_range(start, end) cache_wbinv_range(start, end) #define flush_icache_mm_range(mm, start, end) cache_wbinv_range(start, end) #define flush_icache_deferred(mm) do {} while (0); diff --git a/arch/csky/abiv2/inc/abi/cacheflush.h b/arch/csky/abiv2/inc/abi/cacheflush.h index 9c728933a776..40be16907267 100644 --- a/arch/csky/abiv2/inc/abi/cacheflush.h +++ b/arch/csky/abiv2/inc/abi/cacheflush.h @@ -33,7 +33,6 @@ static inline void flush_dcache_page(struct page *page) #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_icache_page(vma, page) do { } while (0) #define flush_icache_range(start, end) cache_wbinv_range(start, end) diff --git a/arch/hexagon/include/asm/cacheflush.h b/arch/hexagon/include/asm/cacheflush.h index dc3f500a5a01..bfff514a81c8 100644 --- a/arch/hexagon/include/asm/cacheflush.h +++ b/arch/hexagon/include/asm/cacheflush.h @@ -18,7 +18,7 @@ * - flush_cache_range(vma, start, end) flushes a range of pages * - flush_icache_range(start, end) flush a range of instructions * - flush_dcache_page(pg) flushes(wback&invalidates) a page for dcache - * - flush_icache_page(vma, pg) flushes(invalidates) a page for icache + * - flush_icache_pages(vma, pg, nr) flushes(invalidates) nr pages for icache * * Need to doublecheck which one is really needed for ptrace stuff to work. */ diff --git a/arch/loongarch/include/asm/cacheflush.h b/arch/loongarch/include/asm/cacheflush.h index 88a44da50a3b..80bd74106985 100644 --- a/arch/loongarch/include/asm/cacheflush.h +++ b/arch/loongarch/include/asm/cacheflush.h @@ -46,8 +46,6 @@ void local_flush_icache_range(unsigned long start, unsigned long end); #define flush_cache_page(vma, vmaddr, pfn) do { } while (0) #define flush_cache_vmap(start, end) do { } while (0) #define flush_cache_vunmap(start, end) do { } while (0) -#define flush_icache_page(vma, page) do { } while (0) -#define flush_icache_pages(vma, page) do { } while (0) #define flush_icache_user_page(vma, page, addr, len) do { } while (0) #define flush_dcache_page(page) do { } while (0) #define flush_dcache_mmap_lock(mapping) do { } while (0) diff --git a/arch/m68k/include/asm/cacheflush_mm.h b/arch/m68k/include/asm/cacheflush_mm.h index 88eb85e81ef6..ed12358c4783 100644 --- a/arch/m68k/include/asm/cacheflush_mm.h +++ b/arch/m68k/include/asm/cacheflush_mm.h @@ -261,7 +261,6 @@ static inline void __flush_pages_to_ram(void *vaddr, unsigned int nr) #define flush_dcache_mmap_unlock(mapping) do { } while (0) #define flush_icache_pages(vma, page, nr) \ __flush_pages_to_ram(page_address(page), nr) -#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1) extern void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h index 0f389bc7cb90..f36c2519ed97 100644 --- a/arch/mips/include/asm/cacheflush.h +++ b/arch/mips/include/asm/cacheflush.h @@ -82,12 +82,6 @@ static inline void flush_anon_page(struct vm_area_struct *vma, __flush_anon_page(page, vmaddr); } -static inline void flush_icache_pages(struct vm_area_struct *vma, - struct page *page, unsigned int nr) -{ -} -#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1) - extern void (*flush_icache_range)(unsigned long start, unsigned long end); extern void (*local_flush_icache_range)(unsigned long start, unsigned long end); extern void (*__flush_icache_user_range)(unsigned long start, diff --git a/arch/nios2/include/asm/cacheflush.h b/arch/nios2/include/asm/cacheflush.h index 8624ca83cffe..7c48c5213fb7 100644 --- a/arch/nios2/include/asm/cacheflush.h +++ b/arch/nios2/include/asm/cacheflush.h @@ -35,7 +35,7 @@ void flush_dcache_folio(struct folio *folio); extern void flush_icache_range(unsigned long start, unsigned long end); void flush_icache_pages(struct vm_area_struct *vma, struct page *page, unsigned int nr); -#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1); +#define flush_icache_pages flush_icache_pages #define flush_cache_vmap(start, end) flush_dcache_range(start, end) #define flush_cache_vunmap(start, end) flush_dcache_range(start, end) diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index b77c3e0c37d3..b4006f2a9705 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -60,7 +60,7 @@ static inline void flush_dcache_page(struct page *page) void flush_icache_pages(struct vm_area_struct *vma, struct page *page, unsigned int nr); -#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1) +#define flush_icache_pages flush_icache_pages #define flush_icache_range(s,e) do { \ flush_kernel_dcache_range_asm(s,e); \ diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h index 9fceef6f3e00..878b6b551bd2 100644 --- a/arch/sh/include/asm/cacheflush.h +++ b/arch/sh/include/asm/cacheflush.h @@ -53,7 +53,7 @@ extern void flush_icache_range(unsigned long start, unsigned long end); #define flush_icache_user_range flush_icache_range void flush_icache_pages(struct vm_area_struct *vma, struct page *page, unsigned int nr); -#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1) +#define flush_icache_pages flush_icache_pages extern void flush_cache_sigtramp(unsigned long address); struct flusher_data { diff --git a/arch/sparc/include/asm/cacheflush_32.h b/arch/sparc/include/asm/cacheflush_32.h index c8dd971f0e88..f3b7270bf71b 100644 --- a/arch/sparc/include/asm/cacheflush_32.h +++ b/arch/sparc/include/asm/cacheflush_32.h @@ -16,8 +16,6 @@ #define flush_cache_page(vma,addr,pfn) \ sparc32_cachetlb_ops->cache_page(vma, addr) #define flush_icache_range(start, end) do { } while (0) -#define flush_icache_page(vma, pg) do { } while (0) -#define flush_icache_pages(vma, pg, nr) do { } while (0) #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ diff --git a/arch/sparc/include/asm/cacheflush_64.h b/arch/sparc/include/asm/cacheflush_64.h index a9a719f04d06..0e879004efff 100644 --- a/arch/sparc/include/asm/cacheflush_64.h +++ b/arch/sparc/include/asm/cacheflush_64.h @@ -53,9 +53,6 @@ static inline void flush_dcache_page(struct page *page) flush_dcache_folio(page_folio(page)); } -#define flush_icache_page(vma, pg) do { } while(0) -#define flush_icache_pages(vma, pg, nr) do { } while(0) - void flush_ptrace_access(struct vm_area_struct *, struct page *, unsigned long uaddr, void *kaddr, unsigned long len, int write); diff --git a/arch/xtensa/include/asm/cacheflush.h b/arch/xtensa/include/asm/cacheflush.h index 35153f6725e4..785a00ce83c1 100644 --- a/arch/xtensa/include/asm/cacheflush.h +++ b/arch/xtensa/include/asm/cacheflush.h @@ -160,10 +160,6 @@ void local_flush_cache_page(struct vm_area_struct *vma, __invalidate_icache_range(start,(end) - (start)); \ } while (0) -/* This is not required, see Documentation/core-api/cachetlb.rst */ -#define flush_icache_page(vma,page) do { } while (0) -#define flush_icache_pages(vma, page, nr) do { } while (0) - #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) -- cgit v1.2.3 From 00de2c9f26b15f1a6f2af516dd8ec5f8d28189b7 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 10 Aug 2023 09:32:41 +0000 Subject: arm64: mm: use ptep_clear() instead of pte_clear() in clear_flush() In clear_flush(), the original pte may be a present entry, so we should use ptep_clear() to let page_table_check track the pte clearing operation, otherwise it may cause false positive in subsequent set_pte_at(). Link: https://lkml.kernel.org/r/20230810093241.1181142-1-qi.zheng@linux.dev Fixes: 42b2547137f5 ("arm64/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK") Signed-off-by: Qi Zheng Acked-by: Will Deacon Cc: Catalin Marinas Cc: Kefeng Wang Cc: Muchun Song Cc: Pasha Tatashin Cc: Qi Zheng Signed-off-by: Andrew Morton --- arch/arm64/mm/hugetlbpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 21716c940682..9c52718ea750 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -236,7 +236,7 @@ static void clear_flush(struct mm_struct *mm, unsigned long i, saddr = addr; for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) - pte_clear(mm, addr, ptep); + ptep_clear(mm, addr, ptep); flush_tlb_range(&vma, saddr, addr); } -- cgit v1.2.3 From cfeed8ffe55b37fa10286aaaa1369da00cb88440 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 21 Aug 2023 18:08:46 +0200 Subject: mm/swap: stop using page->private on tail pages for THP_SWAP Patch series "mm/swap: stop using page->private on tail pages for THP_SWAP + cleanups". This series stops using page->private on tail pages for THP_SWAP, replaces folio->private by folio->swap for swapcache folios, and starts using "new_folio" for tail pages that we are splitting to remove the usage of page->private for swapcache handling completely. This patch (of 4): Let's stop using page->private on tail pages, making it possible to just unconditionally reuse that field in the tail pages of large folios. The remaining usage of the private field for THP_SWAP is in the THP splitting code (mm/huge_memory.c), that we'll handle separately later. Update the THP_SWAP documentation and sanity checks in mm_types.h and __split_huge_page_tail(). [david@redhat.com: stop using page->private on tail pages for THP_SWAP] Link: https://lkml.kernel.org/r/6f0a82a3-6948-20d9-580b-be1dbf415701@redhat.com Link: https://lkml.kernel.org/r/20230821160849.531668-1-david@redhat.com Link: https://lkml.kernel.org/r/20230821160849.531668-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Catalin Marinas [arm64] Reviewed-by: Yosry Ahmed Cc: Dan Streetman Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Seth Jennings Cc: Vitaly Wool Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/mm/mteswap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c index cd508ba80ab1..a31833e3ddc5 100644 --- a/arch/arm64/mm/mteswap.c +++ b/arch/arm64/mm/mteswap.c @@ -33,8 +33,9 @@ int mte_save_tags(struct page *page) mte_save_page_tags(page_address(page), tag_storage); - /* page_private contains the swap entry.val set in do_swap_page */ - ret = xa_store(&mte_pages, page_private(page), tag_storage, GFP_KERNEL); + /* lookup the swap entry.val from the page */ + ret = xa_store(&mte_pages, page_swap_entry(page).val, tag_storage, + GFP_KERNEL); if (WARN(xa_is_err(ret), "Failed to store MTE tags")) { mte_free_tag_storage(tag_storage); return xa_err(ret); -- cgit v1.2.3 From 7db15418d390cf878d7c77ae08a4ad39f1534bc5 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 22 Aug 2023 16:27:49 +0200 Subject: nios2: fix flush_dcache_page() for usage from irq context Since at least kernel 6.1, flush_dcache_page() is called with IRQs disabled, e.g. from aio_complete(). But the current implementation for flush_dcache_page() on NIOS2 unintentionally re-enables IRQs, which may lead to deadlocks. Fix it by using xa_lock_irqsave() and xa_unlock_irqrestore() for the flush_dcache_mmap_*lock() macros instead. Link: https://lkml.kernel.org/r/ZOTF5WWURQNH9+iw@p100 Signed-off-by: Helge Deller Cc: Dinh Nguyen Signed-off-by: Andrew Morton --- arch/nios2/include/asm/cacheflush.h | 4 ++++ arch/nios2/mm/cacheflush.c | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/nios2/include/asm/cacheflush.h b/arch/nios2/include/asm/cacheflush.h index 7c48c5213fb7..348cea097792 100644 --- a/arch/nios2/include/asm/cacheflush.h +++ b/arch/nios2/include/asm/cacheflush.h @@ -52,5 +52,9 @@ extern void invalidate_dcache_range(unsigned long start, unsigned long end); #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) +#define flush_dcache_mmap_lock_irqsave(mapping, flags) \ + xa_lock_irqsave(&mapping->i_pages, flags) +#define flush_dcache_mmap_unlock_irqrestore(mapping, flags) \ + xa_unlock_irqrestore(&mapping->i_pages, flags) #endif /* _ASM_NIOS2_CACHEFLUSH_H */ diff --git a/arch/nios2/mm/cacheflush.c b/arch/nios2/mm/cacheflush.c index 28b805f465a8..0ee9c5f02e08 100644 --- a/arch/nios2/mm/cacheflush.c +++ b/arch/nios2/mm/cacheflush.c @@ -75,12 +75,13 @@ static void flush_aliases(struct address_space *mapping, struct folio *folio) { struct mm_struct *mm = current->active_mm; struct vm_area_struct *vma; + unsigned long flags; pgoff_t pgoff; unsigned long nr = folio_nr_pages(folio); pgoff = folio->index; - flush_dcache_mmap_lock(mapping); + flush_dcache_mmap_lock_irqsave(mapping, flags); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff + nr - 1) { unsigned long start; @@ -92,7 +93,7 @@ static void flush_aliases(struct address_space *mapping, struct folio *folio) start = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); flush_cache_range(vma, start, start + nr * PAGE_SIZE); } - flush_dcache_mmap_unlock(mapping); + flush_dcache_mmap_unlock_irqrestore(mapping, flags); } void flush_cache_all(void) -- cgit v1.2.3