diff options
Diffstat (limited to 'arch/s390')
-rw-r--r-- | arch/s390/Kbuild | 2 | ||||
-rw-r--r-- | arch/s390/Kconfig | 70 | ||||
-rw-r--r-- | arch/s390/appldata/appldata_base.c | 2 | ||||
-rw-r--r-- | arch/s390/configs/btf.config | 1 | ||||
-rw-r--r-- | arch/s390/configs/kasan.config | 1 | ||||
-rw-r--r-- | arch/s390/include/asm/hugetlb.h | 2 | ||||
-rw-r--r-- | arch/s390/include/asm/io.h | 21 | ||||
-rw-r--r-- | arch/s390/include/asm/pgalloc.h | 8 | ||||
-rw-r--r-- | arch/s390/include/asm/pgtable.h | 37 | ||||
-rw-r--r-- | arch/s390/include/asm/tlb.h | 4 | ||||
-rw-r--r-- | arch/s390/mm/fault.c | 5 | ||||
-rw-r--r-- | arch/s390/mm/pageattr.c | 4 | ||||
-rw-r--r-- | arch/s390/mm/pgalloc.c | 176 | ||||
-rw-r--r-- | arch/s390/pci/pci.c | 57 |
14 files changed, 205 insertions, 185 deletions
diff --git a/arch/s390/Kbuild b/arch/s390/Kbuild index 8e4d74f51115..a5d3503b353c 100644 --- a/arch/s390/Kbuild +++ b/arch/s390/Kbuild @@ -7,7 +7,7 @@ obj-$(CONFIG_S390_HYPFS) += hypfs/ obj-$(CONFIG_APPLDATA_BASE) += appldata/ obj-y += net/ obj-$(CONFIG_PCI) += pci/ -obj-$(CONFIG_ARCH_HAS_KEXEC_PURGATORY) += purgatory/ +obj-$(CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY) += purgatory/ # for cleaning subdir- += boot tools diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 18bf754e1fad..ae29e4392664 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -127,7 +127,8 @@ config S390 select ARCH_WANTS_NO_INSTR select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_IPC_PARSE_VERSION - select ARCH_WANT_OPTIMIZE_VMEMMAP + select ARCH_WANT_KERNEL_PMD_MKWRITE + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS2 select DMA_OPS if PCI @@ -143,6 +144,7 @@ config S390 select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_VDSO_TIME_NS + select GENERIC_IOREMAP if PCI select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL @@ -214,6 +216,7 @@ config S390 select HAVE_VIRT_CPU_ACCOUNTING_IDLE select IOMMU_HELPER if PCI select IOMMU_SUPPORT if PCI + select KEXEC select MMU_GATHER_MERGE_VMAS select MMU_GATHER_NO_GATHER select MMU_GATHER_RCU_TABLE_FREE @@ -245,6 +248,25 @@ config PGTABLE_LEVELS source "kernel/livepatch/Kconfig" +config ARCH_SUPPORTS_KEXEC + def_bool y + +config ARCH_SUPPORTS_KEXEC_FILE + def_bool CRYPTO && CRYPTO_SHA256 && CRYPTO_SHA256_S390 + +config ARCH_SUPPORTS_KEXEC_SIG + def_bool MODULE_SIG_FORMAT + +config ARCH_SUPPORTS_KEXEC_PURGATORY + def_bool KEXEC_FILE + +config ARCH_SUPPORTS_CRASH_DUMP + def_bool y + help + Refer to <file:Documentation/arch/s390/zfcpdump.rst> for more details on this. + This option also enables s390 zfcpdump. + See also <file:Documentation/arch/s390/zfcpdump.rst> + menu "Processor type and features" config HAVE_MARCH_Z10_FEATURES @@ -483,36 +505,6 @@ config SCHED_TOPOLOGY source "kernel/Kconfig.hz" -config KEXEC - def_bool y - select KEXEC_CORE - -config KEXEC_FILE - bool "kexec file based system call" - select KEXEC_CORE - depends on CRYPTO - depends on CRYPTO_SHA256 - depends on CRYPTO_SHA256_S390 - help - Enable the kexec file based system call. In contrast to the normal - kexec system call this system call takes file descriptors for the - kernel and initramfs as arguments. - -config ARCH_HAS_KEXEC_PURGATORY - def_bool y - depends on KEXEC_FILE - -config KEXEC_SIG - bool "Verify kernel signature during kexec_file_load() syscall" - depends on KEXEC_FILE && MODULE_SIG_FORMAT - help - This option makes kernel signature verification mandatory for - the kexec_file_load() syscall. - - In addition to that option, you need to enable signature - verification for the corresponding kernel image type being - loaded in order for this to work. - config CERT_STORE bool "Get user certificates via DIAG320" depends on KEYS @@ -745,22 +737,6 @@ config VFIO_AP endmenu -menu "Dump support" - -config CRASH_DUMP - bool "kernel crash dumps" - select KEXEC - help - Generate crash dump after being started by kexec. - Crash dump kernels are loaded in the main kernel with kexec-tools - into a specially reserved region and then later executed after - a crash by kdump/kexec. - Refer to <file:Documentation/arch/s390/zfcpdump.rst> for more details on this. - This option also enables s390 zfcpdump. - See also <file:Documentation/arch/s390/zfcpdump.rst> - -endmenu - config CCW def_bool y diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c index bbefe5e86bdf..3b0994625652 100644 --- a/arch/s390/appldata/appldata_base.c +++ b/arch/s390/appldata/appldata_base.c @@ -365,7 +365,7 @@ int appldata_register_ops(struct appldata_ops *ops) ops->ctl_table[0].proc_handler = appldata_generic_handler; ops->ctl_table[0].data = ops; - ops->sysctl_header = register_sysctl(appldata_proc_name, ops->ctl_table); + ops->sysctl_header = register_sysctl_sz(appldata_proc_name, ops->ctl_table, 1); if (!ops->sysctl_header) goto out; return 0; diff --git a/arch/s390/configs/btf.config b/arch/s390/configs/btf.config index 39227b4511af..eb7f84f5925c 100644 --- a/arch/s390/configs/btf.config +++ b/arch/s390/configs/btf.config @@ -1 +1,2 @@ +# Help: Enable BTF debug info CONFIG_DEBUG_INFO_BTF=y diff --git a/arch/s390/configs/kasan.config b/arch/s390/configs/kasan.config index 700a8b25c3ff..84c2b551e992 100644 --- a/arch/s390/configs/kasan.config +++ b/arch/s390/configs/kasan.config @@ -1,3 +1,4 @@ +# Help: Enable KASan for debugging CONFIG_KASAN=y CONFIG_KASAN_INLINE=y CONFIG_KASAN_VMALLOC=y diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index ccdbccfde148..f07267875a19 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -104,7 +104,7 @@ static inline int huge_pte_dirty(pte_t pte) static inline pte_t huge_pte_mkwrite(pte_t pte) { - return pte_mkwrite(pte); + return pte_mkwrite_novma(pte); } static inline pte_t huge_pte_mkdirty(pte_t pte) diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h index e3882b012bfa..4453ad7c11ac 100644 --- a/arch/s390/include/asm/io.h +++ b/arch/s390/include/asm/io.h @@ -22,11 +22,18 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); #define IO_SPACE_LIMIT 0 -void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot); -void __iomem *ioremap(phys_addr_t addr, size_t size); -void __iomem *ioremap_wc(phys_addr_t addr, size_t size); -void __iomem *ioremap_wt(phys_addr_t addr, size_t size); -void iounmap(volatile void __iomem *addr); +/* + * I/O memory mapping functions. + */ +#define ioremap_prot ioremap_prot +#define iounmap iounmap + +#define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL) + +#define ioremap_wc(addr, size) \ + ioremap_prot((addr), (size), pgprot_val(pgprot_writecombine(PAGE_KERNEL))) +#define ioremap_wt(addr, size) \ + ioremap_prot((addr), (size), pgprot_val(pgprot_writethrough(PAGE_KERNEL))) static inline void __iomem *ioport_map(unsigned long port, unsigned int nr) { @@ -51,10 +58,6 @@ static inline void ioport_unmap(void __iomem *p) #define pci_iomap_wc pci_iomap_wc #define pci_iomap_wc_range pci_iomap_wc_range -#define ioremap ioremap -#define ioremap_wt ioremap_wt -#define ioremap_wc ioremap_wc - #define memcpy_fromio(dst, src, count) zpci_memcpy_fromio(dst, src, count) #define memcpy_toio(dst, src, count) zpci_memcpy_toio(dst, src, count) #define memset_io(dst, val, count) zpci_memset_io(dst, val, count) diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 17eb618f1348..376b4b23bdaa 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -86,7 +86,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) if (!table) return NULL; crst_table_init(table, _SEGMENT_ENTRY_EMPTY); - if (!pgtable_pmd_page_ctor(virt_to_page(table))) { + if (!pagetable_pmd_ctor(virt_to_ptdesc(table))) { crst_table_free(mm, table); return NULL; } @@ -97,7 +97,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { if (mm_pmd_folded(mm)) return; - pgtable_pmd_page_dtor(virt_to_page(pmd)); + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); crst_table_free(mm, (unsigned long *) pmd); } @@ -143,6 +143,10 @@ static inline void pmd_populate(struct mm_struct *mm, #define pte_free_kernel(mm, pte) page_table_free(mm, (unsigned long *) pte) #define pte_free(mm, pte) page_table_free(mm, (unsigned long *) pte) +/* arch use pte_free_defer() implementation in arch/s390/mm/pgalloc.c */ +#define pte_free_defer pte_free_defer +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); + void vmem_map_init(void); void *vmem_crst_alloc(unsigned long val); pte_t *vmem_pte_alloc(void); diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 30909fe27c24..fb3ee7758b76 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -47,6 +47,7 @@ static inline void update_page_count(int level, long count) * tables contain all the necessary information. */ #define update_mmu_cache(vma, address, ptep) do { } while (0) +#define update_mmu_cache_range(vmf, vma, addr, ptep, nr) do { } while (0) #define update_mmu_cache_pmd(vma, address, ptep) do { } while (0) /* @@ -1000,7 +1001,7 @@ static inline pte_t pte_wrprotect(pte_t pte) return set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); } -static inline pte_t pte_mkwrite(pte_t pte) +static inline pte_t pte_mkwrite_novma(pte_t pte) { pte = set_pte_bit(pte, __pgprot(_PAGE_WRITE)); if (pte_val(pte) & _PAGE_DIRTY) @@ -1314,20 +1315,34 @@ pgprot_t pgprot_writecombine(pgprot_t prot); pgprot_t pgprot_writethrough(pgprot_t prot); /* - * Certain architectures need to do special things when PTEs - * within a page table are directly modified. Thus, the following - * hook is made available. + * Set multiple PTEs to consecutive pages with a single call. All PTEs + * are within the same folio, PMD and VMA. */ -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t entry) +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t entry, unsigned int nr) { if (pte_present(entry)) entry = clear_pte_bit(entry, __pgprot(_PAGE_UNUSED)); - if (mm_has_pgste(mm)) - ptep_set_pte_at(mm, addr, ptep, entry); - else - set_pte(ptep, entry); + if (mm_has_pgste(mm)) { + for (;;) { + ptep_set_pte_at(mm, addr, ptep, entry); + if (--nr == 0) + break; + ptep++; + entry = __pte(pte_val(entry) + PAGE_SIZE); + addr += PAGE_SIZE; + } + } else { + for (;;) { + set_pte(ptep, entry); + if (--nr == 0) + break; + ptep++; + entry = __pte(pte_val(entry) + PAGE_SIZE); + } + } } +#define set_ptes set_ptes /* * Conversion functions: convert a page and protection to a page entry, @@ -1483,7 +1498,7 @@ static inline pmd_t pmd_wrprotect(pmd_t pmd) return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT)); } -static inline pmd_t pmd_mkwrite(pmd_t pmd) +static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) { pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_WRITE)); if (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index b91f4a9b044c..383b1f91442c 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -89,12 +89,12 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, { if (mm_pmd_folded(tlb->mm)) return; - pgtable_pmd_page_dtor(virt_to_page(pmd)); + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); __tlb_adjust_range(tlb, address, PAGE_SIZE); tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; tlb->cleared_puds = 1; - tlb_remove_table(tlb, pmd); + tlb_remove_ptdesc(tlb, pmd); } /* diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index b5e1bea9194c..099c4824dd8a 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -405,7 +405,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) access = VM_WRITE; if (access == VM_WRITE) flags |= FAULT_FLAG_WRITE; -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; vma = lock_vma_under_rcu(mm, address); @@ -416,7 +415,8 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) goto lock_mmap; } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); if (likely(!(fault & VM_FAULT_ERROR))) @@ -430,7 +430,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) goto out; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ mmap_read_lock(mm); gmap = NULL; diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 43c919c7bafb..b87e96c64b61 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -98,7 +98,7 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end, if (flags & SET_MEMORY_RO) new = pte_wrprotect(new); else if (flags & SET_MEMORY_RW) - new = pte_mkwrite(pte_mkdirty(new)); + new = pte_mkwrite_novma(pte_mkdirty(new)); if (flags & SET_MEMORY_NX) new = set_pte_bit(new, __pgprot(_PAGE_NOEXEC)); else if (flags & SET_MEMORY_X) @@ -156,7 +156,7 @@ static void modify_pmd_page(pmd_t *pmdp, unsigned long addr, if (flags & SET_MEMORY_RO) new = pmd_wrprotect(new); else if (flags & SET_MEMORY_RW) - new = pmd_mkwrite(pmd_mkdirty(new)); + new = pmd_mkwrite_novma(pmd_mkdirty(new)); if (flags & SET_MEMORY_NX) new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC)); else if (flags & SET_MEMORY_X) diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 66ab68db9842..07fc660a24aa 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -43,17 +43,17 @@ __initcall(page_table_register_sysctl); unsigned long *crst_table_alloc(struct mm_struct *mm) { - struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER); - if (!page) + if (!ptdesc) return NULL; - arch_set_page_dat(page, CRST_ALLOC_ORDER); - return (unsigned long *) page_to_virt(page); + arch_set_page_dat(ptdesc_page(ptdesc), CRST_ALLOC_ORDER); + return (unsigned long *) ptdesc_to_virt(ptdesc); } void crst_table_free(struct mm_struct *mm, unsigned long *table) { - free_pages((unsigned long)table, CRST_ALLOC_ORDER); + pagetable_free(virt_to_ptdesc(table)); } static void __crst_table_upgrade(void *arg) @@ -140,21 +140,21 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) struct page *page_table_alloc_pgste(struct mm_struct *mm) { - struct page *page; + struct ptdesc *ptdesc; u64 *table; - page = alloc_page(GFP_KERNEL); - if (page) { - table = (u64 *)page_to_virt(page); + ptdesc = pagetable_alloc(GFP_KERNEL, 0); + if (ptdesc) { + table = (u64 *)ptdesc_to_virt(ptdesc); memset64(table, _PAGE_INVALID, PTRS_PER_PTE); memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } - return page; + return ptdesc_page(ptdesc); } void page_table_free_pgste(struct page *page) { - __free_page(page); + pagetable_free(page_ptdesc(page)); } #endif /* CONFIG_PGSTE */ @@ -229,11 +229,20 @@ void page_table_free_pgste(struct page *page) * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable * while the PP bits are never used, nor such a page is added to or removed * from mm_context_t::pgtable_list. + * + * pte_free_defer() overrides those rules: it takes the page off pgtable_list, + * and prevents both 2K fragments from being reused. pte_free_defer() has to + * guarantee that its pgtable cannot be reused before the RCU grace period + * has elapsed (which page_table_free_rcu() does not actually guarantee). + * But for simplicity, because page->rcu_head overlays page->lru, and because + * the RCU callback might not be called before the mm_context_t has been freed, + * pte_free_defer() in this implementation prevents both fragments from being + * reused, and delays making the call to RCU until both fragments are freed. */ unsigned long *page_table_alloc(struct mm_struct *mm) { unsigned long *table; - struct page *page; + struct ptdesc *ptdesc; unsigned int mask, bit; /* Try to get a fragment of a 4K page as a 2K page table */ @@ -241,9 +250,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm) table = NULL; spin_lock_bh(&mm->context.lock); if (!list_empty(&mm->context.pgtable_list)) { - page = list_first_entry(&mm->context.pgtable_list, - struct page, lru); - mask = atomic_read(&page->_refcount) >> 24; + ptdesc = list_first_entry(&mm->context.pgtable_list, + struct ptdesc, pt_list); + mask = atomic_read(&ptdesc->_refcount) >> 24; /* * The pending removal bits must also be checked. * Failure to do so might lead to an impossible @@ -255,13 +264,13 @@ unsigned long *page_table_alloc(struct mm_struct *mm) */ mask = (mask | (mask >> 4)) & 0x03U; if (mask != 0x03U) { - table = (unsigned long *) page_to_virt(page); + table = (unsigned long *) ptdesc_to_virt(ptdesc); bit = mask & 1; /* =1 -> second 2K */ if (bit) table += PTRS_PER_PTE; - atomic_xor_bits(&page->_refcount, + atomic_xor_bits(&ptdesc->_refcount, 0x01U << (bit + 24)); - list_del(&page->lru); + list_del_init(&ptdesc->pt_list); } } spin_unlock_bh(&mm->context.lock); @@ -269,27 +278,28 @@ unsigned long *page_table_alloc(struct mm_struct *mm) return table; } /* Allocate a fresh page */ - page = alloc_page(GFP_KERNEL); - if (!page) + ptdesc = pagetable_alloc(GFP_KERNEL, 0); + if (!ptdesc) return NULL; - if (!pgtable_pte_page_ctor(page)) { - __free_page(page); + if (!pagetable_pte_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - arch_set_page_dat(page, 0); + arch_set_page_dat(ptdesc_page(ptdesc), 0); /* Initialize page table */ - table = (unsigned long *) page_to_virt(page); + table = (unsigned long *) ptdesc_to_virt(ptdesc); if (mm_alloc_pgste(mm)) { /* Return 4K page table with PGSTEs */ - atomic_xor_bits(&page->_refcount, 0x03U << 24); + INIT_LIST_HEAD(&ptdesc->pt_list); + atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } else { /* Return the first 2K fragment of the page */ - atomic_xor_bits(&page->_refcount, 0x01U << 24); + atomic_xor_bits(&ptdesc->_refcount, 0x01U << 24); memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE); spin_lock_bh(&mm->context.lock); - list_add(&page->lru, &mm->context.pgtable_list); + list_add(&ptdesc->pt_list, &mm->context.pgtable_list); spin_unlock_bh(&mm->context.lock); } return table; @@ -300,7 +310,9 @@ static void page_table_release_check(struct page *page, void *table, { char msg[128]; - if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask) + if (!IS_ENABLED(CONFIG_DEBUG_VM)) + return; + if (!mask && list_empty(&page->lru)) return; snprintf(msg, sizeof(msg), "Invalid pgtable %p release half 0x%02x mask 0x%02x", @@ -308,12 +320,20 @@ static void page_table_release_check(struct page *page, void *table, dump_page(page, msg); } +static void pte_free_now(struct rcu_head *head) +{ + struct ptdesc *ptdesc; + + ptdesc = container_of(head, struct ptdesc, pt_rcu_head); + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); +} + void page_table_free(struct mm_struct *mm, unsigned long *table) { unsigned int mask, bit, half; - struct page *page; + struct ptdesc *ptdesc = virt_to_ptdesc(table); - page = virt_to_page(table); if (!mm_alloc_pgste(mm)) { /* Free 2K page table fragment of a 4K page */ bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); @@ -323,42 +343,50 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) * will happen outside of the critical section from this * function or from __tlb_remove_table() */ - mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if (mask & 0x03U) - list_add(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); + if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) { + /* + * Other half is allocated, and neither half has had + * its free deferred: add page to head of list, to make + * this freed half available for immediate reuse. + */ + list_add(&ptdesc->pt_list, &mm->context.pgtable_list); + } else { + /* If page is on list, now remove it. */ + list_del_init(&ptdesc->pt_list); + } spin_unlock_bh(&mm->context.lock); - mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24)); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x10U << (bit + 24)); mask >>= 24; if (mask != 0x00U) return; half = 0x01U << bit; } else { half = 0x03U; - mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); mask >>= 24; } - page_table_release_check(page, table, half, mask); - pgtable_pte_page_dtor(page); - __free_page(page); + page_table_release_check(ptdesc_page(ptdesc), table, half, mask); + if (folio_test_clear_active(ptdesc_folio(ptdesc))) + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); + else + pte_free_now(&ptdesc->pt_rcu_head); } void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, unsigned long vmaddr) { struct mm_struct *mm; - struct page *page; unsigned int bit, mask; + struct ptdesc *ptdesc = virt_to_ptdesc(table); mm = tlb->mm; - page = virt_to_page(table); if (mm_alloc_pgste(mm)) { gmap_unlink(mm, table, vmaddr); table = (unsigned long *) ((unsigned long)table | 0x03U); - tlb_remove_table(tlb, table); + tlb_remove_ptdesc(tlb, table); return; } bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); @@ -368,12 +396,20 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, * outside of the critical section from __tlb_remove_table() or from * page_table_free() */ - mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if (mask & 0x03U) - list_add_tail(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); + if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) { + /* + * Other half is allocated, and neither half has had + * its free deferred: add page to end of list, to make + * this freed half available for reuse once its pending + * bit has been cleared by __tlb_remove_table(). + */ + list_add_tail(&ptdesc->pt_list, &mm->context.pgtable_list); + } else { + /* If page is on list, now remove it. */ + list_del_init(&ptdesc->pt_list); + } spin_unlock_bh(&mm->context.lock); table = (unsigned long *) ((unsigned long) table | (0x01U << bit)); tlb_remove_table(tlb, table); @@ -383,30 +419,48 @@ void __tlb_remove_table(void *_table) { unsigned int mask = (unsigned long) _table & 0x03U, half = mask; void *table = (void *)((unsigned long) _table ^ mask); - struct page *page = virt_to_page(table); + struct ptdesc *ptdesc = virt_to_ptdesc(table); switch (half) { case 0x00U: /* pmd, pud, or p4d */ - free_pages((unsigned long)table, CRST_ALLOC_ORDER); + pagetable_free(ptdesc); return; case 0x01U: /* lower 2K of a 4K page table */ case 0x02U: /* higher 2K of a 4K page table */ - mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24)); + mask = atomic_xor_bits(&ptdesc->_refcount, mask << (4 + 24)); mask >>= 24; if (mask != 0x00U) return; break; case 0x03U: /* 4K page table with pgstes */ - mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); mask >>= 24; break; } - page_table_release_check(page, table, half, mask); - pgtable_pte_page_dtor(page); - __free_page(page); + page_table_release_check(ptdesc_page(ptdesc), table, half, mask); + if (folio_test_clear_active(ptdesc_folio(ptdesc))) + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); + else + pte_free_now(&ptdesc->pt_rcu_head); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) +{ + struct page *page; + + page = virt_to_page(pgtable); + SetPageActive(page); + page_table_free(mm, (unsigned long *)pgtable); + /* + * page_table_free() does not do the pgste gmap_unlink() which + * page_table_free_rcu() does: warn us if pgste ever reaches here. + */ + WARN_ON_ONCE(mm_has_pgste(mm)); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + /* * Base infrastructure required to generate basic asces, region, segment, * and page tables that do not make use of enhanced features like EDAT1. @@ -432,16 +486,20 @@ static void base_pgt_free(unsigned long *table) static unsigned long *base_crst_alloc(unsigned long val) { unsigned long *table; + struct ptdesc *ptdesc; - table = (unsigned long *)__get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER); - if (table) - crst_table_init(table, val); + ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, CRST_ALLOC_ORDER); + if (!ptdesc) + return NULL; + table = ptdesc_address(ptdesc); + + crst_table_init(table, val); return table; } static void base_crst_free(unsigned long *table) { - free_pages((unsigned long)table, CRST_ALLOC_ORDER); + pagetable_free(virt_to_ptdesc(table)); } #define BASE_ADDR_END_FUNC(NAME, SIZE) \ diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index afc3f33788da..d34d5813d006 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -244,62 +244,25 @@ void __iowrite64_copy(void __iomem *to, const void *from, size_t count) zpci_memcpy_toio(to, from, count); } -static void __iomem *__ioremap(phys_addr_t addr, size_t size, pgprot_t prot) +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long prot) { - unsigned long offset, vaddr; - struct vm_struct *area; - phys_addr_t last_addr; - - last_addr = addr + size - 1; - if (!size || last_addr < addr) - return NULL; - + /* + * When PCI MIO instructions are unavailable the "physical" address + * encodes a hint for accessing the PCI memory space it represents. + * Just pass it unchanged such that ioread/iowrite can decode it. + */ if (!static_branch_unlikely(&have_mio)) - return (void __iomem *) addr; + return (void __iomem *)phys_addr; - offset = addr & ~PAGE_MASK; - addr &= PAGE_MASK; - size = PAGE_ALIGN(size + offset); - area = get_vm_area(size, VM_IOREMAP); - if (!area) - return NULL; - - vaddr = (unsigned long) area->addr; - if (ioremap_page_range(vaddr, vaddr + size, addr, prot)) { - free_vm_area(area); - return NULL; - } - return (void __iomem *) ((unsigned long) area->addr + offset); -} - -void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot) -{ - return __ioremap(addr, size, __pgprot(prot)); + return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); } EXPORT_SYMBOL(ioremap_prot); -void __iomem *ioremap(phys_addr_t addr, size_t size) -{ - return __ioremap(addr, size, PAGE_KERNEL); -} -EXPORT_SYMBOL(ioremap); - -void __iomem *ioremap_wc(phys_addr_t addr, size_t size) -{ - return __ioremap(addr, size, pgprot_writecombine(PAGE_KERNEL)); -} -EXPORT_SYMBOL(ioremap_wc); - -void __iomem *ioremap_wt(phys_addr_t addr, size_t size) -{ - return __ioremap(addr, size, pgprot_writethrough(PAGE_KERNEL)); -} -EXPORT_SYMBOL(ioremap_wt); - void iounmap(volatile void __iomem *addr) { if (static_branch_likely(&have_mio)) - vunmap((__force void *) ((unsigned long) addr & PAGE_MASK)); + generic_iounmap(addr); } EXPORT_SYMBOL(iounmap); |