summaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile7
-rw-r--r--arch/x86/mm/amdtopology.c1
-rw-r--r--arch/x86/mm/cpu_entry_area.c2
-rw-r--r--arch/x86/mm/extable.c9
-rw-r--r--arch/x86/mm/fault.c67
-rw-r--r--arch/x86/mm/highmem_32.c34
-rw-r--r--arch/x86/mm/hugetlbpage.c130
-rw-r--r--arch/x86/mm/ident_map.c116
-rw-r--r--arch/x86/mm/init.c148
-rw-r--r--arch/x86/mm/init_32.c47
-rw-r--r--arch/x86/mm/init_64.c92
-rw-r--r--arch/x86/mm/ioremap.c132
-rw-r--r--arch/x86/mm/kasan_init_64.c1
-rw-r--r--arch/x86/mm/kaslr.c48
-rw-r--r--arch/x86/mm/mem_encrypt.c9
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c98
-rw-r--r--arch/x86/mm/mem_encrypt_boot.S1
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c17
-rw-r--r--arch/x86/mm/mmap.c18
-rw-r--r--arch/x86/mm/numa.c622
-rw-r--r--arch/x86/mm/numa_32.c1
-rw-r--r--arch/x86/mm/numa_emulation.c585
-rw-r--r--arch/x86/mm/numa_internal.h24
-rw-r--r--arch/x86/mm/pat/cpa-test.c2
-rw-r--r--arch/x86/mm/pat/memtype.c125
-rw-r--r--arch/x86/mm/pat/set_memory.c397
-rw-r--r--arch/x86/mm/pgtable.c103
-rw-r--r--arch/x86/mm/pti.c53
-rw-r--r--arch/x86/mm/srat.c6
-rw-r--r--arch/x86/mm/testmmiotrace.c1
-rw-r--r--arch/x86/mm/tlb.c517
31 files changed, 1410 insertions, 2003 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 428048e73bd2..32035d5be5a0 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -34,16 +34,14 @@ obj-y += pat/
CFLAGS_physaddr.o := -fno-stack-protector
CFLAGS_mem_encrypt_identity.o := -fno-stack-protector
-CFLAGS_fault.o := -I $(srctree)/$(src)/../include/asm/trace
+CFLAGS_fault.o := -I $(src)/../include/asm/trace
obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o
+obj-$(CONFIG_PTDUMP) += dump_pagetables.o
obj-$(CONFIG_PTDUMP_DEBUGFS) += debug_pagetables.o
-obj-$(CONFIG_HIGHMEM) += highmem_32.o
-
KASAN_SANITIZE_kasan_init_$(BITS).o := n
obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o
@@ -57,7 +55,6 @@ obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
obj-$(CONFIG_AMD_NUMA) += amdtopology.o
obj-$(CONFIG_ACPI_NUMA) += srat.o
-obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
index 9332b36a1091..628833afee37 100644
--- a/arch/x86/mm/amdtopology.c
+++ b/arch/x86/mm/amdtopology.c
@@ -12,6 +12,7 @@
#include <linux/string.h>
#include <linux/nodemask.h>
#include <linux/memblock.h>
+#include <linux/numa_memblks.h>
#include <asm/io.h>
#include <linux/pci_ids.h>
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index e91500a80963..575f863f3c75 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -164,7 +164,7 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu)
}
}
#else
-static inline void percpu_setup_exception_stacks(unsigned int cpu)
+static void __init percpu_setup_exception_stacks(unsigned int cpu)
{
struct cpu_entry_area *cea = get_cpu_entry_area(cpu);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index b522933bfa56..51986e8a9d35 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -164,13 +164,6 @@ static bool ex_handler_uaccess(const struct exception_table_entry *fixup,
return ex_handler_default(fixup, regs);
}
-static bool ex_handler_copy(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr)
-{
- WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
- return ex_handler_fault(fixup, regs, trapnr);
-}
-
static bool ex_handler_msr(const struct exception_table_entry *fixup,
struct pt_regs *regs, bool wrmsr, bool safe, int reg)
{
@@ -341,8 +334,6 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
return ex_handler_fault(e, regs, trapnr);
case EX_TYPE_UACCESS:
return ex_handler_uaccess(e, regs, trapnr, fault_addr);
- case EX_TYPE_COPY:
- return ex_handler_copy(e, regs, trapnr);
case EX_TYPE_CLEAR_FS:
return ex_handler_clear_fs(e, regs);
case EX_TYPE_FPU_RESTORE:
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 622d12ec7f08..296d294142c8 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -7,7 +7,6 @@
#include <linux/sched.h> /* test_thread_flag(), ... */
#include <linux/sched/task_stack.h> /* task_stack_*(), ... */
#include <linux/kdebug.h> /* oops_begin/end, ... */
-#include <linux/extable.h> /* search_exception_tables */
#include <linux/memblock.h> /* max_low_pfn */
#include <linux/kfence.h> /* kfence_handle_page_fault */
#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
@@ -20,6 +19,7 @@
#include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/
#include <linux/mm_types.h>
#include <linux/mm.h> /* find_and_lock_vma() */
+#include <linux/vmalloc.h>
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
@@ -514,18 +514,19 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
if (error_code & X86_PF_INSTR) {
unsigned int level;
+ bool nx, rw;
pgd_t *pgd;
pte_t *pte;
pgd = __va(read_cr3_pa());
pgd += pgd_index(address);
- pte = lookup_address_in_pgd(pgd, address, &level);
+ pte = lookup_address_in_pgd_attr(pgd, address, &level, &nx, &rw);
- if (pte && pte_present(*pte) && !pte_exec(*pte))
+ if (pte && pte_present(*pte) && (!pte_exec(*pte) || nx))
pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
from_kuid(&init_user_ns, current_uid()));
- if (pte && pte_present(*pte) && pte_exec(*pte) &&
+ if (pte && pte_present(*pte) && pte_exec(*pte) && !nx &&
(pgd_flags(*pgd) & _PAGE_USER) &&
(__read_cr4() & X86_CR4_SMEP))
pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
@@ -676,7 +677,7 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
ASM_CALL_ARG3,
, [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info));
- unreachable();
+ BUG();
}
#endif
@@ -723,39 +724,8 @@ kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
WARN_ON_ONCE(user_mode(regs));
/* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
- /*
- * Any interrupt that takes a fault gets the fixup. This makes
- * the below recursive fault logic only apply to a faults from
- * task context.
- */
- if (in_interrupt())
- return;
-
- /*
- * Per the above we're !in_interrupt(), aka. task context.
- *
- * In this case we need to make sure we're not recursively
- * faulting through the emulate_vsyscall() logic.
- */
- if (current->thread.sig_on_uaccess_err && signal) {
- sanitize_error_code(address, &error_code);
-
- set_signal_archinfo(address, error_code);
-
- if (si_code == SEGV_PKUERR) {
- force_sig_pkuerr((void __user *)address, pkey);
- } else {
- /* XXX: hwpoison faults will set the wrong code. */
- force_sig_fault(signal, si_code, (void __user *)address);
- }
- }
-
- /*
- * Barring that, we can do the fixup and be happy.
- */
+ if (fixup_exception(regs, X86_TRAP_PF, error_code, address))
return;
- }
/*
* AMD erratum #91 manifests as a spurious page fault on a PREFETCH
@@ -865,14 +835,17 @@ bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
static void
__bad_area(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, u32 pkey, int si_code)
+ unsigned long address, struct mm_struct *mm,
+ struct vm_area_struct *vma, u32 pkey, int si_code)
{
- struct mm_struct *mm = current->mm;
/*
* Something tried to access memory that isn't in our memory map..
* Fix it, but check if it's kernel or user first..
*/
- mmap_read_unlock(mm);
+ if (mm)
+ mmap_read_unlock(mm);
+ else
+ vma_end_read(vma);
__bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
}
@@ -896,7 +869,8 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
static noinline void
bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, struct vm_area_struct *vma)
+ unsigned long address, struct mm_struct *mm,
+ struct vm_area_struct *vma)
{
/*
* This OSPKE check is not strictly necessary at runtime.
@@ -926,9 +900,9 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
*/
u32 pkey = vma_pkey(vma);
- __bad_area(regs, error_code, address, pkey, SEGV_PKUERR);
+ __bad_area(regs, error_code, address, mm, vma, pkey, SEGV_PKUERR);
} else {
- __bad_area(regs, error_code, address, 0, SEGV_ACCERR);
+ __bad_area(regs, error_code, address, mm, vma, 0, SEGV_ACCERR);
}
}
@@ -1356,8 +1330,9 @@ void do_user_addr_fault(struct pt_regs *regs,
goto lock_mmap;
if (unlikely(access_error(error_code, vma))) {
- vma_end_read(vma);
- goto lock_mmap;
+ bad_area_access_error(regs, error_code, address, NULL, vma);
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ return;
}
fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
@@ -1393,7 +1368,7 @@ retry:
* we can handle it..
*/
if (unlikely(access_error(error_code, vma))) {
- bad_area_access_error(regs, error_code, address, vma);
+ bad_area_access_error(regs, error_code, address, mm, vma);
return;
}
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
deleted file mode 100644
index d9efa35711ee..000000000000
--- a/arch/x86/mm/highmem_32.c
+++ /dev/null
@@ -1,34 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/highmem.h>
-#include <linux/export.h>
-#include <linux/swap.h> /* for totalram_pages */
-#include <linux/memblock.h>
-#include <asm/numa.h>
-
-void __init set_highmem_pages_init(void)
-{
- struct zone *zone;
- int nid;
-
- /*
- * Explicitly reset zone->managed_pages because set_highmem_pages_init()
- * is invoked before memblock_free_all()
- */
- reset_all_zones_managed_pages();
- for_each_zone(zone) {
- unsigned long zone_start_pfn, zone_end_pfn;
-
- if (!is_highmem(zone))
- continue;
-
- zone_start_pfn = zone->zone_start_pfn;
- zone_end_pfn = zone_start_pfn + zone->spanned_pages;
-
- nid = zone_to_nid(zone);
- printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
- zone->name, nid, zone_start_pfn, zone_end_pfn);
-
- add_highpages_with_active_regions(nid, zone_start_pfn,
- zone_end_pfn);
- }
-}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 5804bbae4f01..58f7f2bd535d 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -19,136 +19,6 @@
#include <asm/tlbflush.h>
#include <asm/elf.h>
-/*
- * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal
- * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry.
- * Otherwise, returns 0.
- */
-int pmd_huge(pmd_t pmd)
-{
- return !pmd_none(pmd) &&
- (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
-}
-
-/*
- * pud_huge() returns 1 if @pud is hugetlb related entry, that is normal
- * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry.
- * Otherwise, returns 0.
- */
-int pud_huge(pud_t pud)
-{
-#if CONFIG_PGTABLE_LEVELS > 2
- return !pud_none(pud) &&
- (pud_val(pud) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
-#else
- return 0;
-#endif
-}
-
-#ifdef CONFIG_HUGETLB_PAGE
-static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
- unsigned long addr, unsigned long len,
- unsigned long pgoff, unsigned long flags)
-{
- struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info;
-
- info.flags = 0;
- info.length = len;
- info.low_limit = get_mmap_base(1);
-
- /*
- * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
- * in the full address space.
- */
- info.high_limit = in_32bit_syscall() ?
- task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW);
-
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
- return vm_unmapped_area(&info);
-}
-
-static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
- unsigned long addr, unsigned long len,
- unsigned long pgoff, unsigned long flags)
-{
- struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info;
-
- info.flags = VM_UNMAPPED_AREA_TOPDOWN;
- info.length = len;
- info.low_limit = PAGE_SIZE;
- info.high_limit = get_mmap_base(0);
-
- /*
- * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
- * in the full address space.
- */
- if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall())
- info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
-
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
- addr = vm_unmapped_area(&info);
-
- /*
- * A failed mmap() very likely causes application failure,
- * so fall back to the bottom-up function here. This scenario
- * can happen with large stack limits and large mmap()
- * allocations.
- */
- if (addr & ~PAGE_MASK) {
- VM_BUG_ON(addr != -ENOMEM);
- info.flags = 0;
- info.low_limit = TASK_UNMAPPED_BASE;
- info.high_limit = TASK_SIZE_LOW;
- addr = vm_unmapped_area(&info);
- }
-
- return addr;
-}
-
-unsigned long
-hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
-{
- struct hstate *h = hstate_file(file);
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
-
- if (len & ~huge_page_mask(h))
- return -EINVAL;
-
- if (len > TASK_SIZE)
- return -ENOMEM;
-
- /* No address checking. See comment at mmap_address_hint_valid() */
- if (flags & MAP_FIXED) {
- if (prepare_hugepage_range(file, addr, len))
- return -EINVAL;
- return addr;
- }
-
- if (addr) {
- addr &= huge_page_mask(h);
- if (!mmap_address_hint_valid(addr, len))
- goto get_unmapped_area;
-
- vma = find_vma(mm, addr);
- if (!vma || addr + len <= vm_start_gap(vma))
- return addr;
- }
-
-get_unmapped_area:
- if (mm->get_unmapped_area == arch_get_unmapped_area)
- return hugetlb_get_unmapped_area_bottomup(file, addr, len,
- pgoff, flags);
- else
- return hugetlb_get_unmapped_area_topdown(file, addr, len,
- pgoff, flags);
-}
-#endif /* CONFIG_HUGETLB_PAGE */
#ifdef CONFIG_X86_64
bool __init arch_hugetlb_valid_size(unsigned long size)
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index 968d7005f4a7..bd5d101c5c37 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -4,6 +4,79 @@
* included by both the compressed kernel and the regular kernel.
*/
+static void free_pte(struct x86_mapping_info *info, pmd_t *pmd)
+{
+ pte_t *pte = pte_offset_kernel(pmd, 0);
+
+ info->free_pgt_page(pte, info->context);
+}
+
+static void free_pmd(struct x86_mapping_info *info, pud_t *pud)
+{
+ pmd_t *pmd = pmd_offset(pud, 0);
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (!pmd_present(pmd[i]))
+ continue;
+
+ if (pmd_leaf(pmd[i]))
+ continue;
+
+ free_pte(info, &pmd[i]);
+ }
+
+ info->free_pgt_page(pmd, info->context);
+}
+
+static void free_pud(struct x86_mapping_info *info, p4d_t *p4d)
+{
+ pud_t *pud = pud_offset(p4d, 0);
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ if (!pud_present(pud[i]))
+ continue;
+
+ if (pud_leaf(pud[i]))
+ continue;
+
+ free_pmd(info, &pud[i]);
+ }
+
+ info->free_pgt_page(pud, info->context);
+}
+
+static void free_p4d(struct x86_mapping_info *info, pgd_t *pgd)
+{
+ p4d_t *p4d = p4d_offset(pgd, 0);
+ int i;
+
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ if (!p4d_present(p4d[i]))
+ continue;
+
+ free_pud(info, &p4d[i]);
+ }
+
+ if (pgtable_l5_enabled())
+ info->free_pgt_page(p4d, info->context);
+}
+
+void kernel_ident_mapping_free(struct x86_mapping_info *info, pgd_t *pgd)
+{
+ int i;
+
+ for (i = 0; i < PTRS_PER_PGD; i++) {
+ if (!pgd_present(pgd[i]))
+ continue;
+
+ free_p4d(info, &pgd[i]);
+ }
+
+ info->free_pgt_page(pgd, info->context);
+}
+
static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page,
unsigned long addr, unsigned long end)
{
@@ -26,18 +99,29 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
for (; addr < end; addr = next) {
pud_t *pud = pud_page + pud_index(addr);
pmd_t *pmd;
+ bool use_gbpage;
- next = (addr & PUD_MASK) + PUD_SIZE;
- if (next > end)
- next = end;
+ next = pud_addr_end(addr, end);
- if (info->direct_gbpages) {
- pud_t pudval;
+ /* if this is already a gbpage, this portion is already mapped */
+ if (pud_leaf(*pud))
+ continue;
- if (pud_present(*pud))
- continue;
+ /* Is using a gbpage allowed? */
+ use_gbpage = info->direct_gbpages;
+
+ /* Don't use gbpage if it maps more than the requested region. */
+ /* at the begining: */
+ use_gbpage &= ((addr & ~PUD_MASK) == 0);
+ /* ... or at the end: */
+ use_gbpage &= ((next & ~PUD_MASK) == 0);
+
+ /* Never overwrite existing mappings */
+ use_gbpage &= !pud_present(*pud);
+
+ if (use_gbpage) {
+ pud_t pudval;
- addr &= PUD_MASK;
pudval = __pud((addr - info->offset) | info->page_flag);
set_pud(pud, pudval);
continue;
@@ -68,10 +152,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
p4d_t *p4d = p4d_page + p4d_index(addr);
pud_t *pud;
- next = (addr & P4D_MASK) + P4D_SIZE;
- if (next > end)
- next = end;
-
+ next = p4d_addr_end(addr, end);
if (p4d_present(*p4d)) {
pud = pud_offset(p4d, 0);
result = ident_pud_init(info, pud, addr, next);
@@ -88,7 +169,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
if (result)
return result;
- set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag));
+ set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag | _PAGE_NOPTISHADOW));
}
return 0;
@@ -113,10 +194,7 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
pgd_t *pgd = pgd_page + pgd_index(addr);
p4d_t *p4d;
- next = (addr & PGDIR_MASK) + PGDIR_SIZE;
- if (next > end)
- next = end;
-
+ next = pgd_addr_end(addr, end);
if (pgd_present(*pgd)) {
p4d = p4d_offset(pgd, 0);
result = ident_p4d_init(info, p4d, addr, next);
@@ -132,14 +210,14 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
if (result)
return result;
if (pgtable_l5_enabled()) {
- set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag));
+ set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag | _PAGE_NOPTISHADOW));
} else {
/*
* With p4d folded, pgd is equal to p4d.
* The pgd entry has to point to the pud page table in this case.
*/
pud_t *pud = pud_offset(p4d, 0);
- set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag));
+ set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag | _PAGE_NOPTISHADOW));
}
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 679893ea5e68..bfa444a7dbb0 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -7,6 +7,7 @@
#include <linux/swapops.h>
#include <linux/kmemleak.h>
#include <linux/sched/task.h>
+#include <linux/execmem.h>
#include <asm/set_memory.h>
#include <asm/cpu_device_id.h>
@@ -261,33 +262,34 @@ static void __init probe_page_size_mask(void)
}
}
-#define INTEL_MATCH(_model) { .vendor = X86_VENDOR_INTEL, \
- .family = 6, \
- .model = _model, \
- }
/*
- * INVLPG may not properly flush Global entries
- * on these CPUs when PCIDs are enabled.
+ * INVLPG may not properly flush Global entries on
+ * these CPUs. New microcode fixes the issue.
*/
static const struct x86_cpu_id invlpg_miss_ids[] = {
- INTEL_MATCH(INTEL_FAM6_ALDERLAKE ),
- INTEL_MATCH(INTEL_FAM6_ALDERLAKE_L ),
- INTEL_MATCH(INTEL_FAM6_ATOM_GRACEMONT ),
- INTEL_MATCH(INTEL_FAM6_RAPTORLAKE ),
- INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_P),
- INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_S),
+ X86_MATCH_VFM(INTEL_ALDERLAKE, 0x2e),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L, 0x42c),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, 0x11),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE, 0x118),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P, 0x4117),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_S, 0x2e),
{}
};
static void setup_pcid(void)
{
+ const struct x86_cpu_id *invlpg_miss_match;
+
if (!IS_ENABLED(CONFIG_X86_64))
return;
if (!boot_cpu_has(X86_FEATURE_PCID))
return;
- if (x86_match_cpu(invlpg_miss_ids)) {
+ invlpg_miss_match = x86_match_cpu(invlpg_miss_ids);
+
+ if (invlpg_miss_match &&
+ boot_cpu_data.microcode < invlpg_miss_match->driver_data) {
pr_info("Incomplete global flushes, disabling PCID");
setup_clear_cpu_cap(X86_FEATURE_PCID);
return;
@@ -643,8 +645,13 @@ static void __init memory_map_top_down(unsigned long map_start,
*/
addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start,
map_end);
- memblock_phys_free(addr, PMD_SIZE);
- real_end = addr + PMD_SIZE;
+ if (!addr) {
+ pr_warn("Failed to release memory for alloc_low_pages()");
+ real_end = max(map_start, ALIGN_DOWN(map_end, PMD_SIZE));
+ } else {
+ memblock_phys_free(addr, PMD_SIZE);
+ real_end = addr + PMD_SIZE;
+ }
/* step_size need to be small so pgt_buf from BRK could cover it */
step_size = PMD_SIZE;
@@ -990,53 +997,6 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
}
#endif
-/*
- * Calculate the precise size of the DMA zone (first 16 MB of RAM),
- * and pass it to the MM layer - to help it set zone watermarks more
- * accurately.
- *
- * Done on 64-bit systems only for the time being, although 32-bit systems
- * might benefit from this as well.
- */
-void __init memblock_find_dma_reserve(void)
-{
-#ifdef CONFIG_X86_64
- u64 nr_pages = 0, nr_free_pages = 0;
- unsigned long start_pfn, end_pfn;
- phys_addr_t start_addr, end_addr;
- int i;
- u64 u;
-
- /*
- * Iterate over all memory ranges (free and reserved ones alike),
- * to calculate the total number of pages in the first 16 MB of RAM:
- */
- nr_pages = 0;
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
- start_pfn = min(start_pfn, MAX_DMA_PFN);
- end_pfn = min(end_pfn, MAX_DMA_PFN);
-
- nr_pages += end_pfn - start_pfn;
- }
-
- /*
- * Iterate over free memory ranges to calculate the number of free
- * pages in the DMA zone, while not counting potential partial
- * pages at the beginning or the end of the range:
- */
- nr_free_pages = 0;
- for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) {
- start_pfn = min_t(unsigned long, PFN_UP(start_addr), MAX_DMA_PFN);
- end_pfn = min_t(unsigned long, PFN_DOWN(end_addr), MAX_DMA_PFN);
-
- if (start_pfn < end_pfn)
- nr_free_pages += end_pfn - start_pfn;
- }
-
- set_dma_reserve(nr_pages - nr_free_pages);
-#endif
-}
-
void __init zone_sizes_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];
@@ -1099,3 +1059,67 @@ unsigned long arch_max_swapfile_size(void)
return pages;
}
#endif
+
+#ifdef CONFIG_EXECMEM
+static struct execmem_info execmem_info __ro_after_init;
+
+#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX
+void execmem_fill_trapping_insns(void *ptr, size_t size, bool writeable)
+{
+ /* fill memory with INT3 instructions */
+ if (writeable)
+ memset(ptr, INT3_INSN_OPCODE, size);
+ else
+ text_poke_set(ptr, INT3_INSN_OPCODE, size);
+}
+#endif
+
+struct execmem_info __init *execmem_arch_setup(void)
+{
+ unsigned long start, offset = 0;
+ enum execmem_range_flags flags;
+ pgprot_t pgprot;
+
+ if (kaslr_enabled())
+ offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
+
+ start = MODULES_VADDR + offset;
+
+ if (IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX) &&
+ cpu_feature_enabled(X86_FEATURE_PSE)) {
+ pgprot = PAGE_KERNEL_ROX;
+ flags = EXECMEM_KASAN_SHADOW | EXECMEM_ROX_CACHE;
+ } else {
+ pgprot = PAGE_KERNEL;
+ flags = EXECMEM_KASAN_SHADOW;
+ }
+
+ execmem_info = (struct execmem_info){
+ .ranges = {
+ [EXECMEM_MODULE_TEXT] = {
+ .flags = flags,
+ .start = start,
+ .end = MODULES_END,
+ .pgprot = pgprot,
+ .alignment = MODULE_ALIGN,
+ },
+ [EXECMEM_KPROBES ... EXECMEM_BPF] = {
+ .flags = EXECMEM_KASAN_SHADOW,
+ .start = start,
+ .end = MODULES_END,
+ .pgprot = PAGE_KERNEL,
+ .alignment = MODULE_ALIGN,
+ },
+ [EXECMEM_MODULE_DATA] = {
+ .flags = EXECMEM_KASAN_SHADOW,
+ .start = start,
+ .end = MODULES_END,
+ .pgprot = PAGE_KERNEL,
+ .alignment = MODULE_ALIGN,
+ },
+ },
+ };
+
+ return &execmem_info;
+}
+#endif /* CONFIG_EXECMEM */
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ac41b1e0940d..ad662cc4605c 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -394,23 +394,6 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
pkmap_page_table = virt_to_kpte(vaddr);
}
-
-void __init add_highpages_with_active_regions(int nid,
- unsigned long start_pfn, unsigned long end_pfn)
-{
- phys_addr_t start, end;
- u64 i;
-
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) {
- unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
- start_pfn, end_pfn);
- unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
- start_pfn, end_pfn);
- for ( ; pfn < e_pfn; pfn++)
- if (pfn_valid(pfn))
- free_highmem_page(pfn_to_page(pfn));
- }
-}
#else
static inline void permanent_kmaps_init(pgd_t *pgd_base)
{
@@ -582,7 +565,7 @@ static void __init lowmem_pfn_init(void)
"only %luMB highmem pages available, ignoring highmem size of %luMB!\n"
#define MSG_HIGHMEM_TRIMMED \
- "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n"
+ "Warning: only 4GB will be used. Support for for CONFIG_HIGHMEM64G was removed!\n"
/*
* We have more RAM than fits into lowmem - we try to put it into
* highmem, also taking the highmem=x boot parameter into account:
@@ -606,18 +589,13 @@ static void __init highmem_pfn_init(void)
#ifndef CONFIG_HIGHMEM
/* Maximum memory usable is what is directly addressable */
printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
- if (max_pfn > MAX_NONPAE_PFN)
- printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
- else
- printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+ printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
max_pfn = MAXMEM_PFN;
#else /* !CONFIG_HIGHMEM */
-#ifndef CONFIG_HIGHMEM64G
if (max_pfn > MAX_NONPAE_PFN) {
max_pfn = MAX_NONPAE_PFN;
printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
}
-#endif /* !CONFIG_HIGHMEM64G */
#endif /* !CONFIG_HIGHMEM */
}
@@ -650,9 +628,6 @@ void __init initmem_init(void)
memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
-#ifdef CONFIG_FLATMEM
- max_mapnr = IS_ENABLED(CONFIG_HIGHMEM) ? highend_pfn : max_low_pfn;
-#endif
__vmalloc_start_set = true;
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
@@ -714,27 +689,17 @@ static void __init test_wp_bit(void)
panic("Linux doesn't support CPUs with broken WP.");
}
-void __init mem_init(void)
+void __init arch_mm_preinit(void)
{
pci_iommu_alloc();
#ifdef CONFIG_FLATMEM
BUG_ON(!mem_map);
#endif
- /*
- * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to
- * be done before memblock_free_all(). Memblock use free low memory for
- * temporary data (see find_range_array()) and for this purpose can use
- * pages that was already passed to the buddy allocator, hence marked as
- * not accessible in the page tables when compiled with
- * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not
- * important here.
- */
- set_highmem_pages_init();
-
- /* this will put all low memory onto the freelists */
- memblock_free_all();
+}
+void __init mem_init(void)
+{
after_bootmem = 1;
x86_init.hyper.init_after_bootmem();
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 7e177856ee4f..7c4f6f591f2b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -469,7 +469,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & PAGE_MASK, paddr_next,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PAGE_MASK, paddr_next,
- E820_TYPE_RESERVED_KERN))
+ E820_TYPE_ACPI))
set_pte_init(pte, __pte(0), init);
continue;
}
@@ -524,7 +524,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & PMD_MASK, paddr_next,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PMD_MASK, paddr_next,
- E820_TYPE_RESERVED_KERN))
+ E820_TYPE_ACPI))
set_pmd_init(pmd, __pmd(0), init);
continue;
}
@@ -611,7 +611,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & PUD_MASK, paddr_next,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PUD_MASK, paddr_next,
- E820_TYPE_RESERVED_KERN))
+ E820_TYPE_ACPI))
set_pud_init(pud, __pud(0), init);
continue;
}
@@ -698,7 +698,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & P4D_MASK, paddr_next,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & P4D_MASK, paddr_next,
- E820_TYPE_RESERVED_KERN))
+ E820_TYPE_ACPI))
set_p4d_init(p4d, __p4d(0), init);
continue;
}
@@ -950,14 +950,27 @@ static void update_end_of_memory_vars(u64 start, u64 size)
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
struct mhp_params *params)
{
+ unsigned long end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1;
int ret;
+ if (WARN_ON_ONCE(end > DIRECT_MAP_PHYSMEM_END))
+ return -ERANGE;
+
ret = __add_pages(nid, start_pfn, nr_pages, params);
WARN_ON_ONCE(ret);
- /* update max_pfn, max_low_pfn and high_memory */
- update_end_of_memory_vars(start_pfn << PAGE_SHIFT,
- nr_pages << PAGE_SHIFT);
+ /*
+ * Special case: add_pages() is called by memremap_pages() for adding device
+ * private pages. Do not bump up max_pfn in the device private path,
+ * because max_pfn changes affect dma_addressing_limited().
+ *
+ * dma_addressing_limited() returning true when max_pfn is the device's
+ * addressable memory can force device drivers to use bounce buffers
+ * and impact their performance negatively:
+ */
+ if (!params->pgmap)
+ /* update max_pfn, max_low_pfn and high_memory */
+ update_end_of_memory_vars(start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
return ret;
}
@@ -973,24 +986,32 @@ int arch_add_memory(int nid, u64 start, u64 size,
return add_pages(nid, start_pfn, nr_pages, params);
}
-static void __meminit free_pagetable(struct page *page, int order)
+static void free_reserved_pages(struct page *page, unsigned long nr_pages)
{
- unsigned long magic;
- unsigned int nr_pages = 1 << order;
+ while (nr_pages--)
+ free_reserved_page(page++);
+}
+static void __meminit free_pagetable(struct page *page, int order)
+{
/* bootmem page has reserved flag */
if (PageReserved(page)) {
- __ClearPageReserved(page);
+ unsigned long nr_pages = 1 << order;
+#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
+ enum bootmem_type type = bootmem_type(page);
- magic = page->index;
- if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
+ if (type == SECTION_INFO || type == MIX_SECTION_INFO) {
while (nr_pages--)
put_page_bootmem(page++);
- } else
- while (nr_pages--)
- free_reserved_page(page++);
- } else
+ } else {
+ free_reserved_pages(page, nr_pages);
+ }
+#else
+ free_reserved_pages(page, nr_pages);
+#endif
+ } else {
free_pages((unsigned long)page_address(page), order);
+ }
}
static void __meminit free_hugepage_table(struct page *page,
@@ -1328,14 +1349,15 @@ failed:
panic("Failed to pre-allocate %s pages for vmalloc area\n", lvl);
}
-void __init mem_init(void)
+void __init arch_mm_preinit(void)
{
pci_iommu_alloc();
+}
+void __init mem_init(void)
+{
/* clear_bss() already clear the empty_zero_page */
- /* this will put all memory onto the freelists */
- memblock_free_all();
after_bootmem = 1;
x86_init.hyper.init_after_bootmem();
@@ -1354,18 +1376,6 @@ void __init mem_init(void)
preallocate_vmalloc_pages();
}
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask)
-{
- /*
- * More CPUs always led to greater speedups on tested systems, up to
- * all the nodes' CPUs. Use all since the system is otherwise idle
- * now.
- */
- return max_t(int, cpumask_weight(node_cpumask), 1);
-}
-#endif
-
int kernel_set_to_readonly;
void mark_rodata_ro(void)
@@ -1591,11 +1601,14 @@ void register_page_bootmem_memmap(unsigned long section_nr,
}
get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
- if (!boot_cpu_has(X86_FEATURE_PSE)) {
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd)) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ continue;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_PSE) || !pmd_leaf(*pmd)) {
next = (addr + PAGE_SIZE) & PAGE_MASK;
- pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd))
- continue;
get_page_bootmem(section_nr, pmd_page(*pmd),
MIX_SECTION_INFO);
@@ -1606,12 +1619,7 @@ void register_page_bootmem_memmap(unsigned long section_nr,
SECTION_INFO);
} else {
next = pmd_addr_end(addr, end);
-
- pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd))
- continue;
-
- nr_pmd_pages = 1 << get_order(PMD_SIZE);
+ nr_pmd_pages = (next - addr) >> PAGE_SHIFT;
page = pmd_page(*pmd);
while (nr_pmd_pages--)
get_page_bootmem(section_nr, page++,
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index aa7d279321ea..331e101bf801 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -11,6 +11,7 @@
#include <linux/init.h>
#include <linux/io.h>
#include <linux/ioport.h>
+#include <linux/ioremap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mmiotrace.h>
@@ -439,10 +440,10 @@ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
EXPORT_SYMBOL(ioremap_cache);
void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
- unsigned long prot_val)
+ pgprot_t prot)
{
return __ioremap_caller(phys_addr, size,
- pgprot2cachemode(__pgprot(prot_val)),
+ pgprot2cachemode(prot),
__builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_prot);
@@ -457,7 +458,7 @@ void iounmap(volatile void __iomem *addr)
{
struct vm_struct *p, *o;
- if ((void __force *)addr <= high_memory)
+ if (WARN_ON_ONCE(!is_ioremap_addr((void __force *)addr)))
return;
/*
@@ -502,6 +503,14 @@ void iounmap(volatile void __iomem *addr)
}
EXPORT_SYMBOL(iounmap);
+void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags)
+{
+ if ((flags & MEMREMAP_DEC) || cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+ return (void __force *)ioremap_cache(phys_addr, size);
+
+ return (void __force *)ioremap_encrypted(phys_addr, size);
+}
+
/*
* Convert a physical pointer to a virtual kernel pointer for /dev/mem
* access
@@ -592,8 +601,7 @@ static bool memremap_should_map_decrypted(resource_size_t phys_addr,
* Examine the physical address to determine if it is EFI data. Check
* it against the boot params structure and EFI tables and memory types.
*/
-static bool memremap_is_efi_data(resource_size_t phys_addr,
- unsigned long size)
+static bool memremap_is_efi_data(resource_size_t phys_addr)
{
u64 paddr;
@@ -631,41 +639,54 @@ static bool memremap_is_efi_data(resource_size_t phys_addr,
* Examine the physical address to determine if it is boot data by checking
* it against the boot params setup_data chain.
*/
-static bool memremap_is_setup_data(resource_size_t phys_addr,
- unsigned long size)
+static bool __ref __memremap_is_setup_data(resource_size_t phys_addr, bool early)
{
+ unsigned int setup_data_sz = sizeof(struct setup_data);
struct setup_indirect *indirect;
struct setup_data *data;
u64 paddr, paddr_next;
paddr = boot_params.hdr.setup_data;
while (paddr) {
- unsigned int len;
+ unsigned int len, size;
if (phys_addr == paddr)
return true;
- data = memremap(paddr, sizeof(*data),
- MEMREMAP_WB | MEMREMAP_DEC);
+ if (early)
+ data = early_memremap_decrypted(paddr, setup_data_sz);
+ else
+ data = memremap(paddr, setup_data_sz, MEMREMAP_WB | MEMREMAP_DEC);
if (!data) {
- pr_warn("failed to memremap setup_data entry\n");
+ pr_warn("failed to remap setup_data entry\n");
return false;
}
+ size = setup_data_sz;
+
paddr_next = data->next;
len = data->len;
- if ((phys_addr > paddr) && (phys_addr < (paddr + len))) {
- memunmap(data);
+ if ((phys_addr > paddr) &&
+ (phys_addr < (paddr + setup_data_sz + len))) {
+ if (early)
+ early_memunmap(data, setup_data_sz);
+ else
+ memunmap(data);
return true;
}
if (data->type == SETUP_INDIRECT) {
- memunmap(data);
- data = memremap(paddr, sizeof(*data) + len,
- MEMREMAP_WB | MEMREMAP_DEC);
+ size += len;
+ if (early) {
+ early_memunmap(data, setup_data_sz);
+ data = early_memremap_decrypted(paddr, size);
+ } else {
+ memunmap(data);
+ data = memremap(paddr, size, MEMREMAP_WB | MEMREMAP_DEC);
+ }
if (!data) {
- pr_warn("failed to memremap indirect setup_data\n");
+ pr_warn("failed to remap indirect setup_data\n");
return false;
}
@@ -677,7 +698,10 @@ static bool memremap_is_setup_data(resource_size_t phys_addr,
}
}
- memunmap(data);
+ if (early)
+ early_memunmap(data, size);
+ else
+ memunmap(data);
if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
return true;
@@ -688,66 +712,14 @@ static bool memremap_is_setup_data(resource_size_t phys_addr,
return false;
}
-/*
- * Examine the physical address to determine if it is boot data by checking
- * it against the boot params setup_data chain (early boot version).
- */
-static bool __init early_memremap_is_setup_data(resource_size_t phys_addr,
- unsigned long size)
+static bool memremap_is_setup_data(resource_size_t phys_addr)
{
- struct setup_indirect *indirect;
- struct setup_data *data;
- u64 paddr, paddr_next;
-
- paddr = boot_params.hdr.setup_data;
- while (paddr) {
- unsigned int len, size;
-
- if (phys_addr == paddr)
- return true;
-
- data = early_memremap_decrypted(paddr, sizeof(*data));
- if (!data) {
- pr_warn("failed to early memremap setup_data entry\n");
- return false;
- }
-
- size = sizeof(*data);
-
- paddr_next = data->next;
- len = data->len;
-
- if ((phys_addr > paddr) && (phys_addr < (paddr + len))) {
- early_memunmap(data, sizeof(*data));
- return true;
- }
-
- if (data->type == SETUP_INDIRECT) {
- size += len;
- early_memunmap(data, sizeof(*data));
- data = early_memremap_decrypted(paddr, size);
- if (!data) {
- pr_warn("failed to early memremap indirect setup_data\n");
- return false;
- }
-
- indirect = (struct setup_indirect *)data->data;
-
- if (indirect->type != SETUP_INDIRECT) {
- paddr = indirect->addr;
- len = indirect->len;
- }
- }
-
- early_memunmap(data, size);
-
- if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
- return true;
-
- paddr = paddr_next;
- }
+ return __memremap_is_setup_data(phys_addr, false);
+}
- return false;
+static bool __init early_memremap_is_setup_data(resource_size_t phys_addr)
+{
+ return __memremap_is_setup_data(phys_addr, true);
}
/*
@@ -768,8 +740,8 @@ bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size,
return false;
if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) {
- if (memremap_is_setup_data(phys_addr, size) ||
- memremap_is_efi_data(phys_addr, size))
+ if (memremap_is_setup_data(phys_addr) ||
+ memremap_is_efi_data(phys_addr))
return false;
}
@@ -794,8 +766,8 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
encrypted_prot = true;
if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) {
- if (early_memremap_is_setup_data(phys_addr, size) ||
- memremap_is_efi_data(phys_addr, size))
+ if (early_memremap_is_setup_data(phys_addr) ||
+ memremap_is_efi_data(phys_addr))
encrypted_prot = false;
}
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 9dddf19a5571..0539efd0d216 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -1,5 +1,4 @@
// SPDX-License-Identifier: GPL-2.0
-#define DISABLE_BRANCH_PROFILING
#define pr_fmt(fmt) "kasan: " fmt
/* cpu_feature_enabled() cannot be used this early */
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 37db264866b6..3c306de52fd4 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -22,7 +22,7 @@
#include <linux/kernel.h>
#include <linux/init.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
#include <linux/memblock.h>
#include <linux/pgtable.h>
@@ -47,13 +47,28 @@ static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
*/
static __initdata struct kaslr_memory_region {
unsigned long *base;
+ unsigned long *end;
unsigned long size_tb;
} kaslr_regions[] = {
- { &page_offset_base, 0 },
- { &vmalloc_base, 0 },
- { &vmemmap_base, 0 },
+ {
+ .base = &page_offset_base,
+ .end = &direct_map_physmem_end,
+ },
+ {
+ .base = &vmalloc_base,
+ },
+ {
+ .base = &vmemmap_base,
+ },
};
+/*
+ * The end of the physical address space that can be mapped directly by the
+ * kernel. This starts out at (1<<MAX_PHYSMEM_BITS) - 1), but KASLR may reduce
+ * that in order to increase the available entropy for mapping other regions.
+ */
+unsigned long direct_map_physmem_end __ro_after_init;
+
/* Get size in bytes used by the memory region */
static inline unsigned long get_padding(struct kaslr_memory_region *region)
{
@@ -82,6 +97,8 @@ void __init kernel_randomize_memory(void)
BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE);
BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
+ /* Preset the end of the possible address space for physical memory */
+ direct_map_physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1);
if (!kaslr_memory_enabled())
return;
@@ -96,8 +113,14 @@ void __init kernel_randomize_memory(void)
memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
- /* Adapt physical memory region size based on available memory */
- if (memory_tb < kaslr_regions[0].size_tb)
+ /*
+ * Adapt physical memory region size based on available memory,
+ * except when CONFIG_PCI_P2PDMA is enabled. P2PDMA exposes the
+ * device BAR space assuming the direct map space is large enough
+ * for creating a ZONE_DEVICE mapping in the direct map corresponding
+ * to the physical BAR address.
+ */
+ if (!IS_ENABLED(CONFIG_PCI_P2PDMA) && (memory_tb < kaslr_regions[0].size_tb))
kaslr_regions[0].size_tb = memory_tb;
/*
@@ -128,11 +151,18 @@ void __init kernel_randomize_memory(void)
vaddr += entropy;
*kaslr_regions[i].base = vaddr;
+ /* Calculate the end of the region */
+ vaddr += get_padding(&kaslr_regions[i]);
/*
- * Jump the region and add a minimum padding based on
- * randomization alignment.
+ * KASLR trims the maximum possible size of the
+ * direct-map. Update the direct_map_physmem_end boundary.
+ * No rounding required as the region starts
+ * PUD aligned and size is in units of TB.
*/
- vaddr += get_padding(&kaslr_regions[i]);
+ if (kaslr_regions[i].end)
+ *kaslr_regions[i].end = __pa_nodebug(vaddr - 1);
+
+ /* Add a minimum padding based on randomization alignment. */
vaddr = round_up(vaddr + 1, PUD_SIZE);
remain_entropy -= entropy;
}
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 6f3b3e028718..95bae74fdab2 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -94,6 +94,8 @@ void __init mem_encrypt_init(void)
/* Call into SWIOTLB to update the SWIOTLB DMA buffers */
swiotlb_update_mem_attributes();
+ snp_secure_tsc_prepare();
+
print_mem_encrypt_feature_info();
}
@@ -102,6 +104,13 @@ void __init mem_encrypt_setup_arch(void)
phys_addr_t total_mem = memblock_phys_mem_size();
unsigned long size;
+ /*
+ * Do RMP table fixups after the e820 tables have been setup by
+ * e820__memory_setup().
+ */
+ if (cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ snp_fixup_e820_tables();
+
if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return;
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index 422602f6039b..7490ff6d83b1 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -2,13 +2,11 @@
/*
* AMD Memory Encryption Support
*
- * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ * Copyright (C) 2016-2024 Advanced Micro Devices, Inc.
*
* Author: Tom Lendacky <thomas.lendacky@amd.com>
*/
-#define DISABLE_BRANCH_PROFILING
-
#include <linux/linkage.h>
#include <linux/init.h>
#include <linux/mm.h>
@@ -283,7 +281,7 @@ static void enc_dec_hypercall(unsigned long vaddr, unsigned long size, bool enc)
#endif
}
-static bool amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool enc)
+static int amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool enc)
{
/*
* To maintain the security guarantees of SEV-SNP guests, make sure
@@ -292,11 +290,11 @@ static bool amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool
if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !enc)
snp_set_memory_shared(vaddr, npages);
- return true;
+ return 0;
}
/* Return true unconditionally: return value doesn't matter for the SEV side */
-static bool amd_enc_status_change_finish(unsigned long vaddr, int npages, bool enc)
+static int amd_enc_status_change_finish(unsigned long vaddr, int npages, bool enc)
{
/*
* After memory is mapped encrypted in the page table, validate it
@@ -308,62 +306,85 @@ static bool amd_enc_status_change_finish(unsigned long vaddr, int npages, bool e
if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
enc_dec_hypercall(vaddr, npages << PAGE_SHIFT, enc);
- return true;
+ return 0;
}
-static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
+int prepare_pte_enc(struct pte_enc_desc *d)
{
- pgprot_t old_prot, new_prot;
- unsigned long pfn, pa, size;
- pte_t new_pte;
+ pgprot_t old_prot;
- pfn = pg_level_to_pfn(level, kpte, &old_prot);
- if (!pfn)
- return;
+ d->pfn = pg_level_to_pfn(d->pte_level, d->kpte, &old_prot);
+ if (!d->pfn)
+ return 1;
- new_prot = old_prot;
- if (enc)
- pgprot_val(new_prot) |= _PAGE_ENC;
+ d->new_pgprot = old_prot;
+ if (d->encrypt)
+ pgprot_val(d->new_pgprot) |= _PAGE_ENC;
else
- pgprot_val(new_prot) &= ~_PAGE_ENC;
+ pgprot_val(d->new_pgprot) &= ~_PAGE_ENC;
/* If prot is same then do nothing. */
- if (pgprot_val(old_prot) == pgprot_val(new_prot))
- return;
+ if (pgprot_val(old_prot) == pgprot_val(d->new_pgprot))
+ return 1;
- pa = pfn << PAGE_SHIFT;
- size = page_level_size(level);
+ d->pa = d->pfn << PAGE_SHIFT;
+ d->size = page_level_size(d->pte_level);
/*
- * We are going to perform in-place en-/decryption and change the
- * physical page attribute from C=1 to C=0 or vice versa. Flush the
- * caches to ensure that data gets accessed with the correct C-bit.
+ * In-place en-/decryption and physical page attribute change
+ * from C=1 to C=0 or vice versa will be performed. Flush the
+ * caches to ensure that data gets accessed with the correct
+ * C-bit.
*/
- clflush_cache_range(__va(pa), size);
+ if (d->va)
+ clflush_cache_range(d->va, d->size);
+ else
+ clflush_cache_range(__va(d->pa), d->size);
+
+ return 0;
+}
+
+void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot)
+{
+ pte_t new_pte;
+
+ /* Change the page encryption mask. */
+ new_pte = pfn_pte(pfn, new_prot);
+ set_pte_atomic(kpte, new_pte);
+}
+
+static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
+{
+ struct pte_enc_desc d = {
+ .kpte = kpte,
+ .pte_level = level,
+ .encrypt = enc
+ };
+
+ if (prepare_pte_enc(&d))
+ return;
/* Encrypt/decrypt the contents in-place */
if (enc) {
- sme_early_encrypt(pa, size);
+ sme_early_encrypt(d.pa, d.size);
} else {
- sme_early_decrypt(pa, size);
+ sme_early_decrypt(d.pa, d.size);
/*
* ON SNP, the page state in the RMP table must happen
* before the page table updates.
*/
- early_snp_set_memory_shared((unsigned long)__va(pa), pa, 1);
+ early_snp_set_memory_shared((unsigned long)__va(d.pa), d.pa, 1);
}
- /* Change the page encryption mask. */
- new_pte = pfn_pte(pfn, new_prot);
- set_pte_atomic(kpte, new_pte);
+ set_pte_enc_mask(kpte, d.pfn, d.new_pgprot);
/*
* If page is set encrypted in the page table, then update the RMP table to
* add this page as private.
*/
if (enc)
- early_snp_set_memory_private((unsigned long)__va(pa), pa, 1);
+ early_snp_set_memory_private((unsigned long)__va(d.pa), d.pa, 1);
}
static int __init early_set_memory_enc_dec(unsigned long vaddr,
@@ -467,6 +488,8 @@ void __init sme_early_init(void)
x86_platform.guest.enc_status_change_finish = amd_enc_status_change_finish;
x86_platform.guest.enc_tlb_flush_required = amd_enc_tlb_flush_required;
x86_platform.guest.enc_cache_flush_required = amd_enc_cache_flush_required;
+ x86_platform.guest.enc_kexec_begin = snp_kexec_begin;
+ x86_platform.guest.enc_kexec_finish = snp_kexec_finish;
/*
* AMD-SEV-ES intercepts the RDMSR to read the X2APIC ID in the
@@ -510,6 +533,15 @@ void __init sme_early_init(void)
*/
x86_init.resources.dmi_setup = snp_dmi_setup;
}
+
+ /*
+ * Switch the SVSM CA mapping (if active) from identity mapped to
+ * kernel mapped.
+ */
+ snp_update_svsm_ca();
+
+ if (sev_status & MSR_AMD64_SNP_SECURE_TSC)
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
}
void __init mem_encrypt_free_decrypted_mem(void)
diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
index e25288ee33c2..f8a33b25ae86 100644
--- a/arch/x86/mm/mem_encrypt_boot.S
+++ b/arch/x86/mm/mem_encrypt_boot.S
@@ -72,6 +72,7 @@ SYM_FUNC_START(sme_encrypt_execute)
SYM_FUNC_END(sme_encrypt_execute)
SYM_FUNC_START(__enc_copy)
+ ANNOTATE_NOENDBR
/*
* Routine used to encrypt memory in place.
* This routine must be run outside of the kernel proper since
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index ac33b2263a43..5eecdd92da10 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -7,8 +7,6 @@
* Author: Tom Lendacky <thomas.lendacky@amd.com>
*/
-#define DISABLE_BRANCH_PROFILING
-
/*
* Since we're dealing with identity mappings, physical and virtual
* addresses are the same, so override these defines which are ultimately
@@ -495,10 +493,10 @@ void __head sme_enable(struct boot_params *bp)
unsigned int eax, ebx, ecx, edx;
unsigned long feature_mask;
unsigned long me_mask;
- bool snp;
+ bool snp_en;
u64 msr;
- snp = snp_init(bp);
+ snp_en = snp_init(bp);
/* Check for the SME/SEV support leaf */
eax = 0x80000000;
@@ -531,8 +529,11 @@ void __head sme_enable(struct boot_params *bp)
RIP_REL_REF(sev_status) = msr = __rdmsr(MSR_AMD64_SEV);
feature_mask = (msr & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
- /* The SEV-SNP CC blob should never be present unless SEV-SNP is enabled. */
- if (snp && !(msr & MSR_AMD64_SEV_SNP_ENABLED))
+ /*
+ * Any discrepancies between the presence of a CC blob and SNP
+ * enablement abort the guest.
+ */
+ if (snp_en ^ !!(msr & MSR_AMD64_SEV_SNP_ENABLED))
snp_abort();
/* Check if memory encryption is enabled */
@@ -562,7 +563,7 @@ void __head sme_enable(struct boot_params *bp)
}
RIP_REL_REF(sme_me_mask) = me_mask;
- physical_mask &= ~me_mask;
- cc_vendor = CC_VENDOR_AMD;
+ RIP_REL_REF(physical_mask) &= ~me_mask;
+ RIP_REL_REF(cc_vendor) = CC_VENDOR_AMD;
cc_set_mask(me_mask);
}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index c90c20904a60..5ed2109211da 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -84,7 +84,6 @@ static unsigned long mmap_base(unsigned long rnd, unsigned long task_size,
{
unsigned long gap = rlim_stack->rlim_cur;
unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap;
- unsigned long gap_min, gap_max;
/* Values close to RLIM_INFINITY can overflow. */
if (gap + pad > gap)
@@ -94,13 +93,7 @@ static unsigned long mmap_base(unsigned long rnd, unsigned long task_size,
* Top of mmap area (just below the process stack).
* Leave an at least ~128 MB hole with possible stack randomization.
*/
- gap_min = SIZE_128M;
- gap_max = (task_size / 6) * 5;
-
- if (gap < gap_min)
- gap = gap_min;
- else if (gap > gap_max)
- gap = gap_max;
+ gap = clamp(gap, SIZE_128M, (task_size / 6) * 5);
return PAGE_ALIGN(task_size - gap - rnd);
}
@@ -129,9 +122,9 @@ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base,
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
if (mmap_is_legacy())
- mm->get_unmapped_area = arch_get_unmapped_area;
+ clear_bit(MMF_TOPDOWN, &mm->flags);
else
- mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+ set_bit(MMF_TOPDOWN, &mm->flags);
arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
arch_rnd(mmap64_rnd_bits), task_size_64bit(0),
@@ -163,11 +156,6 @@ unsigned long get_mmap_base(int is_legacy)
return is_legacy ? mm->mmap_legacy_base : mm->mmap_base;
}
-const char *arch_vma_name(struct vm_area_struct *vma)
-{
- return NULL;
-}
-
/**
* mmap_address_hint_valid - Validate the address hint of mmap
* @addr: Address hint
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 65e9a6e391c0..64e5cdb2460a 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -13,6 +13,7 @@
#include <linux/sched.h>
#include <linux/topology.h>
#include <linux/sort.h>
+#include <linux/numa_memblks.h>
#include <asm/e820/api.h>
#include <asm/proto.h>
@@ -22,16 +23,6 @@
#include "numa_internal.h"
int numa_off;
-nodemask_t numa_nodes_parsed __initdata;
-
-struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
-EXPORT_SYMBOL(node_data);
-
-static struct numa_meminfo numa_meminfo __initdata_or_meminfo;
-static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
-
-static int numa_distance_cnt;
-static u8 *numa_distance;
static __init int numa_setup(char *opt)
{
@@ -124,456 +115,27 @@ void __init setup_node_to_cpumask_map(void)
pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
}
-static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
- struct numa_meminfo *mi)
-{
- /* ignore zero length blks */
- if (start == end)
- return 0;
-
- /* whine about and ignore invalid blks */
- if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
- pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
- nid, start, end - 1);
- return 0;
- }
-
- if (mi->nr_blks >= NR_NODE_MEMBLKS) {
- pr_err("too many memblk ranges\n");
- return -EINVAL;
- }
-
- mi->blk[mi->nr_blks].start = start;
- mi->blk[mi->nr_blks].end = end;
- mi->blk[mi->nr_blks].nid = nid;
- mi->nr_blks++;
- return 0;
-}
-
-/**
- * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
- * @idx: Index of memblk to remove
- * @mi: numa_meminfo to remove memblk from
- *
- * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
- * decrementing @mi->nr_blks.
- */
-void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
-{
- mi->nr_blks--;
- memmove(&mi->blk[idx], &mi->blk[idx + 1],
- (mi->nr_blks - idx) * sizeof(mi->blk[0]));
-}
-
-/**
- * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another
- * @dst: numa_meminfo to append block to
- * @idx: Index of memblk to remove
- * @src: numa_meminfo to remove memblk from
- */
-static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx,
- struct numa_meminfo *src)
-{
- dst->blk[dst->nr_blks++] = src->blk[idx];
- numa_remove_memblk_from(idx, src);
-}
-
-/**
- * numa_add_memblk - Add one numa_memblk to numa_meminfo
- * @nid: NUMA node ID of the new memblk
- * @start: Start address of the new memblk
- * @end: End address of the new memblk
- *
- * Add a new memblk to the default numa_meminfo.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int __init numa_add_memblk(int nid, u64 start, u64 end)
-{
- return numa_add_memblk_to(nid, start, end, &numa_meminfo);
-}
-
-/* Allocate NODE_DATA for a node on the local memory */
-static void __init alloc_node_data(int nid)
+static int __init numa_register_nodes(void)
{
- const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
- u64 nd_pa;
- void *nd;
- int tnid;
-
- /*
- * Allocate node data. Try node-local memory and then any node.
- * Never allocate in DMA zone.
- */
- nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
- if (!nd_pa) {
- pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
- nd_size, nid);
- return;
- }
- nd = __va(nd_pa);
-
- /* report and initialize */
- printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
- nd_pa, nd_pa + nd_size - 1);
- tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
- if (tnid != nid)
- printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
-
- node_data[nid] = nd;
- memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
-
- node_set_online(nid);
-}
-
-/**
- * numa_cleanup_meminfo - Cleanup a numa_meminfo
- * @mi: numa_meminfo to clean up
- *
- * Sanitize @mi by merging and removing unnecessary memblks. Also check for
- * conflicts and clear unused memblks.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
-{
- const u64 low = 0;
- const u64 high = PFN_PHYS(max_pfn);
- int i, j, k;
-
- /* first, trim all entries */
- for (i = 0; i < mi->nr_blks; i++) {
- struct numa_memblk *bi = &mi->blk[i];
-
- /* move / save reserved memory ranges */
- if (!memblock_overlaps_region(&memblock.memory,
- bi->start, bi->end - bi->start)) {
- numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi);
- continue;
- }
-
- /* make sure all non-reserved blocks are inside the limits */
- bi->start = max(bi->start, low);
-
- /* preserve info for non-RAM areas above 'max_pfn': */
- if (bi->end > high) {
- numa_add_memblk_to(bi->nid, high, bi->end,
- &numa_reserved_meminfo);
- bi->end = high;
- }
-
- /* and there's no empty block */
- if (bi->start >= bi->end)
- numa_remove_memblk_from(i--, mi);
- }
-
- /* merge neighboring / overlapping entries */
- for (i = 0; i < mi->nr_blks; i++) {
- struct numa_memblk *bi = &mi->blk[i];
-
- for (j = i + 1; j < mi->nr_blks; j++) {
- struct numa_memblk *bj = &mi->blk[j];
- u64 start, end;
-
- /*
- * See whether there are overlapping blocks. Whine
- * about but allow overlaps of the same nid. They
- * will be merged below.
- */
- if (bi->end > bj->start && bi->start < bj->end) {
- if (bi->nid != bj->nid) {
- pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
- bi->nid, bi->start, bi->end - 1,
- bj->nid, bj->start, bj->end - 1);
- return -EINVAL;
- }
- pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
- bi->nid, bi->start, bi->end - 1,
- bj->start, bj->end - 1);
- }
-
- /*
- * Join together blocks on the same node, holes
- * between which don't overlap with memory on other
- * nodes.
- */
- if (bi->nid != bj->nid)
- continue;
- start = min(bi->start, bj->start);
- end = max(bi->end, bj->end);
- for (k = 0; k < mi->nr_blks; k++) {
- struct numa_memblk *bk = &mi->blk[k];
-
- if (bi->nid == bk->nid)
- continue;
- if (start < bk->end && end > bk->start)
- break;
- }
- if (k < mi->nr_blks)
- continue;
- printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
- bi->nid, bi->start, bi->end - 1, bj->start,
- bj->end - 1, start, end - 1);
- bi->start = start;
- bi->end = end;
- numa_remove_memblk_from(j--, mi);
- }
- }
-
- /* clear unused ones */
- for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
- mi->blk[i].start = mi->blk[i].end = 0;
- mi->blk[i].nid = NUMA_NO_NODE;
- }
-
- return 0;
-}
-
-/*
- * Set nodes, which have memory in @mi, in *@nodemask.
- */
-static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
- const struct numa_meminfo *mi)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
- if (mi->blk[i].start != mi->blk[i].end &&
- mi->blk[i].nid != NUMA_NO_NODE)
- node_set(mi->blk[i].nid, *nodemask);
-}
-
-/**
- * numa_reset_distance - Reset NUMA distance table
- *
- * The current table is freed. The next numa_set_distance() call will
- * create a new one.
- */
-void __init numa_reset_distance(void)
-{
- size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
-
- /* numa_distance could be 1LU marking allocation failure, test cnt */
- if (numa_distance_cnt)
- memblock_free(numa_distance, size);
- numa_distance_cnt = 0;
- numa_distance = NULL; /* enable table creation */
-}
-
-static int __init numa_alloc_distance(void)
-{
- nodemask_t nodes_parsed;
- size_t size;
- int i, j, cnt = 0;
- u64 phys;
-
- /* size the new table and allocate it */
- nodes_parsed = numa_nodes_parsed;
- numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
-
- for_each_node_mask(i, nodes_parsed)
- cnt = i;
- cnt++;
- size = cnt * cnt * sizeof(numa_distance[0]);
-
- phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0,
- PFN_PHYS(max_pfn_mapped));
- if (!phys) {
- pr_warn("Warning: can't allocate distance table!\n");
- /* don't retry until explicitly reset */
- numa_distance = (void *)1LU;
- return -ENOMEM;
- }
-
- numa_distance = __va(phys);
- numa_distance_cnt = cnt;
-
- /* fill with the default distances */
- for (i = 0; i < cnt; i++)
- for (j = 0; j < cnt; j++)
- numa_distance[i * cnt + j] = i == j ?
- LOCAL_DISTANCE : REMOTE_DISTANCE;
- printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
-
- return 0;
-}
-
-/**
- * numa_set_distance - Set NUMA distance from one NUMA to another
- * @from: the 'from' node to set distance
- * @to: the 'to' node to set distance
- * @distance: NUMA distance
- *
- * Set the distance from node @from to @to to @distance. If distance table
- * doesn't exist, one which is large enough to accommodate all the currently
- * known nodes will be created.
- *
- * If such table cannot be allocated, a warning is printed and further
- * calls are ignored until the distance table is reset with
- * numa_reset_distance().
- *
- * If @from or @to is higher than the highest known node or lower than zero
- * at the time of table creation or @distance doesn't make sense, the call
- * is ignored.
- * This is to allow simplification of specific NUMA config implementations.
- */
-void __init numa_set_distance(int from, int to, int distance)
-{
- if (!numa_distance && numa_alloc_distance() < 0)
- return;
-
- if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
- from < 0 || to < 0) {
- pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
- from, to, distance);
- return;
- }
-
- if ((u8)distance != distance ||
- (from == to && distance != LOCAL_DISTANCE)) {
- pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
- from, to, distance);
- return;
- }
-
- numa_distance[from * numa_distance_cnt + to] = distance;
-}
-
-int __node_distance(int from, int to)
-{
- if (from >= numa_distance_cnt || to >= numa_distance_cnt)
- return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
- return numa_distance[from * numa_distance_cnt + to];
-}
-EXPORT_SYMBOL(__node_distance);
-
-/*
- * Mark all currently memblock-reserved physical memory (which covers the
- * kernel's own memory ranges) as hot-unswappable.
- */
-static void __init numa_clear_kernel_node_hotplug(void)
-{
- nodemask_t reserved_nodemask = NODE_MASK_NONE;
- struct memblock_region *mb_region;
- int i;
-
- /*
- * We have to do some preprocessing of memblock regions, to
- * make them suitable for reservation.
- *
- * At this time, all memory regions reserved by memblock are
- * used by the kernel, but those regions are not split up
- * along node boundaries yet, and don't necessarily have their
- * node ID set yet either.
- *
- * So iterate over all memory known to the x86 architecture,
- * and use those ranges to set the nid in memblock.reserved.
- * This will split up the memblock regions along node
- * boundaries and will set the node IDs as well.
- */
- for (i = 0; i < numa_meminfo.nr_blks; i++) {
- struct numa_memblk *mb = numa_meminfo.blk + i;
- int ret;
-
- ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
- WARN_ON_ONCE(ret);
- }
-
- /*
- * Now go over all reserved memblock regions, to construct a
- * node mask of all kernel reserved memory areas.
- *
- * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
- * numa_meminfo might not include all memblock.reserved
- * memory ranges, because quirks such as trim_snb_memory()
- * reserve specific pages for Sandy Bridge graphics. ]
- */
- for_each_reserved_mem_region(mb_region) {
- int nid = memblock_get_region_node(mb_region);
-
- if (nid != MAX_NUMNODES)
- node_set(nid, reserved_nodemask);
- }
-
- /*
- * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
- * belonging to the reserved node mask.
- *
- * Note that this will include memory regions that reside
- * on nodes that contain kernel memory - entire nodes
- * become hot-unpluggable:
- */
- for (i = 0; i < numa_meminfo.nr_blks; i++) {
- struct numa_memblk *mb = numa_meminfo.blk + i;
-
- if (!node_isset(mb->nid, reserved_nodemask))
- continue;
-
- memblock_clear_hotplug(mb->start, mb->end - mb->start);
- }
-}
-
-static int __init numa_register_memblks(struct numa_meminfo *mi)
-{
- int i, nid;
-
- /* Account for nodes with cpus and no memory */
- node_possible_map = numa_nodes_parsed;
- numa_nodemask_from_meminfo(&node_possible_map, mi);
- if (WARN_ON(nodes_empty(node_possible_map)))
- return -EINVAL;
-
- for (i = 0; i < mi->nr_blks; i++) {
- struct numa_memblk *mb = &mi->blk[i];
- memblock_set_node(mb->start, mb->end - mb->start,
- &memblock.memory, mb->nid);
- }
-
- /*
- * At very early time, the kernel have to use some memory such as
- * loading the kernel image. We cannot prevent this anyway. So any
- * node the kernel resides in should be un-hotpluggable.
- *
- * And when we come here, alloc node data won't fail.
- */
- numa_clear_kernel_node_hotplug();
-
- /*
- * If sections array is gonna be used for pfn -> nid mapping, check
- * whether its granularity is fine enough.
- */
- if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) {
- unsigned long pfn_align = node_map_pfn_alignment();
-
- if (pfn_align && pfn_align < PAGES_PER_SECTION) {
- pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
- PFN_PHYS(pfn_align) >> 20,
- PFN_PHYS(PAGES_PER_SECTION) >> 20);
- return -EINVAL;
- }
- }
+ int nid;
if (!memblock_validate_numa_coverage(SZ_1M))
return -EINVAL;
/* Finally register nodes. */
for_each_node_mask(nid, node_possible_map) {
- u64 start = PFN_PHYS(max_pfn);
- u64 end = 0;
-
- for (i = 0; i < mi->nr_blks; i++) {
- if (nid != mi->blk[i].nid)
- continue;
- start = min(mi->blk[i].start, start);
- end = max(mi->blk[i].end, end);
- }
+ unsigned long start_pfn, end_pfn;
- if (start >= end)
+ /*
+ * Note, get_pfn_range_for_nid() depends on
+ * memblock_set_node() having already happened
+ */
+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+ if (start_pfn >= end_pfn)
continue;
alloc_node_data(nid);
+ node_set_online(nid);
}
/* Dump memblock with node info and return. */
@@ -609,39 +171,11 @@ static int __init numa_init(int (*init_func)(void))
for (i = 0; i < MAX_LOCAL_APIC; i++)
set_apicid_to_node(i, NUMA_NO_NODE);
- nodes_clear(numa_nodes_parsed);
- nodes_clear(node_possible_map);
- nodes_clear(node_online_map);
- memset(&numa_meminfo, 0, sizeof(numa_meminfo));
- WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory,
- MAX_NUMNODES));
- WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved,
- MAX_NUMNODES));
- /* In case that parsing SRAT failed. */
- WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX));
- numa_reset_distance();
-
- ret = init_func();
- if (ret < 0)
- return ret;
-
- /*
- * We reset memblock back to the top-down direction
- * here because if we configured ACPI_NUMA, we have
- * parsed SRAT in init_func(). It is ok to have the
- * reset here even if we did't configure ACPI_NUMA
- * or acpi numa init fails and fallbacks to dummy
- * numa init.
- */
- memblock_set_bottom_up(false);
-
- ret = numa_cleanup_meminfo(&numa_meminfo);
+ ret = numa_memblks_init(init_func, /* memblock_force_top_down */ true);
if (ret < 0)
return ret;
- numa_emulation(&numa_meminfo, numa_distance_cnt);
-
- ret = numa_register_memblks(&numa_meminfo);
+ ret = numa_register_nodes();
if (ret < 0)
return ret;
@@ -782,12 +316,12 @@ void __init init_cpu_to_node(void)
#ifndef CONFIG_DEBUG_PER_CPU_MAPS
# ifndef CONFIG_NUMA_EMU
-void numa_add_cpu(int cpu)
+void numa_add_cpu(unsigned int cpu)
{
cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}
-void numa_remove_cpu(int cpu)
+void numa_remove_cpu(unsigned int cpu)
{
cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}
@@ -825,7 +359,7 @@ int early_cpu_to_node(int cpu)
return per_cpu(x86_cpu_to_node_map, cpu);
}
-void debug_cpumask_set_cpu(int cpu, int node, bool enable)
+void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable)
{
struct cpumask *mask;
@@ -857,12 +391,12 @@ static void numa_set_cpumask(int cpu, bool enable)
debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
}
-void numa_add_cpu(int cpu)
+void numa_add_cpu(unsigned int cpu)
{
numa_set_cpumask(cpu, true);
}
-void numa_remove_cpu(int cpu)
+void numa_remove_cpu(unsigned int cpu)
{
numa_set_cpumask(cpu, false);
}
@@ -893,113 +427,29 @@ EXPORT_SYMBOL(cpumask_of_node);
#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
-#ifdef CONFIG_NUMA_KEEP_MEMINFO
-static int meminfo_to_nid(struct numa_meminfo *mi, u64 start)
+#ifdef CONFIG_NUMA_EMU
+void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys,
+ unsigned int nr_emu_nids)
{
- int i;
-
- for (i = 0; i < mi->nr_blks; i++)
- if (mi->blk[i].start <= start && mi->blk[i].end > start)
- return mi->blk[i].nid;
- return NUMA_NO_NODE;
-}
-
-int phys_to_target_node(phys_addr_t start)
-{
- int nid = meminfo_to_nid(&numa_meminfo, start);
+ int i, j;
/*
- * Prefer online nodes, but if reserved memory might be
- * hot-added continue the search with reserved ranges.
+ * Transform __apicid_to_node table to use emulated nids by
+ * reverse-mapping phys_nid. The maps should always exist but fall
+ * back to zero just in case.
*/
- if (nid != NUMA_NO_NODE)
- return nid;
-
- return meminfo_to_nid(&numa_reserved_meminfo, start);
-}
-EXPORT_SYMBOL_GPL(phys_to_target_node);
-
-int memory_add_physaddr_to_nid(u64 start)
-{
- int nid = meminfo_to_nid(&numa_meminfo, start);
-
- if (nid == NUMA_NO_NODE)
- nid = numa_meminfo.blk[0].nid;
- return nid;
-}
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-
-static int __init cmp_memblk(const void *a, const void *b)
-{
- const struct numa_memblk *ma = *(const struct numa_memblk **)a;
- const struct numa_memblk *mb = *(const struct numa_memblk **)b;
-
- return (ma->start > mb->start) - (ma->start < mb->start);
+ for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
+ if (__apicid_to_node[i] == NUMA_NO_NODE)
+ continue;
+ for (j = 0; j < nr_emu_nids; j++)
+ if (__apicid_to_node[i] == emu_nid_to_phys[j])
+ break;
+ __apicid_to_node[i] = j < nr_emu_nids ? j : 0;
+ }
}
-static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata;
-
-/**
- * numa_fill_memblks - Fill gaps in numa_meminfo memblks
- * @start: address to begin fill
- * @end: address to end fill
- *
- * Find and extend numa_meminfo memblks to cover the physical
- * address range @start-@end
- *
- * RETURNS:
- * 0 : Success
- * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end
- */
-
-int __init numa_fill_memblks(u64 start, u64 end)
+u64 __init numa_emu_dma_end(void)
{
- struct numa_memblk **blk = &numa_memblk_list[0];
- struct numa_meminfo *mi = &numa_meminfo;
- int count = 0;
- u64 prev_end;
-
- /*
- * Create a list of pointers to numa_meminfo memblks that
- * overlap start, end. The list is used to make in-place
- * changes that fill out the numa_meminfo memblks.
- */
- for (int i = 0; i < mi->nr_blks; i++) {
- struct numa_memblk *bi = &mi->blk[i];
-
- if (memblock_addrs_overlap(start, end - start, bi->start,
- bi->end - bi->start)) {
- blk[count] = &mi->blk[i];
- count++;
- }
- }
- if (!count)
- return NUMA_NO_MEMBLK;
-
- /* Sort the list of pointers in memblk->start order */
- sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL);
-
- /* Make sure the first/last memblks include start/end */
- blk[0]->start = min(blk[0]->start, start);
- blk[count - 1]->end = max(blk[count - 1]->end, end);
-
- /*
- * Fill any gaps by tracking the previous memblks
- * end address and backfilling to it if needed.
- */
- prev_end = blk[0]->end;
- for (int i = 1; i < count; i++) {
- struct numa_memblk *curr = blk[i];
-
- if (prev_end >= curr->start) {
- if (prev_end < curr->end)
- prev_end = curr->end;
- } else {
- curr->start = prev_end;
- prev_end = curr->end;
- }
- }
- return 0;
+ return PFN_PHYS(MAX_DMA32_PFN);
}
-
-#endif
+#endif /* CONFIG_NUMA_EMU */
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 025fd7ea5d69..65fda406e6f2 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -24,6 +24,7 @@
#include <linux/memblock.h>
#include <linux/init.h>
+#include <linux/vmalloc.h>
#include <asm/pgtable_areas.h>
#include "numa_internal.h"
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
deleted file mode 100644
index 9a9305367fdd..000000000000
--- a/arch/x86/mm/numa_emulation.c
+++ /dev/null
@@ -1,585 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NUMA emulation
- */
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/topology.h>
-#include <linux/memblock.h>
-#include <asm/dma.h>
-
-#include "numa_internal.h"
-
-static int emu_nid_to_phys[MAX_NUMNODES];
-static char *emu_cmdline __initdata;
-
-int __init numa_emu_cmdline(char *str)
-{
- emu_cmdline = str;
- return 0;
-}
-
-static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
-{
- int i;
-
- for (i = 0; i < mi->nr_blks; i++)
- if (mi->blk[i].nid == nid)
- return i;
- return -ENOENT;
-}
-
-static u64 __init mem_hole_size(u64 start, u64 end)
-{
- unsigned long start_pfn = PFN_UP(start);
- unsigned long end_pfn = PFN_DOWN(end);
-
- if (start_pfn < end_pfn)
- return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn));
- return 0;
-}
-
-/*
- * Sets up nid to range from @start to @end. The return value is -errno if
- * something went wrong, 0 otherwise.
- */
-static int __init emu_setup_memblk(struct numa_meminfo *ei,
- struct numa_meminfo *pi,
- int nid, int phys_blk, u64 size)
-{
- struct numa_memblk *eb = &ei->blk[ei->nr_blks];
- struct numa_memblk *pb = &pi->blk[phys_blk];
-
- if (ei->nr_blks >= NR_NODE_MEMBLKS) {
- pr_err("NUMA: Too many emulated memblks, failing emulation\n");
- return -EINVAL;
- }
-
- ei->nr_blks++;
- eb->start = pb->start;
- eb->end = pb->start + size;
- eb->nid = nid;
-
- if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
- emu_nid_to_phys[nid] = pb->nid;
-
- pb->start += size;
- if (pb->start >= pb->end) {
- WARN_ON_ONCE(pb->start > pb->end);
- numa_remove_memblk_from(phys_blk, pi);
- }
-
- printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
- nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
- return 0;
-}
-
-/*
- * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
- * to max_addr.
- *
- * Returns zero on success or negative on error.
- */
-static int __init split_nodes_interleave(struct numa_meminfo *ei,
- struct numa_meminfo *pi,
- u64 addr, u64 max_addr, int nr_nodes)
-{
- nodemask_t physnode_mask = numa_nodes_parsed;
- u64 size;
- int big;
- int nid = 0;
- int i, ret;
-
- if (nr_nodes <= 0)
- return -1;
- if (nr_nodes > MAX_NUMNODES) {
- pr_info("numa=fake=%d too large, reducing to %d\n",
- nr_nodes, MAX_NUMNODES);
- nr_nodes = MAX_NUMNODES;
- }
-
- /*
- * Calculate target node size. x86_32 freaks on __udivdi3() so do
- * the division in ulong number of pages and convert back.
- */
- size = max_addr - addr - mem_hole_size(addr, max_addr);
- size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
-
- /*
- * Calculate the number of big nodes that can be allocated as a result
- * of consolidating the remainder.
- */
- big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
- FAKE_NODE_MIN_SIZE;
-
- size &= FAKE_NODE_MIN_HASH_MASK;
- if (!size) {
- pr_err("Not enough memory for each node. "
- "NUMA emulation disabled.\n");
- return -1;
- }
-
- /*
- * Continue to fill physical nodes with fake nodes until there is no
- * memory left on any of them.
- */
- while (!nodes_empty(physnode_mask)) {
- for_each_node_mask(i, physnode_mask) {
- u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
- u64 start, limit, end;
- int phys_blk;
-
- phys_blk = emu_find_memblk_by_nid(i, pi);
- if (phys_blk < 0) {
- node_clear(i, physnode_mask);
- continue;
- }
- start = pi->blk[phys_blk].start;
- limit = pi->blk[phys_blk].end;
- end = start + size;
-
- if (nid < big)
- end += FAKE_NODE_MIN_SIZE;
-
- /*
- * Continue to add memory to this fake node if its
- * non-reserved memory is less than the per-node size.
- */
- while (end - start - mem_hole_size(start, end) < size) {
- end += FAKE_NODE_MIN_SIZE;
- if (end > limit) {
- end = limit;
- break;
- }
- }
-
- /*
- * If there won't be at least FAKE_NODE_MIN_SIZE of
- * non-reserved memory in ZONE_DMA32 for the next node,
- * this one must extend to the boundary.
- */
- if (end < dma32_end && dma32_end - end -
- mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
- end = dma32_end;
-
- /*
- * If there won't be enough non-reserved memory for the
- * next node, this one must extend to the end of the
- * physical node.
- */
- if (limit - end - mem_hole_size(end, limit) < size)
- end = limit;
-
- ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
- phys_blk,
- min(end, limit) - start);
- if (ret < 0)
- return ret;
- }
- }
- return 0;
-}
-
-/*
- * Returns the end address of a node so that there is at least `size' amount of
- * non-reserved memory or `max_addr' is reached.
- */
-static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
-{
- u64 end = start + size;
-
- while (end - start - mem_hole_size(start, end) < size) {
- end += FAKE_NODE_MIN_SIZE;
- if (end > max_addr) {
- end = max_addr;
- break;
- }
- }
- return end;
-}
-
-static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes)
-{
- unsigned long max_pfn = PHYS_PFN(max_addr);
- unsigned long base_pfn = PHYS_PFN(base);
- unsigned long hole_pfns = PHYS_PFN(hole);
-
- return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes);
-}
-
-/*
- * Sets up fake nodes of `size' interleaved over physical nodes ranging from
- * `addr' to `max_addr'.
- *
- * Returns zero on success or negative on error.
- */
-static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
- struct numa_meminfo *pi,
- u64 addr, u64 max_addr, u64 size,
- int nr_nodes, struct numa_memblk *pblk,
- int nid)
-{
- nodemask_t physnode_mask = numa_nodes_parsed;
- int i, ret, uniform = 0;
- u64 min_size;
-
- if ((!size && !nr_nodes) || (nr_nodes && !pblk))
- return -1;
-
- /*
- * In the 'uniform' case split the passed in physical node by
- * nr_nodes, in the non-uniform case, ignore the passed in
- * physical block and try to create nodes of at least size
- * @size.
- *
- * In the uniform case, split the nodes strictly by physical
- * capacity, i.e. ignore holes. In the non-uniform case account
- * for holes and treat @size as a minimum floor.
- */
- if (!nr_nodes)
- nr_nodes = MAX_NUMNODES;
- else {
- nodes_clear(physnode_mask);
- node_set(pblk->nid, physnode_mask);
- uniform = 1;
- }
-
- if (uniform) {
- min_size = uniform_size(max_addr, addr, 0, nr_nodes);
- size = min_size;
- } else {
- /*
- * The limit on emulated nodes is MAX_NUMNODES, so the
- * size per node is increased accordingly if the
- * requested size is too small. This creates a uniform
- * distribution of node sizes across the entire machine
- * (but not necessarily over physical nodes).
- */
- min_size = uniform_size(max_addr, addr,
- mem_hole_size(addr, max_addr), nr_nodes);
- }
- min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE);
- if (size < min_size) {
- pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
- size >> 20, min_size >> 20);
- size = min_size;
- }
- size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE);
-
- /*
- * Fill physical nodes with fake nodes of size until there is no memory
- * left on any of them.
- */
- while (!nodes_empty(physnode_mask)) {
- for_each_node_mask(i, physnode_mask) {
- u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
- u64 start, limit, end;
- int phys_blk;
-
- phys_blk = emu_find_memblk_by_nid(i, pi);
- if (phys_blk < 0) {
- node_clear(i, physnode_mask);
- continue;
- }
-
- start = pi->blk[phys_blk].start;
- limit = pi->blk[phys_blk].end;
-
- if (uniform)
- end = start + size;
- else
- end = find_end_of_node(start, limit, size);
- /*
- * If there won't be at least FAKE_NODE_MIN_SIZE of
- * non-reserved memory in ZONE_DMA32 for the next node,
- * this one must extend to the boundary.
- */
- if (end < dma32_end && dma32_end - end -
- mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
- end = dma32_end;
-
- /*
- * If there won't be enough non-reserved memory for the
- * next node, this one must extend to the end of the
- * physical node.
- */
- if ((limit - end - mem_hole_size(end, limit) < size)
- && !uniform)
- end = limit;
-
- ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
- phys_blk,
- min(end, limit) - start);
- if (ret < 0)
- return ret;
- }
- }
- return nid;
-}
-
-static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
- struct numa_meminfo *pi,
- u64 addr, u64 max_addr, u64 size)
-{
- return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size,
- 0, NULL, 0);
-}
-
-static int __init setup_emu2phys_nid(int *dfl_phys_nid)
-{
- int i, max_emu_nid = 0;
-
- *dfl_phys_nid = NUMA_NO_NODE;
- for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
- if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
- max_emu_nid = i;
- if (*dfl_phys_nid == NUMA_NO_NODE)
- *dfl_phys_nid = emu_nid_to_phys[i];
- }
- }
-
- return max_emu_nid;
-}
-
-/**
- * numa_emulation - Emulate NUMA nodes
- * @numa_meminfo: NUMA configuration to massage
- * @numa_dist_cnt: The size of the physical NUMA distance table
- *
- * Emulate NUMA nodes according to the numa=fake kernel parameter.
- * @numa_meminfo contains the physical memory configuration and is modified
- * to reflect the emulated configuration on success. @numa_dist_cnt is
- * used to determine the size of the physical distance table.
- *
- * On success, the following modifications are made.
- *
- * - @numa_meminfo is updated to reflect the emulated nodes.
- *
- * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
- * emulated nodes.
- *
- * - NUMA distance table is rebuilt to represent distances between emulated
- * nodes. The distances are determined considering how emulated nodes
- * are mapped to physical nodes and match the actual distances.
- *
- * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
- * nodes. This is used by numa_add_cpu() and numa_remove_cpu().
- *
- * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
- * identity mapping and no other modification is made.
- */
-void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
-{
- static struct numa_meminfo ei __initdata;
- static struct numa_meminfo pi __initdata;
- const u64 max_addr = PFN_PHYS(max_pfn);
- u8 *phys_dist = NULL;
- size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
- int max_emu_nid, dfl_phys_nid;
- int i, j, ret;
-
- if (!emu_cmdline)
- goto no_emu;
-
- memset(&ei, 0, sizeof(ei));
- pi = *numa_meminfo;
-
- for (i = 0; i < MAX_NUMNODES; i++)
- emu_nid_to_phys[i] = NUMA_NO_NODE;
-
- /*
- * If the numa=fake command-line contains a 'M' or 'G', it represents
- * the fixed node size. Otherwise, if it is just a single number N,
- * split the system RAM into N fake nodes.
- */
- if (strchr(emu_cmdline, 'U')) {
- nodemask_t physnode_mask = numa_nodes_parsed;
- unsigned long n;
- int nid = 0;
-
- n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
- ret = -1;
- for_each_node_mask(i, physnode_mask) {
- /*
- * The reason we pass in blk[0] is due to
- * numa_remove_memblk_from() called by
- * emu_setup_memblk() will delete entry 0
- * and then move everything else up in the pi.blk
- * array. Therefore we should always be looking
- * at blk[0].
- */
- ret = split_nodes_size_interleave_uniform(&ei, &pi,
- pi.blk[0].start, pi.blk[0].end, 0,
- n, &pi.blk[0], nid);
- if (ret < 0)
- break;
- if (ret < n) {
- pr_info("%s: phys: %d only got %d of %ld nodes, failing\n",
- __func__, i, ret, n);
- ret = -1;
- break;
- }
- nid = ret;
- }
- } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
- u64 size;
-
- size = memparse(emu_cmdline, &emu_cmdline);
- ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
- } else {
- unsigned long n;
-
- n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
- ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
- }
- if (*emu_cmdline == ':')
- emu_cmdline++;
-
- if (ret < 0)
- goto no_emu;
-
- if (numa_cleanup_meminfo(&ei) < 0) {
- pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
- goto no_emu;
- }
-
- /* copy the physical distance table */
- if (numa_dist_cnt) {
- u64 phys;
-
- phys = memblock_phys_alloc_range(phys_size, PAGE_SIZE, 0,
- PFN_PHYS(max_pfn_mapped));
- if (!phys) {
- pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
- goto no_emu;
- }
- phys_dist = __va(phys);
-
- for (i = 0; i < numa_dist_cnt; i++)
- for (j = 0; j < numa_dist_cnt; j++)
- phys_dist[i * numa_dist_cnt + j] =
- node_distance(i, j);
- }
-
- /*
- * Determine the max emulated nid and the default phys nid to use
- * for unmapped nodes.
- */
- max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid);
-
- /* commit */
- *numa_meminfo = ei;
-
- /* Make sure numa_nodes_parsed only contains emulated nodes */
- nodes_clear(numa_nodes_parsed);
- for (i = 0; i < ARRAY_SIZE(ei.blk); i++)
- if (ei.blk[i].start != ei.blk[i].end &&
- ei.blk[i].nid != NUMA_NO_NODE)
- node_set(ei.blk[i].nid, numa_nodes_parsed);
-
- /*
- * Transform __apicid_to_node table to use emulated nids by
- * reverse-mapping phys_nid. The maps should always exist but fall
- * back to zero just in case.
- */
- for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
- if (__apicid_to_node[i] == NUMA_NO_NODE)
- continue;
- for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
- if (__apicid_to_node[i] == emu_nid_to_phys[j])
- break;
- __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
- }
-
- /* make sure all emulated nodes are mapped to a physical node */
- for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
- if (emu_nid_to_phys[i] == NUMA_NO_NODE)
- emu_nid_to_phys[i] = dfl_phys_nid;
-
- /* transform distance table */
- numa_reset_distance();
- for (i = 0; i < max_emu_nid + 1; i++) {
- for (j = 0; j < max_emu_nid + 1; j++) {
- int physi = emu_nid_to_phys[i];
- int physj = emu_nid_to_phys[j];
- int dist;
-
- if (get_option(&emu_cmdline, &dist) == 2)
- ;
- else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
- dist = physi == physj ?
- LOCAL_DISTANCE : REMOTE_DISTANCE;
- else
- dist = phys_dist[physi * numa_dist_cnt + physj];
-
- numa_set_distance(i, j, dist);
- }
- }
-
- /* free the copied physical distance table */
- memblock_free(phys_dist, phys_size);
- return;
-
-no_emu:
- /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
- for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
- emu_nid_to_phys[i] = i;
-}
-
-#ifndef CONFIG_DEBUG_PER_CPU_MAPS
-void numa_add_cpu(int cpu)
-{
- int physnid, nid;
-
- nid = early_cpu_to_node(cpu);
- BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
-
- physnid = emu_nid_to_phys[nid];
-
- /*
- * Map the cpu to each emulated node that is allocated on the physical
- * node of the cpu's apic id.
- */
- for_each_online_node(nid)
- if (emu_nid_to_phys[nid] == physnid)
- cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
-}
-
-void numa_remove_cpu(int cpu)
-{
- int i;
-
- for_each_online_node(i)
- cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
-}
-#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
-static void numa_set_cpumask(int cpu, bool enable)
-{
- int nid, physnid;
-
- nid = early_cpu_to_node(cpu);
- if (nid == NUMA_NO_NODE) {
- /* early_cpu_to_node() already emits a warning and trace */
- return;
- }
-
- physnid = emu_nid_to_phys[nid];
-
- for_each_online_node(nid) {
- if (emu_nid_to_phys[nid] != physnid)
- continue;
-
- debug_cpumask_set_cpu(cpu, nid, enable);
- }
-}
-
-void numa_add_cpu(int cpu)
-{
- numa_set_cpumask(cpu, true);
-}
-
-void numa_remove_cpu(int cpu)
-{
- numa_set_cpumask(cpu, false);
-}
-#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index 86860f279662..11e1ff370c10 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -5,30 +5,6 @@
#include <linux/types.h>
#include <asm/numa.h>
-struct numa_memblk {
- u64 start;
- u64 end;
- int nid;
-};
-
-struct numa_meminfo {
- int nr_blks;
- struct numa_memblk blk[NR_NODE_MEMBLKS];
-};
-
-void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
-int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
-void __init numa_reset_distance(void);
-
void __init x86_numa_init(void);
-#ifdef CONFIG_NUMA_EMU
-void __init numa_emulation(struct numa_meminfo *numa_meminfo,
- int numa_dist_cnt);
-#else
-static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
- int numa_dist_cnt)
-{ }
-#endif
-
#endif /* __X86_MM_NUMA_INTERNAL_H */
diff --git a/arch/x86/mm/pat/cpa-test.c b/arch/x86/mm/pat/cpa-test.c
index 3d2f7f0a6ed1..ad3c1feec990 100644
--- a/arch/x86/mm/pat/cpa-test.c
+++ b/arch/x86/mm/pat/cpa-test.c
@@ -183,7 +183,7 @@ static int pageattr_test(void)
break;
case 1:
- err = change_page_attr_set(addrs, len[1], PAGE_CPA_TEST, 1);
+ err = change_page_attr_set(addrs, len[i], PAGE_CPA_TEST, 1);
break;
case 2:
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index 36b603d0cdde..72d8cbc61158 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -39,9 +39,11 @@
#include <linux/pfn_t.h>
#include <linux/slab.h>
#include <linux/mm.h>
+#include <linux/highmem.h>
#include <linux/fs.h>
#include <linux/rbtree.h>
+#include <asm/cpu_device_id.h>
#include <asm/cacheflush.h>
#include <asm/cacheinfo.h>
#include <asm/processor.h>
@@ -103,7 +105,7 @@ __setup("debugpat", pat_debug_setup);
#ifdef CONFIG_X86_PAT
/*
- * X86 PAT uses page flags arch_1 and uncached together to keep track of
+ * X86 PAT uses page flags arch_1 and arch_2 together to keep track of
* memory type of pages that have backing page struct.
*
* X86 PAT supports 4 different memory types:
@@ -117,9 +119,9 @@ __setup("debugpat", pat_debug_setup);
#define _PGMT_WB 0
#define _PGMT_WC (1UL << PG_arch_1)
-#define _PGMT_UC_MINUS (1UL << PG_uncached)
-#define _PGMT_WT (1UL << PG_uncached | 1UL << PG_arch_1)
-#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1)
+#define _PGMT_UC_MINUS (1UL << PG_arch_2)
+#define _PGMT_WT (1UL << PG_arch_2 | 1UL << PG_arch_1)
+#define _PGMT_MASK (1UL << PG_arch_2 | 1UL << PG_arch_1)
#define _PGMT_CLEAR_MASK (~_PGMT_MASK)
static inline enum page_cache_mode get_page_memtype(struct page *pg)
@@ -175,15 +177,6 @@ static inline void set_page_memtype(struct page *pg,
}
#endif
-enum {
- PAT_UC = 0, /* uncached */
- PAT_WC = 1, /* Write combining */
- PAT_WT = 4, /* Write Through */
- PAT_WP = 5, /* Write Protected */
- PAT_WB = 6, /* Write Back (default) */
- PAT_UC_MINUS = 7, /* UC, but can be overridden by MTRR */
-};
-
#define CM(c) (_PAGE_CACHE_MODE_ ## c)
static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val,
@@ -193,13 +186,13 @@ static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val,
char *cache_mode;
switch (pat_val) {
- case PAT_UC: cache = CM(UC); cache_mode = "UC "; break;
- case PAT_WC: cache = CM(WC); cache_mode = "WC "; break;
- case PAT_WT: cache = CM(WT); cache_mode = "WT "; break;
- case PAT_WP: cache = CM(WP); cache_mode = "WP "; break;
- case PAT_WB: cache = CM(WB); cache_mode = "WB "; break;
- case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break;
- default: cache = CM(WB); cache_mode = "WB "; break;
+ case X86_MEMTYPE_UC: cache = CM(UC); cache_mode = "UC "; break;
+ case X86_MEMTYPE_WC: cache = CM(WC); cache_mode = "WC "; break;
+ case X86_MEMTYPE_WT: cache = CM(WT); cache_mode = "WT "; break;
+ case X86_MEMTYPE_WP: cache = CM(WP); cache_mode = "WP "; break;
+ case X86_MEMTYPE_WB: cache = CM(WB); cache_mode = "WB "; break;
+ case X86_MEMTYPE_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break;
+ default: cache = CM(WB); cache_mode = "WB "; break;
}
memcpy(msg, cache_mode, 4);
@@ -256,12 +249,6 @@ void pat_cpu_init(void)
void __init pat_bp_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
-#define PAT(p0, p1, p2, p3, p4, p5, p6, p7) \
- (((u64)PAT_ ## p0) | ((u64)PAT_ ## p1 << 8) | \
- ((u64)PAT_ ## p2 << 16) | ((u64)PAT_ ## p3 << 24) | \
- ((u64)PAT_ ## p4 << 32) | ((u64)PAT_ ## p5 << 40) | \
- ((u64)PAT_ ## p6 << 48) | ((u64)PAT_ ## p7 << 56))
-
if (!IS_ENABLED(CONFIG_X86_PAT))
pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n");
@@ -292,7 +279,7 @@ void __init pat_bp_init(void)
* NOTE: When WC or WP is used, it is redirected to UC- per
* the default setup in __cachemode2pte_tbl[].
*/
- pat_msr_val = PAT(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC);
+ pat_msr_val = PAT_VALUE(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC);
}
/*
@@ -304,9 +291,8 @@ void __init pat_bp_init(void)
return;
}
- if ((c->x86_vendor == X86_VENDOR_INTEL) &&
- (((c->x86 == 0x6) && (c->x86_model <= 0xd)) ||
- ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) {
+ if ((c->x86_vfm >= INTEL_PENTIUM_PRO && c->x86_vfm <= INTEL_PENTIUM_M_DOTHAN) ||
+ (c->x86_vfm >= INTEL_P4_WILLAMETTE && c->x86_vfm <= INTEL_P4_CEDARMILL)) {
/*
* PAT support with the lower four entries. Intel Pentium 2,
* 3, M, and 4 are affected by PAT errata, which makes the
@@ -327,7 +313,7 @@ void __init pat_bp_init(void)
* NOTE: When WT or WP is used, it is redirected to UC- per
* the default setup in __cachemode2pte_tbl[].
*/
- pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC);
+ pat_msr_val = PAT_VALUE(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC);
} else {
/*
* Full PAT support. We put WT in slot 7 to improve
@@ -355,13 +341,12 @@ void __init pat_bp_init(void)
* The reserved slots are unused, but mapped to their
* corresponding types in the presence of PAT errata.
*/
- pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT);
+ pat_msr_val = PAT_VALUE(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT);
}
memory_caching_control |= CACHE_PAT;
init_cache_modes(pat_msr_val);
-#undef PAT
}
static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */
@@ -947,6 +932,26 @@ static void free_pfn_range(u64 paddr, unsigned long size)
memtype_free(paddr, paddr + size);
}
+static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
+ resource_size_t *phys)
+{
+ struct follow_pfnmap_args args = { .vma = vma, .address = vma->vm_start };
+
+ if (follow_pfnmap_start(&args))
+ return -EINVAL;
+
+ /* Never return PFNs of anon folios in COW mappings. */
+ if (!args.special) {
+ follow_pfnmap_end(&args);
+ return -EINVAL;
+ }
+
+ *prot = pgprot_val(args.pgprot);
+ *phys = (resource_size_t)args.pfn << PAGE_SHIFT;
+ follow_pfnmap_end(&args);
+ return 0;
+}
+
static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
pgprot_t *pgprot)
{
@@ -964,7 +969,7 @@ static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
* detect the PFN. If we need the cachemode as well, we're out of luck
* for now and have to fail fork().
*/
- if (!follow_phys(vma, vma->vm_start, 0, &prot, paddr)) {
+ if (!follow_phys(vma, &prot, paddr)) {
if (pgprot)
*pgprot = __pgprot(prot);
return 0;
@@ -979,29 +984,42 @@ static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
return -EINVAL;
}
-/*
- * track_pfn_copy is called when vma that is covering the pfnmap gets
- * copied through copy_page_range().
- *
- * If the vma has a linear pfn mapping for the entire range, we get the prot
- * from pte and reserve the entire vma range with single reserve_pfn_range call.
- */
-int track_pfn_copy(struct vm_area_struct *vma)
+int track_pfn_copy(struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma, unsigned long *pfn)
{
+ const unsigned long vma_size = src_vma->vm_end - src_vma->vm_start;
resource_size_t paddr;
- unsigned long vma_size = vma->vm_end - vma->vm_start;
pgprot_t pgprot;
+ int rc;
- if (vma->vm_flags & VM_PAT) {
- if (get_pat_info(vma, &paddr, &pgprot))
- return -EINVAL;
- /* reserve the whole chunk covered by vma. */
- return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
- }
+ if (!(src_vma->vm_flags & VM_PAT))
+ return 0;
+
+ /*
+ * Duplicate the PAT information for the dst VMA based on the src
+ * VMA.
+ */
+ if (get_pat_info(src_vma, &paddr, &pgprot))
+ return -EINVAL;
+ rc = reserve_pfn_range(paddr, vma_size, &pgprot, 1);
+ if (rc)
+ return rc;
+ /* Reservation for the destination VMA succeeded. */
+ vm_flags_set(dst_vma, VM_PAT);
+ *pfn = PHYS_PFN(paddr);
return 0;
}
+void untrack_pfn_copy(struct vm_area_struct *dst_vma, unsigned long pfn)
+{
+ untrack_pfn(dst_vma, pfn, dst_vma->vm_end - dst_vma->vm_start, true);
+ /*
+ * Reservation was freed, any copied page tables will get cleaned
+ * up later, but without getting PAT involved again.
+ */
+}
+
/*
* prot is passed in as a parameter for the new mapping. If the vma has
* a linear pfn mapping for the entire range, or no vma is provided,
@@ -1090,15 +1108,6 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
}
}
-/*
- * untrack_pfn_clear is called if the following situation fits:
- *
- * 1) while mremapping a pfnmap for a new region, with the old vma after
- * its pfnmap page table has been removed. The new vma has a new pfnmap
- * to the same pfn & cache type with VM_PAT set.
- * 2) while duplicating vm area, the new vma fails to copy the pgtable from
- * old vma.
- */
void untrack_pfn_clear(struct vm_area_struct *vma)
{
vm_flags_clear(vma, VM_PAT);
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 80c9037ffadf..def3d9284254 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -32,8 +32,6 @@
#include <asm/pgalloc.h>
#include <asm/proto.h>
#include <asm/memtype.h>
-#include <asm/hyperv-tlfs.h>
-#include <asm/mshyperv.h>
#include "../mm_internal.h"
@@ -75,6 +73,7 @@ static DEFINE_SPINLOCK(cpa_lock);
#define CPA_ARRAY 2
#define CPA_PAGES_ARRAY 4
#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */
+#define CPA_COLLAPSE 16 /* try to collapse large pages */
static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
{
@@ -107,6 +106,18 @@ static void split_page_count(int level)
direct_pages_count[level - 1] += PTRS_PER_PTE;
}
+static void collapse_page_count(int level)
+{
+ direct_pages_count[level]++;
+ if (system_state == SYSTEM_RUNNING) {
+ if (level == PG_LEVEL_2M)
+ count_vm_event(DIRECT_MAP_LEVEL2_COLLAPSE);
+ else if (level == PG_LEVEL_1G)
+ count_vm_event(DIRECT_MAP_LEVEL3_COLLAPSE);
+ }
+ direct_pages_count[level - 1] -= PTRS_PER_PTE;
+}
+
void arch_report_meminfo(struct seq_file *m)
{
seq_printf(m, "DirectMap4k: %8lu kB\n",
@@ -124,6 +135,7 @@ void arch_report_meminfo(struct seq_file *m)
}
#else
static inline void split_page_count(int level) { }
+static inline void collapse_page_count(int level) { }
#endif
#ifdef CONFIG_X86_CPA_STATISTICS
@@ -213,14 +225,14 @@ within(unsigned long addr, unsigned long start, unsigned long end)
return addr >= start && addr < end;
}
+#ifdef CONFIG_X86_64
+
static inline int
within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
{
return addr >= start && addr <= end;
}
-#ifdef CONFIG_X86_64
-
/*
* The kernel image is mapped into two places in the virtual address space
* (addresses without KASLR, of course):
@@ -354,7 +366,7 @@ bool cpu_cache_has_invalidate_memregion(void)
{
return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR);
}
-EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, DEVMEM);
+EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, "DEVMEM");
int cpu_cache_invalidate_memregion(int res_desc)
{
@@ -363,7 +375,7 @@ int cpu_cache_invalidate_memregion(int res_desc)
wbinvd_on_all_cpus();
return 0;
}
-EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, DEVMEM);
+EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, "DEVMEM");
#endif
static void __cpa_flush_all(void *arg)
@@ -396,16 +408,49 @@ static void __cpa_flush_tlb(void *data)
flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
}
-static void cpa_flush(struct cpa_data *data, int cache)
+static int collapse_large_pages(unsigned long addr, struct list_head *pgtables);
+
+static void cpa_collapse_large_pages(struct cpa_data *cpa)
+{
+ unsigned long start, addr, end;
+ struct ptdesc *ptdesc, *tmp;
+ LIST_HEAD(pgtables);
+ int collapsed = 0;
+ int i;
+
+ if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
+ for (i = 0; i < cpa->numpages; i++)
+ collapsed += collapse_large_pages(__cpa_addr(cpa, i),
+ &pgtables);
+ } else {
+ addr = __cpa_addr(cpa, 0);
+ start = addr & PMD_MASK;
+ end = addr + PAGE_SIZE * cpa->numpages;
+
+ for (addr = start; within(addr, start, end); addr += PMD_SIZE)
+ collapsed += collapse_large_pages(addr, &pgtables);
+ }
+
+ if (!collapsed)
+ return;
+
+ flush_tlb_all();
+
+ list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) {
+ list_del(&ptdesc->pt_list);
+ __free_page(ptdesc_page(ptdesc));
+ }
+}
+
+static void cpa_flush(struct cpa_data *cpa, int cache)
{
- struct cpa_data *cpa = data;
unsigned int i;
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
cpa_flush_all(cache);
- return;
+ goto collapse_large_pages;
}
if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling)
@@ -414,7 +459,7 @@ static void cpa_flush(struct cpa_data *data, int cache)
on_each_cpu(__cpa_flush_tlb, cpa, 1);
if (!cache)
- return;
+ goto collapse_large_pages;
mb();
for (i = 0; i < cpa->numpages; i++) {
@@ -430,6 +475,10 @@ static void cpa_flush(struct cpa_data *data, int cache)
clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE);
}
mb();
+
+collapse_large_pages:
+ if (cpa->flags & CPA_COLLAPSE)
+ cpa_collapse_large_pages(cpa);
}
static bool overlaps(unsigned long r1_start, unsigned long r1_end,
@@ -619,7 +668,8 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
* Validate strict W^X semantics.
*/
static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start,
- unsigned long pfn, unsigned long npg)
+ unsigned long pfn, unsigned long npg,
+ bool nx, bool rw)
{
unsigned long end;
@@ -641,6 +691,10 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star
if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW)
return new;
+ /* Non-leaf translation entries can disable writing or execution. */
+ if (!rw || nx)
+ return new;
+
end = start + npg * PAGE_SIZE - 1;
WARN_ONCE(1, "CPA detected W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n",
(unsigned long long)pgprot_val(old),
@@ -657,56 +711,82 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star
/*
* Lookup the page table entry for a virtual address in a specific pgd.
- * Return a pointer to the entry and the level of the mapping.
+ * Return a pointer to the entry (or NULL if the entry does not exist),
+ * the level of the entry, and the effective NX and RW bits of all
+ * page table levels.
*/
-pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
- unsigned int *level)
+pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
+ unsigned int *level, bool *nx, bool *rw)
{
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
- *level = PG_LEVEL_NONE;
+ *level = PG_LEVEL_256T;
+ *nx = false;
+ *rw = true;
if (pgd_none(*pgd))
return NULL;
+ *level = PG_LEVEL_512G;
+ *nx |= pgd_flags(*pgd) & _PAGE_NX;
+ *rw &= pgd_flags(*pgd) & _PAGE_RW;
+
p4d = p4d_offset(pgd, address);
if (p4d_none(*p4d))
return NULL;
- *level = PG_LEVEL_512G;
if (p4d_leaf(*p4d) || !p4d_present(*p4d))
return (pte_t *)p4d;
+ *level = PG_LEVEL_1G;
+ *nx |= p4d_flags(*p4d) & _PAGE_NX;
+ *rw &= p4d_flags(*p4d) & _PAGE_RW;
+
pud = pud_offset(p4d, address);
if (pud_none(*pud))
return NULL;
- *level = PG_LEVEL_1G;
if (pud_leaf(*pud) || !pud_present(*pud))
return (pte_t *)pud;
+ *level = PG_LEVEL_2M;
+ *nx |= pud_flags(*pud) & _PAGE_NX;
+ *rw &= pud_flags(*pud) & _PAGE_RW;
+
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd))
return NULL;
- *level = PG_LEVEL_2M;
if (pmd_leaf(*pmd) || !pmd_present(*pmd))
return (pte_t *)pmd;
*level = PG_LEVEL_4K;
+ *nx |= pmd_flags(*pmd) & _PAGE_NX;
+ *rw &= pmd_flags(*pmd) & _PAGE_RW;
return pte_offset_kernel(pmd, address);
}
/*
+ * Lookup the page table entry for a virtual address in a specific pgd.
+ * Return a pointer to the entry and the level of the mapping.
+ */
+pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
+ unsigned int *level)
+{
+ bool nx, rw;
+
+ return lookup_address_in_pgd_attr(pgd, address, level, &nx, &rw);
+}
+
+/*
* Lookup the page table entry for a virtual address. Return a pointer
* to the entry and the level of the mapping.
*
- * Note: We return pud and pmd either when the entry is marked large
- * or when the present bit is not set. Otherwise we would return a
- * pointer to a nonexisting mapping.
+ * Note: the function returns p4d, pud or pmd either when the entry is marked
+ * large or when the present bit is not set. Otherwise it returns NULL.
*/
pte_t *lookup_address(unsigned long address, unsigned int *level)
{
@@ -715,13 +795,16 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
EXPORT_SYMBOL_GPL(lookup_address);
static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
- unsigned int *level)
+ unsigned int *level, bool *nx, bool *rw)
{
- if (cpa->pgd)
- return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
- address, level);
+ pgd_t *pgd;
+
+ if (!cpa->pgd)
+ pgd = pgd_offset_k(address);
+ else
+ pgd = cpa->pgd + pgd_index(address);
- return lookup_address(address, level);
+ return lookup_address_in_pgd_attr(pgd, address, level, nx, rw);
}
/*
@@ -849,12 +932,13 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
pgprot_t old_prot, new_prot, req_prot, chk_prot;
pte_t new_pte, *tmp;
enum pg_level level;
+ bool nx, rw;
/*
* Check for races, another CPU might have split this page
* up already:
*/
- tmp = _lookup_address_cpa(cpa, address, &level);
+ tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
if (tmp != kpte)
return 1;
@@ -965,7 +1049,8 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
psize, CPA_DETECT);
- new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages);
+ new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages,
+ nx, rw);
/*
* If there is a conflict, split the large page.
@@ -1046,6 +1131,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
pte_t *pbase = (pte_t *)page_address(base);
unsigned int i, level;
pgprot_t ref_prot;
+ bool nx, rw;
pte_t *tmp;
spin_lock(&pgd_lock);
@@ -1053,7 +1139,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
* Check for races, another CPU might have split this page
* up for us already:
*/
- tmp = _lookup_address_cpa(cpa, address, &level);
+ tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
if (tmp != kpte) {
spin_unlock(&pgd_lock);
return 1;
@@ -1082,8 +1168,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
lpinc = PMD_SIZE;
/*
* Clear the PSE flags if the PRESENT flag is not set
- * otherwise pmd_present/pmd_huge will return true
- * even on a non present pmd.
+ * otherwise pmd_present() will return true even on a non
+ * present pmd.
*/
if (!(pgprot_val(ref_prot) & _PAGE_PRESENT))
pgprot_val(ref_prot) &= ~_PAGE_PSE;
@@ -1162,6 +1248,161 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
return 0;
}
+static int collapse_pmd_page(pmd_t *pmd, unsigned long addr,
+ struct list_head *pgtables)
+{
+ pmd_t _pmd, old_pmd;
+ pte_t *pte, first;
+ unsigned long pfn;
+ pgprot_t pgprot;
+ int i = 0;
+
+ addr &= PMD_MASK;
+ pte = pte_offset_kernel(pmd, addr);
+ first = *pte;
+ pfn = pte_pfn(first);
+
+ /* Make sure alignment is suitable */
+ if (PFN_PHYS(pfn) & ~PMD_MASK)
+ return 0;
+
+ /* The page is 4k intentionally */
+ if (pte_flags(first) & _PAGE_KERNEL_4K)
+ return 0;
+
+ /* Check that the rest of PTEs are compatible with the first one */
+ for (i = 1, pte++; i < PTRS_PER_PTE; i++, pte++) {
+ pte_t entry = *pte;
+
+ if (!pte_present(entry))
+ return 0;
+ if (pte_flags(entry) != pte_flags(first))
+ return 0;
+ if (pte_pfn(entry) != pte_pfn(first) + i)
+ return 0;
+ }
+
+ old_pmd = *pmd;
+
+ /* Success: set up a large page */
+ pgprot = pgprot_4k_2_large(pte_pgprot(first));
+ pgprot_val(pgprot) |= _PAGE_PSE;
+ _pmd = pfn_pmd(pfn, pgprot);
+ set_pmd(pmd, _pmd);
+
+ /* Queue the page table to be freed after TLB flush */
+ list_add(&page_ptdesc(pmd_page(old_pmd))->pt_list, pgtables);
+
+ if (IS_ENABLED(CONFIG_X86_32) && !SHARED_KERNEL_PMD) {
+ struct page *page;
+
+ /* Update all PGD tables to use the same large page */
+ list_for_each_entry(page, &pgd_list, lru) {
+ pgd_t *pgd = (pgd_t *)page_address(page) + pgd_index(addr);
+ p4d_t *p4d = p4d_offset(pgd, addr);
+ pud_t *pud = pud_offset(p4d, addr);
+ pmd_t *pmd = pmd_offset(pud, addr);
+ /* Something is wrong if entries doesn't match */
+ if (WARN_ON(pmd_val(old_pmd) != pmd_val(*pmd)))
+ continue;
+ set_pmd(pmd, _pmd);
+ }
+ }
+
+ if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1))
+ collapse_page_count(PG_LEVEL_2M);
+
+ return 1;
+}
+
+static int collapse_pud_page(pud_t *pud, unsigned long addr,
+ struct list_head *pgtables)
+{
+ unsigned long pfn;
+ pmd_t *pmd, first;
+ int i;
+
+ if (!direct_gbpages)
+ return 0;
+
+ addr &= PUD_MASK;
+ pmd = pmd_offset(pud, addr);
+ first = *pmd;
+
+ /*
+ * To restore PUD page all PMD entries must be large and
+ * have suitable alignment
+ */
+ pfn = pmd_pfn(first);
+ if (!pmd_leaf(first) || (PFN_PHYS(pfn) & ~PUD_MASK))
+ return 0;
+
+ /*
+ * To restore PUD page, all following PMDs must be compatible with the
+ * first one.
+ */
+ for (i = 1, pmd++; i < PTRS_PER_PMD; i++, pmd++) {
+ pmd_t entry = *pmd;
+
+ if (!pmd_present(entry) || !pmd_leaf(entry))
+ return 0;
+ if (pmd_flags(entry) != pmd_flags(first))
+ return 0;
+ if (pmd_pfn(entry) != pmd_pfn(first) + i * PTRS_PER_PTE)
+ return 0;
+ }
+
+ /* Restore PUD page and queue page table to be freed after TLB flush */
+ list_add(&page_ptdesc(pud_page(*pud))->pt_list, pgtables);
+ set_pud(pud, pfn_pud(pfn, pmd_pgprot(first)));
+
+ if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1))
+ collapse_page_count(PG_LEVEL_1G);
+
+ return 1;
+}
+
+/*
+ * Collapse PMD and PUD pages in the kernel mapping around the address where
+ * possible.
+ *
+ * Caller must flush TLB and free page tables queued on the list before
+ * touching the new entries. CPU must not see TLB entries of different size
+ * with different attributes.
+ */
+static int collapse_large_pages(unsigned long addr, struct list_head *pgtables)
+{
+ int collapsed = 0;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ addr &= PMD_MASK;
+
+ spin_lock(&pgd_lock);
+ pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd))
+ goto out;
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
+ goto out;
+ pud = pud_offset(p4d, addr);
+ if (!pud_present(*pud) || pud_leaf(*pud))
+ goto out;
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_present(*pmd) || pmd_leaf(*pmd))
+ goto out;
+
+ collapsed = collapse_pmd_page(pmd, addr, pgtables);
+ if (collapsed)
+ collapsed += collapse_pud_page(pud, addr, pgtables);
+
+out:
+ spin_unlock(&pgd_lock);
+ return collapsed;
+}
+
static bool try_to_free_pte_page(pte_t *pte)
{
int i;
@@ -1594,10 +1835,11 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
int do_split, err;
unsigned int level;
pte_t *kpte, old_pte;
+ bool nx, rw;
address = __cpa_addr(cpa, cpa->curpage);
repeat:
- kpte = _lookup_address_cpa(cpa, address, &level);
+ kpte = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
if (!kpte)
return __cpa_process_fault(cpa, address, primary);
@@ -1619,7 +1861,8 @@ repeat:
new_prot = static_protections(new_prot, address, pfn, 1, 0,
CPA_PROTECT);
- new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1);
+ new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1,
+ nx, rw);
new_prot = pgprot_clear_protnone_bits(new_prot);
@@ -1905,19 +2148,6 @@ static inline int cpa_clear_pages_array(struct page **pages, int numpages,
CPA_PAGES_ARRAY, pages);
}
-/*
- * __set_memory_prot is an internal helper for callers that have been passed
- * a pgprot_t value from upper layers and a reservation has already been taken.
- * If you want to set the pgprot to a specific page protocol, use the
- * set_memory_xx() functions.
- */
-int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot)
-{
- return change_page_attr_set_clr(&addr, numpages, prot,
- __pgprot(~pgprot_val(prot)), 0, 0,
- NULL);
-}
-
int _set_memory_uc(unsigned long addr, int numpages)
{
/*
@@ -2044,6 +2274,7 @@ int set_mce_nospec(unsigned long pfn)
pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
return rc;
}
+EXPORT_SYMBOL_GPL(set_mce_nospec);
/* Restore full speculative operation to the pfn. */
int clear_mce_nospec(unsigned long pfn)
@@ -2083,7 +2314,8 @@ int set_memory_rox(unsigned long addr, int numpages)
if (__supported_pte_mask & _PAGE_NX)
clr.pgprot |= _PAGE_NX;
- return change_page_attr_clear(&addr, numpages, clr, 0);
+ return change_page_attr_set_clr(&addr, numpages, __pgprot(0), clr, 0,
+ CPA_COLLAPSE, NULL);
}
int set_memory_rw(unsigned long addr, int numpages)
@@ -2110,7 +2342,8 @@ int set_memory_p(unsigned long addr, int numpages)
int set_memory_4k(unsigned long addr, int numpages)
{
- return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
+ return change_page_attr_set_clr(&addr, numpages,
+ __pgprot(_PAGE_KERNEL_4K),
__pgprot(0), 1, 0, NULL);
}
@@ -2156,7 +2389,8 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required());
/* Notify hypervisor that we are about to set/clr encryption attribute. */
- if (!x86_platform.guest.enc_status_change_prepare(addr, numpages, enc))
+ ret = x86_platform.guest.enc_status_change_prepare(addr, numpages, enc);
+ if (ret)
goto vmm_fail;
ret = __change_page_attr_set_clr(&cpa, 1);
@@ -2174,24 +2408,61 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
return ret;
/* Notify hypervisor that we have successfully set/clr encryption attribute. */
- if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc))
+ ret = x86_platform.guest.enc_status_change_finish(addr, numpages, enc);
+ if (ret)
goto vmm_fail;
return 0;
vmm_fail:
- WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s.\n",
- (void *)addr, numpages, enc ? "private" : "shared");
+ WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s: %d\n",
+ (void *)addr, numpages, enc ? "private" : "shared", ret);
+
+ return ret;
+}
+
+/*
+ * The lock serializes conversions between private and shared memory.
+ *
+ * It is taken for read on conversion. A write lock guarantees that no
+ * concurrent conversions are in progress.
+ */
+static DECLARE_RWSEM(mem_enc_lock);
- return -EIO;
+/*
+ * Stop new private<->shared conversions.
+ *
+ * Taking the exclusive mem_enc_lock waits for in-flight conversions to complete.
+ * The lock is not released to prevent new conversions from being started.
+ */
+bool set_memory_enc_stop_conversion(void)
+{
+ /*
+ * In a crash scenario, sleep is not allowed. Try to take the lock.
+ * Failure indicates that there is a race with the conversion.
+ */
+ if (oops_in_progress)
+ return down_write_trylock(&mem_enc_lock);
+
+ down_write(&mem_enc_lock);
+
+ return true;
}
static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
{
- if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
- return __set_memory_enc_pgtable(addr, numpages, enc);
+ int ret = 0;
- return 0;
+ if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
+ if (!down_read_trylock(&mem_enc_lock))
+ return -EBUSY;
+
+ ret = __set_memory_enc_pgtable(addr, numpages, enc);
+
+ up_read(&mem_enc_lock);
+ }
+
+ return ret;
}
int set_memory_encrypted(unsigned long addr, int numpages)
@@ -2345,7 +2616,7 @@ static int __set_pages_np(struct page *page, int numpages)
.pgd = NULL,
.numpages = numpages,
.mask_set = __pgprot(0),
- .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+ .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY),
.flags = CPA_NO_CHECK_ALIAS };
/*
@@ -2367,6 +2638,14 @@ int set_direct_map_default_noflush(struct page *page)
return __set_pages_p(page, 1);
}
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
+{
+ if (valid)
+ return __set_pages_p(page, nr);
+
+ return __set_pages_np(page, nr);
+}
+
#ifdef CONFIG_DEBUG_PAGEALLOC
void __kernel_map_pages(struct page *page, int numpages, int enable)
{
@@ -2424,7 +2703,7 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
.pgd = pgd,
.numpages = numpages,
.mask_set = __pgprot(0),
- .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)),
+ .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW|_PAGE_DIRTY)),
.flags = CPA_NO_CHECK_ALIAS,
};
@@ -2467,7 +2746,7 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
.pgd = pgd,
.numpages = numpages,
.mask_set = __pgprot(0),
- .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+ .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY),
.flags = CPA_NO_CHECK_ALIAS,
};
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index d007591b8059..f7ae44d3dd9e 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -12,55 +12,20 @@ phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
EXPORT_SYMBOL(physical_mask);
#endif
-#ifdef CONFIG_HIGHPTE
-#define PGTABLE_HIGHMEM __GFP_HIGHMEM
-#else
-#define PGTABLE_HIGHMEM 0
-#endif
-
-#ifndef CONFIG_PARAVIRT
-static inline
-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- tlb_remove_page(tlb, table);
-}
-#endif
-
-gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
-
pgtable_t pte_alloc_one(struct mm_struct *mm)
{
- return __pte_alloc_one(mm, __userpte_alloc_gfp);
-}
-
-static int __init setup_userpte(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- /*
- * "userpte=nohigh" disables allocation of user pagetables in
- * high memory.
- */
- if (strcmp(arg, "nohigh") == 0)
- __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
- else
- return -EINVAL;
- return 0;
+ return __pte_alloc_one(mm, GFP_PGTABLE_USER);
}
-early_param("userpte", setup_userpte);
void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
- pagetable_pte_dtor(page_ptdesc(pte));
paravirt_release_pte(page_to_pfn(pte));
- paravirt_tlb_remove_table(tlb, pte);
+ tlb_remove_ptdesc(tlb, page_ptdesc(pte));
}
#if CONFIG_PGTABLE_LEVELS > 2
void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
- struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
/*
* NOTE! For PAE, any changes to the top page-directory-pointer-table
@@ -69,25 +34,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
#ifdef CONFIG_X86_PAE
tlb->need_flush_all = 1;
#endif
- pagetable_pmd_dtor(ptdesc);
- paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc));
+ tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd));
}
#if CONFIG_PGTABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
- struct ptdesc *ptdesc = virt_to_ptdesc(pud);
-
- pagetable_pud_dtor(ptdesc);
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
- paravirt_tlb_remove_table(tlb, virt_to_page(pud));
+ tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud));
}
#if CONFIG_PGTABLE_LEVELS > 4
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
{
paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
- paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
+ tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d));
}
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
@@ -110,7 +71,7 @@ static inline void pgd_list_del(pgd_t *pgd)
#define UNSHARED_PTRS_PER_PGD \
(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
#define MAX_UNSHARED_PTRS_PER_PGD \
- max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD)
+ MAX_T(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD)
static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
@@ -222,7 +183,7 @@ static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
if (pmds[i]) {
ptdesc = virt_to_ptdesc(pmds[i]);
- pagetable_pmd_dtor(ptdesc);
+ pagetable_dtor(ptdesc);
pagetable_free(ptdesc);
mm_dec_nr_pmds(mm);
}
@@ -392,15 +353,14 @@ void __init pgtable_cache_init(void)
SLAB_PANIC, NULL);
}
-static inline pgd_t *_pgd_alloc(void)
+static inline pgd_t *_pgd_alloc(struct mm_struct *mm)
{
/*
* If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
* We allocate one page for pgd.
*/
if (!SHARED_KERNEL_PMD)
- return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
- PGD_ALLOCATION_ORDER);
+ return __pgd_alloc(mm, pgd_allocation_order());
/*
* Now PAE kernel is not running as a Xen domain. We can allocate
@@ -409,24 +369,23 @@ static inline pgd_t *_pgd_alloc(void)
return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER);
}
-static inline void _pgd_free(pgd_t *pgd)
+static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
if (!SHARED_KERNEL_PMD)
- free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
+ __pgd_free(mm, pgd);
else
kmem_cache_free(pgd_cache, pgd);
}
#else
-static inline pgd_t *_pgd_alloc(void)
+static inline pgd_t *_pgd_alloc(struct mm_struct *mm)
{
- return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
- PGD_ALLOCATION_ORDER);
+ return __pgd_alloc(mm, pgd_allocation_order());
}
-static inline void _pgd_free(pgd_t *pgd)
+static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
- free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
+ __pgd_free(mm, pgd);
}
#endif /* CONFIG_X86_PAE */
@@ -436,7 +395,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
pmd_t *pmds[MAX_PREALLOCATED_PMDS];
- pgd = _pgd_alloc();
+ pgd = _pgd_alloc(mm);
if (pgd == NULL)
goto out;
@@ -479,7 +438,7 @@ out_free_pmds:
if (sizeof(pmds) != 0)
free_pmds(mm, pmds, PREALLOCATED_PMDS);
out_free_pgd:
- _pgd_free(pgd);
+ _pgd_free(mm, pgd);
out:
return NULL;
}
@@ -489,7 +448,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
pgd_mop_up_pmds(mm, pgd);
pgd_dtor(pgd);
paravirt_pgd_free(mm, pgd);
- _pgd_free(pgd);
+ _pgd_free(mm, pgd);
}
/*
@@ -631,6 +590,8 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
+
/*
* No flush is necessary. Once an invalid PTE is established, the PTE's
* access and dirty bits cannot be updated.
@@ -639,6 +600,18 @@ pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
}
#endif
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
+ pud_t *pudp)
+{
+ VM_WARN_ON_ONCE(!pud_present(*pudp));
+ pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp));
+ flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
+ return old;
+}
+#endif
+
/**
* reserve_top_address - reserves a hole in the top of kernel address space
* @reserve - size of hole to reserve
@@ -731,7 +704,7 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
return 0;
/* Bail out if we are we on a populated non-leaf entry: */
- if (pud_present(*pud) && !pud_huge(*pud))
+ if (pud_present(*pud) && !pud_leaf(*pud))
return 0;
set_pte((pte_t *)pud, pfn_pte(
@@ -760,7 +733,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
}
/* Bail out if we are we on a populated non-leaf entry: */
- if (pmd_present(*pmd) && !pmd_huge(*pmd))
+ if (pmd_present(*pmd) && !pmd_leaf(*pmd))
return 0;
set_pte((pte_t *)pmd, pfn_pte(
@@ -842,7 +815,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
free_page((unsigned long)pmd_sv);
- pagetable_pmd_dtor(virt_to_ptdesc(pmd));
+ pagetable_dtor(virt_to_ptdesc(pmd));
free_page((unsigned long)pmd);
return 1;
@@ -924,3 +897,9 @@ void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd)
VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
pmd_shstk(pmd));
}
+
+void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud)
+{
+ /* See note in arch_check_zapped_pte() */
+ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud));
+}
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 2e69abf4f852..5f0d579932c6 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -132,7 +132,7 @@ pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
* Top-level entries added to init_mm's usermode pgd after boot
* will not be automatically propagated to other mms.
*/
- if (!pgdp_maps_userspace(pgdp))
+ if (!pgdp_maps_userspace(pgdp) || (pgd.pgd & _PAGE_NOPTISHADOW))
return pgd;
/*
@@ -241,7 +241,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
*
* Returns a pointer to a PTE on success, or NULL on failure.
*/
-static pte_t *pti_user_pagetable_walk_pte(unsigned long address)
+static pte_t *pti_user_pagetable_walk_pte(unsigned long address, bool late_text)
{
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
pmd_t *pmd;
@@ -251,10 +251,15 @@ static pte_t *pti_user_pagetable_walk_pte(unsigned long address)
if (!pmd)
return NULL;
- /* We can't do anything sensible if we hit a large mapping. */
+ /* Large PMD mapping found */
if (pmd_leaf(*pmd)) {
- WARN_ON(1);
- return NULL;
+ /* Clear the PMD if we hit a large mapping from the first round */
+ if (late_text) {
+ set_pmd(pmd, __pmd(0));
+ } else {
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
}
if (pmd_none(*pmd)) {
@@ -283,7 +288,7 @@ static void __init pti_setup_vsyscall(void)
if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
return;
- target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
+ target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR, false);
if (WARN_ON(!target_pte))
return;
@@ -301,7 +306,7 @@ enum pti_clone_level {
static void
pti_clone_pgtable(unsigned long start, unsigned long end,
- enum pti_clone_level level)
+ enum pti_clone_level level, bool late_text)
{
unsigned long addr;
@@ -374,14 +379,14 @@ pti_clone_pgtable(unsigned long start, unsigned long end,
*/
*target_pmd = *pmd;
- addr += PMD_SIZE;
+ addr = round_up(addr + 1, PMD_SIZE);
} else if (level == PTI_CLONE_PTE) {
/* Walk the page-table down to the pte level */
pte = pte_offset_kernel(pmd, addr);
if (pte_none(*pte)) {
- addr += PAGE_SIZE;
+ addr = round_up(addr + 1, PAGE_SIZE);
continue;
}
@@ -390,7 +395,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end,
return;
/* Allocate PTE in the user page-table */
- target_pte = pti_user_pagetable_walk_pte(addr);
+ target_pte = pti_user_pagetable_walk_pte(addr, late_text);
if (WARN_ON(!target_pte))
return;
@@ -401,7 +406,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end,
/* Clone the PTE */
*target_pte = *pte;
- addr += PAGE_SIZE;
+ addr = round_up(addr + 1, PAGE_SIZE);
} else {
BUG();
@@ -452,7 +457,7 @@ static void __init pti_clone_user_shared(void)
phys_addr_t pa = per_cpu_ptr_to_phys((void *)va);
pte_t *target_pte;
- target_pte = pti_user_pagetable_walk_pte(va);
+ target_pte = pti_user_pagetable_walk_pte(va, false);
if (WARN_ON(!target_pte))
return;
@@ -475,7 +480,7 @@ static void __init pti_clone_user_shared(void)
start = CPU_ENTRY_AREA_BASE;
end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES);
- pti_clone_pgtable(start, end, PTI_CLONE_PMD);
+ pti_clone_pgtable(start, end, PTI_CLONE_PMD, false);
}
#endif /* CONFIG_X86_64 */
@@ -492,11 +497,11 @@ static void __init pti_setup_espfix64(void)
/*
* Clone the populated PMDs of the entry text and force it RO.
*/
-static void pti_clone_entry_text(void)
+static void pti_clone_entry_text(bool late)
{
pti_clone_pgtable((unsigned long) __entry_text_start,
(unsigned long) __entry_text_end,
- PTI_CLONE_PMD);
+ PTI_LEVEL_KERNEL_IMAGE, late);
}
/*
@@ -571,7 +576,7 @@ static void pti_clone_kernel_text(void)
* pti_set_kernel_image_nonglobal() did to clear the
* global bit.
*/
- pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE);
+ pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE, false);
/*
* pti_clone_pgtable() will set the global bit in any PMDs
@@ -638,8 +643,15 @@ void __init pti_init(void)
/* Undo all global bits from the init pagetables in head_64.S: */
pti_set_kernel_image_nonglobal();
+
/* Replace some of the global bits just for shared entry text: */
- pti_clone_entry_text();
+ /*
+ * This is very early in boot. Device and Late initcalls can do
+ * modprobe before free_initmem() and mark_readonly(). This
+ * pti_clone_entry_text() allows those user-mode-helpers to function,
+ * but notably the text is still RW.
+ */
+ pti_clone_entry_text(false);
pti_setup_espfix64();
pti_setup_vsyscall();
}
@@ -656,10 +668,11 @@ void pti_finalize(void)
if (!boot_cpu_has(X86_FEATURE_PTI))
return;
/*
- * We need to clone everything (again) that maps parts of the
- * kernel image.
+ * This is after free_initmem() (all initcalls are done) and we've done
+ * mark_readonly(). Text is now NX which might've split some PMDs
+ * relative to the early clone.
*/
- pti_clone_entry_text();
+ pti_clone_entry_text(true);
pti_clone_kernel_text();
debug_checkwx_user();
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 9c52a95937ad..6f8e0f21c710 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -57,8 +57,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
}
set_apicid_to_node(apic_id, node);
node_set(node, numa_nodes_parsed);
- printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
- pxm, apic_id, node);
+ pr_debug("SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node);
}
/* Callback for Proximity Domain -> LAPIC mapping */
@@ -98,8 +97,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
set_apicid_to_node(apic_id, node);
node_set(node, numa_nodes_parsed);
- printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
- pxm, apic_id, node);
+ pr_debug("SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node);
}
int __init x86_acpi_numa_init(void)
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index bda73cb7a044..ae295659ca14 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -144,3 +144,4 @@ static void __exit cleanup(void)
module_init(init);
module_exit(cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Test module for mmiotrace");
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 44ac64f3a047..eb83348f9305 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -11,6 +11,7 @@
#include <linux/sched/smt.h>
#include <linux/task_work.h>
#include <linux/mmu_notifier.h>
+#include <linux/mmu_context.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -19,6 +20,7 @@
#include <asm/cacheflush.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
+#include <asm/tlb.h>
#include "mm_internal.h"
@@ -72,22 +74,21 @@
* use different names for each of them:
*
* ASID - [0, TLB_NR_DYN_ASIDS-1]
- * the canonical identifier for an mm
+ * the canonical identifier for an mm, dynamically allocated on each CPU
+ * [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1]
+ * the canonical, global identifier for an mm, identical across all CPUs
*
- * kPCID - [1, TLB_NR_DYN_ASIDS]
+ * kPCID - [1, MAX_ASID_AVAILABLE]
* the value we write into the PCID part of CR3; corresponds to the
* ASID+1, because PCID 0 is special.
*
- * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
+ * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE]
* for KPTI each mm has two address spaces and thus needs two
* PCID values, but we can still do with a single ASID denomination
* for each mm. Corresponds to kPCID + 2048.
*
*/
-/* There are 12 bits of space for ASIDS in CR3 */
-#define CR3_HW_ASID_BITS 12
-
/*
* When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for
* user/kernel switches
@@ -160,7 +161,6 @@ static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam)
unsigned long cr3 = __sme_pa(pgd) | lam;
if (static_cpu_has(X86_FEATURE_PCID)) {
- VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
cr3 |= kern_pcid(asid);
} else {
VM_WARN_ON_ONCE(asid != 0);
@@ -227,6 +227,20 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
return;
}
+ /*
+ * TLB consistency for global ASIDs is maintained with hardware assisted
+ * remote TLB flushing. Global ASIDs are always up to date.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+ u16 global_asid = mm_global_asid(next);
+
+ if (global_asid) {
+ *new_asid = global_asid;
+ *need_flush = false;
+ return;
+ }
+ }
+
if (this_cpu_read(cpu_tlbstate.invalidate_other))
clear_asid_other();
@@ -254,6 +268,268 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
}
/*
+ * Global ASIDs are allocated for multi-threaded processes that are
+ * active on multiple CPUs simultaneously, giving each of those
+ * processes the same PCID on every CPU, for use with hardware-assisted
+ * TLB shootdown on remote CPUs, like AMD INVLPGB or Intel RAR.
+ *
+ * These global ASIDs are held for the lifetime of the process.
+ */
+static DEFINE_RAW_SPINLOCK(global_asid_lock);
+static u16 last_global_asid = MAX_ASID_AVAILABLE;
+static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE);
+static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE);
+static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1;
+
+/*
+ * When the search for a free ASID in the global ASID space reaches
+ * MAX_ASID_AVAILABLE, a global TLB flush guarantees that previously
+ * freed global ASIDs are safe to re-use.
+ *
+ * This way the global flush only needs to happen at ASID rollover
+ * time, and not at ASID allocation time.
+ */
+static void reset_global_asid_space(void)
+{
+ lockdep_assert_held(&global_asid_lock);
+
+ invlpgb_flush_all_nonglobals();
+
+ /*
+ * The TLB flush above makes it safe to re-use the previously
+ * freed global ASIDs.
+ */
+ bitmap_andnot(global_asid_used, global_asid_used,
+ global_asid_freed, MAX_ASID_AVAILABLE);
+ bitmap_clear(global_asid_freed, 0, MAX_ASID_AVAILABLE);
+
+ /* Restart the search from the start of global ASID space. */
+ last_global_asid = TLB_NR_DYN_ASIDS;
+}
+
+static u16 allocate_global_asid(void)
+{
+ u16 asid;
+
+ lockdep_assert_held(&global_asid_lock);
+
+ /* The previous allocation hit the edge of available address space */
+ if (last_global_asid >= MAX_ASID_AVAILABLE - 1)
+ reset_global_asid_space();
+
+ asid = find_next_zero_bit(global_asid_used, MAX_ASID_AVAILABLE, last_global_asid);
+
+ if (asid >= MAX_ASID_AVAILABLE && !global_asid_available) {
+ /* This should never happen. */
+ VM_WARN_ONCE(1, "Unable to allocate global ASID despite %d available\n",
+ global_asid_available);
+ return 0;
+ }
+
+ /* Claim this global ASID. */
+ __set_bit(asid, global_asid_used);
+ last_global_asid = asid;
+ global_asid_available--;
+ return asid;
+}
+
+/*
+ * Check whether a process is currently active on more than @threshold CPUs.
+ * This is a cheap estimation on whether or not it may make sense to assign
+ * a global ASID to this process, and use broadcast TLB invalidation.
+ */
+static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold)
+{
+ int count = 0;
+ int cpu;
+
+ /* This quick check should eliminate most single threaded programs. */
+ if (cpumask_weight(mm_cpumask(mm)) <= threshold)
+ return false;
+
+ /* Slower check to make sure. */
+ for_each_cpu(cpu, mm_cpumask(mm)) {
+ /* Skip the CPUs that aren't really running this process. */
+ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm)
+ continue;
+
+ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
+ continue;
+
+ if (++count > threshold)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Assign a global ASID to the current process, protecting against
+ * races between multiple threads in the process.
+ */
+static void use_global_asid(struct mm_struct *mm)
+{
+ u16 asid;
+
+ guard(raw_spinlock_irqsave)(&global_asid_lock);
+
+ /* This process is already using broadcast TLB invalidation. */
+ if (mm_global_asid(mm))
+ return;
+
+ /*
+ * The last global ASID was consumed while waiting for the lock.
+ *
+ * If this fires, a more aggressive ASID reuse scheme might be
+ * needed.
+ */
+ if (!global_asid_available) {
+ VM_WARN_ONCE(1, "Ran out of global ASIDs\n");
+ return;
+ }
+
+ asid = allocate_global_asid();
+ if (!asid)
+ return;
+
+ mm_assign_global_asid(mm, asid);
+}
+
+void mm_free_global_asid(struct mm_struct *mm)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return;
+
+ if (!mm_global_asid(mm))
+ return;
+
+ guard(raw_spinlock_irqsave)(&global_asid_lock);
+
+ /* The global ASID can be re-used only after flush at wrap-around. */
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+ __set_bit(mm->context.global_asid, global_asid_freed);
+
+ mm->context.global_asid = 0;
+ global_asid_available++;
+#endif
+}
+
+/*
+ * Is the mm transitioning from a CPU-local ASID to a global ASID?
+ */
+static bool mm_needs_global_asid(struct mm_struct *mm, u16 asid)
+{
+ u16 global_asid = mm_global_asid(mm);
+
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return false;
+
+ /* Process is transitioning to a global ASID */
+ if (global_asid && asid != global_asid)
+ return true;
+
+ return false;
+}
+
+/*
+ * x86 has 4k ASIDs (2k when compiled with KPTI), but the largest x86
+ * systems have over 8k CPUs. Because of this potential ASID shortage,
+ * global ASIDs are handed out to processes that have frequent TLB
+ * flushes and are active on 4 or more CPUs simultaneously.
+ */
+static void consider_global_asid(struct mm_struct *mm)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return;
+
+ /* Check every once in a while. */
+ if ((current->pid & 0x1f) != (jiffies & 0x1f))
+ return;
+
+ /*
+ * Assign a global ASID if the process is active on
+ * 4 or more CPUs simultaneously.
+ */
+ if (mm_active_cpus_exceeds(mm, 3))
+ use_global_asid(mm);
+}
+
+static void finish_asid_transition(struct flush_tlb_info *info)
+{
+ struct mm_struct *mm = info->mm;
+ int bc_asid = mm_global_asid(mm);
+ int cpu;
+
+ if (!mm_in_asid_transition(mm))
+ return;
+
+ for_each_cpu(cpu, mm_cpumask(mm)) {
+ /*
+ * The remote CPU is context switching. Wait for that to
+ * finish, to catch the unlikely case of it switching to
+ * the target mm with an out of date ASID.
+ */
+ while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING)
+ cpu_relax();
+
+ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm)
+ continue;
+
+ /*
+ * If at least one CPU is not using the global ASID yet,
+ * send a TLB flush IPI. The IPI should cause stragglers
+ * to transition soon.
+ *
+ * This can race with the CPU switching to another task;
+ * that results in a (harmless) extra IPI.
+ */
+ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) {
+ flush_tlb_multi(mm_cpumask(info->mm), info);
+ return;
+ }
+ }
+
+ /* All the CPUs running this process are using the global ASID. */
+ mm_clear_asid_transition(mm);
+}
+
+static void broadcast_tlb_flush(struct flush_tlb_info *info)
+{
+ bool pmd = info->stride_shift == PMD_SHIFT;
+ unsigned long asid = mm_global_asid(info->mm);
+ unsigned long addr = info->start;
+
+ /*
+ * TLB flushes with INVLPGB are kicked off asynchronously.
+ * The inc_mm_tlb_gen() guarantees page table updates are done
+ * before these TLB flushes happen.
+ */
+ if (info->end == TLB_FLUSH_ALL) {
+ invlpgb_flush_single_pcid_nosync(kern_pcid(asid));
+ /* Do any CPUs supporting INVLPGB need PTI? */
+ if (cpu_feature_enabled(X86_FEATURE_PTI))
+ invlpgb_flush_single_pcid_nosync(user_pcid(asid));
+ } else do {
+ unsigned long nr = 1;
+
+ if (info->stride_shift <= PMD_SHIFT) {
+ nr = (info->end - addr) >> info->stride_shift;
+ nr = clamp_val(nr, 1, invlpgb_count_max);
+ }
+
+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd);
+ if (cpu_feature_enabled(X86_FEATURE_PTI))
+ invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd);
+
+ addr += nr << info->stride_shift;
+ } while (addr < info->end);
+
+ finish_asid_transition(info);
+
+ /* Wait for the INVLPGBs kicked off above to finish. */
+ __tlbsync();
+}
+
+/*
* Given an ASID, flush the corresponding user ASID. We can delay this
* until the next time we switch to it.
*
@@ -391,9 +667,9 @@ static void cond_mitigation(struct task_struct *next)
prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec);
/*
- * Avoid user/user BTB poisoning by flushing the branch predictor
- * when switching between processes. This stops one process from
- * doing Spectre-v2 attacks on another.
+ * Avoid user->user BTB/RSB poisoning by flushing them when switching
+ * between processes. This stops one process from doing Spectre-v2
+ * attacks on another.
*
* Both, the conditional and the always IBPB mode use the mm
* pointer to avoid the IBPB when switching between tasks of the
@@ -449,8 +725,7 @@ static void cond_mitigation(struct task_struct *next)
* different context than the user space task which ran
* last on this CPU.
*/
- if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) !=
- (unsigned long)next->mm)
+ if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) != (unsigned long)next->mm)
indirect_branch_prediction_barrier();
}
@@ -503,9 +778,9 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
{
struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
- unsigned long new_lam = mm_lam_cr3_mask(next);
bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
unsigned cpu = smp_processor_id();
+ unsigned long new_lam;
u64 next_tlb_gen;
bool need_flush;
u16 new_asid;
@@ -558,7 +833,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
*/
if (prev == next) {
/* Not actually switching mm's */
- VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+ VM_WARN_ON(is_dyn_asid(prev_asid) &&
+ this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);
/*
@@ -571,10 +847,24 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
* mm_cpumask. The TLB shootdown code can figure out from
* cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
*/
- if (WARN_ON_ONCE(prev != &init_mm &&
+ if (IS_ENABLED(CONFIG_DEBUG_VM) && WARN_ON_ONCE(prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
+ /* Check if the current mm is transitioning to a global ASID */
+ if (mm_needs_global_asid(next, prev_asid)) {
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+ goto reload_tlb;
+ }
+
+ /*
+ * Broadcast TLB invalidation keeps this ASID up to date
+ * all the time.
+ */
+ if (is_global_asid(prev_asid))
+ return;
+
/*
* If the CPU is not in lazy TLB mode, we are just switching
* from one thread in a process to another thread in the same
@@ -609,32 +899,32 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
cond_mitigation(tsk);
/*
- * Stop remote flushes for the previous mm.
- * Skip kernel threads; we never send init_mm TLB flushing IPIs,
- * but the bitmap manipulation can cause cache line contention.
+ * Let nmi_uaccess_okay() and finish_asid_transition()
+ * know that CR3 is changing.
*/
- if (prev != &init_mm) {
- VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
- mm_cpumask(prev)));
- cpumask_clear_cpu(cpu, mm_cpumask(prev));
- }
+ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+ barrier();
/*
- * Start remote flushes and then read tlb_gen.
+ * Leave this CPU in prev's mm_cpumask. Atomic writes to
+ * mm_cpumask can be expensive under contention. The CPU
+ * will be removed lazily at TLB flush time.
*/
- if (next != &init_mm)
+ VM_WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu,
+ mm_cpumask(prev)));
+
+ /* Start receiving IPIs and then read tlb_gen (and LAM below) */
+ if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))
cpumask_set_cpu(cpu, mm_cpumask(next));
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
-
- /* Let nmi_uaccess_okay() know that we're changing CR3. */
- this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
- barrier();
}
- set_tlbstate_lam_mode(next);
+reload_tlb:
+ new_lam = mm_lam_cr3_mask(next);
if (need_flush) {
+ VM_WARN_ON_ONCE(is_global_asid(new_asid));
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
@@ -652,6 +942,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+ cpu_tlbstate_update_lam(new_lam, mm_untag_mask(next));
if (next != prev) {
cr4_update_pce_mm(next);
@@ -698,6 +989,7 @@ void initialize_tlbstate_and_flush(void)
int i;
struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
+ unsigned long lam = mm_lam_cr3_mask(mm);
unsigned long cr3 = __read_cr3();
/* Assert that CR3 already references the right mm. */
@@ -705,7 +997,7 @@ void initialize_tlbstate_and_flush(void)
/* LAM expected to be disabled */
WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57));
- WARN_ON(mm_lam_cr3_mask(mm));
+ WARN_ON(lam);
/*
* Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
@@ -724,7 +1016,7 @@ void initialize_tlbstate_and_flush(void)
this_cpu_write(cpu_tlbstate.next_asid, 1);
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
- set_tlbstate_lam_mode(mm);
+ cpu_tlbstate_update_lam(lam, mm_untag_mask(mm));
for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
@@ -751,7 +1043,7 @@ static void flush_tlb_func(void *info)
const struct flush_tlb_info *f = info;
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
- u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+ u64 local_tlb_gen;
bool local = smp_processor_id() == f->initiating_cpu;
unsigned long nr_invalidate = 0;
u64 mm_tlb_gen;
@@ -762,15 +1054,28 @@ static void flush_tlb_func(void *info)
if (!local) {
inc_irq_stat(irq_tlb_count);
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ }
- /* Can only happen on remote CPUs */
- if (f->mm && f->mm != loaded_mm)
- return;
+ /* The CPU was left in the mm_cpumask of the target mm. Clear it. */
+ if (f->mm && f->mm != loaded_mm) {
+ cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(f->mm));
+ trace_tlb_flush(TLB_REMOTE_WRONG_CPU, 0);
+ return;
}
if (unlikely(loaded_mm == &init_mm))
return;
+ /* Reload the ASID if transitioning into or out of a global ASID */
+ if (mm_needs_global_asid(loaded_mm, loaded_mm_asid)) {
+ switch_mm_irqs_off(NULL, loaded_mm, NULL);
+ loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ }
+
+ /* Broadcast ASIDs are always kept up to date with INVLPGB. */
+ if (is_global_asid(loaded_mm_asid))
+ return;
+
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
loaded_mm->context.ctx_id);
@@ -788,6 +1093,8 @@ static void flush_tlb_func(void *info)
return;
}
+ local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+
if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
f->new_tlb_gen <= local_tlb_gen)) {
/*
@@ -895,9 +1202,36 @@ done:
nr_invalidate);
}
-static bool tlb_is_not_lazy(int cpu, void *data)
+static bool should_flush_tlb(int cpu, void *data)
{
- return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu);
+ struct flush_tlb_info *info = data;
+
+ /* Lazy TLB will get flushed at the next context switch. */
+ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
+ return false;
+
+ /* No mm means kernel memory flush. */
+ if (!info->mm)
+ return true;
+
+ /* The target mm is loaded, and the CPU is not lazy. */
+ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm)
+ return true;
+
+ /* In cpumask, but not the loaded mm? Periodically remove by flushing. */
+ if (info->trim_cpumask)
+ return true;
+
+ return false;
+}
+
+static bool should_trim_cpumask(struct mm_struct *mm)
+{
+ if (time_after(jiffies, READ_ONCE(mm->context.next_trim_cpumask))) {
+ WRITE_ONCE(mm->context.next_trim_cpumask, jiffies + HZ);
+ return true;
+ }
+ return false;
}
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
@@ -928,10 +1262,10 @@ STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
* up on the new contents of what used to be page tables, while
* doing a speculative memory access.
*/
- if (info->freed_tables)
+ if (info->freed_tables || mm_in_asid_transition(info->mm))
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
else
- on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func,
+ on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func,
(void *)info, 1, cpumask);
}
@@ -975,6 +1309,15 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
#endif
+ /*
+ * If the number of flushes is so large that a full flush
+ * would be faster, do a full flush.
+ */
+ if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
+ start = 0;
+ end = TLB_FLUSH_ALL;
+ }
+
info->start = start;
info->end = end;
info->mm = mm;
@@ -982,6 +1325,7 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
info->freed_tables = freed_tables;
info->new_tlb_gen = new_tlb_gen;
info->initiating_cpu = smp_processor_id();
+ info->trim_cpumask = 0;
return info;
}
@@ -1000,17 +1344,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
bool freed_tables)
{
struct flush_tlb_info *info;
+ int cpu = get_cpu();
u64 new_tlb_gen;
- int cpu;
-
- cpu = get_cpu();
-
- /* Should we flush just the requested range? */
- if ((end == TLB_FLUSH_ALL) ||
- ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
- start = 0;
- end = TLB_FLUSH_ALL;
- }
/* This is also a barrier that synchronizes with switch_mm(). */
new_tlb_gen = inc_mm_tlb_gen(mm);
@@ -1023,8 +1358,12 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
* a local TLB flush is needed. Optimize this use-case by calling
* flush_tlb_func_local() directly in this case.
*/
- if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+ if (mm_global_asid(mm)) {
+ broadcast_tlb_flush(info);
+ } else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+ info->trim_cpumask = should_trim_cpumask(mm);
flush_tlb_multi(mm_cpumask(mm), info);
+ consider_global_asid(mm);
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
lockdep_assert_irqs_enabled();
local_irq_disable();
@@ -1037,7 +1376,6 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
-
static void do_flush_tlb_all(void *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
@@ -1047,7 +1385,32 @@ static void do_flush_tlb_all(void *info)
void flush_tlb_all(void)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
- on_each_cpu(do_flush_tlb_all, NULL, 1);
+
+ /* First try (faster) hardware-assisted TLB invalidation. */
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ invlpgb_flush_all();
+ else
+ /* Fall back to the IPI-based invalidation. */
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
+}
+
+/* Flush an arbitrarily large range of memory with INVLPGB. */
+static void invlpgb_kernel_range_flush(struct flush_tlb_info *info)
+{
+ unsigned long addr, nr;
+
+ for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) {
+ nr = (info->end - addr) >> PAGE_SHIFT;
+
+ /*
+ * INVLPGB has a limit on the size of ranges it can
+ * flush. Break up large flushes.
+ */
+ nr = clamp_val(nr, 1, invlpgb_count_max);
+
+ invlpgb_flush_addr_nosync(addr, nr);
+ }
+ __tlbsync();
}
static void do_kernel_range_flush(void *info)
@@ -1060,24 +1423,37 @@ static void do_kernel_range_flush(void *info)
flush_tlb_one_kernel(addr);
}
-void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+static void kernel_tlb_flush_all(struct flush_tlb_info *info)
{
- /* Balance as user space task's flush, a bit conservative */
- if (end == TLB_FLUSH_ALL ||
- (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ invlpgb_flush_all();
+ else
on_each_cpu(do_flush_tlb_all, NULL, 1);
- } else {
- struct flush_tlb_info *info;
-
- preempt_disable();
- info = get_flush_tlb_info(NULL, start, end, 0, false,
- TLB_GENERATION_INVALID);
+}
+static void kernel_tlb_flush_range(struct flush_tlb_info *info)
+{
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ invlpgb_kernel_range_flush(info);
+ else
on_each_cpu(do_kernel_range_flush, info, 1);
+}
- put_flush_tlb_info();
- preempt_enable();
- }
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+ struct flush_tlb_info *info;
+
+ guard(preempt)();
+
+ info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
+ TLB_GENERATION_INVALID);
+
+ if (info->end == TLB_FLUSH_ALL)
+ kernel_tlb_flush_all(info);
+ else
+ kernel_tlb_flush_range(info);
+
+ put_flush_tlb_info();
}
/*
@@ -1143,7 +1519,7 @@ STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr)
bool cpu_pcide;
/* Flush 'addr' from the kernel PCID: */
- asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+ invlpg(addr);
/* If PTI is off there is no user PCID and nothing to flush. */
if (!static_cpu_has(X86_FEATURE_PTI))
@@ -1256,7 +1632,10 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
* a local TLB flush is needed. Optimize this use-case by calling
* flush_tlb_func_local() directly in this case.
*/
- if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->unmapped_pages) {
+ invlpgb_flush_all_nonglobals();
+ batch->unmapped_pages = false;
+ } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
flush_tlb_multi(&batch->cpumask, info);
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
lockdep_assert_irqs_enabled();
@@ -1298,7 +1677,7 @@ bool nmi_uaccess_okay(void)
if (loaded_mm != current_mm)
return false;
- VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
+ VM_WARN_ON_ONCE(__pa(current_mm->pgd) != read_cr3_pa());
return true;
}