summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-03-24 22:25:21 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-03-24 22:25:21 -0700
commitb58386a9bd79222b69890141b9f9cb30a7423d8a (patch)
tree888462c4e85dfef72ffd8225efd362164945f7d6
parentebfb94d87b35a4b5fc6fda0cf994268555ebe415 (diff)
parentb25eb5f5e419b81f124d5ba2abaaacf1948fb97e (diff)
downloadlinux-b58386a9bd79222b69890141b9f9cb30a7423d8a.tar.gz
linux-b58386a9bd79222b69890141b9f9cb30a7423d8a.tar.bz2
linux-b58386a9bd79222b69890141b9f9cb30a7423d8a.zip
Merge tag 'x86-boot-2025-03-22' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 boot code updates from Ingo Molnar: - Memblock setup and other early boot code cleanups (Mike Rapoport) - Export e820_table_kexec[] to sysfs (Dave Young) - Baby steps of adding relocate_kernel() debugging support (David Woodhouse) - Replace open-coded parity calculation with parity8() (Kuan-Wei Chiu) - Move the LA57 trampoline to separate source file (Ard Biesheuvel) - Misc micro-optimizations (Uros Bizjak) - Drop obsolete E820_TYPE_RESERVED_KERN and related code (Mike Rapoport) * tag 'x86-boot-2025-03-22' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/kexec: Add relocate_kernel() debugging support: Load a GDT x86/boot: Move the LA57 trampoline to separate source file x86/boot: Do not test if AC and ID eflags are changeable on x86_64 x86/bootflag: Replace open-coded parity calculation with parity8() x86/bootflag: Micro-optimize sbf_write() x86/boot: Add missing has_cpuflag() prototype x86/kexec: Export e820_table_kexec[] to sysfs x86/boot: Change some static bootflag functions to bool x86/e820: Drop obsolete E820_TYPE_RESERVED_KERN and related code x86/boot: Split parsing of boot_params into the parse_boot_params() helper function x86/boot: Split kernel resources setup into the setup_kernel_resources() helper function x86/boot: Move setting of memblock parameters to e820__memblock_setup()
-rw-r--r--arch/x86/boot/compressed/Makefile1
-rw-r--r--arch/x86/boot/compressed/head_64.S103
-rw-r--r--arch/x86/boot/compressed/la57toggle.S112
-rw-r--r--arch/x86/boot/cpuflags.c26
-rw-r--r--arch/x86/boot/cpuflags.h7
-rw-r--r--arch/x86/include/asm/e820/api.h1
-rw-r--r--arch/x86/include/asm/e820/types.h9
-rw-r--r--arch/x86/kernel/bootflag.c29
-rw-r--r--arch/x86/kernel/e820.c115
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S23
-rw-r--r--arch/x86/kernel/setup.c134
-rw-r--r--arch/x86/kernel/tboot.c3
-rw-r--r--arch/x86/mm/init_64.c8
-rw-r--r--arch/x86/virt/svm/sev.c1
14 files changed, 266 insertions, 306 deletions
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 606c74f27459..0e0b238e8363 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -98,6 +98,7 @@ ifdef CONFIG_X86_64
vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/mem_encrypt.o
vmlinux-objs-y += $(obj)/pgtable_64.o
vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o
+ vmlinux-objs-y += $(obj)/la57toggle.o
endif
vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 1dcb794c5479..3dc86352cdbe 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -483,110 +483,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
jmp *%rax
SYM_FUNC_END(.Lrelocated)
-/*
- * This is the 32-bit trampoline that will be copied over to low memory. It
- * will be called using the ordinary 64-bit calling convention from code
- * running in 64-bit mode.
- *
- * Return address is at the top of the stack (might be above 4G).
- * The first argument (EDI) contains the address of the temporary PGD level
- * page table in 32-bit addressable memory which will be programmed into
- * register CR3.
- */
- .section ".rodata", "a", @progbits
-SYM_CODE_START(trampoline_32bit_src)
- /*
- * Preserve callee save 64-bit registers on the stack: this is
- * necessary because the architecture does not guarantee that GPRs will
- * retain their full 64-bit values across a 32-bit mode switch.
- */
- pushq %r15
- pushq %r14
- pushq %r13
- pushq %r12
- pushq %rbp
- pushq %rbx
-
- /* Preserve top half of RSP in a legacy mode GPR to avoid truncation */
- movq %rsp, %rbx
- shrq $32, %rbx
-
- /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
- pushq $__KERNEL32_CS
- leaq 0f(%rip), %rax
- pushq %rax
- lretq
-
- /*
- * The 32-bit code below will do a far jump back to long mode and end
- * up here after reconfiguring the number of paging levels. First, the
- * stack pointer needs to be restored to its full 64-bit value before
- * the callee save register contents can be popped from the stack.
- */
-.Lret:
- shlq $32, %rbx
- orq %rbx, %rsp
-
- /* Restore the preserved 64-bit registers */
- popq %rbx
- popq %rbp
- popq %r12
- popq %r13
- popq %r14
- popq %r15
- retq
-
.code32
-0:
- /* Disable paging */
- movl %cr0, %eax
- btrl $X86_CR0_PG_BIT, %eax
- movl %eax, %cr0
-
- /* Point CR3 to the trampoline's new top level page table */
- movl %edi, %cr3
-
- /* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */
- movl $MSR_EFER, %ecx
- rdmsr
- btsl $_EFER_LME, %eax
- /* Avoid writing EFER if no change was made (for TDX guest) */
- jc 1f
- wrmsr
-1:
- /* Toggle CR4.LA57 */
- movl %cr4, %eax
- btcl $X86_CR4_LA57_BIT, %eax
- movl %eax, %cr4
-
- /* Enable paging again. */
- movl %cr0, %eax
- btsl $X86_CR0_PG_BIT, %eax
- movl %eax, %cr0
-
- /*
- * Return to the 64-bit calling code using LJMP rather than LRET, to
- * avoid the need for a 32-bit addressable stack. The destination
- * address will be adjusted after the template code is copied into a
- * 32-bit addressable buffer.
- */
-.Ljmp: ljmpl $__KERNEL_CS, $(.Lret - trampoline_32bit_src)
-SYM_CODE_END(trampoline_32bit_src)
-
-/*
- * This symbol is placed right after trampoline_32bit_src() so its address can
- * be used to infer the size of the trampoline code.
- */
-SYM_DATA(trampoline_ljmp_imm_offset, .word .Ljmp + 1 - trampoline_32bit_src)
-
- /*
- * The trampoline code has a size limit.
- * Make sure we fail to compile if the trampoline code grows
- * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes.
- */
- .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE
-
- .text
SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode)
/* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */
1:
diff --git a/arch/x86/boot/compressed/la57toggle.S b/arch/x86/boot/compressed/la57toggle.S
new file mode 100644
index 000000000000..9ee002387eb1
--- /dev/null
+++ b/arch/x86/boot/compressed/la57toggle.S
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/boot.h>
+#include <asm/msr.h>
+#include <asm/processor-flags.h>
+#include "pgtable.h"
+
+/*
+ * This is the 32-bit trampoline that will be copied over to low memory. It
+ * will be called using the ordinary 64-bit calling convention from code
+ * running in 64-bit mode.
+ *
+ * Return address is at the top of the stack (might be above 4G).
+ * The first argument (EDI) contains the address of the temporary PGD level
+ * page table in 32-bit addressable memory which will be programmed into
+ * register CR3.
+ */
+
+ .section ".rodata", "a", @progbits
+SYM_CODE_START(trampoline_32bit_src)
+ /*
+ * Preserve callee save 64-bit registers on the stack: this is
+ * necessary because the architecture does not guarantee that GPRs will
+ * retain their full 64-bit values across a 32-bit mode switch.
+ */
+ pushq %r15
+ pushq %r14
+ pushq %r13
+ pushq %r12
+ pushq %rbp
+ pushq %rbx
+
+ /* Preserve top half of RSP in a legacy mode GPR to avoid truncation */
+ movq %rsp, %rbx
+ shrq $32, %rbx
+
+ /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
+ pushq $__KERNEL32_CS
+ leaq 0f(%rip), %rax
+ pushq %rax
+ lretq
+
+ /*
+ * The 32-bit code below will do a far jump back to long mode and end
+ * up here after reconfiguring the number of paging levels. First, the
+ * stack pointer needs to be restored to its full 64-bit value before
+ * the callee save register contents can be popped from the stack.
+ */
+.Lret:
+ shlq $32, %rbx
+ orq %rbx, %rsp
+
+ /* Restore the preserved 64-bit registers */
+ popq %rbx
+ popq %rbp
+ popq %r12
+ popq %r13
+ popq %r14
+ popq %r15
+ retq
+
+ .code32
+0:
+ /* Disable paging */
+ movl %cr0, %eax
+ btrl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ /* Point CR3 to the trampoline's new top level page table */
+ movl %edi, %cr3
+
+ /* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_LME, %eax
+ /* Avoid writing EFER if no change was made (for TDX guest) */
+ jc 1f
+ wrmsr
+1:
+ /* Toggle CR4.LA57 */
+ movl %cr4, %eax
+ btcl $X86_CR4_LA57_BIT, %eax
+ movl %eax, %cr4
+
+ /* Enable paging again. */
+ movl %cr0, %eax
+ btsl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ /*
+ * Return to the 64-bit calling code using LJMP rather than LRET, to
+ * avoid the need for a 32-bit addressable stack. The destination
+ * address will be adjusted after the template code is copied into a
+ * 32-bit addressable buffer.
+ */
+.Ljmp: ljmpl $__KERNEL_CS, $(.Lret - trampoline_32bit_src)
+SYM_CODE_END(trampoline_32bit_src)
+
+/*
+ * This symbol is placed right after trampoline_32bit_src() so its address can
+ * be used to infer the size of the trampoline code.
+ */
+SYM_DATA(trampoline_ljmp_imm_offset, .word .Ljmp + 1 - trampoline_32bit_src)
+
+ /*
+ * The trampoline code has a size limit.
+ * Make sure we fail to compile if the trampoline code grows
+ * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes.
+ */
+ .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c
index 0cabdacb2a2f..916bac09b464 100644
--- a/arch/x86/boot/cpuflags.c
+++ b/arch/x86/boot/cpuflags.c
@@ -28,40 +28,32 @@ static int has_fpu(void)
return fsw == 0 && (fcw & 0x103f) == 0x003f;
}
+#ifdef CONFIG_X86_32
/*
* For building the 16-bit code we want to explicitly specify 32-bit
* push/pop operations, rather than just saying 'pushf' or 'popf' and
- * letting the compiler choose. But this is also included from the
- * compressed/ directory where it may be 64-bit code, and thus needs
- * to be 'pushfq' or 'popfq' in that case.
+ * letting the compiler choose.
*/
-#ifdef __x86_64__
-#define PUSHF "pushfq"
-#define POPF "popfq"
-#else
-#define PUSHF "pushfl"
-#define POPF "popfl"
-#endif
-
-int has_eflag(unsigned long mask)
+bool has_eflag(unsigned long mask)
{
unsigned long f0, f1;
- asm volatile(PUSHF " \n\t"
- PUSHF " \n\t"
+ asm volatile("pushfl \n\t"
+ "pushfl \n\t"
"pop %0 \n\t"
"mov %0,%1 \n\t"
"xor %2,%1 \n\t"
"push %1 \n\t"
- POPF " \n\t"
- PUSHF " \n\t"
+ "popfl \n\t"
+ "pushfl \n\t"
"pop %1 \n\t"
- POPF
+ "popfl"
: "=&r" (f0), "=&r" (f1)
: "ri" (mask));
return !!((f0^f1) & mask);
}
+#endif
void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d)
{
diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h
index 475b8fde90f7..a398d9204ad0 100644
--- a/arch/x86/boot/cpuflags.h
+++ b/arch/x86/boot/cpuflags.h
@@ -15,8 +15,13 @@ struct cpu_features {
extern struct cpu_features cpu;
extern u32 cpu_vendor[3];
-int has_eflag(unsigned long mask);
+#ifdef CONFIG_X86_32
+bool has_eflag(unsigned long mask);
+#else
+static inline bool has_eflag(unsigned long mask) { return true; }
+#endif
void get_cpuflags(void);
void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d);
+bool has_cpuflag(int flag);
#endif
diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
index 2e74a7f0e935..c83645d5b2a8 100644
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -29,7 +29,6 @@ extern unsigned long e820__end_of_low_ram_pfn(void);
extern u64 e820__memblock_alloc_reserved(u64 size, u64 align);
extern void e820__memblock_setup(void);
-extern void e820__reserve_setup_data(void);
extern void e820__finish_early_params(void);
extern void e820__reserve_resources(void);
extern void e820__reserve_resources_late(void);
diff --git a/arch/x86/include/asm/e820/types.h b/arch/x86/include/asm/e820/types.h
index 314f75d886d0..80c4a7266629 100644
--- a/arch/x86/include/asm/e820/types.h
+++ b/arch/x86/include/asm/e820/types.h
@@ -35,15 +35,6 @@ enum e820_type {
* marking it with the IORES_DESC_SOFT_RESERVED designation.
*/
E820_TYPE_SOFT_RESERVED = 0xefffffff,
-
- /*
- * Reserved RAM used by the kernel itself if
- * CONFIG_INTEL_TXT=y is enabled, memory of this type
- * will be included in the S3 integrity calculation
- * and so should not include any memory that the BIOS
- * might alter over the S3 transition:
- */
- E820_TYPE_RESERVED_KERN = 128,
};
/*
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
index 3fed7ae58b60..73274d76ce16 100644
--- a/arch/x86/kernel/bootflag.c
+++ b/arch/x86/kernel/bootflag.c
@@ -8,6 +8,7 @@
#include <linux/string.h>
#include <linux/spinlock.h>
#include <linux/acpi.h>
+#include <linux/bitops.h>
#include <asm/io.h>
#include <linux/mc146818rtc.h>
@@ -20,27 +21,13 @@
int sbf_port __initdata = -1; /* set via acpi_boot_init() */
-static int __init parity(u8 v)
-{
- int x = 0;
- int i;
-
- for (i = 0; i < 8; i++) {
- x ^= (v & 1);
- v >>= 1;
- }
-
- return x;
-}
-
static void __init sbf_write(u8 v)
{
unsigned long flags;
if (sbf_port != -1) {
- v &= ~SBF_PARITY;
- if (!parity(v))
- v |= SBF_PARITY;
+ if (!parity8(v))
+ v ^= SBF_PARITY;
printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n",
sbf_port, v);
@@ -66,14 +53,14 @@ static u8 __init sbf_read(void)
return v;
}
-static int __init sbf_value_valid(u8 v)
+static bool __init sbf_value_valid(u8 v)
{
if (v & SBF_RESERVED) /* Reserved bits */
- return 0;
- if (!parity(v))
- return 0;
+ return false;
+ if (!parity8(v))
+ return false;
- return 1;
+ return true;
}
static int __init sbf_init(void)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 82b96ed9890a..57120f0749cc 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -28,18 +28,13 @@
* the first 128 E820 memory entries in boot_params.e820_table and the remaining
* (if any) entries of the SETUP_E820_EXT nodes. We use this to:
*
- * - inform the user about the firmware's notion of memory layout
- * via /sys/firmware/memmap
- *
* - the hibernation code uses it to generate a kernel-independent CRC32
* checksum of the physical memory layout of a system.
*
* - 'e820_table_kexec': a slightly modified (by the kernel) firmware version
* passed to us by the bootloader - the major difference between
- * e820_table_firmware[] and this one is that, the latter marks the setup_data
- * list created by the EFI boot stub as reserved, so that kexec can reuse the
- * setup_data information in the second kernel. Besides, e820_table_kexec[]
- * might also be modified by the kexec itself to fake a mptable.
+ * e820_table_firmware[] and this one is that e820_table_kexec[]
+ * might be modified by the kexec itself to fake an mptable.
* We use this to:
*
* - kexec, which is a bootloader in disguise, uses the original E820
@@ -47,6 +42,11 @@
* can have a restricted E820 map while the kexec()-ed kexec-kernel
* can have access to full memory - etc.
*
+ * Export the memory layout via /sys/firmware/memmap. kexec-tools uses
+ * the entries to create an E820 table for the kexec kernel.
+ *
+ * kexec_file_load in-kernel code uses the table for the kexec kernel.
+ *
* - 'e820_table': this is the main E820 table that is massaged by the
* low level x86 platform code, or modified by boot parameters, before
* passed on to higher level MM layers.
@@ -187,8 +187,7 @@ void __init e820__range_add(u64 start, u64 size, enum e820_type type)
static void __init e820_print_type(enum e820_type type)
{
switch (type) {
- case E820_TYPE_RAM: /* Fall through: */
- case E820_TYPE_RESERVED_KERN: pr_cont("usable"); break;
+ case E820_TYPE_RAM: pr_cont("usable"); break;
case E820_TYPE_RESERVED: pr_cont("reserved"); break;
case E820_TYPE_SOFT_RESERVED: pr_cont("soft reserved"); break;
case E820_TYPE_ACPI: pr_cont("ACPI data"); break;
@@ -764,7 +763,7 @@ void __init e820__register_nosave_regions(unsigned long limit_pfn)
pfn = PFN_DOWN(entry->addr + entry->size);
- if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
+ if (entry->type != E820_TYPE_RAM)
register_nosave_region(PFN_UP(entry->addr), pfn);
if (pfn >= limit_pfn)
@@ -991,60 +990,6 @@ static int __init parse_memmap_opt(char *str)
early_param("memmap", parse_memmap_opt);
/*
- * Reserve all entries from the bootloader's extensible data nodes list,
- * because if present we are going to use it later on to fetch e820
- * entries from it:
- */
-void __init e820__reserve_setup_data(void)
-{
- struct setup_indirect *indirect;
- struct setup_data *data;
- u64 pa_data, pa_next;
- u32 len;
-
- pa_data = boot_params.hdr.setup_data;
- if (!pa_data)
- return;
-
- while (pa_data) {
- data = early_memremap(pa_data, sizeof(*data));
- if (!data) {
- pr_warn("e820: failed to memremap setup_data entry\n");
- return;
- }
-
- len = sizeof(*data);
- pa_next = data->next;
-
- e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
-
- if (data->type == SETUP_INDIRECT) {
- len += data->len;
- early_memunmap(data, sizeof(*data));
- data = early_memremap(pa_data, len);
- if (!data) {
- pr_warn("e820: failed to memremap indirect setup_data\n");
- return;
- }
-
- indirect = (struct setup_indirect *)data->data;
-
- if (indirect->type != SETUP_INDIRECT)
- e820__range_update(indirect->addr, indirect->len,
- E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
- }
-
- pa_data = pa_next;
- early_memunmap(data, len);
- }
-
- e820__update_table(e820_table);
-
- pr_info("extended physical RAM map:\n");
- e820__print_table("reserve setup_data");
-}
-
-/*
* Called after parse_early_param(), after early parameters (such as mem=)
* have been processed, in which case we already have an E820 table filled in
* via the parameter callback function(s), but it's not sorted and printed yet:
@@ -1063,7 +1008,6 @@ void __init e820__finish_early_params(void)
static const char *__init e820_type_to_string(struct e820_entry *entry)
{
switch (entry->type) {
- case E820_TYPE_RESERVED_KERN: /* Fall-through: */
case E820_TYPE_RAM: return "System RAM";
case E820_TYPE_ACPI: return "ACPI Tables";
case E820_TYPE_NVS: return "ACPI Non-volatile Storage";
@@ -1079,7 +1023,6 @@ static const char *__init e820_type_to_string(struct e820_entry *entry)
static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
{
switch (entry->type) {
- case E820_TYPE_RESERVED_KERN: /* Fall-through: */
case E820_TYPE_RAM: return IORESOURCE_SYSTEM_RAM;
case E820_TYPE_ACPI: /* Fall-through: */
case E820_TYPE_NVS: /* Fall-through: */
@@ -1101,7 +1044,6 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
case E820_TYPE_RESERVED: return IORES_DESC_RESERVED;
case E820_TYPE_SOFT_RESERVED: return IORES_DESC_SOFT_RESERVED;
- case E820_TYPE_RESERVED_KERN: /* Fall-through: */
case E820_TYPE_RAM: /* Fall-through: */
case E820_TYPE_UNUSABLE: /* Fall-through: */
default: return IORES_DESC_NONE;
@@ -1124,7 +1066,6 @@ static bool __init do_mark_busy(enum e820_type type, struct resource *res)
case E820_TYPE_PRAM:
case E820_TYPE_PMEM:
return false;
- case E820_TYPE_RESERVED_KERN:
case E820_TYPE_RAM:
case E820_TYPE_ACPI:
case E820_TYPE_NVS:
@@ -1176,9 +1117,9 @@ void __init e820__reserve_resources(void)
res++;
}
- /* Expose the bootloader-provided memory layout to the sysfs. */
- for (i = 0; i < e820_table_firmware->nr_entries; i++) {
- struct e820_entry *entry = e820_table_firmware->entries + i;
+ /* Expose the kexec e820 table to the sysfs. */
+ for (i = 0; i < e820_table_kexec->nr_entries; i++) {
+ struct e820_entry *entry = e820_table_kexec->entries + i;
firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry));
}
@@ -1302,6 +1243,36 @@ void __init e820__memblock_setup(void)
int i;
u64 end;
+#ifdef CONFIG_MEMORY_HOTPLUG
+ /*
+ * Memory used by the kernel cannot be hot-removed because Linux
+ * cannot migrate the kernel pages. When memory hotplug is
+ * enabled, we should prevent memblock from allocating memory
+ * for the kernel.
+ *
+ * ACPI SRAT records all hotpluggable memory ranges. But before
+ * SRAT is parsed, we don't know about it.
+ *
+ * The kernel image is loaded into memory at very early time. We
+ * cannot prevent this anyway. So on NUMA system, we set any
+ * node the kernel resides in as un-hotpluggable.
+ *
+ * Since on modern servers, one node could have double-digit
+ * gigabytes memory, we can assume the memory around the kernel
+ * image is also un-hotpluggable. So before SRAT is parsed, just
+ * allocate memory near the kernel image to try the best to keep
+ * the kernel away from hotpluggable memory.
+ */
+ if (movable_node_is_enabled())
+ memblock_set_bottom_up(true);
+#endif
+
+ /*
+ * At this point only the first megabyte is mapped for sure, the
+ * rest of the memory cannot be used for memblock resizing
+ */
+ memblock_set_current_limit(ISA_END_ADDRESS);
+
/*
* The bootstrap memblock region count maximum is 128 entries
* (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
@@ -1323,7 +1294,7 @@ void __init e820__memblock_setup(void)
if (entry->type == E820_TYPE_SOFT_RESERVED)
memblock_reserve(entry->addr, entry->size);
- if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
+ if (entry->type != E820_TYPE_RAM)
continue;
memblock_add(entry->addr, entry->size);
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index b44d8863e57f..ac058971a382 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -40,6 +40,16 @@ SYM_DATA(kexec_pa_table_page, .quad 0)
SYM_DATA(kexec_pa_swap_page, .quad 0)
SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0)
+ .balign 16
+SYM_DATA_START_LOCAL(kexec_debug_gdt)
+ .word kexec_debug_gdt_end - kexec_debug_gdt - 1
+ .long 0
+ .word 0
+ .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
+ .quad 0x00af9a000000ffff /* __KERNEL_CS */
+ .quad 0x00cf92000000ffff /* __KERNEL_DS */
+SYM_DATA_END_LABEL(kexec_debug_gdt, SYM_L_LOCAL, kexec_debug_gdt_end)
+
.section .text..relocate_kernel,"ax";
.code64
SYM_CODE_START_NOALIGN(relocate_kernel)
@@ -116,6 +126,19 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
/* store the start address on the stack */
pushq %rdx
+ /* Create a GDTR (16 bits limit, 64 bits addr) on stack */
+ leaq kexec_debug_gdt(%rip), %rax
+ pushq %rax
+ pushw (%rax)
+
+ /* Load the GDT, put the stack back */
+ lgdt (%rsp)
+ addq $10, %rsp
+
+ /* Test that we can load segments */
+ movq %ds, %rax
+ movq %rax, %ds
+
/*
* Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP
* below.
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 9f8ff3aad4f4..c7164a8de983 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -495,6 +495,46 @@ static void __init parse_setup_data(void)
}
}
+/*
+ * Translate the fields of 'struct boot_param' into global variables
+ * representing these parameters.
+ */
+static void __init parse_boot_params(void)
+{
+ ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
+ screen_info = boot_params.screen_info;
+ edid_info = boot_params.edid_info;
+#ifdef CONFIG_X86_32
+ apm_info.bios = boot_params.apm_bios_info;
+ ist_info = boot_params.ist_info;
+#endif
+ saved_video_mode = boot_params.hdr.vid_mode;
+ bootloader_type = boot_params.hdr.type_of_loader;
+ if ((bootloader_type >> 4) == 0xe) {
+ bootloader_type &= 0xf;
+ bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
+ }
+ bootloader_version = bootloader_type & 0xf;
+ bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
+
+#ifdef CONFIG_BLK_DEV_RAM
+ rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
+#endif
+#ifdef CONFIG_EFI
+ if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+ EFI32_LOADER_SIGNATURE, 4)) {
+ set_bit(EFI_BOOT, &efi.flags);
+ } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+ EFI64_LOADER_SIGNATURE, 4)) {
+ set_bit(EFI_BOOT, &efi.flags);
+ set_bit(EFI_64BIT, &efi.flags);
+ }
+#endif
+
+ if (!boot_params.hdr.root_flags)
+ root_mountflags &= ~MS_RDONLY;
+}
+
static void __init memblock_x86_reserve_range_setup_data(void)
{
struct setup_indirect *indirect;
@@ -593,6 +633,23 @@ void __init reserve_standard_io_resources(void)
}
+static void __init setup_kernel_resources(void)
+{
+ code_resource.start = __pa_symbol(_text);
+ code_resource.end = __pa_symbol(_etext)-1;
+ rodata_resource.start = __pa_symbol(__start_rodata);
+ rodata_resource.end = __pa_symbol(__end_rodata)-1;
+ data_resource.start = __pa_symbol(_sdata);
+ data_resource.end = __pa_symbol(_edata)-1;
+ bss_resource.start = __pa_symbol(__bss_start);
+ bss_resource.end = __pa_symbol(__bss_stop)-1;
+
+ insert_resource(&iomem_resource, &code_resource);
+ insert_resource(&iomem_resource, &rodata_resource);
+ insert_resource(&iomem_resource, &data_resource);
+ insert_resource(&iomem_resource, &bss_resource);
+}
+
static bool __init snb_gfx_workaround_needed(void)
{
#ifdef CONFIG_PCI
@@ -855,35 +912,7 @@ void __init setup_arch(char **cmdline_p)
setup_olpc_ofw_pgd();
- ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
- screen_info = boot_params.screen_info;
- edid_info = boot_params.edid_info;
-#ifdef CONFIG_X86_32
- apm_info.bios = boot_params.apm_bios_info;
- ist_info = boot_params.ist_info;
-#endif
- saved_video_mode = boot_params.hdr.vid_mode;
- bootloader_type = boot_params.hdr.type_of_loader;
- if ((bootloader_type >> 4) == 0xe) {
- bootloader_type &= 0xf;
- bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
- }
- bootloader_version = bootloader_type & 0xf;
- bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
-
-#ifdef CONFIG_BLK_DEV_RAM
- rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
-#endif
-#ifdef CONFIG_EFI
- if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
- EFI32_LOADER_SIGNATURE, 4)) {
- set_bit(EFI_BOOT, &efi.flags);
- } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
- EFI64_LOADER_SIGNATURE, 4)) {
- set_bit(EFI_BOOT, &efi.flags);
- set_bit(EFI_64BIT, &efi.flags);
- }
-#endif
+ parse_boot_params();
x86_init.oem.arch_setup();
@@ -907,19 +936,8 @@ void __init setup_arch(char **cmdline_p)
copy_edd();
- if (!boot_params.hdr.root_flags)
- root_mountflags &= ~MS_RDONLY;
setup_initial_init_mm(_text, _etext, _edata, (void *)_brk_end);
- code_resource.start = __pa_symbol(_text);
- code_resource.end = __pa_symbol(_etext)-1;
- rodata_resource.start = __pa_symbol(__start_rodata);
- rodata_resource.end = __pa_symbol(__end_rodata)-1;
- data_resource.start = __pa_symbol(_sdata);
- data_resource.end = __pa_symbol(_edata)-1;
- bss_resource.start = __pa_symbol(__bss_start);
- bss_resource.end = __pa_symbol(__bss_stop)-1;
-
/*
* x86_configure_nx() is called before parse_early_param() to detect
* whether hardware doesn't support NX (so that the early EHCI debug
@@ -932,30 +950,6 @@ void __init setup_arch(char **cmdline_p)
if (efi_enabled(EFI_BOOT))
efi_memblock_x86_reserve_range();
-#ifdef CONFIG_MEMORY_HOTPLUG
- /*
- * Memory used by the kernel cannot be hot-removed because Linux
- * cannot migrate the kernel pages. When memory hotplug is
- * enabled, we should prevent memblock from allocating memory
- * for the kernel.
- *
- * ACPI SRAT records all hotpluggable memory ranges. But before
- * SRAT is parsed, we don't know about it.
- *
- * The kernel image is loaded into memory at very early time. We
- * cannot prevent this anyway. So on NUMA system, we set any
- * node the kernel resides in as un-hotpluggable.
- *
- * Since on modern servers, one node could have double-digit
- * gigabytes memory, we can assume the memory around the kernel
- * image is also un-hotpluggable. So before SRAT is parsed, just
- * allocate memory near the kernel image to try the best to keep
- * the kernel away from hotpluggable memory.
- */
- if (movable_node_is_enabled())
- memblock_set_bottom_up(true);
-#endif
-
x86_report_nx();
apic_setup_apic_calls();
@@ -967,7 +961,6 @@ void __init setup_arch(char **cmdline_p)
setup_clear_cpu_cap(X86_FEATURE_APIC);
}
- e820__reserve_setup_data();
e820__finish_early_params();
if (efi_enabled(EFI_BOOT))
@@ -987,11 +980,11 @@ void __init setup_arch(char **cmdline_p)
tsc_early_init();
x86_init.resources.probe_roms();
- /* after parse_early_param, so could debug it */
- insert_resource(&iomem_resource, &code_resource);
- insert_resource(&iomem_resource, &rodata_resource);
- insert_resource(&iomem_resource, &data_resource);
- insert_resource(&iomem_resource, &bss_resource);
+ /*
+ * Add resources for kernel text and data to the iomem_resource.
+ * Do it after parse_early_param, so it can be debugged.
+ */
+ setup_kernel_resources();
e820_add_kernel_range();
trim_bios_range();
@@ -1056,7 +1049,6 @@ void __init setup_arch(char **cmdline_p)
cleanup_highmap();
- memblock_set_current_limit(ISA_END_ADDRESS);
e820__memblock_setup();
/*
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 4c1bcb6053fc..46b8f1f16676 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -200,8 +200,7 @@ static int tboot_setup_sleep(void)
tboot->num_mac_regions = 0;
for (i = 0; i < e820_table->nr_entries; i++) {
- if ((e820_table->entries[i].type != E820_TYPE_RAM)
- && (e820_table->entries[i].type != E820_TYPE_RESERVED_KERN))
+ if (e820_table->entries[i].type != E820_TYPE_RAM)
continue;
add_mac_region(e820_table->entries[i].addr, e820_table->entries[i].size);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 01ea7c6df303..519aa53114fa 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -469,8 +469,6 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & PAGE_MASK, paddr_next,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PAGE_MASK, paddr_next,
- E820_TYPE_RESERVED_KERN) &&
- !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
E820_TYPE_ACPI))
set_pte_init(pte, __pte(0), init);
continue;
@@ -526,8 +524,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & PMD_MASK, paddr_next,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PMD_MASK, paddr_next,
- E820_TYPE_RESERVED_KERN) &&
- !e820__mapped_any(paddr & PMD_MASK, paddr_next,
E820_TYPE_ACPI))
set_pmd_init(pmd, __pmd(0), init);
continue;
@@ -615,8 +611,6 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & PUD_MASK, paddr_next,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PUD_MASK, paddr_next,
- E820_TYPE_RESERVED_KERN) &&
- !e820__mapped_any(paddr & PUD_MASK, paddr_next,
E820_TYPE_ACPI))
set_pud_init(pud, __pud(0), init);
continue;
@@ -704,8 +698,6 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & P4D_MASK, paddr_next,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & P4D_MASK, paddr_next,
- E820_TYPE_RESERVED_KERN) &&
- !e820__mapped_any(paddr & P4D_MASK, paddr_next,
E820_TYPE_ACPI))
set_p4d_init(p4d, __p4d(0), init);
continue;
diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
index 42e74a5a7d78..fc473ca12c44 100644
--- a/arch/x86/virt/svm/sev.c
+++ b/arch/x86/virt/svm/sev.c
@@ -198,7 +198,6 @@ static void __init __snp_fixup_e820_tables(u64 pa)
pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa);
e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
- e820__range_update_table(e820_table_firmware, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
if (!memblock_is_region_reserved(pa, PMD_SIZE))
memblock_reserve(pa, PMD_SIZE);
}