diff options
Diffstat (limited to 'arch/x86/kernel')
39 files changed, 1729 insertions, 804 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index ac3b3d002833..7bd3bd310106 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -87,6 +87,9 @@ obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o +obj-$(CONFIG_MICROCODE_EARLY) += microcode_core_early.o +obj-$(CONFIG_MICROCODE_INTEL_EARLY) += microcode_intel_early.o +obj-$(CONFIG_MICROCODE_INTEL_LIB) += microcode_intel_lib.o microcode-y := microcode_core.o microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index bacf4b0d91f4..230c8ea878e5 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -51,7 +51,6 @@ EXPORT_SYMBOL(acpi_disabled); #ifdef CONFIG_X86_64 # include <asm/proto.h> -# include <asm/numa_64.h> #endif /* X86 */ #define BAD_MADT_ENTRY(entry, end) ( \ @@ -697,6 +696,10 @@ EXPORT_SYMBOL(acpi_map_lsapic); int acpi_unmap_lsapic(int cpu) { +#ifdef CONFIG_ACPI_NUMA + set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE); +#endif + per_cpu(x86_cpu_to_apicid, cpu) = -1; set_cpu_present(cpu, false); num_processors--; diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index d5e0d717005a..0532f5d6e4ef 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -69,7 +69,7 @@ int acpi_suspend_lowlevel(void) #ifndef CONFIG_64BIT header->pmode_entry = (u32)&wakeup_pmode_return; - header->pmode_cr3 = (u32)__pa(&initial_page_table); + header->pmode_cr3 = (u32)__pa_symbol(initial_page_table); saved_magic = 0x12345678; #else /* CONFIG_64BIT */ #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index e66311200cbd..b574b295a2f9 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -768,10 +768,9 @@ int __init gart_iommu_init(void) aper_base = info.aper_base; end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); - if (end_pfn > max_low_pfn_mapped) { - start_pfn = (aper_base>>PAGE_SHIFT); + start_pfn = PFN_DOWN(aper_base); + if (!pfn_range_is_mapped(start_pfn, end_pfn)) init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); - } pr_info("PCI-DMA: using GART IOMMU.\n"); iommu_size = check_iommu_size(info.aper_base, aper_size); diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 9c2aa89a11cb..9a9110918ca7 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -28,6 +28,7 @@ #include <asm/apic.h> #include <asm/ipi.h> #include <asm/apic_flat_64.h> +#include <asm/pgtable.h> static int numachip_system __read_mostly; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 782c456eaa01..fa96eb0d02fb 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -12,7 +12,6 @@ #include <asm/pci-direct.h> #ifdef CONFIG_X86_64 -# include <asm/numa_64.h> # include <asm/mmconfig.h> # include <asm/cacheflush.h> #endif @@ -220,8 +219,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) */ WARN_ONCE(1, "WARNING: This combination of AMD" " processors is not suitable for SMP.\n"); - if (!test_taint(TAINT_UNSAFE_SMP)) - add_taint(TAINT_UNSAFE_SMP); + add_taint(TAINT_UNSAFE_SMP, LOCKDEP_NOW_UNRELIABLE); valid_k7: ; @@ -518,10 +516,9 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) static void __cpuinit init_amd(struct cpuinfo_x86 *c) { u32 dummy; - -#ifdef CONFIG_SMP unsigned long long value; +#ifdef CONFIG_SMP /* * Disable TLB flush filter by setting HWCR.FFDIS on K8 * bit 6 of msr C001_0015 @@ -559,12 +556,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * (AMD Erratum #110, docId: 25759). */ if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { - u64 val; - clear_cpu_cap(c, X86_FEATURE_LAHF_LM); - if (!rdmsrl_amd_safe(0xc001100d, &val)) { - val &= ~(1ULL << 32); - wrmsrl_amd_safe(0xc001100d, val); + if (!rdmsrl_amd_safe(0xc001100d, &value)) { + value &= ~(1ULL << 32); + wrmsrl_amd_safe(0xc001100d, value); } } @@ -617,13 +612,12 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if ((c->x86 == 0x15) && (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) && !cpu_has(c, X86_FEATURE_TOPOEXT)) { - u64 val; - if (!rdmsrl_safe(0xc0011005, &val)) { - val |= 1ULL << 54; - wrmsrl_safe(0xc0011005, val); - rdmsrl(0xc0011005, val); - if (val & (1ULL << 54)) { + if (!rdmsrl_safe(0xc0011005, &value)) { + value |= 1ULL << 54; + wrmsrl_safe(0xc0011005, value); + rdmsrl(0xc0011005, value); + if (value & (1ULL << 54)) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); printk(KERN_INFO FW_INFO "CPU: Re-enabling " "disabled Topology Extensions Support\n"); @@ -637,11 +631,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) */ if ((c->x86 == 0x15) && (c->x86_model >= 0x02) && (c->x86_model < 0x20)) { - u64 val; - if (!rdmsrl_safe(0xc0011021, &val) && !(val & 0x1E)) { - val |= 0x1E; - wrmsrl_safe(0xc0011021, val); + if (!rdmsrl_safe(0xc0011021, &value) && !(value & 0x1E)) { + value |= 0x1E; + wrmsrl_safe(0xc0011021, value); } } @@ -685,12 +678,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * benefit in doing so. */ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { + unsigned long pfn = tseg >> PAGE_SHIFT; + printk(KERN_DEBUG "tseg: %010llx\n", tseg); - if ((tseg>>PMD_SHIFT) < - (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || - ((tseg>>PMD_SHIFT) < - (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && - (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) + if (pfn_range_is_mapped(pfn, pfn + 1)) set_memory_4k((unsigned long)__va(tseg), 1); } } @@ -703,13 +694,11 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (c->x86 > 0x11) set_cpu_cap(c, X86_FEATURE_ARAT); - /* - * Disable GART TLB Walk Errors on Fam10h. We do this here - * because this is always needed when GART is enabled, even in a - * kernel which has no MCE support built in. - */ if (c->x86 == 0x10) { /* + * Disable GART TLB Walk Errors on Fam10h. We do this here + * because this is always needed when GART is enabled, even in a + * kernel which has no MCE support built in. * BIOS should disable GartTlbWlk Errors themself. If * it doesn't do it here as suggested by the BKDG. * @@ -723,6 +712,21 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) mask |= (1 << 10); wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask); } + + /* + * On family 10h BIOS may not have properly enabled WC+ support, + * causing it to be converted to CD memtype. This may result in + * performance degradation for certain nested-paging guests. + * Prevent this conversion by clearing bit 24 in + * MSR_AMD64_BU_CFG2. + * + * NOTE: we want to use the _safe accessors so as not to #GP kvm + * guests on older kvm hosts. + */ + + rdmsrl_safe(MSR_AMD64_BU_CFG2, &value); + value &= ~(1ULL << 24); + wrmsrl_safe(MSR_AMD64_BU_CFG2, value); } rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9c3ab43a6954..d814772c5bed 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -37,6 +37,8 @@ #include <asm/mce.h> #include <asm/msr.h> #include <asm/pat.h> +#include <asm/microcode.h> +#include <asm/microcode_intel.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/uv/uv.h> @@ -213,7 +215,7 @@ static inline int flag_is_changeable_p(u32 flag) } /* Probe for the CPUID instruction */ -static int __cpuinit have_cpuid_p(void) +int __cpuinit have_cpuid_p(void) { return flag_is_changeable_p(X86_EFLAGS_ID); } @@ -249,11 +251,6 @@ static inline int flag_is_changeable_p(u32 flag) { return 1; } -/* Probe for the CPUID instruction */ -static inline int have_cpuid_p(void) -{ - return 1; -} static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) { } @@ -1223,6 +1220,12 @@ void __cpuinit cpu_init(void) int cpu; int i; + /* + * Load microcode on this cpu if a valid microcode is available. + * This is early microcode loading procedure. + */ + load_ucode_ap(); + cpu = stack_smp_processor_id(); t = &per_cpu(init_tss, cpu); oist = &per_cpu(orig_ist, cpu); @@ -1314,6 +1317,8 @@ void __cpuinit cpu_init(void) struct tss_struct *t = &per_cpu(init_tss, cpu); struct thread_struct *thread = &curr->thread; + show_ucode_info_early(); + if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) { printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); for (;;) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fcaabd0432c5..1905ce98bee0 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -17,7 +17,6 @@ #ifdef CONFIG_X86_64 #include <linux/topology.h> -#include <asm/numa_64.h> #endif #include "cpu.h" @@ -168,7 +167,7 @@ int __cpuinit ppro_with_ram_bug(void) #ifdef CONFIG_X86_F00F_BUG static void __cpuinit trap_init_f00f_bug(void) { - __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); + __set_fixmap(FIX_F00F_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO); /* * Update the IDT descriptor and reload the IDT so that diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index fc7608a89d93..7bc126346ace 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1082,7 +1082,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) /* * Set taint even when machine check was not enabled. */ - add_taint(TAINT_MACHINE_CHECK); + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); severity = mce_severity(&m, cfg->tolerant, NULL); diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 2d5454cd2c4f..1c044b1ccc59 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -33,7 +33,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) smp_processor_id()); } - add_taint(TAINT_MACHINE_CHECK); + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); } /* Set up machine check reporting for processors with Intel style MCE: */ diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 2d7998fb628c..e9a701aecaa1 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -15,7 +15,7 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code) { printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); - add_taint(TAINT_MACHINE_CHECK); + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); } /* Set up machine check reporting on the Winchip C6 series */ diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index e9fe907cd249..fa72a39e5d46 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -542,7 +542,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, if (tmp != mask_lo) { printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); - add_taint(TAINT_FIRMWARE_WORKAROUND); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); mask_lo = tmp; } } diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 6336bcbd0618..5f0581e713c2 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -528,7 +528,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) if (!test_bit(IBS_STARTED, pcpu->state)) { /* * Catch spurious interrupts after stopping IBS: After - * disabling IBS there could be still incomming NMIs + * disabling IBS there could be still incoming NMIs * with samples that even have the valid bit cleared. * Mark all this NMIs as handled. */ diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ae42418bc50f..c8797d55b245 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -232,7 +232,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) bust_spinlocks(0); die_owner = -1; - add_taint(TAINT_DIE); + add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); die_nest_count--; if (!die_nest_count) /* Nest count reaches zero, release the lock. */ diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index df06ade26bef..d32abeabbda5 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -835,7 +835,7 @@ static int __init parse_memopt(char *p) } early_param("mem", parse_memopt); -static int __init parse_memmap_opt(char *p) +static int __init parse_memmap_one(char *p) { char *oldp; u64 start_at, mem_size; @@ -877,6 +877,20 @@ static int __init parse_memmap_opt(char *p) return *p == '\0' ? 0 : -EINVAL; } +static int __init parse_memmap_opt(char *str) +{ + while (str) { + char *k = strchr(str, ','); + + if (k) + *k++ = 0; + + parse_memmap_one(str); + str = k; + } + + return 0; +} early_param("memmap", parse_memmap_opt); void __init finish_e820_parsing(void) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 8831176aa5ef..8f3e2dec1df3 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -699,51 +699,6 @@ END(syscall_badsys) */ .popsection -/* - * System calls that need a pt_regs pointer. - */ -#define PTREGSCALL0(name) \ -ENTRY(ptregs_##name) ; \ - leal 4(%esp),%eax; \ - jmp sys_##name; \ -ENDPROC(ptregs_##name) - -#define PTREGSCALL1(name) \ -ENTRY(ptregs_##name) ; \ - leal 4(%esp),%edx; \ - movl (PT_EBX+4)(%esp),%eax; \ - jmp sys_##name; \ -ENDPROC(ptregs_##name) - -#define PTREGSCALL2(name) \ -ENTRY(ptregs_##name) ; \ - leal 4(%esp),%ecx; \ - movl (PT_ECX+4)(%esp),%edx; \ - movl (PT_EBX+4)(%esp),%eax; \ - jmp sys_##name; \ -ENDPROC(ptregs_##name) - -#define PTREGSCALL3(name) \ -ENTRY(ptregs_##name) ; \ - CFI_STARTPROC; \ - leal 4(%esp),%eax; \ - pushl_cfi %eax; \ - movl PT_EDX(%eax),%ecx; \ - movl PT_ECX(%eax),%edx; \ - movl PT_EBX(%eax),%eax; \ - call sys_##name; \ - addl $4,%esp; \ - CFI_ADJUST_CFA_OFFSET -4; \ - ret; \ - CFI_ENDPROC; \ -ENDPROC(ptregs_##name) - -PTREGSCALL1(iopl) -PTREGSCALL0(sigreturn) -PTREGSCALL0(rt_sigreturn) -PTREGSCALL2(vm86) -PTREGSCALL1(vm86old) - .macro FIXUP_ESPFIX_STACK /* * Switch back for ESPFIX stack to the normal zerobased stack diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 048f2240f8e6..c1d01e6ca790 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -828,23 +828,6 @@ int_restore_rest: CFI_ENDPROC END(system_call) -/* - * Certain special system calls that need to save a complete full stack frame. - */ - .macro PTREGSCALL label,func,arg -ENTRY(\label) - PARTIAL_FRAME 1 8 /* offset 8: return address */ - subq $REST_SKIP, %rsp - CFI_ADJUST_CFA_OFFSET REST_SKIP - call save_rest - DEFAULT_FRAME 0 8 /* offset 8: return address */ - leaq 8(%rsp), \arg /* pt_regs pointer */ - call \func - jmp ptregscall_common - CFI_ENDPROC -END(\label) - .endm - .macro FORK_LIKE func ENTRY(stub_\func) CFI_STARTPROC @@ -861,10 +844,22 @@ ENTRY(stub_\func) END(stub_\func) .endm + .macro FIXED_FRAME label,func +ENTRY(\label) + CFI_STARTPROC + PARTIAL_FRAME 0 8 /* offset 8: return address */ + FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET + call \func + RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET + ret + CFI_ENDPROC +END(\label) + .endm + FORK_LIKE clone FORK_LIKE fork FORK_LIKE vfork - PTREGSCALL stub_iopl, sys_iopl, %rsi + FIXED_FRAME stub_iopl, sys_iopl ENTRY(ptregscall_common) DEFAULT_FRAME 1 8 /* offset 8: return address */ @@ -886,7 +881,6 @@ ENTRY(stub_execve) SAVE_REST FIXUP_TOP_OF_STACK %r11 call sys_execve - RESTORE_TOP_OF_STACK %r11 movq %rax,RAX(%rsp) RESTORE_REST jmp int_ret_from_sys_call @@ -902,7 +896,6 @@ ENTRY(stub_rt_sigreturn) addq $8, %rsp PARTIAL_FRAME 0 SAVE_REST - movq %rsp,%rdi FIXUP_TOP_OF_STACK %r11 call sys_rt_sigreturn movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer @@ -917,7 +910,6 @@ ENTRY(stub_x32_rt_sigreturn) addq $8, %rsp PARTIAL_FRAME 0 SAVE_REST - movq %rsp,%rdi FIXUP_TOP_OF_STACK %r11 call sys32_x32_rt_sigreturn movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 1d414029f1d8..42a392a9fd02 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -89,7 +89,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code) * kernel identity mapping to modify code. */ if (within(ip, (unsigned long)_text, (unsigned long)_etext)) - ip = (unsigned long)__va(__pa(ip)); + ip = (unsigned long)__va(__pa_symbol(ip)); return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE); } @@ -279,7 +279,7 @@ static int ftrace_write(unsigned long ip, const char *val, int size) * kernel identity mapping to modify code. */ if (within(ip, (unsigned long)_text, (unsigned long)_etext)) - ip = (unsigned long)__va(__pa(ip)); + ip = (unsigned long)__va(__pa_symbol(ip)); return probe_kernel_write((void *)ip, val, size); } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 6773c918b8cc..138463a24877 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -33,20 +33,6 @@ void __init i386_start_kernel(void) { sanitize_boot_params(&boot_params); - memblock_reserve(__pa_symbol(&_text), - __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); - -#ifdef CONFIG_BLK_DEV_INITRD - /* Reserve INITRD */ - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { - /* Assume only end is not page aligned */ - u64 ramdisk_image = boot_params.hdr.ramdisk_image; - u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); - } -#endif - /* Call the subarch specific early setup function */ switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_MRST: @@ -60,11 +46,5 @@ void __init i386_start_kernel(void) break; } - /* - * At this point everything still needed from the boot loader - * or BIOS or kernel text should be early reserved or marked not - * RAM in e820. All other memory is free game. - */ - start_kernel(); } diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 849fc9e63c2f..c5e403f6d869 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -26,12 +26,83 @@ #include <asm/e820.h> #include <asm/bios_ebda.h> #include <asm/bootparam_utils.h> +#include <asm/microcode.h> -static void __init zap_identity_mappings(void) +/* + * Manage page tables very early on. + */ +extern pgd_t early_level4_pgt[PTRS_PER_PGD]; +extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; +static unsigned int __initdata next_early_pgt = 2; + +/* Wipe all early page tables except for the kernel symbol map */ +static void __init reset_early_page_tables(void) { - pgd_t *pgd = pgd_offset_k(0UL); - pgd_clear(pgd); - __flush_tlb_all(); + unsigned long i; + + for (i = 0; i < PTRS_PER_PGD-1; i++) + early_level4_pgt[i].pgd = 0; + + next_early_pgt = 0; + + write_cr3(__pa(early_level4_pgt)); +} + +/* Create a new PMD entry */ +int __init early_make_pgtable(unsigned long address) +{ + unsigned long physaddr = address - __PAGE_OFFSET; + unsigned long i; + pgdval_t pgd, *pgd_p; + pudval_t pud, *pud_p; + pmdval_t pmd, *pmd_p; + + /* Invalid address or early pgt is done ? */ + if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt)) + return -1; + +again: + pgd_p = &early_level4_pgt[pgd_index(address)].pgd; + pgd = *pgd_p; + + /* + * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is + * critical -- __PAGE_OFFSET would point us back into the dynamic + * range and we might end up looping forever... + */ + if (pgd) + pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { + reset_early_page_tables(); + goto again; + } + + pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; + for (i = 0; i < PTRS_PER_PUD; i++) + pud_p[i] = 0; + *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + } + pud_p += pud_index(address); + pud = *pud_p; + + if (pud) + pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { + reset_early_page_tables(); + goto again; + } + + pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; + for (i = 0; i < PTRS_PER_PMD; i++) + pmd_p[i] = 0; + *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + } + pmd = (physaddr & PMD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL); + pmd_p[pmd_index(address)] = pmd; + + return 0; } /* Don't add a printk in there. printk relies on the PDA which is not initialized @@ -42,14 +113,25 @@ static void __init clear_bss(void) (unsigned long) __bss_stop - (unsigned long) __bss_start); } +static unsigned long get_cmd_line_ptr(void) +{ + unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr; + + cmd_line_ptr |= (u64)boot_params.ext_cmd_line_ptr << 32; + + return cmd_line_ptr; +} + static void __init copy_bootdata(char *real_mode_data) { char * command_line; + unsigned long cmd_line_ptr; memcpy(&boot_params, real_mode_data, sizeof boot_params); sanitize_boot_params(&boot_params); - if (boot_params.hdr.cmd_line_ptr) { - command_line = __va(boot_params.hdr.cmd_line_ptr); + cmd_line_ptr = get_cmd_line_ptr(); + if (cmd_line_ptr) { + command_line = __va(cmd_line_ptr); memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); } } @@ -72,54 +154,40 @@ void __init x86_64_start_kernel(char * real_mode_data) (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); + /* Kill off the identity-map trampoline */ + reset_early_page_tables(); + /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); - /* Make NULL pointers segfault */ - zap_identity_mappings(); - - max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; - - for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { -#ifdef CONFIG_EARLY_PRINTK + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) set_intr_gate(i, &early_idt_handlers[i]); -#else - set_intr_gate(i, early_idt_handler); -#endif - } load_idt((const struct desc_ptr *)&idt_descr); + copy_bootdata(__va(real_mode_data)); + + /* + * Load microcode early on BSP. + */ + load_ucode_bsp(); + if (console_loglevel == 10) early_printk("Kernel alive\n"); + clear_page(init_level4_pgt); + /* set init_level4_pgt kernel high mapping*/ + init_level4_pgt[511] = early_level4_pgt[511]; + x86_64_start_reservations(real_mode_data); } void __init x86_64_start_reservations(char *real_mode_data) { - copy_bootdata(__va(real_mode_data)); - - memblock_reserve(__pa_symbol(&_text), - __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); - -#ifdef CONFIG_BLK_DEV_INITRD - /* Reserve INITRD */ - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { - /* Assume only end is not page aligned */ - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; - unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); - } -#endif + /* version is always not zero if it is copied */ + if (!boot_params.hdr.version) + copy_bootdata(__va(real_mode_data)); reserve_ebda_region(); - /* - * At this point everything still needed from the boot loader - * or BIOS or kernel text should be early reserved or marked not - * RAM in e820. All other memory is free game. - */ - start_kernel(); } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 3c3f58a0808f..73afd11799ca 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -144,6 +144,11 @@ ENTRY(startup_32) movl %eax, pa(olpc_ofw_pgd) #endif +#ifdef CONFIG_MICROCODE_EARLY + /* Early load ucode on BSP. */ + call load_ucode_bsp +#endif + /* * Initialize page tables. This creates a PDE and a set of page * tables, which are located immediately beyond __brk_base. The variable @@ -299,6 +304,12 @@ ENTRY(startup_32_smp) movl %eax,%ss leal -__PAGE_OFFSET(%ecx),%esp +#ifdef CONFIG_MICROCODE_EARLY + /* Early load ucode on AP. */ + call load_ucode_ap +#endif + + default_entry: #define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 980053c4b9cc..b7de3b25adb5 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -47,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) .code64 .globl startup_64 startup_64: - /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, * and someone has loaded an identity mapped page table * for us. These identity mapped page tables map all of the * kernel pages and possibly all of memory. * - * %esi holds a physical pointer to real_mode_data. + * %rsi holds a physical pointer to real_mode_data. * * We come here either directly from a 64bit bootloader, or from * arch/x86_64/boot/compressed/head.S. @@ -66,7 +65,8 @@ startup_64: * tables and then reload them. */ - /* Compute the delta between the address I am compiled to run at and the + /* + * Compute the delta between the address I am compiled to run at and the * address I am actually running at. */ leaq _text(%rip), %rbp @@ -78,45 +78,62 @@ startup_64: testl %eax, %eax jnz bad_address - /* Is the address too large? */ - leaq _text(%rip), %rdx - movq $PGDIR_SIZE, %rax - cmpq %rax, %rdx - jae bad_address - - /* Fixup the physical addresses in the page table + /* + * Is the address too large? */ - addq %rbp, init_level4_pgt + 0(%rip) - addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip) - addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip) + leaq _text(%rip), %rax + shrq $MAX_PHYSMEM_BITS, %rax + jnz bad_address - addq %rbp, level3_ident_pgt + 0(%rip) + /* + * Fixup the physical addresses in the page table + */ + addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip) addq %rbp, level3_kernel_pgt + (510*8)(%rip) addq %rbp, level3_kernel_pgt + (511*8)(%rip) addq %rbp, level2_fixmap_pgt + (506*8)(%rip) - /* Add an Identity mapping if I am above 1G */ + /* + * Set up the identity mapping for the switchover. These + * entries should *NOT* have the global bit set! This also + * creates a bunch of nonsense entries but that is fine -- + * it avoids problems around wraparound. + */ leaq _text(%rip), %rdi - andq $PMD_PAGE_MASK, %rdi + leaq early_level4_pgt(%rip), %rbx movq %rdi, %rax - shrq $PUD_SHIFT, %rax - andq $(PTRS_PER_PUD - 1), %rax - jz ident_complete + shrq $PGDIR_SHIFT, %rax - leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx - leaq level3_ident_pgt(%rip), %rbx - movq %rdx, 0(%rbx, %rax, 8) + leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx + movq %rdx, 0(%rbx,%rax,8) + movq %rdx, 8(%rbx,%rax,8) + addq $4096, %rdx movq %rdi, %rax - shrq $PMD_SHIFT, %rax - andq $(PTRS_PER_PMD - 1), %rax - leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx - leaq level2_spare_pgt(%rip), %rbx - movq %rdx, 0(%rbx, %rax, 8) -ident_complete: + shrq $PUD_SHIFT, %rax + andl $(PTRS_PER_PUD-1), %eax + movq %rdx, (4096+0)(%rbx,%rax,8) + movq %rdx, (4096+8)(%rbx,%rax,8) + + addq $8192, %rbx + movq %rdi, %rax + shrq $PMD_SHIFT, %rdi + addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax + leaq (_end - 1)(%rip), %rcx + shrq $PMD_SHIFT, %rcx + subq %rdi, %rcx + incl %ecx + +1: + andq $(PTRS_PER_PMD - 1), %rdi + movq %rax, (%rbx,%rdi,8) + incq %rdi + addq $PMD_SIZE, %rax + decl %ecx + jnz 1b /* * Fixup the kernel text+data virtual addresses. Note that @@ -124,7 +141,6 @@ ident_complete: * cleanup_highmap() fixes this up along with the mappings * beyond _end. */ - leaq level2_kernel_pgt(%rip), %rdi leaq 4096(%rdi), %r8 /* See if it is a valid page table entry */ @@ -139,17 +155,14 @@ ident_complete: /* Fixup phys_base */ addq %rbp, phys_base(%rip) - /* Due to ENTRY(), sometimes the empty space gets filled with - * zeros. Better take a jmp than relying on empty space being - * filled with 0x90 (nop) - */ - jmp secondary_startup_64 + movq $(early_level4_pgt - __START_KERNEL_map), %rax + jmp 1f ENTRY(secondary_startup_64) /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, * and someone has loaded a mapped page table. * - * %esi holds a physical pointer to real_mode_data. + * %rsi holds a physical pointer to real_mode_data. * * We come here either from startup_64 (using physical addresses) * or from trampoline.S (using virtual addresses). @@ -159,12 +172,14 @@ ENTRY(secondary_startup_64) * after the boot processor executes this code. */ + movq $(init_level4_pgt - __START_KERNEL_map), %rax +1: + /* Enable PAE mode and PGE */ - movl $(X86_CR4_PAE | X86_CR4_PGE), %eax - movq %rax, %cr4 + movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx + movq %rcx, %cr4 /* Setup early boot stage 4 level pagetables. */ - movq $(init_level4_pgt - __START_KERNEL_map), %rax addq phys_base(%rip), %rax movq %rax, %cr3 @@ -196,7 +211,7 @@ ENTRY(secondary_startup_64) movq %rax, %cr0 /* Setup a boot time stack */ - movq stack_start(%rip),%rsp + movq stack_start(%rip), %rsp /* zero EFLAGS after setting rsp */ pushq $0 @@ -236,15 +251,33 @@ ENTRY(secondary_startup_64) movl initial_gs+4(%rip),%edx wrmsr - /* esi is pointer to real mode structure with interesting info. + /* rsi is pointer to real mode structure with interesting info. pass it to C */ - movl %esi, %edi + movq %rsi, %rdi /* Finally jump to run C code and to be on real kernel address * Since we are running on identity-mapped space we have to jump * to the full 64bit address, this is only possible as indirect * jump. In addition we need to ensure %cs is set so we make this * a far return. + * + * Note: do not change to far jump indirect with 64bit offset. + * + * AMD does not support far jump indirect with 64bit offset. + * AMD64 Architecture Programmer's Manual, Volume 3: states only + * JMP FAR mem16:16 FF /5 Far jump indirect, + * with the target specified by a far pointer in memory. + * JMP FAR mem16:32 FF /5 Far jump indirect, + * with the target specified by a far pointer in memory. + * + * Intel64 does support 64bit offset. + * Software Developer Manual Vol 2: states: + * FF /5 JMP m16:16 Jump far, absolute indirect, + * address given in m16:16 + * FF /5 JMP m16:32 Jump far, absolute indirect, + * address given in m16:32. + * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, + * address given in m16:64. */ movq initial_code(%rip),%rax pushq $0 # fake return address to stop unwinder @@ -270,13 +303,13 @@ ENDPROC(start_cpu0) /* SMP bootup changes these two */ __REFDATA - .align 8 - ENTRY(initial_code) + .balign 8 + GLOBAL(initial_code) .quad x86_64_start_kernel - ENTRY(initial_gs) + GLOBAL(initial_gs) .quad INIT_PER_CPU_VAR(irq_stack_union) - ENTRY(stack_start) + GLOBAL(stack_start) .quad init_thread_union+THREAD_SIZE-8 .word 0 __FINITDATA @@ -284,7 +317,7 @@ ENDPROC(start_cpu0) bad_address: jmp bad_address - .section ".init.text","ax" + __INIT .globl early_idt_handlers early_idt_handlers: # 104(%rsp) %rflags @@ -303,6 +336,7 @@ early_idt_handlers: i = i + 1 .endr +/* This is global to keep gas from relaxing the jumps */ ENTRY(early_idt_handler) cld @@ -321,14 +355,22 @@ ENTRY(early_idt_handler) pushq %r11 # 0(%rsp) cmpl $__KERNEL_CS,96(%rsp) - jne 10f + jne 11f + + cmpl $14,72(%rsp) # Page fault? + jnz 10f + GET_CR2_INTO(%rdi) # can clobber any volatile register if pv + call early_make_pgtable + andl %eax,%eax + jz 20f # All good +10: leaq 88(%rsp),%rdi # Pointer to %rip call early_fixup_exception andl %eax,%eax jnz 20f # Found an exception entry -10: +11: #ifdef CONFIG_EARLY_PRINTK GET_CR2_INTO(%r9) # can clobber any volatile register if pv movl 80(%rsp),%r8d # error code @@ -350,7 +392,7 @@ ENTRY(early_idt_handler) 1: hlt jmp 1b -20: # Exception table entry found +20: # Exception table entry found or page table generated popq %r11 popq %r10 popq %r9 @@ -363,6 +405,9 @@ ENTRY(early_idt_handler) addq $16,%rsp # drop vector number and error code decl early_recursion_flag(%rip) INTERRUPT_RETURN +ENDPROC(early_idt_handler) + + __INITDATA .balign 4 early_recursion_flag: @@ -374,11 +419,10 @@ early_idt_msg: early_idt_ripmsg: .asciz "RIP %s\n" #endif /* CONFIG_EARLY_PRINTK */ - .previous #define NEXT_PAGE(name) \ .balign PAGE_SIZE; \ -ENTRY(name) +GLOBAL(name) /* Automate the creation of 1 to 1 mapping pmd entries */ #define PMDS(START, PERM, COUNT) \ @@ -388,24 +432,37 @@ ENTRY(name) i = i + 1 ; \ .endr + __INITDATA +NEXT_PAGE(early_level4_pgt) + .fill 511,8,0 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + +NEXT_PAGE(early_dynamic_pgts) + .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 + .data - /* - * This default setting generates an ident mapping at address 0x100000 - * and a mapping for the kernel that precisely maps virtual address - * 0xffffffff80000000 to physical address 0x000000. (always using - * 2Mbyte large pages provided by PAE mode) - */ + +#ifndef CONFIG_XEN NEXT_PAGE(init_level4_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_level4_pgt + L4_START_KERNEL*8, 0 + .fill 512,8,0 +#else +NEXT_PAGE(init_level4_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .org init_level4_pgt + L4_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 511,8,0 + .fill 511, 8, 0 +NEXT_PAGE(level2_ident_pgt) + /* Since I easily can, map the first 1G. + * Don't set NX because code runs from these pages. + */ + PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) +#endif NEXT_PAGE(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 @@ -413,21 +470,6 @@ NEXT_PAGE(level3_kernel_pgt) .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE -NEXT_PAGE(level2_fixmap_pgt) - .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE - /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ - .fill 5,8,0 - -NEXT_PAGE(level1_fixmap_pgt) - .fill 512,8,0 - -NEXT_PAGE(level2_ident_pgt) - /* Since I easily can, map the first 1G. - * Don't set NX because code runs from these pages. - */ - PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) - NEXT_PAGE(level2_kernel_pgt) /* * 512 MB kernel mapping. We spend a full page on this pagetable @@ -442,11 +484,16 @@ NEXT_PAGE(level2_kernel_pgt) PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) -NEXT_PAGE(level2_spare_pgt) - .fill 512, 8, 0 +NEXT_PAGE(level2_fixmap_pgt) + .fill 506,8,0 + .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ + .fill 5,8,0 + +NEXT_PAGE(level1_fixmap_pgt) + .fill 512,8,0 #undef PMDS -#undef NEXT_PAGE .data .align 16 @@ -472,6 +519,5 @@ ENTRY(nmi_idt_table) .skip IDT_ENTRIES * 16 __PAGE_ALIGNED_BSS - .align PAGE_SIZE -ENTRY(empty_zero_page) +NEXT_PAGE(empty_zero_page) .skip PAGE_SIZE diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 9c3bd4a2050e..0fa69127209a 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -26,6 +26,7 @@ EXPORT_SYMBOL(csum_partial_copy_generic); EXPORT_SYMBOL(__get_user_1); EXPORT_SYMBOL(__get_user_2); EXPORT_SYMBOL(__get_user_4); +EXPORT_SYMBOL(__get_user_8); EXPORT_SYMBOL(__put_user_1); EXPORT_SYMBOL(__put_user_2); diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 8c968974253d..4ddaf66ea35f 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -93,8 +93,9 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * on system-call entry - see also fork() and the signal handling * code. */ -long sys_iopl(unsigned int level, struct pt_regs *regs) +SYSCALL_DEFINE1(iopl, unsigned int, level) { + struct pt_regs *regs = current_pt_regs(); unsigned int old = (regs->flags >> 12) & 3; struct thread_struct *t = ¤t->thread; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 2b44ea5f269d..b686a904d7c3 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -297,9 +297,9 @@ static void kvm_register_steal_time(void) memset(st, 0, sizeof(*st)); - wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED)); - printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n", - cpu, __pa(st)); + wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); + pr_info("kvm-stealtime: cpu %d, msr %llx\n", + cpu, (unsigned long long) slow_virt_to_phys(st)); } static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; @@ -324,7 +324,7 @@ void __cpuinit kvm_guest_cpu_init(void) return; if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { - u64 pa = __pa(&__get_cpu_var(apf_reason)); + u64 pa = slow_virt_to_phys(&__get_cpu_var(apf_reason)); #ifdef CONFIG_PREEMPT pa |= KVM_ASYNC_PF_SEND_ALWAYS; @@ -340,7 +340,8 @@ void __cpuinit kvm_guest_cpu_init(void) /* Size alignment is implied but just to make it explicit. */ BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); __get_cpu_var(kvm_apic_eoi) = 0; - pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED; + pa = slow_virt_to_phys(&__get_cpu_var(kvm_apic_eoi)) + | KVM_MSR_ENABLED; wrmsrl(MSR_KVM_PV_EOI_EN, pa); } diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 220a360010f8..0732f0089a3d 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -162,8 +162,8 @@ int kvm_register_clock(char *txt) int low, high, ret; struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti; - low = (int)__pa(src) | 1; - high = ((u64)__pa(src) >> 32); + low = (int)slow_virt_to_phys(src) | 1; + high = ((u64)slow_virt_to_phys(src) >> 32); ret = native_write_msr_safe(msr_kvm_system_time, low, high); printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", cpu, high, low, txt); @@ -218,6 +218,9 @@ static void kvm_shutdown(void) void __init kvmclock_init(void) { unsigned long mem; + int size; + + size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); if (!kvm_para_available()) return; @@ -231,16 +234,14 @@ void __init kvmclock_init(void) printk(KERN_INFO "kvm-clock: Using msrs %x and %x", msr_kvm_system_time, msr_kvm_wall_clock); - mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS, - PAGE_SIZE); + mem = memblock_alloc(size, PAGE_SIZE); if (!mem) return; hv_clock = __va(mem); if (kvm_register_clock("boot clock")) { hv_clock = NULL; - memblock_free(mem, - sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); + memblock_free(mem, size); return; } pv_time_ops.sched_clock = kvm_clock_read; @@ -275,7 +276,7 @@ int __init kvm_setup_vsyscall_timeinfo(void) struct pvclock_vcpu_time_info *vcpu_time; unsigned int size; - size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS; + size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); preempt_disable(); cpu = smp_processor_id(); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index b3ea9db39db6..4eabc160696f 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -16,125 +16,12 @@ #include <linux/io.h> #include <linux/suspend.h> +#include <asm/init.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/debugreg.h> -static int init_one_level2_page(struct kimage *image, pgd_t *pgd, - unsigned long addr) -{ - pud_t *pud; - pmd_t *pmd; - struct page *page; - int result = -ENOMEM; - - addr &= PMD_MASK; - pgd += pgd_index(addr); - if (!pgd_present(*pgd)) { - page = kimage_alloc_control_pages(image, 0); - if (!page) - goto out; - pud = (pud_t *)page_address(page); - clear_page(pud); - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); - } - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) { - page = kimage_alloc_control_pages(image, 0); - if (!page) - goto out; - pmd = (pmd_t *)page_address(page); - clear_page(pmd); - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); - } - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) - set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); - result = 0; -out: - return result; -} - -static void init_level2_page(pmd_t *level2p, unsigned long addr) -{ - unsigned long end_addr; - - addr &= PAGE_MASK; - end_addr = addr + PUD_SIZE; - while (addr < end_addr) { - set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); - addr += PMD_SIZE; - } -} - -static int init_level3_page(struct kimage *image, pud_t *level3p, - unsigned long addr, unsigned long last_addr) -{ - unsigned long end_addr; - int result; - - result = 0; - addr &= PAGE_MASK; - end_addr = addr + PGDIR_SIZE; - while ((addr < last_addr) && (addr < end_addr)) { - struct page *page; - pmd_t *level2p; - - page = kimage_alloc_control_pages(image, 0); - if (!page) { - result = -ENOMEM; - goto out; - } - level2p = (pmd_t *)page_address(page); - init_level2_page(level2p, addr); - set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE)); - addr += PUD_SIZE; - } - /* clear the unused entries */ - while (addr < end_addr) { - pud_clear(level3p++); - addr += PUD_SIZE; - } -out: - return result; -} - - -static int init_level4_page(struct kimage *image, pgd_t *level4p, - unsigned long addr, unsigned long last_addr) -{ - unsigned long end_addr; - int result; - - result = 0; - addr &= PAGE_MASK; - end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE); - while ((addr < last_addr) && (addr < end_addr)) { - struct page *page; - pud_t *level3p; - - page = kimage_alloc_control_pages(image, 0); - if (!page) { - result = -ENOMEM; - goto out; - } - level3p = (pud_t *)page_address(page); - result = init_level3_page(image, level3p, addr, last_addr); - if (result) - goto out; - set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); - addr += PGDIR_SIZE; - } - /* clear the unused entries */ - while (addr < end_addr) { - pgd_clear(level4p++); - addr += PGDIR_SIZE; - } -out: - return result; -} - static void free_transition_pgtable(struct kimage *image) { free_page((unsigned long)image->arch.pud); @@ -184,22 +71,62 @@ err: return result; } +static void *alloc_pgt_page(void *data) +{ + struct kimage *image = (struct kimage *)data; + struct page *page; + void *p = NULL; + + page = kimage_alloc_control_pages(image, 0); + if (page) { + p = page_address(page); + clear_page(p); + } + + return p; +} static int init_pgtable(struct kimage *image, unsigned long start_pgtable) { + struct x86_mapping_info info = { + .alloc_pgt_page = alloc_pgt_page, + .context = image, + .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, + }; + unsigned long mstart, mend; pgd_t *level4p; int result; + int i; + level4p = (pgd_t *)__va(start_pgtable); - result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); - if (result) - return result; + clear_page(level4p); + for (i = 0; i < nr_pfn_mapped; i++) { + mstart = pfn_mapped[i].start << PAGE_SHIFT; + mend = pfn_mapped[i].end << PAGE_SHIFT; + + result = kernel_ident_mapping_init(&info, + level4p, mstart, mend); + if (result) + return result; + } + /* - * image->start may be outside 0 ~ max_pfn, for example when - * jump back to original kernel from kexeced kernel + * segments's mem ranges could be outside 0 ~ max_pfn, + * for example when jump back to original kernel from kexeced kernel. + * or first kernel is booted with user mem map, and second kernel + * could be loaded out of that range. */ - result = init_one_level2_page(image, level4p, image->start); - if (result) - return result; + for (i = 0; i < image->nr_segments; i++) { + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + + result = kernel_ident_mapping_init(&info, + level4p, mstart, mend); + + if (result) + return result; + } + return init_transition_pgtable(image, level4p); } diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 3a04b224d0c0..22db92bbdf1a 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -364,10 +364,7 @@ static struct attribute_group mc_attr_group = { static void microcode_fini_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - microcode_ops->microcode_fini_cpu(cpu); - uci->valid = 0; } static enum ucode_state microcode_resume_cpu(int cpu) @@ -383,6 +380,10 @@ static enum ucode_state microcode_resume_cpu(int cpu) static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw) { enum ucode_state ustate; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (uci && uci->valid) + return UCODE_OK; if (collect_cpu_info(cpu)) return UCODE_ERROR; diff --git a/arch/x86/kernel/microcode_core_early.c b/arch/x86/kernel/microcode_core_early.c new file mode 100644 index 000000000000..577db8417d15 --- /dev/null +++ b/arch/x86/kernel/microcode_core_early.c @@ -0,0 +1,76 @@ +/* + * X86 CPU microcode early update for Linux + * + * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> + * H Peter Anvin" <hpa@zytor.com> + * + * This driver allows to early upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture + * Software Developer's Manual. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/module.h> +#include <asm/microcode_intel.h> +#include <asm/processor.h> + +#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) +#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') +#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I') +#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l') +#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h') +#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i') +#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D') + +#define CPUID_IS(a, b, c, ebx, ecx, edx) \ + (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c)))) + +/* + * In early loading microcode phase on BSP, boot_cpu_data is not set up yet. + * x86_vendor() gets vendor id for BSP. + * + * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify + * coding, we still use x86_vendor() to get vendor id for AP. + * + * x86_vendor() gets vendor information directly through cpuid. + */ +static int __cpuinit x86_vendor(void) +{ + u32 eax = 0x00000000; + u32 ebx, ecx = 0, edx; + + if (!have_cpuid_p()) + return X86_VENDOR_UNKNOWN; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx)) + return X86_VENDOR_INTEL; + + if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx)) + return X86_VENDOR_AMD; + + return X86_VENDOR_UNKNOWN; +} + +void __init load_ucode_bsp(void) +{ + int vendor = x86_vendor(); + + if (vendor == X86_VENDOR_INTEL) + load_ucode_intel_bsp(); +} + +void __cpuinit load_ucode_ap(void) +{ + int vendor = x86_vendor(); + + if (vendor == X86_VENDOR_INTEL) + load_ucode_intel_ap(); +} diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 3544aed39338..5fb2cebf556b 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -79,7 +79,7 @@ #include <linux/module.h> #include <linux/vmalloc.h> -#include <asm/microcode.h> +#include <asm/microcode_intel.h> #include <asm/processor.h> #include <asm/msr.h> @@ -87,59 +87,6 @@ MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); MODULE_LICENSE("GPL"); -struct microcode_header_intel { - unsigned int hdrver; - unsigned int rev; - unsigned int date; - unsigned int sig; - unsigned int cksum; - unsigned int ldrver; - unsigned int pf; - unsigned int datasize; - unsigned int totalsize; - unsigned int reserved[3]; -}; - -struct microcode_intel { - struct microcode_header_intel hdr; - unsigned int bits[0]; -}; - -/* microcode format is extended from prescott processors */ -struct extended_signature { - unsigned int sig; - unsigned int pf; - unsigned int cksum; -}; - -struct extended_sigtable { - unsigned int count; - unsigned int cksum; - unsigned int reserved[3]; - struct extended_signature sigs[0]; -}; - -#define DEFAULT_UCODE_DATASIZE (2000) -#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) -#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) -#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) -#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) -#define DWSIZE (sizeof(u32)) - -#define get_totalsize(mc) \ - (((struct microcode_intel *)mc)->hdr.totalsize ? \ - ((struct microcode_intel *)mc)->hdr.totalsize : \ - DEFAULT_UCODE_TOTALSIZE) - -#define get_datasize(mc) \ - (((struct microcode_intel *)mc)->hdr.datasize ? \ - ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) - -#define sigmatch(s1, s2, p1, p2) \ - (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) - -#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) - static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu_num); @@ -162,128 +109,25 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) return 0; } -static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf) -{ - return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1; -} - -static inline int -update_match_revision(struct microcode_header_intel *mc_header, int rev) -{ - return (mc_header->rev <= rev) ? 0 : 1; -} - -static int microcode_sanity_check(void *mc) -{ - unsigned long total_size, data_size, ext_table_size; - struct microcode_header_intel *mc_header = mc; - struct extended_sigtable *ext_header = NULL; - int sum, orig_sum, ext_sigcount = 0, i; - struct extended_signature *ext_sig; - - total_size = get_totalsize(mc_header); - data_size = get_datasize(mc_header); - - if (data_size + MC_HEADER_SIZE > total_size) { - pr_err("error! Bad data size in microcode data file\n"); - return -EINVAL; - } - - if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { - pr_err("error! Unknown microcode update format\n"); - return -EINVAL; - } - ext_table_size = total_size - (MC_HEADER_SIZE + data_size); - if (ext_table_size) { - if ((ext_table_size < EXT_HEADER_SIZE) - || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { - pr_err("error! Small exttable size in microcode data file\n"); - return -EINVAL; - } - ext_header = mc + MC_HEADER_SIZE + data_size; - if (ext_table_size != exttable_size(ext_header)) { - pr_err("error! Bad exttable size in microcode data file\n"); - return -EFAULT; - } - ext_sigcount = ext_header->count; - } - - /* check extended table checksum */ - if (ext_table_size) { - int ext_table_sum = 0; - int *ext_tablep = (int *)ext_header; - - i = ext_table_size / DWSIZE; - while (i--) - ext_table_sum += ext_tablep[i]; - if (ext_table_sum) { - pr_warning("aborting, bad extended signature table checksum\n"); - return -EINVAL; - } - } - - /* calculate the checksum */ - orig_sum = 0; - i = (MC_HEADER_SIZE + data_size) / DWSIZE; - while (i--) - orig_sum += ((int *)mc)[i]; - if (orig_sum) { - pr_err("aborting, bad checksum\n"); - return -EINVAL; - } - if (!ext_table_size) - return 0; - /* check extended signature checksum */ - for (i = 0; i < ext_sigcount; i++) { - ext_sig = (void *)ext_header + EXT_HEADER_SIZE + - EXT_SIGNATURE_SIZE * i; - sum = orig_sum - - (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); - if (sum) { - pr_err("aborting, bad checksum\n"); - return -EINVAL; - } - } - return 0; -} - /* * return 0 - no update found * return 1 - found update */ -static int -get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev) +static int get_matching_mc(struct microcode_intel *mc_intel, int cpu) { - struct microcode_header_intel *mc_header = mc; - struct extended_sigtable *ext_header; - unsigned long total_size = get_totalsize(mc_header); - int ext_sigcount, i; - struct extended_signature *ext_sig; - - if (!update_match_revision(mc_header, rev)) - return 0; - - if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf)) - return 1; + struct cpu_signature cpu_sig; + unsigned int csig, cpf, crev; - /* Look for ext. headers: */ - if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) - return 0; + collect_cpu_info(cpu, &cpu_sig); - ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; - ext_sigcount = ext_header->count; - ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + csig = cpu_sig.sig; + cpf = cpu_sig.pf; + crev = cpu_sig.rev; - for (i = 0; i < ext_sigcount; i++) { - if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf)) - return 1; - ext_sig++; - } - return 0; + return get_matching_microcode(csig, cpf, mc_intel, crev); } -static int apply_microcode(int cpu) +int apply_microcode(int cpu) { struct microcode_intel *mc_intel; struct ucode_cpu_info *uci; @@ -300,6 +144,14 @@ static int apply_microcode(int cpu) if (mc_intel == NULL) return 0; + /* + * Microcode on this CPU could be updated earlier. Only apply the + * microcode patch in mc_intel when it is newer than the one on this + * CPU. + */ + if (get_matching_mc(mc_intel, cpu) == 0) + return 0; + /* write microcode via MSR 0x79 */ wrmsr(MSR_IA32_UCODE_WRITE, (unsigned long) mc_intel->bits, @@ -338,6 +190,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, unsigned int leftover = size; enum ucode_state state = UCODE_OK; unsigned int curr_mc_size = 0; + unsigned int csig, cpf; while (leftover) { struct microcode_header_intel mc_header; @@ -362,11 +215,13 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, } if (get_ucode_data(mc, ucode_ptr, mc_size) || - microcode_sanity_check(mc) < 0) { + microcode_sanity_check(mc, 1) < 0) { break; } - if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) { + csig = uci->cpu_sig.sig; + cpf = uci->cpu_sig.pf; + if (get_matching_microcode(csig, cpf, mc, new_rev)) { vfree(new_mc); new_rev = mc_header.rev; new_mc = mc; @@ -393,6 +248,13 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, vfree(uci->mc); uci->mc = (struct microcode_intel *)new_mc; + /* + * If early loading microcode is supported, save this mc into + * permanent memory. So it will be loaded early when a CPU is hot added + * or resumes. + */ + save_mc_for_early(new_mc); + pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); out: diff --git a/arch/x86/kernel/microcode_intel_early.c b/arch/x86/kernel/microcode_intel_early.c new file mode 100644 index 000000000000..7890bc838952 --- /dev/null +++ b/arch/x86/kernel/microcode_intel_early.c @@ -0,0 +1,796 @@ +/* + * Intel CPU microcode early update for Linux + * + * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> + * H Peter Anvin" <hpa@zytor.com> + * + * This allows to early upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture + * Software Developer's Manual. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/earlycpio.h> +#include <linux/initrd.h> +#include <linux/cpu.h> +#include <asm/msr.h> +#include <asm/microcode_intel.h> +#include <asm/processor.h> +#include <asm/tlbflush.h> +#include <asm/setup.h> + +unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; +struct mc_saved_data { + unsigned int mc_saved_count; + struct microcode_intel **mc_saved; +} mc_saved_data; + +static enum ucode_state __cpuinit +generic_load_microcode_early(struct microcode_intel **mc_saved_p, + unsigned int mc_saved_count, + struct ucode_cpu_info *uci) +{ + struct microcode_intel *ucode_ptr, *new_mc = NULL; + int new_rev = uci->cpu_sig.rev; + enum ucode_state state = UCODE_OK; + unsigned int mc_size; + struct microcode_header_intel *mc_header; + unsigned int csig = uci->cpu_sig.sig; + unsigned int cpf = uci->cpu_sig.pf; + int i; + + for (i = 0; i < mc_saved_count; i++) { + ucode_ptr = mc_saved_p[i]; + + mc_header = (struct microcode_header_intel *)ucode_ptr; + mc_size = get_totalsize(mc_header); + if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) { + new_rev = mc_header->rev; + new_mc = ucode_ptr; + } + } + + if (!new_mc) { + state = UCODE_NFOUND; + goto out; + } + + uci->mc = (struct microcode_intel *)new_mc; +out: + return state; +} + +static void __cpuinit +microcode_pointer(struct microcode_intel **mc_saved, + unsigned long *mc_saved_in_initrd, + unsigned long initrd_start, int mc_saved_count) +{ + int i; + + for (i = 0; i < mc_saved_count; i++) + mc_saved[i] = (struct microcode_intel *) + (mc_saved_in_initrd[i] + initrd_start); +} + +#ifdef CONFIG_X86_32 +static void __cpuinit +microcode_phys(struct microcode_intel **mc_saved_tmp, + struct mc_saved_data *mc_saved_data) +{ + int i; + struct microcode_intel ***mc_saved; + + mc_saved = (struct microcode_intel ***) + __pa_symbol(&mc_saved_data->mc_saved); + for (i = 0; i < mc_saved_data->mc_saved_count; i++) { + struct microcode_intel *p; + + p = *(struct microcode_intel **) + __pa(mc_saved_data->mc_saved + i); + mc_saved_tmp[i] = (struct microcode_intel *)__pa(p); + } +} +#endif + +static enum ucode_state __cpuinit +load_microcode(struct mc_saved_data *mc_saved_data, + unsigned long *mc_saved_in_initrd, + unsigned long initrd_start, + struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + unsigned int count = mc_saved_data->mc_saved_count; + + if (!mc_saved_data->mc_saved) { + microcode_pointer(mc_saved_tmp, mc_saved_in_initrd, + initrd_start, count); + + return generic_load_microcode_early(mc_saved_tmp, count, uci); + } else { +#ifdef CONFIG_X86_32 + microcode_phys(mc_saved_tmp, mc_saved_data); + return generic_load_microcode_early(mc_saved_tmp, count, uci); +#else + return generic_load_microcode_early(mc_saved_data->mc_saved, + count, uci); +#endif + } +} + +static u8 get_x86_family(unsigned long sig) +{ + u8 x86; + + x86 = (sig >> 8) & 0xf; + + if (x86 == 0xf) + x86 += (sig >> 20) & 0xff; + + return x86; +} + +static u8 get_x86_model(unsigned long sig) +{ + u8 x86, x86_model; + + x86 = get_x86_family(sig); + x86_model = (sig >> 4) & 0xf; + + if (x86 == 0x6 || x86 == 0xf) + x86_model += ((sig >> 16) & 0xf) << 4; + + return x86_model; +} + +/* + * Given CPU signature and a microcode patch, this function finds if the + * microcode patch has matching family and model with the CPU. + */ +static enum ucode_state +matching_model_microcode(struct microcode_header_intel *mc_header, + unsigned long sig) +{ + u8 x86, x86_model; + u8 x86_ucode, x86_model_ucode; + struct extended_sigtable *ext_header; + unsigned long total_size = get_totalsize(mc_header); + unsigned long data_size = get_datasize(mc_header); + int ext_sigcount, i; + struct extended_signature *ext_sig; + + x86 = get_x86_family(sig); + x86_model = get_x86_model(sig); + + x86_ucode = get_x86_family(mc_header->sig); + x86_model_ucode = get_x86_model(mc_header->sig); + + if (x86 == x86_ucode && x86_model == x86_model_ucode) + return UCODE_OK; + + /* Look for ext. headers: */ + if (total_size <= data_size + MC_HEADER_SIZE) + return UCODE_NFOUND; + + ext_header = (struct extended_sigtable *) + mc_header + data_size + MC_HEADER_SIZE; + ext_sigcount = ext_header->count; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + + for (i = 0; i < ext_sigcount; i++) { + x86_ucode = get_x86_family(ext_sig->sig); + x86_model_ucode = get_x86_model(ext_sig->sig); + + if (x86 == x86_ucode && x86_model == x86_model_ucode) + return UCODE_OK; + + ext_sig++; + } + + return UCODE_NFOUND; +} + +static int +save_microcode(struct mc_saved_data *mc_saved_data, + struct microcode_intel **mc_saved_src, + unsigned int mc_saved_count) +{ + int i, j; + struct microcode_intel **mc_saved_p; + int ret; + + if (!mc_saved_count) + return -EINVAL; + + /* + * Copy new microcode data. + */ + mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *), + GFP_KERNEL); + if (!mc_saved_p) + return -ENOMEM; + + for (i = 0; i < mc_saved_count; i++) { + struct microcode_intel *mc = mc_saved_src[i]; + struct microcode_header_intel *mc_header = &mc->hdr; + unsigned long mc_size = get_totalsize(mc_header); + mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL); + if (!mc_saved_p[i]) { + ret = -ENOMEM; + goto err; + } + if (!mc_saved_src[i]) { + ret = -EINVAL; + goto err; + } + memcpy(mc_saved_p[i], mc, mc_size); + } + + /* + * Point to newly saved microcode. + */ + mc_saved_data->mc_saved = mc_saved_p; + mc_saved_data->mc_saved_count = mc_saved_count; + + return 0; + +err: + for (j = 0; j <= i; j++) + kfree(mc_saved_p[j]); + kfree(mc_saved_p); + + return ret; +} + +/* + * A microcode patch in ucode_ptr is saved into mc_saved + * - if it has matching signature and newer revision compared to an existing + * patch mc_saved. + * - or if it is a newly discovered microcode patch. + * + * The microcode patch should have matching model with CPU. + */ +static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr, + unsigned int *mc_saved_count_p) +{ + int i; + int found = 0; + unsigned int mc_saved_count = *mc_saved_count_p; + struct microcode_header_intel *mc_header; + + mc_header = (struct microcode_header_intel *)ucode_ptr; + for (i = 0; i < mc_saved_count; i++) { + unsigned int sig, pf; + unsigned int new_rev; + struct microcode_header_intel *mc_saved_header = + (struct microcode_header_intel *)mc_saved[i]; + sig = mc_saved_header->sig; + pf = mc_saved_header->pf; + new_rev = mc_header->rev; + + if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) { + found = 1; + if (update_match_revision(mc_header, new_rev)) { + /* + * Found an older ucode saved before. + * Replace the older one with this newer + * one. + */ + mc_saved[i] = + (struct microcode_intel *)ucode_ptr; + break; + } + } + } + if (i >= mc_saved_count && !found) + /* + * This ucode is first time discovered in ucode file. + * Save it to memory. + */ + mc_saved[mc_saved_count++] = + (struct microcode_intel *)ucode_ptr; + + *mc_saved_count_p = mc_saved_count; +} + +/* + * Get microcode matching with BSP's model. Only CPUs with the same model as + * BSP can stay in the platform. + */ +static enum ucode_state __init +get_matching_model_microcode(int cpu, unsigned long start, + void *data, size_t size, + struct mc_saved_data *mc_saved_data, + unsigned long *mc_saved_in_initrd, + struct ucode_cpu_info *uci) +{ + u8 *ucode_ptr = data; + unsigned int leftover = size; + enum ucode_state state = UCODE_OK; + unsigned int mc_size; + struct microcode_header_intel *mc_header; + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + unsigned int mc_saved_count = mc_saved_data->mc_saved_count; + int i; + + while (leftover) { + mc_header = (struct microcode_header_intel *)ucode_ptr; + + mc_size = get_totalsize(mc_header); + if (!mc_size || mc_size > leftover || + microcode_sanity_check(ucode_ptr, 0) < 0) + break; + + leftover -= mc_size; + + /* + * Since APs with same family and model as the BSP may boot in + * the platform, we need to find and save microcode patches + * with the same family and model as the BSP. + */ + if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != + UCODE_OK) { + ucode_ptr += mc_size; + continue; + } + + _save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count); + + ucode_ptr += mc_size; + } + + if (leftover) { + state = UCODE_ERROR; + goto out; + } + + if (mc_saved_count == 0) { + state = UCODE_NFOUND; + goto out; + } + + for (i = 0; i < mc_saved_count; i++) + mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start; + + mc_saved_data->mc_saved_count = mc_saved_count; +out: + return state; +} + +#define native_rdmsr(msr, val1, val2) \ +do { \ + u64 __val = native_read_msr((msr)); \ + (void)((val1) = (u32)__val); \ + (void)((val2) = (u32)(__val >> 32)); \ +} while (0) + +#define native_wrmsr(msr, low, high) \ + native_write_msr(msr, low, high); + +static int __cpuinit collect_cpu_info_early(struct ucode_cpu_info *uci) +{ + unsigned int val[2]; + u8 x86, x86_model; + struct cpu_signature csig; + unsigned int eax, ebx, ecx, edx; + + csig.sig = 0; + csig.pf = 0; + csig.rev = 0; + + memset(uci, 0, sizeof(*uci)); + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + csig.sig = eax; + + x86 = get_x86_family(csig.sig); + x86_model = get_x86_model(csig.sig); + + if ((x86_model >= 5) || (x86 > 6)) { + /* get processor flags from MSR 0x17 */ + native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig.pf = 1 << ((val[1] >> 18) & 7); + } + native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + + /* As documented in the SDM: Do a CPUID 1 here */ + sync_core(); + + /* get the current revision from MSR 0x8B */ + native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + + csig.rev = val[1]; + + uci->cpu_sig = csig; + uci->valid = 1; + + return 0; +} + +#ifdef DEBUG +static void __ref show_saved_mc(void) +{ + int i, j; + unsigned int sig, pf, rev, total_size, data_size, date; + struct ucode_cpu_info uci; + + if (mc_saved_data.mc_saved_count == 0) { + pr_debug("no micorcode data saved.\n"); + return; + } + pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count); + + collect_cpu_info_early(&uci); + + sig = uci.cpu_sig.sig; + pf = uci.cpu_sig.pf; + rev = uci.cpu_sig.rev; + pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n", + smp_processor_id(), sig, pf, rev); + + for (i = 0; i < mc_saved_data.mc_saved_count; i++) { + struct microcode_header_intel *mc_saved_header; + struct extended_sigtable *ext_header; + int ext_sigcount; + struct extended_signature *ext_sig; + + mc_saved_header = (struct microcode_header_intel *) + mc_saved_data.mc_saved[i]; + sig = mc_saved_header->sig; + pf = mc_saved_header->pf; + rev = mc_saved_header->rev; + total_size = get_totalsize(mc_saved_header); + data_size = get_datasize(mc_saved_header); + date = mc_saved_header->date; + + pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n", + i, sig, pf, rev, total_size, + date & 0xffff, + date >> 24, + (date >> 16) & 0xff); + + /* Look for ext. headers: */ + if (total_size <= data_size + MC_HEADER_SIZE) + continue; + + ext_header = (struct extended_sigtable *) + mc_saved_header + data_size + MC_HEADER_SIZE; + ext_sigcount = ext_header->count; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + + for (j = 0; j < ext_sigcount; j++) { + sig = ext_sig->sig; + pf = ext_sig->pf; + + pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n", + j, sig, pf); + + ext_sig++; + } + + } +} +#else +static inline void show_saved_mc(void) +{ +} +#endif + +#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU) +/* + * Save this mc into mc_saved_data. So it will be loaded early when a CPU is + * hot added or resumes. + * + * Please make sure this mc should be a valid microcode patch before calling + * this function. + */ +int save_mc_for_early(u8 *mc) +{ + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + unsigned int mc_saved_count_init; + unsigned int mc_saved_count; + struct microcode_intel **mc_saved; + int ret = 0; + int i; + + /* + * Hold hotplug lock so mc_saved_data is not accessed by a CPU in + * hotplug. + */ + cpu_hotplug_driver_lock(); + + mc_saved_count_init = mc_saved_data.mc_saved_count; + mc_saved_count = mc_saved_data.mc_saved_count; + mc_saved = mc_saved_data.mc_saved; + + if (mc_saved && mc_saved_count) + memcpy(mc_saved_tmp, mc_saved, + mc_saved_count * sizeof(struct mirocode_intel *)); + /* + * Save the microcode patch mc in mc_save_tmp structure if it's a newer + * version. + */ + + _save_mc(mc_saved_tmp, mc, &mc_saved_count); + + /* + * Save the mc_save_tmp in global mc_saved_data. + */ + ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count); + if (ret) { + pr_err("Can not save microcode patch.\n"); + goto out; + } + + show_saved_mc(); + + /* + * Free old saved microcod data. + */ + if (mc_saved) { + for (i = 0; i < mc_saved_count_init; i++) + kfree(mc_saved[i]); + kfree(mc_saved); + } + +out: + cpu_hotplug_driver_unlock(); + + return ret; +} +EXPORT_SYMBOL_GPL(save_mc_for_early); +#endif + +static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; +static __init enum ucode_state +scan_microcode(unsigned long start, unsigned long end, + struct mc_saved_data *mc_saved_data, + unsigned long *mc_saved_in_initrd, + struct ucode_cpu_info *uci) +{ + unsigned int size = end - start + 1; + struct cpio_data cd; + long offset = 0; +#ifdef CONFIG_X86_32 + char *p = (char *)__pa_symbol(ucode_name); +#else + char *p = ucode_name; +#endif + + cd.data = NULL; + cd.size = 0; + + cd = find_cpio_data(p, (void *)start, size, &offset); + if (!cd.data) + return UCODE_ERROR; + + + return get_matching_model_microcode(0, start, cd.data, cd.size, + mc_saved_data, mc_saved_in_initrd, + uci); +} + +/* + * Print ucode update info. + */ +static void __cpuinit +print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) +{ + int cpu = smp_processor_id(); + + pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", + cpu, + uci->cpu_sig.rev, + date & 0xffff, + date >> 24, + (date >> 16) & 0xff); +} + +#ifdef CONFIG_X86_32 + +static int delay_ucode_info; +static int current_mc_date; + +/* + * Print early updated ucode info after printk works. This is delayed info dump. + */ +void __cpuinit show_ucode_info_early(void) +{ + struct ucode_cpu_info uci; + + if (delay_ucode_info) { + collect_cpu_info_early(&uci); + print_ucode_info(&uci, current_mc_date); + delay_ucode_info = 0; + } +} + +/* + * At this point, we can not call printk() yet. Keep microcode patch number in + * mc_saved_data.mc_saved and delay printing microcode info in + * show_ucode_info_early() until printk() works. + */ +static void __cpuinit print_ucode(struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_intel; + int *delay_ucode_info_p; + int *current_mc_date_p; + + mc_intel = uci->mc; + if (mc_intel == NULL) + return; + + delay_ucode_info_p = (int *)__pa_symbol(&delay_ucode_info); + current_mc_date_p = (int *)__pa_symbol(¤t_mc_date); + + *delay_ucode_info_p = 1; + *current_mc_date_p = mc_intel->hdr.date; +} +#else + +/* + * Flush global tlb. We only do this in x86_64 where paging has been enabled + * already and PGE should be enabled as well. + */ +static inline void __cpuinit flush_tlb_early(void) +{ + __native_flush_tlb_global_irq_disabled(); +} + +static inline void __cpuinit print_ucode(struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_intel; + + mc_intel = uci->mc; + if (mc_intel == NULL) + return; + + print_ucode_info(uci, mc_intel->hdr.date); +} +#endif + +static int apply_microcode_early(struct mc_saved_data *mc_saved_data, + struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_intel; + unsigned int val[2]; + + mc_intel = uci->mc; + if (mc_intel == NULL) + return 0; + + /* write microcode via MSR 0x79 */ + native_wrmsr(MSR_IA32_UCODE_WRITE, + (unsigned long) mc_intel->bits, + (unsigned long) mc_intel->bits >> 16 >> 16); + native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + + /* As documented in the SDM: Do a CPUID 1 here */ + sync_core(); + + /* get the current revision from MSR 0x8B */ + native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + if (val[1] != mc_intel->hdr.rev) + return -1; + +#ifdef CONFIG_X86_64 + /* Flush global tlb. This is precaution. */ + flush_tlb_early(); +#endif + uci->cpu_sig.rev = val[1]; + + print_ucode(uci); + + return 0; +} + +/* + * This function converts microcode patch offsets previously stored in + * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data. + */ +int __init save_microcode_in_initrd(void) +{ + unsigned int count = mc_saved_data.mc_saved_count; + struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; + int ret = 0; + + if (count == 0) + return ret; + + microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count); + ret = save_microcode(&mc_saved_data, mc_saved, count); + if (ret) + pr_err("Can not save microcod patches from initrd"); + + show_saved_mc(); + + return ret; +} + +static void __init +_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, + unsigned long *mc_saved_in_initrd, + unsigned long initrd_start_early, + unsigned long initrd_end_early, + struct ucode_cpu_info *uci) +{ + collect_cpu_info_early(uci); + scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data, + mc_saved_in_initrd, uci); + load_microcode(mc_saved_data, mc_saved_in_initrd, + initrd_start_early, uci); + apply_microcode_early(mc_saved_data, uci); +} + +void __init +load_ucode_intel_bsp(void) +{ + u64 ramdisk_image, ramdisk_size; + unsigned long initrd_start_early, initrd_end_early; + struct ucode_cpu_info uci; +#ifdef CONFIG_X86_32 + struct boot_params *boot_params_p; + + boot_params_p = (struct boot_params *)__pa_symbol(&boot_params); + ramdisk_image = boot_params_p->hdr.ramdisk_image; + ramdisk_size = boot_params_p->hdr.ramdisk_size; + initrd_start_early = ramdisk_image; + initrd_end_early = initrd_start_early + ramdisk_size; + + _load_ucode_intel_bsp( + (struct mc_saved_data *)__pa_symbol(&mc_saved_data), + (unsigned long *)__pa_symbol(&mc_saved_in_initrd), + initrd_start_early, initrd_end_early, &uci); +#else + ramdisk_image = boot_params.hdr.ramdisk_image; + ramdisk_size = boot_params.hdr.ramdisk_size; + initrd_start_early = ramdisk_image + PAGE_OFFSET; + initrd_end_early = initrd_start_early + ramdisk_size; + + _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, + initrd_start_early, initrd_end_early, &uci); +#endif +} + +void __cpuinit load_ucode_intel_ap(void) +{ + struct mc_saved_data *mc_saved_data_p; + struct ucode_cpu_info uci; + unsigned long *mc_saved_in_initrd_p; + unsigned long initrd_start_addr; +#ifdef CONFIG_X86_32 + unsigned long *initrd_start_p; + + mc_saved_in_initrd_p = + (unsigned long *)__pa_symbol(mc_saved_in_initrd); + mc_saved_data_p = (struct mc_saved_data *)__pa_symbol(&mc_saved_data); + initrd_start_p = (unsigned long *)__pa_symbol(&initrd_start); + initrd_start_addr = (unsigned long)__pa_symbol(*initrd_start_p); +#else + mc_saved_data_p = &mc_saved_data; + mc_saved_in_initrd_p = mc_saved_in_initrd; + initrd_start_addr = initrd_start; +#endif + + /* + * If there is no valid ucode previously saved in memory, no need to + * update ucode on this AP. + */ + if (mc_saved_data_p->mc_saved_count == 0) + return; + + collect_cpu_info_early(&uci); + load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, + initrd_start_addr, &uci); + apply_microcode_early(mc_saved_data_p, &uci); +} diff --git a/arch/x86/kernel/microcode_intel_lib.c b/arch/x86/kernel/microcode_intel_lib.c new file mode 100644 index 000000000000..ce69320d0179 --- /dev/null +++ b/arch/x86/kernel/microcode_intel_lib.c @@ -0,0 +1,174 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> + * H Peter Anvin" <hpa@zytor.com> + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/Assets/PDF/manual/253668.pdf + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ +#include <linux/firmware.h> +#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#include <asm/microcode_intel.h> +#include <asm/processor.h> +#include <asm/msr.h> + +static inline int +update_match_cpu(unsigned int csig, unsigned int cpf, + unsigned int sig, unsigned int pf) +{ + return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1; +} + +int +update_match_revision(struct microcode_header_intel *mc_header, int rev) +{ + return (mc_header->rev <= rev) ? 0 : 1; +} + +int microcode_sanity_check(void *mc, int print_err) +{ + unsigned long total_size, data_size, ext_table_size; + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header = NULL; + int sum, orig_sum, ext_sigcount = 0, i; + struct extended_signature *ext_sig; + + total_size = get_totalsize(mc_header); + data_size = get_datasize(mc_header); + + if (data_size + MC_HEADER_SIZE > total_size) { + if (print_err) + pr_err("error! Bad data size in microcode data file\n"); + return -EINVAL; + } + + if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { + if (print_err) + pr_err("error! Unknown microcode update format\n"); + return -EINVAL; + } + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); + if (ext_table_size) { + if ((ext_table_size < EXT_HEADER_SIZE) + || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { + if (print_err) + pr_err("error! Small exttable size in microcode data file\n"); + return -EINVAL; + } + ext_header = mc + MC_HEADER_SIZE + data_size; + if (ext_table_size != exttable_size(ext_header)) { + if (print_err) + pr_err("error! Bad exttable size in microcode data file\n"); + return -EFAULT; + } + ext_sigcount = ext_header->count; + } + + /* check extended table checksum */ + if (ext_table_size) { + int ext_table_sum = 0; + int *ext_tablep = (int *)ext_header; + + i = ext_table_size / DWSIZE; + while (i--) + ext_table_sum += ext_tablep[i]; + if (ext_table_sum) { + if (print_err) + pr_warn("aborting, bad extended signature table checksum\n"); + return -EINVAL; + } + } + + /* calculate the checksum */ + orig_sum = 0; + i = (MC_HEADER_SIZE + data_size) / DWSIZE; + while (i--) + orig_sum += ((int *)mc)[i]; + if (orig_sum) { + if (print_err) + pr_err("aborting, bad checksum\n"); + return -EINVAL; + } + if (!ext_table_size) + return 0; + /* check extended signature checksum */ + for (i = 0; i < ext_sigcount; i++) { + ext_sig = (void *)ext_header + EXT_HEADER_SIZE + + EXT_SIGNATURE_SIZE * i; + sum = orig_sum + - (mc_header->sig + mc_header->pf + mc_header->cksum) + + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + if (sum) { + if (print_err) + pr_err("aborting, bad checksum\n"); + return -EINVAL; + } + } + return 0; +} +EXPORT_SYMBOL_GPL(microcode_sanity_check); + +/* + * return 0 - no update found + * return 1 - found update + */ +int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev) +{ + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header; + unsigned long total_size = get_totalsize(mc_header); + int ext_sigcount, i; + struct extended_signature *ext_sig; + + if (update_match_cpu(csig, cpf, mc_header->sig, mc_header->pf)) + return 1; + + /* Look for ext. headers: */ + if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) + return 0; + + ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; + ext_sigcount = ext_header->count; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + + for (i = 0; i < ext_sigcount; i++) { + if (update_match_cpu(csig, cpf, ext_sig->sig, ext_sig->pf)) + return 1; + ext_sig++; + } + return 0; +} + +/* + * return 0 - no update found + * return 1 - found update + */ +int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev) +{ + struct microcode_header_intel *mc_header = mc; + + if (!update_match_revision(mc_header, rev)) + return 0; + + return get_matching_sig(csig, cpf, mc, rev); +} +EXPORT_SYMBOL_GPL(get_matching_microcode); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6e68a6194965..0f49677da51e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -117,7 +117,7 @@ void release_thread(struct task_struct *dead_task) { if (dead_task->mm) { if (dead_task->mm->context.size) { - pr_warn("WARNING: dead process %8s still has LDT? <%p/%d>\n", + pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", dead_task->comm, dead_task->mm->context.ldt, dead_task->mm->context.size); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 8b24289cc10c..9c857f05cef0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -108,17 +108,16 @@ #include <asm/topology.h> #include <asm/apicdef.h> #include <asm/amd_nb.h> -#ifdef CONFIG_X86_64 -#include <asm/numa_64.h> -#endif #include <asm/mce.h> #include <asm/alternative.h> #include <asm/prom.h> /* - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. - * The direct mapping extends to max_pfn_mapped, so that we can directly access - * apertures, ACPI and other tables without having to play with fixmaps. + * max_low_pfn_mapped: highest direct mapped pfn under 4GB + * max_pfn_mapped: highest direct mapped pfn over 4GB + * + * The direct mapping only covers E820_RAM regions, so the ranges and gaps are + * represented by pfn_mapped */ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; @@ -276,18 +275,7 @@ void * __init extend_brk(size_t size, size_t align) return ret; } -#ifdef CONFIG_X86_64 -static void __init init_gbpages(void) -{ - if (direct_gbpages && cpu_has_gbpages) - printk(KERN_INFO "Using GB pages for direct mapping\n"); - else - direct_gbpages = 0; -} -#else -static inline void init_gbpages(void) -{ -} +#ifdef CONFIG_X86_32 static void __init cleanup_highmap(void) { } @@ -296,8 +284,8 @@ static void __init cleanup_highmap(void) static void __init reserve_brk(void) { if (_brk_end > _brk_start) - memblock_reserve(__pa(_brk_start), - __pa(_brk_end) - __pa(_brk_start)); + memblock_reserve(__pa_symbol(_brk_start), + _brk_end - _brk_start); /* Mark brk area as locked down and no longer taking any new allocations */ @@ -306,27 +294,43 @@ static void __init reserve_brk(void) #ifdef CONFIG_BLK_DEV_INITRD +static u64 __init get_ramdisk_image(void) +{ + u64 ramdisk_image = boot_params.hdr.ramdisk_image; + + ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32; + + return ramdisk_image; +} +static u64 __init get_ramdisk_size(void) +{ + u64 ramdisk_size = boot_params.hdr.ramdisk_size; + + ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32; + + return ramdisk_size; +} + #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) static void __init relocate_initrd(void) { /* Assume only end is not page aligned */ - u64 ramdisk_image = boot_params.hdr.ramdisk_image; - u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 ramdisk_image = get_ramdisk_image(); + u64 ramdisk_size = get_ramdisk_size(); u64 area_size = PAGE_ALIGN(ramdisk_size); - u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; u64 ramdisk_here; unsigned long slop, clen, mapaddr; char *p, *q; - /* We need to move the initrd down into lowmem */ - ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size, - PAGE_SIZE); + /* We need to move the initrd down into directly mapped mem */ + ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), + area_size, PAGE_SIZE); if (!ramdisk_here) panic("Cannot find place for new RAMDISK of size %lld\n", ramdisk_size); - /* Note: this includes all the lowmem currently occupied by + /* Note: this includes all the mem currently occupied by the initrd, we rely on that fact to keep the data intact. */ memblock_reserve(ramdisk_here, area_size); initrd_start = ramdisk_here + PAGE_OFFSET; @@ -336,17 +340,7 @@ static void __init relocate_initrd(void) q = (char *)initrd_start; - /* Copy any lowmem portion of the initrd */ - if (ramdisk_image < end_of_lowmem) { - clen = end_of_lowmem - ramdisk_image; - p = (char *)__va(ramdisk_image); - memcpy(q, p, clen); - q += clen; - ramdisk_image += clen; - ramdisk_size -= clen; - } - - /* Copy the highmem portion of the initrd */ + /* Copy the initrd */ while (ramdisk_size) { slop = ramdisk_image & ~PAGE_MASK; clen = ramdisk_size; @@ -360,22 +354,35 @@ static void __init relocate_initrd(void) ramdisk_image += clen; ramdisk_size -= clen; } - /* high pages is not converted by early_res_to_bootmem */ - ramdisk_image = boot_params.hdr.ramdisk_image; - ramdisk_size = boot_params.hdr.ramdisk_size; + + ramdisk_image = get_ramdisk_image(); + ramdisk_size = get_ramdisk_size(); printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" " [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_image + ramdisk_size - 1, ramdisk_here, ramdisk_here + ramdisk_size - 1); } +static void __init early_reserve_initrd(void) +{ + /* Assume only end is not page aligned */ + u64 ramdisk_image = get_ramdisk_image(); + u64 ramdisk_size = get_ramdisk_size(); + u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); + + if (!boot_params.hdr.type_of_loader || + !ramdisk_image || !ramdisk_size) + return; /* No initrd provided by bootloader */ + + memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); +} static void __init reserve_initrd(void) { /* Assume only end is not page aligned */ - u64 ramdisk_image = boot_params.hdr.ramdisk_image; - u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 ramdisk_image = get_ramdisk_image(); + u64 ramdisk_size = get_ramdisk_size(); u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; + u64 mapped_size; if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) @@ -383,22 +390,18 @@ static void __init reserve_initrd(void) initrd_start = 0; - if (ramdisk_size >= (end_of_lowmem>>1)) { + mapped_size = memblock_mem_size(max_pfn_mapped); + if (ramdisk_size >= (mapped_size>>1)) panic("initrd too large to handle, " "disabling initrd (%lld needed, %lld available)\n", - ramdisk_size, end_of_lowmem>>1); - } + ramdisk_size, mapped_size>>1); printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_end - 1); - - if (ramdisk_end <= end_of_lowmem) { - /* All in lowmem, easy case */ - /* - * don't need to reserve again, already reserved early - * in i386_start_kernel - */ + if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image), + PFN_DOWN(ramdisk_end))) { + /* All are mapped, easy case */ initrd_start = ramdisk_image + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; return; @@ -409,6 +412,9 @@ static void __init reserve_initrd(void) memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); } #else +static void __init early_reserve_initrd(void) +{ +} static void __init reserve_initrd(void) { } @@ -419,8 +425,6 @@ static void __init parse_setup_data(void) struct setup_data *data; u64 pa_data; - if (boot_params.hdr.version < 0x0209) - return; pa_data = boot_params.hdr.setup_data; while (pa_data) { u32 data_len, map_len; @@ -456,8 +460,6 @@ static void __init e820_reserve_setup_data(void) u64 pa_data; int found = 0; - if (boot_params.hdr.version < 0x0209) - return; pa_data = boot_params.hdr.setup_data; while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); @@ -481,8 +483,6 @@ static void __init memblock_x86_reserve_range_setup_data(void) struct setup_data *data; u64 pa_data; - if (boot_params.hdr.version < 0x0209) - return; pa_data = boot_params.hdr.setup_data; while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); @@ -501,17 +501,51 @@ static void __init memblock_x86_reserve_range_setup_data(void) /* * Keep the crash kernel below this limit. On 32 bits earlier kernels * would limit the kernel to the low 512 MiB due to mapping restrictions. - * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this - * limit once kexec-tools are fixed. */ #ifdef CONFIG_X86_32 # define CRASH_KERNEL_ADDR_MAX (512 << 20) #else -# define CRASH_KERNEL_ADDR_MAX (896 << 20) +# define CRASH_KERNEL_ADDR_MAX MAXMEM #endif +static void __init reserve_crashkernel_low(void) +{ +#ifdef CONFIG_X86_64 + const unsigned long long alignment = 16<<20; /* 16M */ + unsigned long long low_base = 0, low_size = 0; + unsigned long total_low_mem; + unsigned long long base; + int ret; + + total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT)); + ret = parse_crashkernel_low(boot_command_line, total_low_mem, + &low_size, &base); + if (ret != 0 || low_size <= 0) + return; + + low_base = memblock_find_in_range(low_size, (1ULL<<32), + low_size, alignment); + + if (!low_base) { + pr_info("crashkernel low reservation failed - No suitable area found.\n"); + + return; + } + + memblock_reserve(low_base, low_size); + pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", + (unsigned long)(low_size >> 20), + (unsigned long)(low_base >> 20), + (unsigned long)(total_low_mem >> 20)); + crashk_low_res.start = low_base; + crashk_low_res.end = low_base + low_size - 1; + insert_resource(&iomem_resource, &crashk_low_res); +#endif +} + static void __init reserve_crashkernel(void) { + const unsigned long long alignment = 16<<20; /* 16M */ unsigned long long total_mem; unsigned long long crash_size, crash_base; int ret; @@ -525,8 +559,6 @@ static void __init reserve_crashkernel(void) /* 0 means: find the address automatically */ if (crash_base <= 0) { - const unsigned long long alignment = 16<<20; /* 16M */ - /* * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX */ @@ -537,6 +569,7 @@ static void __init reserve_crashkernel(void) pr_info("crashkernel reservation failed - No suitable area found.\n"); return; } + } else { unsigned long long start; @@ -558,6 +591,9 @@ static void __init reserve_crashkernel(void) crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; insert_resource(&iomem_resource, &crashk_res); + + if (crash_base >= (1ULL<<32)) + reserve_crashkernel_low(); } #else static void __init reserve_crashkernel(void) @@ -608,8 +644,6 @@ static __init void reserve_ibft_region(void) memblock_reserve(addr, size); } -static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; - static bool __init snb_gfx_workaround_needed(void) { #ifdef CONFIG_PCI @@ -698,8 +732,7 @@ static void __init trim_bios_range(void) * since some BIOSes are known to corrupt low memory. See the * Kconfig help text for X86_RESERVE_LOW. */ - e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE), - E820_RAM, E820_RESERVED); + e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); /* * special case: Some BIOSen report the PC BIOS @@ -711,6 +744,29 @@ static void __init trim_bios_range(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); } +/* called before trim_bios_range() to spare extra sanitize */ +static void __init e820_add_kernel_range(void) +{ + u64 start = __pa_symbol(_text); + u64 size = __pa_symbol(_end) - start; + + /* + * Complain if .text .data and .bss are not marked as E820_RAM and + * attempt to fix it by adding the range. We may have a confused BIOS, + * or the user may have used memmap=exactmap or memmap=xxM$yyM to + * exclude kernel range. If we really are running on top non-RAM, + * we will crash later anyways. + */ + if (e820_all_mapped(start, start + size, E820_RAM)) + return; + + pr_warn(".text .data .bss are not marked as E820_RAM!\n"); + e820_remove_range(start, size, E820_RAM, 0); + e820_add_region(start, size, E820_RAM); +} + +static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; + static int __init parse_reservelow(char *p) { unsigned long long size; @@ -733,6 +789,11 @@ static int __init parse_reservelow(char *p) early_param("reservelow", parse_reservelow); +static void __init trim_low_memory_range(void) +{ + memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE)); +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -748,6 +809,17 @@ early_param("reservelow", parse_reservelow); void __init setup_arch(char **cmdline_p) { + memblock_reserve(__pa_symbol(_text), + (unsigned long)__bss_stop - (unsigned long)_text); + + early_reserve_initrd(); + + /* + * At this point everything still needed from the boot loader + * or BIOS or kernel text should be early reserved or marked not + * RAM in e820. All other memory is free game. + */ + #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); @@ -835,12 +907,12 @@ void __init setup_arch(char **cmdline_p) init_mm.end_data = (unsigned long) _edata; init_mm.brk = _brk_end; - code_resource.start = virt_to_phys(_text); - code_resource.end = virt_to_phys(_etext)-1; - data_resource.start = virt_to_phys(_etext); - data_resource.end = virt_to_phys(_edata)-1; - bss_resource.start = virt_to_phys(&__bss_start); - bss_resource.end = virt_to_phys(&__bss_stop)-1; + code_resource.start = __pa_symbol(_text); + code_resource.end = __pa_symbol(_etext)-1; + data_resource.start = __pa_symbol(_etext); + data_resource.end = __pa_symbol(_edata)-1; + bss_resource.start = __pa_symbol(__bss_start); + bss_resource.end = __pa_symbol(__bss_stop)-1; #ifdef CONFIG_CMDLINE_BOOL #ifdef CONFIG_CMDLINE_OVERRIDE @@ -906,6 +978,7 @@ void __init setup_arch(char **cmdline_p) insert_resource(&iomem_resource, &data_resource); insert_resource(&iomem_resource, &bss_resource); + e820_add_kernel_range(); trim_bios_range(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { @@ -955,6 +1028,8 @@ void __init setup_arch(char **cmdline_p) reserve_ibft_region(); + early_alloc_pgt_buf(); + /* * Need to conclude brk, before memblock_x86_fill() * it could use memblock_find_in_range, could overlap with @@ -964,7 +1039,7 @@ void __init setup_arch(char **cmdline_p) cleanup_highmap(); - memblock.current_limit = get_max_mapped(); + memblock.current_limit = ISA_END_ADDRESS; memblock_x86_fill(); /* @@ -981,41 +1056,31 @@ void __init setup_arch(char **cmdline_p) setup_bios_corruption_check(); #endif + /* + * In the memory hotplug case, the kernel needs info from SRAT to + * determine which memory is hotpluggable before allocating memory + * using memblock. + */ + acpi_boot_table_init(); + early_acpi_boot_init(); + early_parse_srat(); + +#ifdef CONFIG_X86_32 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", (max_pfn_mapped<<PAGE_SHIFT) - 1); +#endif - setup_real_mode(); + reserve_real_mode(); trim_platform_memory_ranges(); + trim_low_memory_range(); - init_gbpages(); + init_mem_mapping(); - /* max_pfn_mapped is updated here */ - max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); - max_pfn_mapped = max_low_pfn_mapped; + early_trap_pf_init(); -#ifdef CONFIG_X86_64 - if (max_pfn > max_low_pfn) { - int i; - unsigned long start, end; - unsigned long start_pfn, end_pfn; - - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, - NULL) { - - end = PFN_PHYS(end_pfn); - if (end <= (1UL<<32)) - continue; - - start = PFN_PHYS(start_pfn); - max_pfn_mapped = init_memory_mapping( - max((1UL<<32), start), end); - } + setup_real_mode(); - /* can we preseve max_low_pfn ?*/ - max_low_pfn = max_pfn; - } -#endif memblock.current_limit = get_max_mapped(); dma_contiguous_reserve(0); @@ -1045,10 +1110,6 @@ void __init setup_arch(char **cmdline_p) /* * Parse the ACPI tables for possible boot-time SMP configuration. */ - acpi_boot_table_init(); - - early_acpi_boot_init(); - initmem_init(); memblock_find_dma_reserve(); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index d6bf1f34a6e9..69562992e457 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -278,7 +278,7 @@ static const struct { }; static int -__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, +__setup_frame(int sig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { struct sigframe __user *frame; @@ -286,7 +286,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, int err = 0; void __user *fpstate = NULL; - frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); + frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) return -EFAULT; @@ -307,8 +307,8 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn); else restorer = &frame->retcode; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = ksig->ka.sa.sa_restorer; /* Set up to return from userspace. */ err |= __put_user(restorer, &frame->pretcode); @@ -327,7 +327,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, /* Set up registers for signal handler */ regs->sp = (unsigned long)frame; - regs->ip = (unsigned long)ka->sa.sa_handler; + regs->ip = (unsigned long)ksig->ka.sa.sa_handler; regs->ax = (unsigned long)sig; regs->dx = 0; regs->cx = 0; @@ -340,7 +340,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, return 0; } -static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, +static int __setup_rt_frame(int sig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; @@ -348,7 +348,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, int err = 0; void __user *fpstate = NULL; - frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); + frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) return -EFAULT; @@ -368,8 +368,8 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, /* Set up to return from userspace. */ restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = ksig->ka.sa.sa_restorer; put_user_ex(restorer, &frame->pretcode); /* @@ -382,7 +382,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode); } put_user_catch(err); - err |= copy_siginfo_to_user(&frame->info, info); + err |= copy_siginfo_to_user(&frame->info, &ksig->info); err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); @@ -392,7 +392,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, /* Set up registers for signal handler */ regs->sp = (unsigned long)frame; - regs->ip = (unsigned long)ka->sa.sa_handler; + regs->ip = (unsigned long)ksig->ka.sa.sa_handler; regs->ax = (unsigned long)sig; regs->dx = (unsigned long)&frame->info; regs->cx = (unsigned long)&frame->uc; @@ -405,20 +405,20 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, return 0; } #else /* !CONFIG_X86_32 */ -static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, +static int __setup_rt_frame(int sig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; void __user *fp = NULL; int err = 0; - frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp); + frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) return -EFAULT; - if (ka->sa.sa_flags & SA_SIGINFO) { - if (copy_siginfo_to_user(&frame->info, info)) + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + if (copy_siginfo_to_user(&frame->info, &ksig->info)) return -EFAULT; } @@ -434,8 +434,8 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, /* Set up to return from userspace. If provided, use a stub already in userspace. */ /* x86-64 should always use SA_RESTORER. */ - if (ka->sa.sa_flags & SA_RESTORER) { - put_user_ex(ka->sa.sa_restorer, &frame->pretcode); + if (ksig->ka.sa.sa_flags & SA_RESTORER) { + put_user_ex(ksig->ka.sa.sa_restorer, &frame->pretcode); } else { /* could use a vstub here */ err |= -EFAULT; @@ -457,7 +457,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, next argument after the signal number on the stack. */ regs->si = (unsigned long)&frame->info; regs->dx = (unsigned long)&frame->uc; - regs->ip = (unsigned long) ka->sa.sa_handler; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; regs->sp = (unsigned long)frame; @@ -469,8 +469,8 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, } #endif /* CONFIG_X86_32 */ -static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, - siginfo_t *info, compat_sigset_t *set, +static int x32_setup_rt_frame(struct ksignal *ksig, + compat_sigset_t *set, struct pt_regs *regs) { #ifdef CONFIG_X86_X32_ABI @@ -479,13 +479,13 @@ static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, int err = 0; void __user *fpstate = NULL; - frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); + frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) return -EFAULT; - if (ka->sa.sa_flags & SA_SIGINFO) { - if (copy_siginfo_to_user32(&frame->info, info)) + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + if (copy_siginfo_to_user32(&frame->info, &ksig->info)) return -EFAULT; } @@ -499,8 +499,8 @@ static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp); put_user_ex(0, &frame->uc.uc__pad0); - if (ka->sa.sa_flags & SA_RESTORER) { - restorer = ka->sa.sa_restorer; + if (ksig->ka.sa.sa_flags & SA_RESTORER) { + restorer = ksig->ka.sa.sa_restorer; } else { /* could use a vstub here */ restorer = NULL; @@ -518,10 +518,10 @@ static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, /* Set up registers for signal handler */ regs->sp = (unsigned long) frame; - regs->ip = (unsigned long) ka->sa.sa_handler; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; /* We use the x32 calling convention here... */ - regs->di = sig; + regs->di = ksig->sig; regs->si = (unsigned long) &frame->info; regs->dx = (unsigned long) &frame->uc; @@ -535,70 +535,13 @@ static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, return 0; } -#ifdef CONFIG_X86_32 -/* - * Atomically swap in the new signal mask, and wait for a signal. - */ -asmlinkage int -sys_sigsuspend(int history0, int history1, old_sigset_t mask) -{ - sigset_t blocked; - siginitset(&blocked, mask); - return sigsuspend(&blocked); -} - -asmlinkage int -sys_sigaction(int sig, const struct old_sigaction __user *act, - struct old_sigaction __user *oact) -{ - struct k_sigaction new_ka, old_ka; - int ret = 0; - - if (act) { - old_sigset_t mask; - - if (!access_ok(VERIFY_READ, act, sizeof(*act))) - return -EFAULT; - - get_user_try { - get_user_ex(new_ka.sa.sa_handler, &act->sa_handler); - get_user_ex(new_ka.sa.sa_flags, &act->sa_flags); - get_user_ex(mask, &act->sa_mask); - get_user_ex(new_ka.sa.sa_restorer, &act->sa_restorer); - } get_user_catch(ret); - - if (ret) - return -EFAULT; - siginitset(&new_ka.sa.sa_mask, mask); - } - - ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); - - if (!ret && oact) { - if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact))) - return -EFAULT; - - put_user_try { - put_user_ex(old_ka.sa.sa_handler, &oact->sa_handler); - put_user_ex(old_ka.sa.sa_flags, &oact->sa_flags); - put_user_ex(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); - put_user_ex(old_ka.sa.sa_restorer, &oact->sa_restorer); - } put_user_catch(ret); - - if (ret) - return -EFAULT; - } - - return ret; -} -#endif /* CONFIG_X86_32 */ - /* * Do a signal return; undo the signal stack. */ #ifdef CONFIG_X86_32 -unsigned long sys_sigreturn(struct pt_regs *regs) +unsigned long sys_sigreturn(void) { + struct pt_regs *regs = current_pt_regs(); struct sigframe __user *frame; unsigned long ax; sigset_t set; @@ -625,8 +568,9 @@ badframe: } #endif /* CONFIG_X86_32 */ -long sys_rt_sigreturn(struct pt_regs *regs) +long sys_rt_sigreturn(void) { + struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; unsigned long ax; sigset_t set; @@ -667,30 +611,29 @@ static int signr_convert(int sig) } static int -setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - struct pt_regs *regs) +setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) { - int usig = signr_convert(sig); + int usig = signr_convert(ksig->sig); sigset_t *set = sigmask_to_save(); compat_sigset_t *cset = (compat_sigset_t *) set; /* Set up the stack frame */ if (is_ia32_frame()) { - if (ka->sa.sa_flags & SA_SIGINFO) - return ia32_setup_rt_frame(usig, ka, info, cset, regs); + if (ksig->ka.sa.sa_flags & SA_SIGINFO) + return ia32_setup_rt_frame(usig, ksig, cset, regs); else - return ia32_setup_frame(usig, ka, cset, regs); + return ia32_setup_frame(usig, ksig, cset, regs); } else if (is_x32_frame()) { - return x32_setup_rt_frame(usig, ka, info, cset, regs); + return x32_setup_rt_frame(ksig, cset, regs); } else { - return __setup_rt_frame(sig, ka, info, set, regs); + return __setup_rt_frame(ksig->sig, ksig, set, regs); } } static void -handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, - struct pt_regs *regs) +handle_signal(struct ksignal *ksig, struct pt_regs *regs) { + bool failed; /* Are we from a system call? */ if (syscall_get_nr(current, regs) >= 0) { /* If so, check system call restarting.. */ @@ -701,7 +644,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, break; case -ERESTARTSYS: - if (!(ka->sa.sa_flags & SA_RESTART)) { + if (!(ksig->ka.sa.sa_flags & SA_RESTART)) { regs->ax = -EINTR; break; } @@ -721,26 +664,21 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, likely(test_and_clear_thread_flag(TIF_FORCED_TF))) regs->flags &= ~X86_EFLAGS_TF; - if (setup_rt_frame(sig, ka, info, regs) < 0) { - force_sigsegv(sig, current); - return; + failed = (setup_rt_frame(ksig, regs) < 0); + if (!failed) { + /* + * Clear the direction flag as per the ABI for function entry. + */ + regs->flags &= ~X86_EFLAGS_DF; + /* + * Clear TF when entering the signal handler, but + * notify any tracer that was single-stepping it. + * The tracer may want to single-step inside the + * handler too. + */ + regs->flags &= ~X86_EFLAGS_TF; } - - /* - * Clear the direction flag as per the ABI for function entry. - */ - regs->flags &= ~X86_EFLAGS_DF; - - /* - * Clear TF when entering the signal handler, but - * notify any tracer that was single-stepping it. - * The tracer may want to single-step inside the - * handler too. - */ - regs->flags &= ~X86_EFLAGS_TF; - - signal_delivered(sig, info, ka, regs, - test_thread_flag(TIF_SINGLESTEP)); + signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP)); } #ifdef CONFIG_X86_32 @@ -757,14 +695,11 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, */ static void do_signal(struct pt_regs *regs) { - struct k_sigaction ka; - siginfo_t info; - int signr; + struct ksignal ksig; - signr = get_signal_to_deliver(&info, &ka, regs, NULL); - if (signr > 0) { + if (get_signal(&ksig)) { /* Whee! Actually deliver the signal. */ - handle_signal(signr, &info, &ka, regs); + handle_signal(&ksig, regs); return; } @@ -843,8 +778,9 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) } #ifdef CONFIG_X86_X32_ABI -asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs) +asmlinkage long sys32_x32_rt_sigreturn(void) { + struct pt_regs *regs = current_pt_regs(); struct rt_sigframe_x32 __user *frame; sigset_t set; unsigned long ax; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ecffca11f4e9..68bda7a84159 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -688,10 +688,19 @@ void __init early_trap_init(void) set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); /* int3 can be called from all */ set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); +#ifdef CONFIG_X86_32 set_intr_gate(X86_TRAP_PF, &page_fault); +#endif load_idt(&idt_descr); } +void __init early_trap_pf_init(void) +{ +#ifdef CONFIG_X86_64 + set_intr_gate(X86_TRAP_PF, &page_fault); +#endif +} + void __init trap_init(void) { int i; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 1dfe69cc78a8..1cf5766dde16 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -202,7 +202,7 @@ out: static int do_vm86_irq_handling(int subfunction, int irqnumber); static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); -int sys_vm86old(struct vm86_struct __user *v86, struct pt_regs *regs) +int sys_vm86old(struct vm86_struct __user *v86) { struct kernel_vm86_struct info; /* declare this _on top_, * this avoids wasting of stack space. @@ -222,7 +222,7 @@ int sys_vm86old(struct vm86_struct __user *v86, struct pt_regs *regs) if (tmp) goto out; memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus); - info.regs32 = regs; + info.regs32 = current_pt_regs(); tsk->thread.vm86_info = v86; do_sys_vm86(&info, tsk); ret = 0; /* we never return here */ @@ -231,7 +231,7 @@ out: } -int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs) +int sys_vm86(unsigned long cmd, unsigned long arg) { struct kernel_vm86_struct info; /* declare this _on top_, * this avoids wasting of stack space. @@ -272,7 +272,7 @@ int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs) ret = -EFAULT; if (tmp) goto out; - info.regs32 = regs; + info.regs32 = current_pt_regs(); info.vm86plus.is_vm86pus = 1; tsk->thread.vm86_info = (struct vm86_struct __user *)v86; do_sys_vm86(&info, tsk); diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 1330dd102950..b014d9414d08 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -59,6 +59,9 @@ EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(__memcpy); EXPORT_SYMBOL(memmove); +#ifndef CONFIG_DEBUG_VIRTUAL +EXPORT_SYMBOL(phys_base); +#endif EXPORT_SYMBOL(empty_zero_page); #ifndef CONFIG_PARAVIRT EXPORT_SYMBOL(native_load_gs_index); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index d065d67c2672..45a14dbbddaf 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -63,10 +63,6 @@ struct x86_init_ops x86_init __initdata = { .banner = default_banner, }, - .mapping = { - .pagetable_reserve = native_pagetable_reserve, - }, - .paging = { .pagetable_init = native_pagetable_init, }, |