diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-13 18:28:19 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-13 18:28:19 -0700 |
commit | 13e091b6dd0e78a518a7d8756607d3acb8215768 (patch) | |
tree | ab7b6eef8d53008602be8dd5966655816abfeda6 | |
parent | eac341194426ba7ead3444923b9eba491ae4feeb (diff) | |
parent | 1088c6eef261939bda8346ec35b513790a2111d5 (diff) | |
download | linux-stable-13e091b6dd0e78a518a7d8756607d3acb8215768.tar.gz linux-stable-13e091b6dd0e78a518a7d8756607d3acb8215768.tar.bz2 linux-stable-13e091b6dd0e78a518a7d8756607d3acb8215768.zip |
Merge branch 'x86-timers-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 timer updates from Thomas Gleixner:
"Early TSC based time stamping to allow better boot time analysis.
This comes with a general cleanup of the TSC calibration code which
grew warts and duct taping over the years and removes 250 lines of
code. Initiated and mostly implemented by Pavel with help from various
folks"
* 'x86-timers-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (37 commits)
x86/kvmclock: Mark kvm_get_preset_lpj() as __init
x86/tsc: Consolidate init code
sched/clock: Disable interrupts when calling generic_sched_clock_init()
timekeeping: Prevent false warning when persistent clock is not available
sched/clock: Close a hole in sched_clock_init()
x86/tsc: Make use of tsc_calibrate_cpu_early()
x86/tsc: Split native_calibrate_cpu() into early and late parts
sched/clock: Use static key for sched_clock_running
sched/clock: Enable sched clock early
sched/clock: Move sched clock initialization and merge with generic clock
x86/tsc: Use TSC as sched clock early
x86/tsc: Initialize cyc2ns when tsc frequency is determined
x86/tsc: Calibrate tsc only once
ARM/time: Remove read_boot_clock64()
s390/time: Remove read_boot_clock64()
timekeeping: Default boot time offset to local_clock()
timekeeping: Replace read_boot_clock64() with read_persistent_wall_and_boot_offset()
s390/time: Add read_persistent_wall_and_boot_offset()
x86/xen/time: Output xen sched_clock time from 0
x86/xen/time: Initialize pv xen time in init_hypervisor_platform()
...
41 files changed, 508 insertions, 754 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index c370f5f0eb38..5cde1ff32ff3 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2835,8 +2835,6 @@ nosync [HW,M68K] Disables sync negotiation for all devices. - notsc [BUGS=X86-32] Disable Time Stamp Counter - nowatchdog [KNL] Disable both lockup detectors, i.e. soft-lockup and NMI watchdog (hard-lockup). diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 3a2ba436721a..ad6d2a80cf05 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -92,9 +92,7 @@ APICs Timing notsc - Don't use the CPU time stamp counter to read the wall time. - This can be used to work around timing problems on multiprocessor systems - with not properly synchronized CPUs. + Deprecated, use tsc=unstable instead. nohpet Don't use the HPET timer. diff --git a/arch/arm/include/asm/mach/time.h b/arch/arm/include/asm/mach/time.h index 0f79e4dec7f9..4ac3a019a46f 100644 --- a/arch/arm/include/asm/mach/time.h +++ b/arch/arm/include/asm/mach/time.h @@ -13,7 +13,6 @@ extern void timer_tick(void); typedef void (*clock_access_fn)(struct timespec64 *); -extern int register_persistent_clock(clock_access_fn read_boot, - clock_access_fn read_persistent); +extern int register_persistent_clock(clock_access_fn read_persistent); #endif diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c index cf2701cb0de8..078b259ead4e 100644 --- a/arch/arm/kernel/time.c +++ b/arch/arm/kernel/time.c @@ -83,29 +83,18 @@ static void dummy_clock_access(struct timespec64 *ts) } static clock_access_fn __read_persistent_clock = dummy_clock_access; -static clock_access_fn __read_boot_clock = dummy_clock_access; void read_persistent_clock64(struct timespec64 *ts) { __read_persistent_clock(ts); } -void read_boot_clock64(struct timespec64 *ts) -{ - __read_boot_clock(ts); -} - -int __init register_persistent_clock(clock_access_fn read_boot, - clock_access_fn read_persistent) +int __init register_persistent_clock(clock_access_fn read_persistent) { /* Only allow the clockaccess functions to be registered once */ - if (__read_persistent_clock == dummy_clock_access && - __read_boot_clock == dummy_clock_access) { - if (read_boot) - __read_boot_clock = read_boot; + if (__read_persistent_clock == dummy_clock_access) { if (read_persistent) __read_persistent_clock = read_persistent; - return 0; } diff --git a/arch/arm/plat-omap/counter_32k.c b/arch/arm/plat-omap/counter_32k.c index 2438b96004c1..fcc5bfec8bd1 100644 --- a/arch/arm/plat-omap/counter_32k.c +++ b/arch/arm/plat-omap/counter_32k.c @@ -110,7 +110,7 @@ int __init omap_init_clocksource_32k(void __iomem *vbase) } sched_clock_register(omap_32k_read_sched_clock, 32, 32768); - register_persistent_clock(NULL, omap_read_persistent_clock64); + register_persistent_clock(omap_read_persistent_clock64); pr_info("OMAP clocksource: 32k_counter at 32768 Hz\n"); return 0; diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index cf561160ea88..e8766beee5ad 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -221,17 +221,22 @@ void read_persistent_clock64(struct timespec64 *ts) ext_to_timespec64(clk, ts); } -void read_boot_clock64(struct timespec64 *ts) +void __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, + struct timespec64 *boot_offset) { unsigned char clk[STORE_CLOCK_EXT_SIZE]; + struct timespec64 boot_time; __u64 delta; delta = initial_leap_seconds + TOD_UNIX_EPOCH; - memcpy(clk, tod_clock_base, 16); - *(__u64 *) &clk[1] -= delta; - if (*(__u64 *) &clk[1] > delta) + memcpy(clk, tod_clock_base, STORE_CLOCK_EXT_SIZE); + *(__u64 *)&clk[1] -= delta; + if (*(__u64 *)&clk[1] > delta) clk[0]--; - ext_to_timespec64(clk, ts); + ext_to_timespec64(clk, &boot_time); + + read_persistent_clock64(wall_time); + *boot_offset = timespec64_sub(*wall_time, boot_time); } static u64 read_tod_clock(struct clocksource *cs) diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index cf090e584202..7ed08a7c3398 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -76,4 +76,17 @@ #define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ #define INTEL_FAM6_XEON_PHI_KNM 0x85 /* Knights Mill */ +/* Useful macros */ +#define INTEL_CPU_FAM_ANY(_family, _model, _driver_data) \ +{ \ + .vendor = X86_VENDOR_INTEL, \ + .family = _family, \ + .model = _model, \ + .feature = X86_FEATURE_ANY, \ + .driver_data = (kernel_ulong_t)&_driver_data \ +} + +#define INTEL_CPU_FAM6(_model, _driver_data) \ + INTEL_CPU_FAM_ANY(6, INTEL_FAM6_##_model, _driver_data) + #endif /* _ASM_X86_INTEL_FAMILY_H */ diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index fe04491130ae..52f815a80539 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -80,35 +80,6 @@ enum intel_mid_cpu_type { extern enum intel_mid_cpu_type __intel_mid_cpu_chip; -/** - * struct intel_mid_ops - Interface between intel-mid & sub archs - * @arch_setup: arch_setup function to re-initialize platform - * structures (x86_init, x86_platform_init) - * - * This structure can be extended if any new interface is required - * between intel-mid & its sub arch files. - */ -struct intel_mid_ops { - void (*arch_setup)(void); -}; - -/* Helper API's for INTEL_MID_OPS_INIT */ -#define DECLARE_INTEL_MID_OPS_INIT(cpuname, cpuid) \ - [cpuid] = get_##cpuname##_ops - -/* Maximum number of CPU ops */ -#define MAX_CPU_OPS(a) (sizeof(a)/sizeof(void *)) - -/* - * For every new cpu addition, a weak get_<cpuname>_ops() function needs be - * declared in arch/x86/platform/intel_mid/intel_mid_weak_decls.h. - */ -#define INTEL_MID_OPS_INIT { \ - DECLARE_INTEL_MID_OPS_INIT(penwell, INTEL_MID_CPU_CHIP_PENWELL), \ - DECLARE_INTEL_MID_OPS_INIT(cloverview, INTEL_MID_CPU_CHIP_CLOVERVIEW), \ - DECLARE_INTEL_MID_OPS_INIT(tangier, INTEL_MID_CPU_CHIP_TANGIER) \ -}; - #ifdef CONFIG_X86_INTEL_MID static inline enum intel_mid_cpu_type intel_mid_identify_cpu(void) @@ -136,20 +107,6 @@ enum intel_mid_timer_options { extern enum intel_mid_timer_options intel_mid_timer_options; -/* - * Penwell uses spread spectrum clock, so the freq number is not exactly - * the same as reported by MSR based on SDM. - */ -#define FSB_FREQ_83SKU 83200 -#define FSB_FREQ_100SKU 99840 -#define FSB_FREQ_133SKU 133000 - -#define FSB_FREQ_167SKU 167000 -#define FSB_FREQ_200SKU 200000 -#define FSB_FREQ_267SKU 267000 -#define FSB_FREQ_333SKU 333000 -#define FSB_FREQ_400SKU 400000 - /* Bus Select SoC Fuse value */ #define BSEL_SOC_FUSE_MASK 0x7 /* FSB 133MHz */ diff --git a/arch/x86/include/asm/kvm_guest.h b/arch/x86/include/asm/kvm_guest.h deleted file mode 100644 index 46185263d9c2..000000000000 --- a/arch/x86/include/asm/kvm_guest.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_KVM_GUEST_H -#define _ASM_X86_KVM_GUEST_H - -int kvm_setup_vsyscall_timeinfo(void); - -#endif /* _ASM_X86_KVM_GUEST_H */ diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 3aea2658323a..4c723632c036 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -7,7 +7,6 @@ #include <uapi/asm/kvm_para.h> extern void kvmclock_init(void); -extern int kvm_register_clock(char *txt); #ifdef CONFIG_KVM_GUEST bool kvm_check_and_clear_guest_paused(void); diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 2ecd34e2d46c..e85ff65c43c3 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -37,5 +37,6 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len); extern void *text_poke(void *addr, const void *opcode, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); +extern int after_bootmem; #endif /* _ASM_X86_TEXT_PATCHING_H */ diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 2701d221583a..eb5bbfeccb66 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -33,13 +33,13 @@ static inline cycles_t get_cycles(void) extern struct system_counterval_t convert_art_to_tsc(u64 art); extern struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns); -extern void tsc_early_delay_calibrate(void); +extern void tsc_early_init(void); extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); extern void mark_tsc_async_resets(char *reason); -extern unsigned long native_calibrate_cpu(void); +extern unsigned long native_calibrate_cpu_early(void); extern unsigned long native_calibrate_tsc(void); extern unsigned long long native_sched_clock_from_tsc(u64 tsc); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a481763a3776..014f214da581 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -668,6 +668,7 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode, local_irq_save(flags); memcpy(addr, opcode, len); local_irq_restore(flags); + sync_core(); /* Could also do a CLFLUSH here to speed up CPU recovery; but that causes hangs on some VIA CPUs. */ return addr; @@ -693,6 +694,12 @@ void *text_poke(void *addr, const void *opcode, size_t len) struct page *pages[2]; int i; + /* + * While boot memory allocator is runnig we cannot use struct + * pages as they are not yet initialized. + */ + BUG_ON(!after_bootmem); + if (!core_kernel_text((unsigned long)addr)) { pages[0] = vmalloc_to_page(addr); pages[1] = vmalloc_to_page(addr + PAGE_SIZE); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 38915fbfae73..b732438c1a1e 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -232,8 +232,6 @@ static void init_amd_k7(struct cpuinfo_x86 *c) } } - set_cpu_cap(c, X86_FEATURE_K7); - /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -617,6 +615,14 @@ static void early_init_amd(struct cpuinfo_x86 *c) early_init_amd_mc(c); +#ifdef CONFIG_X86_32 + if (c->x86 == 6) + set_cpu_cap(c, X86_FEATURE_K7); +#endif + + if (c->x86 >= 0xf) + set_cpu_cap(c, X86_FEATURE_K8); + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); /* @@ -863,9 +869,6 @@ static void init_amd(struct cpuinfo_x86 *c) init_amd_cacheinfo(c); - if (c->x86 >= 0xf) - set_cpu_cap(c, X86_FEATURE_K8); - if (cpu_has(c, X86_FEATURE_XMM2)) { unsigned long long val; int ret; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index df28e931d732..ba6b8bb1c036 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1019,6 +1019,24 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) } /* + * The NOPL instruction is supposed to exist on all CPUs of family >= 6; + * unfortunately, that's not true in practice because of early VIA + * chips and (more importantly) broken virtualizers that are not easy + * to detect. In the latter case it doesn't even *fail* reliably, so + * probing for it doesn't even work. Disable it completely on 32-bit + * unless we can find a reliable way to detect all the broken cases. + * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). + */ +static void detect_nopl(void) +{ +#ifdef CONFIG_X86_32 + setup_clear_cpu_cap(X86_FEATURE_NOPL); +#else + setup_force_cpu_cap(X86_FEATURE_NOPL); +#endif +} + +/* * Do minimum CPU detection early. * Fields really needed: vendor, cpuid_level, family, model, mask, * cache alignment. @@ -1092,6 +1110,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) */ if (!pgtable_l5_enabled()) setup_clear_cpu_cap(X86_FEATURE_LA57); + + detect_nopl(); } void __init early_cpu_init(void) @@ -1127,24 +1147,6 @@ void __init early_cpu_init(void) early_identify_cpu(&boot_cpu_data); } -/* - * The NOPL instruction is supposed to exist on all CPUs of family >= 6; - * unfortunately, that's not true in practice because of early VIA - * chips and (more importantly) broken virtualizers that are not easy - * to detect. In the latter case it doesn't even *fail* reliably, so - * probing for it doesn't even work. Disable it completely on 32-bit - * unless we can find a reliable way to detect all the broken cases. - * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). - */ -static void detect_nopl(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_X86_32 - clear_cpu_cap(c, X86_FEATURE_NOPL); -#else - set_cpu_cap(c, X86_FEATURE_NOPL); -#endif -} - static void detect_null_seg_behavior(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_64 @@ -1207,8 +1209,6 @@ static void generic_identify(struct cpuinfo_x86 *c) get_model_name(c); /* Default name */ - detect_nopl(c); - detect_null_seg_behavior(c); /* diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index e56c95be2808..eeea935e9bb5 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -37,15 +37,18 @@ static void bug_at(unsigned char *ip, int line) BUG(); } -static void __jump_label_transform(struct jump_entry *entry, - enum jump_label_type type, - void *(*poker)(void *, const void *, size_t), - int init) +static void __ref __jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + void *(*poker)(void *, const void *, size_t), + int init) { union jump_code_union code; const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; + if (early_boot_irqs_disabled) + poker = text_poke_early; + if (type == JUMP_LABEL_JMP) { if (init) { /* diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a37bda38d205..09aaabb2bbf1 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -45,7 +45,6 @@ #include <asm/apic.h> #include <asm/apicdef.h> #include <asm/hypervisor.h> -#include <asm/kvm_guest.h> static int kvmapf = 1; @@ -66,15 +65,6 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); -static int kvmclock_vsyscall = 1; -static int __init parse_no_kvmclock_vsyscall(char *arg) -{ - kvmclock_vsyscall = 0; - return 0; -} - -early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); - static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64); static int has_steal_clock = 0; @@ -560,9 +550,6 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); - if (kvmclock_vsyscall) - kvm_setup_vsyscall_timeinfo(); - #ifdef CONFIG_SMP smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; @@ -628,6 +615,7 @@ const __initconst struct hypervisor_x86 x86_hyper_kvm = { .name = "KVM", .detect = kvm_detect, .type = X86_HYPER_KVM, + .init.init_platform = kvmclock_init, .init.guest_late_init = kvm_guest_init, .init.x2apic_available = kvm_para_available, }; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 3b8e7c13c614..d2edd7e6c294 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -23,30 +23,56 @@ #include <asm/apic.h> #include <linux/percpu.h> #include <linux/hardirq.h> -#include <linux/memblock.h> +#include <linux/cpuhotplug.h> #include <linux/sched.h> #include <linux/sched/clock.h> +#include <linux/mm.h> +#include <asm/hypervisor.h> #include <asm/mem_encrypt.h> #include <asm/x86_init.h> #include <asm/reboot.h> #include <asm/kvmclock.h> -static int kvmclock __ro_after_init = 1; -static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; -static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; -static u64 kvm_sched_clock_offset; +static int kvmclock __initdata = 1; +static int kvmclock_vsyscall __initdata = 1; +static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME; +static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK; +static u64 kvm_sched_clock_offset __ro_after_init; -static int parse_no_kvmclock(char *arg) +static int __init parse_no_kvmclock(char *arg) { kvmclock = 0; return 0; } early_param("no-kvmclock", parse_no_kvmclock); -/* The hypervisor will put information about time periodically here */ -static struct pvclock_vsyscall_time_info *hv_clock; -static struct pvclock_wall_clock *wall_clock; +static int __init parse_no_kvmclock_vsyscall(char *arg) +{ + kvmclock_vsyscall = 0; + return 0; +} +early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); + +/* Aligned to page sizes to match whats mapped via vsyscalls to userspace */ +#define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS) +#define HVC_BOOT_ARRAY_SIZE \ + (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) + +static struct pvclock_vsyscall_time_info + hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE); +static struct pvclock_wall_clock wall_clock; +static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); + +static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) +{ + return &this_cpu_read(hv_clock_per_cpu)->pvti; +} + +static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void) +{ + return this_cpu_read(hv_clock_per_cpu); +} /* * The wallclock is the time of day when we booted. Since then, some time may @@ -55,21 +81,10 @@ static struct pvclock_wall_clock *wall_clock; */ static void kvm_get_wallclock(struct timespec64 *now) { - struct pvclock_vcpu_time_info *vcpu_time; - int low, high; - int cpu; - - low = (int)slow_virt_to_phys(wall_clock); - high = ((u64)slow_virt_to_phys(wall_clock) >> 32); - - native_write_msr(msr_kvm_wall_clock, low, high); - - cpu = get_cpu(); - - vcpu_time = &hv_clock[cpu].pvti; - pvclock_read_wallclock(wall_clock, vcpu_time, now); - - put_cpu(); + wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); + preempt_disable(); + pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now); + preempt_enable(); } static int kvm_set_wallclock(const struct timespec64 *now) @@ -79,14 +94,10 @@ static int kvm_set_wallclock(const struct timespec64 *now) static u64 kvm_clock_read(void) { - struct pvclock_vcpu_time_info *src; u64 ret; - int cpu; preempt_disable_notrace(); - cpu = smp_processor_id(); - src = &hv_clock[cpu].pvti; - ret = pvclock_clocksource_read(src); + ret = pvclock_clocksource_read(this_cpu_pvti()); preempt_enable_notrace(); return ret; } @@ -112,11 +123,11 @@ static inline void kvm_sched_clock_init(bool stable) kvm_sched_clock_offset = kvm_clock_read(); pv_time_ops.sched_clock = kvm_sched_clock_read; - printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n", - kvm_sched_clock_offset); + pr_info("kvm-clock: using sched offset of %llu cycles", + kvm_sched_clock_offset); BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > - sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); + sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); } /* @@ -130,19 +141,11 @@ static inline void kvm_sched_clock_init(bool stable) */ static unsigned long kvm_get_tsc_khz(void) { - struct pvclock_vcpu_time_info *src; - int cpu; - unsigned long tsc_khz; - - cpu = get_cpu(); - src = &hv_clock[cpu].pvti; - tsc_khz = pvclock_tsc_khz(src); - put_cpu(); setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); - return tsc_khz; + return pvclock_tsc_khz(this_cpu_pvti()); } -static void kvm_get_preset_lpj(void) +static void __init kvm_get_preset_lpj(void) { unsigned long khz; u64 lpj; @@ -156,49 +159,40 @@ static void kvm_get_preset_lpj(void) bool kvm_check_and_clear_guest_paused(void) { + struct pvclock_vsyscall_time_info *src = this_cpu_hvclock(); bool ret = false; - struct pvclock_vcpu_time_info *src; - int cpu = smp_processor_id(); - if (!hv_clock) + if (!src) return ret; - src = &hv_clock[cpu].pvti; - if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { - src->flags &= ~PVCLOCK_GUEST_STOPPED; + if ((src->pvti.flags & PVCLOCK_GUEST_STOPPED) != 0) { + src->pvti.flags &= ~PVCLOCK_GUEST_STOPPED; pvclock_touch_watchdogs(); ret = true; } - return ret; } struct clocksource kvm_clock = { - .name = "kvm-clock", - .read = kvm_clock_get_cycles, - .rating = 400, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .name = "kvm-clock", + .read = kvm_clock_get_cycles, + .rating = 400, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; EXPORT_SYMBOL_GPL(kvm_clock); -int kvm_register_clock(char *txt) +static void kvm_register_clock(char *txt) { - int cpu = smp_processor_id(); - int low, high, ret; - struct pvclock_vcpu_time_info *src; - - if (!hv_clock) - return 0; + struct pvclock_vsyscall_time_info *src = this_cpu_hvclock(); + u64 pa; - src = &hv_clock[cpu].pvti; - low = (int)slow_virt_to_phys(src) | 1; - high = ((u64)slow_virt_to_phys(src) >> 32); - ret = native_write_msr_safe(msr_kvm_system_time, low, high); - printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", - cpu, high, low, txt); + if (!src) + return; - return ret; + pa = slow_virt_to_phys(&src->pvti) | 0x01ULL; + wrmsrl(msr_kvm_system_time, pa); + pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt); } static void kvm_save_sched_clock_state(void) @@ -213,11 +207,7 @@ static void kvm_restore_sched_clock_state(void) #ifdef CONFIG_X86_LOCAL_APIC static void kvm_setup_secondary_clock(void) { - /* - * Now that the first cpu already had this clocksource initialized, - * we shouldn't fail. - */ - WARN_ON(kvm_register_clock("secondary cpu clock")); + kvm_register_clock("secondary cpu clock"); } #endif @@ -245,100 +235,84 @@ static void kvm_shutdown(void) native_machine_shutdown(); } -static phys_addr_t __init kvm_memblock_alloc(phys_addr_t size, - phys_addr_t align) +static int __init kvm_setup_vsyscall_timeinfo(void) { - phys_addr_t mem; +#ifdef CONFIG_X86_64 + u8 flags; - mem = memblock_alloc(size, align); - if (!mem) + if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall) return 0; - if (sev_active()) { - if (early_set_memory_decrypted((unsigned long)__va(mem), size)) - goto e_free; - } + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) + return 0; - return mem; -e_free: - memblock_free(mem, size); + kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; +#endif return 0; } +early_initcall(kvm_setup_vsyscall_timeinfo); -static void __init kvm_memblock_free(phys_addr_t addr, phys_addr_t size) +static int kvmclock_setup_percpu(unsigned int cpu) { - if (sev_active()) - early_set_memory_encrypted((unsigned long)__va(addr), size); + struct pvclock_vsyscall_time_info *p = per_cpu(hv_clock_per_cpu, cpu); - memblock_free(addr, size); + /* + * The per cpu area setup replicates CPU0 data to all cpu + * pointers. So carefully check. CPU0 has been set up in init + * already. + */ + if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0))) + return 0; + + /* Use the static page for the first CPUs, allocate otherwise */ + if (cpu < HVC_BOOT_ARRAY_SIZE) + p = &hv_clock_boot[cpu]; + else + p = kzalloc(sizeof(*p), GFP_KERNEL); + + per_cpu(hv_clock_per_cpu, cpu) = p; + return p ? 0 : -ENOMEM; } void __init kvmclock_init(void) { - struct pvclock_vcpu_time_info *vcpu_time; - unsigned long mem, mem_wall_clock; - int size, cpu, wall_clock_size; u8 flags; - size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); - - if (!kvm_para_available()) + if (!kvm_para_available() || !kvmclock) return; - if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; - } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) - return; - - wall_clock_size = PAGE_ALIGN(sizeof(struct pvclock_wall_clock)); - mem_wall_clock = kvm_memblock_alloc(wall_clock_size, PAGE_SIZE); - if (!mem_wall_clock) - return; - - wall_clock = __va(mem_wall_clock); - memset(wall_clock, 0, wall_clock_size); - - mem = kvm_memblock_alloc(size, PAGE_SIZE); - if (!mem) { - kvm_memblock_free(mem_wall_clock, wall_clock_size); - wall_clock = NULL; + } else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { return; } - hv_clock = __va(mem); - memset(hv_clock, 0, size); - - if (kvm_register_clock("primary cpu clock")) { - hv_clock = NULL; - kvm_memblock_free(mem, size); - kvm_memblock_free(mem_wall_clock, wall_clock_size); - wall_clock = NULL; + if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu", + kvmclock_setup_percpu, NULL) < 0) { return; } - printk(KERN_INFO "kvm-clock: Using msrs %x and %x", + pr_info("kvm-clock: Using msrs %x and %x", msr_kvm_system_time, msr_kvm_wall_clock); - pvclock_set_pvti_cpu0_va(hv_clock); + this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]); + kvm_register_clock("primary cpu clock"); + pvclock_set_pvti_cpu0_va(hv_clock_boot); if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); - cpu = get_cpu(); - vcpu_time = &hv_clock[cpu].pvti; - flags = pvclock_read_flags(vcpu_time); - + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); - put_cpu(); x86_platform.calibrate_tsc = kvm_get_tsc_khz; x86_platform.calibrate_cpu = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - x86_cpuinit.early_percpu_clock_init = - kvm_setup_secondary_clock; + x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; #endif x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; @@ -350,31 +324,3 @@ void __init kvmclock_init(void) clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); pv_info.name = "KVM"; } - -int __init kvm_setup_vsyscall_timeinfo(void) -{ -#ifdef CONFIG_X86_64 - int cpu; - u8 flags; - struct pvclock_vcpu_time_info *vcpu_time; - unsigned int size; - - if (!hv_clock) - return 0; - - size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); - - cpu = get_cpu(); - - vcpu_time = &hv_clock[cpu].pvti; - flags = pvclock_read_flags(vcpu_time); - - put_cpu(); - - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) - return 1; - - kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; -#endif - return 0; -} diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 2f86d883dd95..5d32c55aeb8b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -866,6 +866,8 @@ void __init setup_arch(char **cmdline_p) idt_setup_early_traps(); early_cpu_init(); + arch_init_ideal_nops(); + jump_label_init(); early_ioremap_init(); setup_olpc_ofw_pgd(); @@ -1012,6 +1014,7 @@ void __init setup_arch(char **cmdline_p) */ init_hypervisor_platform(); + tsc_early_init(); x86_init.resources.probe_roms(); /* after parse_early_param, so could debug it */ @@ -1197,11 +1200,6 @@ void __init setup_arch(char **cmdline_p) memblock_find_dma_reserve(); -#ifdef CONFIG_KVM_GUEST - kvmclock_init(); -#endif - - tsc_early_delay_calibrate(); if (!early_xdbc_setup_hardware()) early_xdbc_register_console(); @@ -1272,8 +1270,6 @@ void __init setup_arch(char **cmdline_p) mcheck_init(); - arch_init_ideal_nops(); - register_refined_jiffies(CLOCK_TICK_RATE); #ifdef CONFIG_EFI diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 74392d9d51e0..1463468ba9a0 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -33,16 +33,13 @@ EXPORT_SYMBOL(cpu_khz); unsigned int __read_mostly tsc_khz; EXPORT_SYMBOL(tsc_khz); +#define KHZ 1000 + /* * TSC can be unstable due to cpufreq or due to unsynced TSCs */ static int __read_mostly tsc_unstable; -/* native_sched_clock() is called before tsc_init(), so - we must start with the TSC soft disabled to prevent - erroneous rdtsc usage on !boot_cpu_has(X86_FEATURE_TSC) processors */ -static int __read_mostly tsc_disabled = -1; - static DEFINE_STATIC_KEY_FALSE(__use_tsc); int tsc_clocksource_reliable; @@ -106,23 +103,6 @@ void cyc2ns_read_end(void) * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static void cyc2ns_data_init(struct cyc2ns_data *data) -{ - data->cyc2ns_mul = 0; - data->cyc2ns_shift = 0; - data->cyc2ns_offset = 0; -} - -static void __init cyc2ns_init(int cpu) -{ - struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); - - cyc2ns_data_init(&c2n->data[0]); - cyc2ns_data_init(&c2n->data[1]); - - seqcount_init(&c2n->seq); -} - static inline unsigned long long cycles_2_ns(unsigned long long cyc) { struct cyc2ns_data data; @@ -138,18 +118,11 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) return ns; } -static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) +static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) { unsigned long long ns_now; struct cyc2ns_data data; struct cyc2ns *c2n; - unsigned long flags; - - local_irq_save(flags); - sched_clock_idle_sleep_event(); - - if (!khz) - goto done; ns_now = cycles_2_ns(tsc_now); @@ -181,13 +154,56 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_ c2n->data[0] = data; raw_write_seqcount_latch(&c2n->seq); c2n->data[1] = data; +} + +static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) +{ + unsigned long flags; + + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + if (khz) + __set_cyc2ns_scale(khz, cpu, tsc_now); -done: sched_clock_idle_wakeup_event(); local_irq_restore(flags); } /* + * Initialize cyc2ns for boot cpu + */ +static void __init cyc2ns_init_boot_cpu(void) +{ + struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); + + seqcount_init(&c2n->seq); + __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc()); +} + +/* + * Secondary CPUs do not run through tsc_init(), so set up + * all the scale factors for all CPUs, assuming the same + * speed as the bootup CPU. (cpufreq notifiers will fix this + * up if their speed diverges) + */ +static void __init cyc2ns_init_secondary_cpus(void) +{ + unsigned int cpu, this_cpu = smp_processor_id(); + struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); + struct cyc2ns_data *data = c2n->data; + + for_each_possible_cpu(cpu) { + if (cpu != this_cpu) { + seqcount_init(&c2n->seq); + c2n = per_cpu_ptr(&cyc2ns, cpu); + c2n->data[0] = data[0]; + c2n->data[1] = data[1]; + } + } +} + +/* * Scheduler clock - returns current time in nanosec units. */ u64 native_sched_clock(void) @@ -248,8 +264,7 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable); #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { - pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n"); - tsc_disabled = 1; + mark_tsc_unstable("boot parameter notsc"); return 1; } #else @@ -665,30 +680,17 @@ static unsigned long cpu_khz_from_cpuid(void) return eax_base_mhz * 1000; } -/** - * native_calibrate_cpu - calibrate the cpu on boot +/* + * calibrate cpu using pit, hpet, and ptimer methods. They are available + * later in boot after acpi is initialized. */ -unsigned long native_calibrate_cpu(void) +static unsigned long pit_hpet_ptimer_calibrate_cpu(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate; + unsigned long flags, latch, ms; int hpet = is_hpet_enabled(), i, loopmin; - fast_calibrate = cpu_khz_from_cpuid(); - if (fast_calibrate) - return fast_calibrate; - - fast_calibrate = cpu_khz_from_msr(); - if (fast_calibrate) - return fast_calibrate; - - local_irq_save(flags); - fast_calibrate = quick_pit_calibrate(); - local_irq_restore(flags); - if (fast_calibrate) - return fast_calibrate; - /* * Run 5 calibration loops to get the lowest frequency value * (the best estimate). We use two different calibration modes @@ -831,6 +833,37 @@ unsigned long native_calibrate_cpu(void) return tsc_pit_min; } +/** + * native_calibrate_cpu_early - can calibrate the cpu early in boot + */ +unsigned long native_calibrate_cpu_early(void) +{ + unsigned long flags, fast_calibrate = cpu_khz_from_cpuid(); + + if (!fast_calibrate) + fast_calibrate = cpu_khz_from_msr(); + if (!fast_calibrate) { + local_irq_save(flags); + fast_calibrate = quick_pit_calibrate(); + local_irq_restore(flags); + } + return fast_calibrate; +} + + +/** + * native_calibrate_cpu - calibrate the cpu + */ +static unsigned long native_calibrate_cpu(void) +{ + unsigned long tsc_freq = native_calibrate_cpu_early(); + + if (!tsc_freq) + tsc_freq = pit_hpet_ptimer_calibrate_cpu(); + + return tsc_freq; +} + void recalibrate_cpu_khz(void) { #ifndef CONFIG_SMP @@ -1307,7 +1340,7 @@ unreg: static int __init init_tsc_clocksource(void) { - if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_disabled > 0 || !tsc_khz) + if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz) return 0; if (tsc_unstable) @@ -1341,40 +1374,22 @@ unreg: */ device_initcall(init_tsc_clocksource); -void __init tsc_early_delay_calibrate(void) +static bool __init determine_cpu_tsc_frequencies(bool early) { - unsigned long lpj; - - if (!boot_cpu_has(X86_FEATURE_TSC)) - return; - - cpu_khz = x86_platform.calibrate_cpu(); - tsc_khz = x86_platform.calibrate_tsc(); - - tsc_khz = tsc_khz ? : cpu_khz; - if (!tsc_khz) - return; - - lpj = tsc_khz * 1000; - do_div(lpj, HZ); - loops_per_jiffy = lpj; -} - -void __init tsc_init(void) -{ - u64 lpj, cyc; - int cpu; - - if (!boot_cpu_has(X86_FEATURE_TSC)) { - setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); - return; + /* Make sure that cpu and tsc are not already calibrated */ + WARN_ON(cpu_khz || tsc_khz); + + if (early) { + cpu_khz = x86_platform.calibrate_cpu(); + tsc_khz = x86_platform.calibrate_tsc(); + } else { + /* We should not be here with non-native cpu calibration */ + WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu); + cpu_khz = pit_hpet_ptimer_calibrate_cpu(); } - cpu_khz = x86_platform.calibrate_cpu(); - tsc_khz = x86_platform.calibrate_tsc(); - /* - * Trust non-zero tsc_khz as authorative, + * Trust non-zero tsc_khz as authoritative, * and use it to sanity check cpu_khz, * which will be off if system timer is off. */ @@ -1383,52 +1398,78 @@ void __init tsc_init(void) else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) cpu_khz = tsc_khz; - if (!tsc_khz) { - mark_tsc_unstable("could not calculate TSC khz"); - setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); - return; - } + if (tsc_khz == 0) + return false; pr_info("Detected %lu.%03lu MHz processor\n", - (unsigned long)cpu_khz / 1000, - (unsigned long)cpu_khz % 1000); + (unsigned long)cpu_khz / KHZ, + (unsigned long)cpu_khz % KHZ); if (cpu_khz != tsc_khz) { pr_info("Detected %lu.%03lu MHz TSC", - (unsigned long)tsc_khz / 1000, - (unsigned long)tsc_khz % 1000); + (unsigned long)tsc_khz / KHZ, + (unsigned long)tsc_khz % KHZ); } + return true; +} + +static unsigned long __init get_loops_per_jiffy(void) +{ + unsigned long lpj = tsc_khz * KHZ; + do_div(lpj, HZ); + return lpj; +} + +static void __init tsc_enable_sched_clock(void) +{ /* Sanitize TSC ADJUST before cyc2ns gets initialized */ tsc_store_and_check_tsc_adjust(true); + cyc2ns_init_boot_cpu(); + static_branch_enable(&__use_tsc); +} + +void __init tsc_early_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_TSC)) + return; + if (!determine_cpu_tsc_frequencies(true)) + return; + loops_per_jiffy = get_loops_per_jiffy(); + tsc_enable_sched_clock(); +} + +void __init tsc_init(void) +{ /* - * Secondary CPUs do not run through tsc_init(), so set up - * all the scale factors for all CPUs, assuming the same - * speed as the bootup CPU. (cpufreq notifiers will fix this - * up if their speed diverges) + * native_calibrate_cpu_early can only calibrate using methods that are + * available early in boot. */ - cyc = rdtsc(); - for_each_possible_cpu(cpu) { - cyc2ns_init(cpu); - set_cyc2ns_scale(tsc_khz, cpu, cyc); - } + if (x86_platform.calibrate_cpu == native_calibrate_cpu_early) + x86_platform.calibrate_cpu = native_calibrate_cpu; - if (tsc_disabled > 0) + if (!boot_cpu_has(X86_FEATURE_TSC)) { + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; + } - /* now allow native_sched_clock() to use rdtsc */ + if (!tsc_khz) { + /* We failed to determine frequencies earlier, try again */ + if (!determine_cpu_tsc_frequencies(false)) { + mark_tsc_unstable("could not calculate TSC khz"); + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + return; + } + tsc_enable_sched_clock(); + } - tsc_disabled = 0; - static_branch_enable(&__use_tsc); + cyc2ns_init_secondary_cpus(); if (!no_sched_irq_time) enable_sched_clock_irqtime(); - lpj = ((u64)tsc_khz * 1000); - do_div(lpj, HZ); - lpj_fine = lpj; - + lpj_fine = get_loops_per_jiffy(); use_tsc_delay(); check_system_tsc_reliable(); @@ -1455,7 +1496,7 @@ unsigned long calibrate_delay_is_known(void) int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC); const struct cpumask *mask = topology_core_cpumask(cpu); - if (tsc_disabled || !constant_tsc || !mask) + if (!constant_tsc || !mask) return 0; sibling = cpumask_any_but(mask, cpu); diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 19afdbd7d0a7..27ef714d886c 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -1,17 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * tsc_msr.c - TSC frequency enumeration via MSR + * TSC frequency enumeration via MSR * - * Copyright (C) 2013 Intel Corporation + * Copyright (C) 2013, 2018 Intel Corporation * Author: Bin Gao <bin.gao@intel.com> - * - * This file is released under the GPLv2. */ #include <linux/kernel.h> -#include <asm/processor.h> -#include <asm/setup.h> + #include <asm/apic.h> +#include <asm/cpu_device_id.h> +#include <asm/intel-family.h> +#include <asm/msr.h> #include <asm/param.h> +#include <asm/tsc.h> #define MAX_NUM_FREQS 9 @@ -23,44 +25,48 @@ * field msr_plat does. */ struct freq_desc { - u8 x86_family; /* CPU family */ - u8 x86_model; /* model */ u8 msr_plat; /* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */ u32 freqs[MAX_NUM_FREQS]; }; -static struct freq_desc freq_desc_tables[] = { - /* PNW */ - { 6, 0x27, 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 } }, - /* CLV+ */ - { 6, 0x35, 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 } }, - /* TNG - Intel Atom processor Z3400 series */ - { 6, 0x4a, 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 } }, - /* VLV2 - Intel Atom processor E3000, Z3600, Z3700 series */ - { 6, 0x37, 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 } }, - /* ANN - Intel Atom processor Z3500 series */ - { 6, 0x5a, 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } }, - /* AMT - Intel Atom processor X7-Z8000 and X5-Z8000 series */ - { 6, 0x4c, 1, { 83300, 100000, 133300, 116700, - 80000, 93300, 90000, 88900, 87500 } }, +/* + * Penwell and Clovertrail use spread spectrum clock, + * so the freq number is not exactly the same as reported + * by MSR based on SDM. + */ +static const struct freq_desc freq_desc_pnw = { + 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 } }; -static int match_cpu(u8 family, u8 model) -{ - int i; +static const struct freq_desc freq_desc_clv = { + 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 } +}; - for (i = 0; i < ARRAY_SIZE(freq_desc_tables); i++) { - if ((family == freq_desc_tables[i].x86_family) && - (model == freq_desc_tables[i].x86_model)) - return i; - } +static const struct freq_desc freq_desc_byt = { + 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 } +}; - return -1; -} +static const struct freq_desc freq_desc_cht = { + 1, { 83300, 100000, 133300, 116700, 80000, 93300, 90000, 88900, 87500 } +}; -/* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */ -#define id_to_freq(cpu_index, freq_id) \ - (freq_desc_tables[cpu_index].freqs[freq_id]) +static const struct freq_desc freq_desc_tng = { + 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 } +}; + +static const struct freq_desc freq_desc_ann = { + 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } +}; + +static const struct x86_cpu_id tsc_msr_cpu_ids[] = { + INTEL_CPU_FAM6(ATOM_PENWELL, freq_desc_pnw), + INTEL_CPU_FAM6(ATOM_CLOVERVIEW, freq_desc_clv), + INTEL_CPU_FAM6(ATOM_SILVERMONT1, freq_desc_byt), + INTEL_CPU_FAM6(ATOM_AIRMONT, freq_desc_cht), + INTEL_CPU_FAM6(ATOM_MERRIFIELD, freq_desc_tng), + INTEL_CPU_FAM6(ATOM_MOOREFIELD, freq_desc_ann), + {} +}; /* * MSR-based CPU/TSC frequency discovery for certain CPUs. @@ -70,18 +76,17 @@ static int match_cpu(u8 family, u8 model) */ unsigned long cpu_khz_from_msr(void) { - u32 lo, hi, ratio, freq_id, freq; + u32 lo, hi, ratio, freq; + const struct freq_desc *freq_desc; + const struct x86_cpu_id *id; unsigned long res; - int cpu_index; - - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) - return 0; - cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model); - if (cpu_index < 0) + id = x86_match_cpu(tsc_msr_cpu_ids); + if (!id) return 0; - if (freq_desc_tables[cpu_index].msr_plat) { + freq_desc = (struct freq_desc *)id->driver_data; + if (freq_desc->msr_plat) { rdmsr(MSR_PLATFORM_INFO, lo, hi); ratio = (lo >> 8) & 0xff; } else { @@ -91,8 +96,9 @@ unsigned long cpu_khz_from_msr(void) /* Get FSB FREQ ID */ rdmsr(MSR_FSB_FREQ, lo, hi); - freq_id = lo & 0x7; - freq = id_to_freq(cpu_index, freq_id); + + /* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */ + freq = freq_desc->freqs[lo & 0x7]; /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */ res = freq * ratio; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 3ab867603e81..2792b5573818 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -109,7 +109,7 @@ struct x86_cpuinit_ops x86_cpuinit = { static void default_nmi_init(void) { }; struct x86_platform_ops x86_platform __ro_after_init = { - .calibrate_cpu = native_calibrate_cpu, + .calibrate_cpu = native_calibrate_cpu_early, .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, diff --git a/arch/x86/platform/intel-mid/Makefile b/arch/x86/platform/intel-mid/Makefile index fa021dfab088..5cf886c867c2 100644 --- a/arch/x86/platform/intel-mid/Makefile +++ b/arch/x86/platform/intel-mid/Makefile @@ -1,4 +1,4 @@ -obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o mfld.o mrfld.o pwr.o +obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o pwr.o # SFI specific code ifdef CONFIG_X86_INTEL_MID diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 2ebdf31d9996..56f66eafb94f 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -36,8 +36,6 @@ #include <asm/apb_timer.h> #include <asm/reboot.h> -#include "intel_mid_weak_decls.h" - /* * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock, * cmdline option x86_intel_mid_timer can be used to override the configuration @@ -61,10 +59,6 @@ enum intel_mid_timer_options intel_mid_timer_options; -/* intel_mid_ops to store sub arch ops */ -static struct intel_mid_ops *intel_mid_ops; -/* getter function for sub arch ops*/ -static void *(*get_intel_mid_ops[])(void) = INTEL_MID_OPS_INIT; enum intel_mid_cpu_type __intel_mid_cpu_chip; EXPORT_SYMBOL_GPL(__intel_mid_cpu_chip); @@ -82,11 +76,6 @@ static void intel_mid_reboot(void) intel_scu_ipc_simple_command(IPCMSG_COLD_RESET, 0); } -static unsigned long __init intel_mid_calibrate_tsc(void) -{ - return 0; -} - static void __init intel_mid_setup_bp_timer(void) { apbt_time_init(); @@ -133,6 +122,7 @@ static void intel_mid_arch_setup(void) case 0x3C: case 0x4A: __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_TANGIER; + x86_platform.legacy.rtc = 1; break; case 0x27: default: @@ -140,17 +130,7 @@ static void intel_mid_arch_setup(void) break; } - if (__intel_mid_cpu_chip < MAX_CPU_OPS(get_intel_mid_ops)) - intel_mid_ops = get_intel_mid_ops[__intel_mid_cpu_chip](); - else { - intel_mid_ops = get_intel_mid_ops[INTEL_MID_CPU_CHIP_PENWELL](); - pr_info("ARCH: Unknown SoC, assuming Penwell!\n"); - } - out: - if (intel_mid_ops->arch_setup) - intel_mid_ops->arch_setup(); - /* * Intel MID platforms are using explicitly defined regulators. * @@ -191,7 +171,6 @@ void __init x86_intel_mid_early_setup(void) x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock; - x86_platform.calibrate_tsc = intel_mid_calibrate_tsc; x86_platform.get_nmi_reason = intel_mid_get_nmi_reason; x86_init.pci.arch_init = intel_mid_pci_init; diff --git a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h deleted file mode 100644 index 3c1c3866d82b..000000000000 --- a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * intel_mid_weak_decls.h: Weak declarations of intel-mid.c - * - * (C) Copyright 2013 Intel Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - - -/* For every CPU addition a new get_<cpuname>_ops interface needs - * to be added. - */ -extern void *get_penwell_ops(void); -extern void *get_cloverview_ops(void); -extern void *get_tangier_ops(void); diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c deleted file mode 100644 index e42978d4deaf..000000000000 --- a/arch/x86/platform/intel-mid/mfld.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * mfld.c: Intel Medfield platform setup code - * - * (C) Copyright 2013 Intel Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include <linux/init.h> - -#include <asm/apic.h> -#include <asm/intel-mid.h> -#include <asm/intel_mid_vrtc.h> - -#include "intel_mid_weak_decls.h" - -static unsigned long __init mfld_calibrate_tsc(void) -{ - unsigned long fast_calibrate; - u32 lo, hi, ratio, fsb; - - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi); - ratio = (hi >> 8) & 0x1f; - pr_debug("ratio is %d\n", ratio); - if (!ratio) { - pr_err("read a zero ratio, should be incorrect!\n"); - pr_err("force tsc ratio to 16 ...\n"); - ratio = 16; - } - rdmsr(MSR_FSB_FREQ, lo, hi); - if ((lo & 0x7) == 0x7) - fsb = FSB_FREQ_83SKU; - else - fsb = FSB_FREQ_100SKU; - fast_calibrate = ratio * fsb; - pr_debug("read penwell tsc %lu khz\n", fast_calibrate); - lapic_timer_frequency = fsb * 1000 / HZ; - - /* - * TSC on Intel Atom SoCs is reliable and of known frequency. - * See tsc_msr.c for details. - */ - setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); - setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); - - return fast_calibrate; -} - -static void __init penwell_arch_setup(void) -{ - x86_platform.calibrate_tsc = mfld_calibrate_tsc; -} - -static struct intel_mid_ops penwell_ops = { - .arch_setup = penwell_arch_setup, -}; - -void *get_penwell_ops(void) -{ - return &penwell_ops; -} - -void *get_cloverview_ops(void) -{ - return &penwell_ops; -} diff --git a/arch/x86/platform/intel-mid/mrfld.c b/arch/x86/platform/intel-mid/mrfld.c deleted file mode 100644 index ae7bdeb0e507..000000000000 --- a/arch/x86/platform/intel-mid/mrfld.c +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Intel Merrifield platform specific setup code - * - * (C) Copyright 2013 Intel Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include <linux/init.h> - -#include <asm/apic.h> -#include <asm/intel-mid.h> - -#include "intel_mid_weak_decls.h" - -static unsigned long __init tangier_calibrate_tsc(void) -{ - unsigned long fast_calibrate; - u32 lo, hi, ratio, fsb, bus_freq; - - /* *********************** */ - /* Compute TSC:Ratio * FSB */ - /* *********************** */ - - /* Compute Ratio */ - rdmsr(MSR_PLATFORM_INFO, lo, hi); - pr_debug("IA32 PLATFORM_INFO is 0x%x : %x\n", hi, lo); - - ratio = (lo >> 8) & 0xFF; - pr_debug("ratio is %d\n", ratio); - if (!ratio) { - pr_err("Read a zero ratio, force tsc ratio to 4 ...\n"); - ratio = 4; - } - - /* Compute FSB */ - rdmsr(MSR_FSB_FREQ, lo, hi); - pr_debug("Actual FSB frequency detected by SOC 0x%x : %x\n", - hi, lo); - - bus_freq = lo & 0x7; - pr_debug("bus_freq = 0x%x\n", bus_freq); - - if (bus_freq == 0) - fsb = FSB_FREQ_100SKU; - else if (bus_freq == 1) - fsb = FSB_FREQ_100SKU; - else if (bus_freq == 2) - fsb = FSB_FREQ_133SKU; - else if (bus_freq == 3) - fsb = FSB_FREQ_167SKU; - else if (bus_freq == 4) - fsb = FSB_FREQ_83SKU; - else if (bus_freq == 5) - fsb = FSB_FREQ_400SKU; - else if (bus_freq == 6) - fsb = FSB_FREQ_267SKU; - else if (bus_freq == 7) - fsb = FSB_FREQ_333SKU; - else { - BUG(); - pr_err("Invalid bus_freq! Setting to minimal value!\n"); - fsb = FSB_FREQ_100SKU; - } - - /* TSC = FSB Freq * Resolved HFM Ratio */ - fast_calibrate = ratio * fsb; - pr_debug("calculate tangier tsc %lu KHz\n", fast_calibrate); - - /* ************************************ */ - /* Calculate Local APIC Timer Frequency */ - /* ************************************ */ - lapic_timer_frequency = (fsb * 1000) / HZ; - - pr_debug("Setting lapic_timer_frequency = %d\n", - lapic_timer_frequency); - - /* - * TSC on Intel Atom SoCs is reliable and of known frequency. - * See tsc_msr.c for details. - */ - setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); - setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); - - return fast_calibrate; -} - -static void __init tangier_arch_setup(void) -{ - x86_platform.calibrate_tsc = tangier_calibrate_tsc; - x86_platform.legacy.rtc = 1; -} - -/* tangier arch ops */ -static struct intel_mid_ops tangier_ops = { - .arch_setup = tangier_arch_setup, -}; - -void *get_tangier_ops(void) -{ - return &tangier_ops; -} diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 439a94bf89ad..105a57d73701 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -119,6 +119,27 @@ static void __init xen_banner(void) version >> 16, version & 0xffff, extra.extraversion, xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); } + +static void __init xen_pv_init_platform(void) +{ + set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info); + HYPERVISOR_shared_info = (void *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); + + /* xen clock uses per-cpu vcpu_info, need to init it for boot cpu */ + xen_vcpu_info_reset(0); + + /* pvclock is in shared info area */ + xen_init_time_ops(); +} + +static void __init xen_pv_guest_late_init(void) +{ +#ifndef CONFIG_SMP + /* Setup shared vcpu info for non-smp configurations */ + xen_setup_vcpu_info_placement(); +#endif +} + /* Check if running on Xen version (major, minor) or later */ bool xen_running_on_version_or_later(unsigned int major, unsigned int minor) @@ -947,34 +968,8 @@ static void xen_write_msr(unsigned int msr, unsigned low, unsigned high) xen_write_msr_safe(msr, low, high); } -void xen_setup_shared_info(void) -{ - set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info); - - HYPERVISOR_shared_info = - (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); - - xen_setup_mfn_list_list(); - - if (system_state == SYSTEM_BOOTING) { -#ifndef CONFIG_SMP - /* - * In UP this is as good a place as any to set up shared info. - * Limit this to boot only, at restore vcpu setup is done via - * xen_vcpu_restore(). - */ - xen_setup_vcpu_info_placement(); -#endif - /* - * Now that shared info is set up we can start using routines - * that point to pvclock area. - */ - xen_init_time_ops(); - } -} - /* This is called once we have the cpu_possible_mask */ -void __ref xen_setup_vcpu_info_placement(void) +void __init xen_setup_vcpu_info_placement(void) { int cpu; @@ -1228,6 +1223,8 @@ asmlinkage __visible void __init xen_start_kernel(void) x86_init.irqs.intr_mode_init = x86_init_noop; x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; + x86_init.hyper.init_platform = xen_pv_init_platform; + x86_init.hyper.guest_late_init = xen_pv_guest_late_init; /* * Set up some pagetable state before starting to set any ptes. diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 2c30cabfda90..52206ad81e4b 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1230,8 +1230,7 @@ static void __init xen_pagetable_p2m_free(void) * We roundup to the PMD, which means that if anybody at this stage is * using the __ka address of xen_start_info or * xen_start_info->shared_info they are in going to crash. Fortunatly - * we have already revectored in xen_setup_kernel_pagetable and in - * xen_setup_shared_info. + * we have already revectored in xen_setup_kernel_pagetable. */ size = roundup(size, PMD_SIZE); @@ -1292,8 +1291,7 @@ static void __init xen_pagetable_init(void) /* Remap memory freed due to conflicts with E820 map */ xen_remap_memory(); - - xen_setup_shared_info(); + xen_setup_mfn_list_list(); } static void xen_write_cr2(unsigned long cr2) { diff --git a/arch/x86/xen/suspend_pv.c b/arch/x86/xen/suspend_pv.c index a2e0f110af56..8303b58c79a9 100644 --- a/arch/x86/xen/suspend_pv.c +++ b/arch/x86/xen/suspend_pv.c @@ -27,8 +27,9 @@ void xen_pv_pre_suspend(void) void xen_pv_post_suspend(int suspend_cancelled) { xen_build_mfn_list_list(); - - xen_setup_shared_info(); + set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info); + HYPERVISOR_shared_info = (void *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); + xen_setup_mfn_list_list(); if (suspend_cancelled) { xen_start_info->store_mfn = diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index e0f1bcf01d63..c84f1e039d84 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -31,6 +31,8 @@ /* Xen may fire a timer up to this many ns early */ #define TIMER_SLOP 100000 +static u64 xen_sched_clock_offset __read_mostly; + /* Get the TSC speed from Xen */ static unsigned long xen_tsc_khz(void) { @@ -40,7 +42,7 @@ static unsigned long xen_tsc_khz(void) return pvclock_tsc_khz(info); } -u64 xen_clocksource_read(void) +static u64 xen_clocksource_read(void) { struct pvclock_vcpu_time_info *src; u64 ret; @@ -57,6 +59,11 @@ static u64 xen_clocksource_get_cycles(struct clocksource *cs) return xen_clocksource_read(); } +static u64 xen_sched_clock(void) +{ + return xen_clocksource_read() - xen_sched_clock_offset; +} + static void xen_read_wallclock(struct timespec64 *ts) { struct shared_info *s = HYPERVISOR_shared_info; @@ -367,7 +374,7 @@ void xen_timer_resume(void) } static const struct pv_time_ops xen_time_ops __initconst = { - .sched_clock = xen_clocksource_read, + .sched_clock = xen_sched_clock, .steal_clock = xen_steal_clock, }; @@ -503,8 +510,9 @@ static void __init xen_time_init(void) pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier); } -void __ref xen_init_time_ops(void) +void __init xen_init_time_ops(void) { + xen_sched_clock_offset = xen_clocksource_read(); pv_time_ops = xen_time_ops; x86_init.timers.timer_init = xen_time_init; @@ -542,11 +550,11 @@ void __init xen_hvm_init_time_ops(void) return; if (!xen_feature(XENFEAT_hvm_safe_pvclock)) { - printk(KERN_INFO "Xen doesn't support pvclock on HVM," - "disable pv timer\n"); + pr_info("Xen doesn't support pvclock on HVM, disable pv timer"); return; } + xen_sched_clock_offset = xen_clocksource_read(); pv_time_ops = xen_time_ops; x86_init.timers.setup_percpu_clockev = xen_time_init; x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents; diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 3b34745d0a52..e78684597f57 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -31,7 +31,6 @@ extern struct shared_info xen_dummy_shared_info; extern struct shared_info *HYPERVISOR_shared_info; void xen_setup_mfn_list_list(void); -void xen_setup_shared_info(void); void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); @@ -68,12 +67,11 @@ void xen_init_irq_ops(void); void xen_setup_timer(int cpu); void xen_setup_runstate_info(int cpu); void xen_teardown_timer(int cpu); -u64 xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); void xen_save_time_memory_area(void); void xen_restore_time_memory_area(void); -void __ref xen_init_time_ops(void); -void __init xen_hvm_init_time_ops(void); +void xen_init_time_ops(void); +void xen_hvm_init_time_ops(void); irqreturn_t xen_debug_interrupt(int irq, void *dev_id); diff --git a/drivers/clocksource/tegra20_timer.c b/drivers/clocksource/tegra20_timer.c index dabf0a103567..aa624885e0e2 100644 --- a/drivers/clocksource/tegra20_timer.c +++ b/drivers/clocksource/tegra20_timer.c @@ -259,6 +259,6 @@ static int __init tegra20_init_rtc(struct device_node *np) else clk_prepare_enable(clk); - return register_persistent_clock(NULL, tegra_read_persistent_clock64); + return register_persistent_clock(tegra_read_persistent_clock64); } TIMER_OF_DECLARE(tegra20_rtc, "nvidia,tegra20-rtc", tegra20_init_rtc); diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h index 411b52e424e1..abe28d5cb3f4 100644 --- a/include/linux/sched_clock.h +++ b/include/linux/sched_clock.h @@ -9,17 +9,16 @@ #define LINUX_SCHED_CLOCK #ifdef CONFIG_GENERIC_SCHED_CLOCK -extern void sched_clock_postinit(void); +extern void generic_sched_clock_init(void); extern void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate); #else -static inline void sched_clock_postinit(void) { } +static inline void generic_sched_clock_init(void) { } static inline void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { - ; } #endif diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index edace6b656e9..e79861418fd7 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -243,7 +243,8 @@ extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); extern int persistent_clock_is_local; extern void read_persistent_clock64(struct timespec64 *ts); -extern void read_boot_clock64(struct timespec64 *ts); +void read_persistent_clock_and_boot_offset(struct timespec64 *wall_clock, + struct timespec64 *boot_offset); extern int update_persistent_clock64(struct timespec64 now); /* diff --git a/init/main.c b/init/main.c index f86b64c35d38..38c68b593d0d 100644 --- a/init/main.c +++ b/init/main.c @@ -79,7 +79,7 @@ #include <linux/pti.h> #include <linux/blkdev.h> #include <linux/elevator.h> -#include <linux/sched_clock.h> +#include <linux/sched/clock.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/context_tracking.h> @@ -642,7 +642,6 @@ asmlinkage __visible void __init start_kernel(void) softirq_init(); timekeeping_init(); time_init(); - sched_clock_postinit(); printk_safe_init(); perf_event_init(); profile_init(); @@ -697,6 +696,7 @@ asmlinkage __visible void __init start_kernel(void) acpi_early_init(); if (late_time_init) late_time_init(); + sched_clock_init(); calibrate_delay(); pid_idr_init(); anon_vma_init(); diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 10c83e73837a..e3e3b979f9bd 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -53,6 +53,7 @@ * */ #include "sched.h" +#include <linux/sched_clock.h> /* * Scheduler clock - returns current time in nanosec units. @@ -66,12 +67,7 @@ unsigned long long __weak sched_clock(void) } EXPORT_SYMBOL_GPL(sched_clock); -__read_mostly int sched_clock_running; - -void sched_clock_init(void) -{ - sched_clock_running = 1; -} +static DEFINE_STATIC_KEY_FALSE(sched_clock_running); #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK /* @@ -195,17 +191,40 @@ void clear_sched_clock_stable(void) smp_mb(); /* matches sched_clock_init_late() */ - if (sched_clock_running == 2) + if (static_key_count(&sched_clock_running.key) == 2) __clear_sched_clock_stable(); } +static void __sched_clock_gtod_offset(void) +{ + struct sched_clock_data *scd = this_scd(); + + __scd_stamp(scd); + __gtod_offset = (scd->tick_raw + __sched_clock_offset) - scd->tick_gtod; +} + +void __init sched_clock_init(void) +{ + /* + * Set __gtod_offset such that once we mark sched_clock_running, + * sched_clock_tick() continues where sched_clock() left off. + * + * Even if TSC is buggered, we're still UP at this point so it + * can't really be out of sync. + */ + local_irq_disable(); + __sched_clock_gtod_offset(); + local_irq_enable(); + + static_branch_inc(&sched_clock_running); +} /* * We run this as late_initcall() such that it runs after all built-in drivers, * notably: acpi_processor and intel_idle, which can mark the TSC as unstable. */ static int __init sched_clock_init_late(void) { - sched_clock_running = 2; + static_branch_inc(&sched_clock_running); /* * Ensure that it is impossible to not do a static_key update. * @@ -350,8 +369,8 @@ u64 sched_clock_cpu(int cpu) if (sched_clock_stable()) return sched_clock() + __sched_clock_offset; - if (unlikely(!sched_clock_running)) - return 0ull; + if (!static_branch_unlikely(&sched_clock_running)) + return sched_clock(); preempt_disable_notrace(); scd = cpu_sdc(cpu); @@ -373,7 +392,7 @@ void sched_clock_tick(void) if (sched_clock_stable()) return; - if (unlikely(!sched_clock_running)) + if (!static_branch_unlikely(&sched_clock_running)) return; lockdep_assert_irqs_disabled(); @@ -385,8 +404,6 @@ void sched_clock_tick(void) void sched_clock_tick_stable(void) { - u64 gtod, clock; - if (!sched_clock_stable()) return; @@ -398,9 +415,7 @@ void sched_clock_tick_stable(void) * TSC to be unstable, any computation will be computing crap. */ local_irq_disable(); - gtod = ktime_get_ns(); - clock = sched_clock(); - __gtod_offset = (clock + __sched_clock_offset) - gtod; + __sched_clock_gtod_offset(); local_irq_enable(); } @@ -434,9 +449,17 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ +void __init sched_clock_init(void) +{ + static_branch_inc(&sched_clock_running); + local_irq_disable(); + generic_sched_clock_init(); + local_irq_enable(); +} + u64 sched_clock_cpu(int cpu) { - if (unlikely(!sched_clock_running)) + if (!static_branch_unlikely(&sched_clock_running)) return 0; return sched_clock(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d17d99778356..c45de46fdf10 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5916,7 +5916,6 @@ void __init sched_init(void) int i, j; unsigned long alloc_size = 0, ptr; - sched_clock_init(); wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 870d4f3da285..60caf1fb94e0 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -622,8 +622,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) #undef PU } -extern __read_mostly int sched_clock_running; - static void print_cpu(struct seq_file *m, int cpu) { struct rq *rq = cpu_rq(cpu); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 2d8f05aad442..cbc72c2c1fca 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -237,7 +237,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) pr_debug("Registered %pF as sched_clock source\n", read); } -void __init sched_clock_postinit(void) +void __init generic_sched_clock_init(void) { /* * If no sched_clock() function has been provided at that point, diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d9e659a12c76..f3b22f456fac 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -17,6 +17,7 @@ #include <linux/nmi.h> #include <linux/sched.h> #include <linux/sched/loadavg.h> +#include <linux/sched/clock.h> #include <linux/syscore_ops.h> #include <linux/clocksource.h> #include <linux/jiffies.h> @@ -1505,18 +1506,23 @@ void __weak read_persistent_clock64(struct timespec64 *ts64) } /** - * read_boot_clock64 - Return time of the system start. + * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset + * from the boot. * * Weak dummy function for arches that do not yet support it. - * Function to read the exact time the system has been started. - * Returns a timespec64 with tv_sec=0 and tv_nsec=0 if unsupported. - * - * XXX - Do be sure to remove it once all arches implement it. + * wall_time - current time as returned by persistent clock + * boot_offset - offset that is defined as wall_time - boot_time + * The default function calculates offset based on the current value of + * local_clock(). This way architectures that support sched_clock() but don't + * support dedicated boot time clock will provide the best estimate of the + * boot time. */ -void __weak read_boot_clock64(struct timespec64 *ts) +void __weak __init +read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, + struct timespec64 *boot_offset) { - ts->tv_sec = 0; - ts->tv_nsec = 0; + read_persistent_clock64(wall_time); + *boot_offset = ns_to_timespec64(local_clock()); } /* @@ -1542,28 +1548,29 @@ static bool persistent_clock_exists; */ void __init timekeeping_init(void) { + struct timespec64 wall_time, boot_offset, wall_to_mono; struct timekeeper *tk = &tk_core.timekeeper; struct clocksource *clock; unsigned long flags; - struct timespec64 now, boot, tmp; - - read_persistent_clock64(&now); - if (!timespec64_valid_strict(&now)) { - pr_warn("WARNING: Persistent clock returned invalid value!\n" - " Check your CMOS/BIOS settings.\n"); - now.tv_sec = 0; - now.tv_nsec = 0; - } else if (now.tv_sec || now.tv_nsec) - persistent_clock_exists = true; - read_boot_clock64(&boot); - if (!timespec64_valid_strict(&boot)) { - pr_warn("WARNING: Boot clock returned invalid value!\n" - " Check your CMOS/BIOS settings.\n"); - boot.tv_sec = 0; - boot.tv_nsec = 0; + read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); + if (timespec64_valid_strict(&wall_time) && + timespec64_to_ns(&wall_time) > 0) { + persistent_clock_exists = true; + } else if (timespec64_to_ns(&wall_time) != 0) { + pr_warn("Persistent clock returned invalid value"); + wall_time = (struct timespec64){0}; } + if (timespec64_compare(&wall_time, &boot_offset) < 0) + boot_offset = (struct timespec64){0}; + + /* + * We want set wall_to_mono, so the following is true: + * wall time + wall_to_mono = boot time + */ + wall_to_mono = timespec64_sub(boot_offset, wall_time); + raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); ntp_init(); @@ -1573,13 +1580,10 @@ void __init timekeeping_init(void) clock->enable(clock); tk_setup_internals(tk, clock); - tk_set_xtime(tk, &now); + tk_set_xtime(tk, &wall_time); tk->raw_sec = 0; - if (boot.tv_sec == 0 && boot.tv_nsec == 0) - boot = tk_xtime(tk); - set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec); - tk_set_wall_to_mono(tk, tmp); + tk_set_wall_to_mono(tk, wall_to_mono); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); |