From ee6825c80e870fff1a370c718ec77022ade0889b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 25 Mar 2016 15:52:34 +0100 Subject: x86/topology: Fix AMD core count It turns out AMD gets x86_max_cores wrong when there are compute units. The issue is that Linux assumes: nr_logical_cpus = nr_cores * nr_siblings But AMD reports its CU unit as 2 cores, but then sets num_smp_siblings to 2 as well. Boris: fixup ras/mce_amd_inj.c too, to compute the Node Base Core properly, according to the new nomenclature. Fixes: 1f12e32f4cd5 ("x86/topology: Create logical package id") Reported-by: Xiong Zhou Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Cc: Andreas Herrmann Cc: Andy Lutomirski Link: http://lkml.kernel.org/r/20160317095220.GO6344@twins.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/smp.h | 1 + arch/x86/kernel/cpu/amd.c | 8 ++++---- arch/x86/ras/mce_amd_inj.c | 3 ++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 20a3de5cb3b0..66b057306f40 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -155,6 +155,7 @@ static inline int wbinvd_on_all_cpus(void) wbinvd(); return 0; } +#define smp_num_siblings 1 #endif /* CONFIG_SMP */ extern unsigned disabled_cpus; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 6e47e3a916f1..4d0087f94ee5 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -312,9 +312,9 @@ static void amd_get_topology(struct cpuinfo_x86 *c) node_id = ecx & 7; /* get compute unit information */ - smp_num_siblings = ((ebx >> 8) & 3) + 1; + cores_per_cu = smp_num_siblings = ((ebx >> 8) & 3) + 1; + c->x86_max_cores /= smp_num_siblings; c->compute_unit_id = ebx & 0xff; - cores_per_cu += ((ebx >> 8) & 3); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; @@ -329,8 +329,8 @@ static void amd_get_topology(struct cpuinfo_x86 *c) u32 cus_per_node; set_cpu_cap(c, X86_FEATURE_AMD_DCM); - cores_per_node = c->x86_max_cores / nodes_per_socket; - cus_per_node = cores_per_node / cores_per_cu; + cus_per_node = c->x86_max_cores / nodes_per_socket; + cores_per_node = cus_per_node * cores_per_cu; /* store NodeID, use llc_shared_map to store sibling info */ per_cpu(cpu_llc_id, cpu) = node_id; diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c index 55d38cfa46c2..9e02dcaef683 100644 --- a/arch/x86/ras/mce_amd_inj.c +++ b/arch/x86/ras/mce_amd_inj.c @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -206,7 +207,7 @@ static u32 get_nbc_for_node(int node_id) struct cpuinfo_x86 *c = &boot_cpu_data; u32 cores_per_node; - cores_per_node = c->x86_max_cores / amd_get_nodes_per_socket(); + cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket(); return cores_per_node * node_id; } -- cgit v1.2.3 From 32b62f446827f696cc474a6d83cea93693c5ed49 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 25 Mar 2016 15:52:35 +0100 Subject: perf/x86/amd: Cleanup Fam10h NB event constraints Avoid allocating the AMD NB event constraints data structure when not needed. This gets rid of x86_max_cores usage and avoids allocating this on AMD Core Perfctr supporting hardware (which has separate MSRs for NB events). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Cc: aherrmann@suse.com Cc: Rui Huang Cc: Borislav Petkov Cc: jencce.kernel@gmail.com Link: http://lkml.kernel.org/r/20160320124629.GY6375@twins.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- arch/x86/events/amd/core.c | 21 ++++++++++++++++++--- arch/x86/events/perf_event.h | 5 +++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 049ada8d4e9c..86a9bec18dab 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -369,7 +369,7 @@ static int amd_pmu_cpu_prepare(int cpu) WARN_ON_ONCE(cpuc->amd_nb); - if (boot_cpu_data.x86_max_cores < 2) + if (!x86_pmu.amd_nb_constraints) return NOTIFY_OK; cpuc->amd_nb = amd_alloc_nb(cpu); @@ -388,7 +388,7 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; - if (boot_cpu_data.x86_max_cores < 2) + if (!x86_pmu.amd_nb_constraints) return; nb_id = amd_get_nb_id(cpu); @@ -414,7 +414,7 @@ static void amd_pmu_cpu_dead(int cpu) { struct cpu_hw_events *cpuhw; - if (boot_cpu_data.x86_max_cores < 2) + if (!x86_pmu.amd_nb_constraints) return; cpuhw = &per_cpu(cpu_hw_events, cpu); @@ -648,6 +648,8 @@ static __initconst const struct x86_pmu amd_pmu = { .cpu_prepare = amd_pmu_cpu_prepare, .cpu_starting = amd_pmu_cpu_starting, .cpu_dead = amd_pmu_cpu_dead, + + .amd_nb_constraints = 1, }; static int __init amd_core_pmu_init(void) @@ -674,6 +676,11 @@ static int __init amd_core_pmu_init(void) x86_pmu.eventsel = MSR_F15H_PERF_CTL; x86_pmu.perfctr = MSR_F15H_PERF_CTR; x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE; + /* + * AMD Core perfctr has separate MSRs for the NB events, see + * the amd/uncore.c driver. + */ + x86_pmu.amd_nb_constraints = 0; pr_cont("core perfctr, "); return 0; @@ -693,6 +700,14 @@ __init int amd_pmu_init(void) if (ret) return ret; + if (num_possible_cpus() == 1) { + /* + * No point in allocating data structures to serialize + * against other CPUs, when there is only the one CPU. + */ + x86_pmu.amd_nb_constraints = 0; + } + /* Events are common for all AMDs */ memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, sizeof(hw_cache_event_ids)); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ba6ef18528c9..716d0482f5db 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -607,6 +607,11 @@ struct x86_pmu { */ atomic_t lbr_exclusive[x86_lbr_exclusive_max]; + /* + * AMD bits + */ + unsigned int amd_nb_constraints : 1; + /* * Extra registers for events */ -- cgit v1.2.3 From 8196dab4fc159943df6baaac04973bb1accb7100 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 25 Mar 2016 15:52:36 +0100 Subject: x86/cpu: Get rid of compute_unit_id It is cpu_core_id anyway. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1458917557-8757-3-git-send-email-bp@alien8.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/processor.h | 2 -- arch/x86/kernel/amd_nb.c | 6 ++---- arch/x86/kernel/cpu/amd.c | 10 +++------- arch/x86/kernel/smpboot.c | 2 +- 4 files changed, 6 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 983738ac014c..9264476f3d57 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -132,8 +132,6 @@ struct cpuinfo_x86 { u16 logical_proc_id; /* Core id: */ u16 cpu_core_id; - /* Compute unit id */ - u8 compute_unit_id; /* Index into per_cpu list: */ u16 cpu_index; u32 microcode; diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 29fa475ec518..a147e676fc7b 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -170,15 +170,13 @@ int amd_get_subcaches(int cpu) { struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; unsigned int mask; - int cuid; if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) return 0; pci_read_config_dword(link, 0x1d4, &mask); - cuid = cpu_data(cpu).compute_unit_id; - return (mask >> (4 * cuid)) & 0xf; + return (mask >> (4 * cpu_data(cpu).cpu_core_id)) & 0xf; } int amd_set_subcaches(int cpu, unsigned long mask) @@ -204,7 +202,7 @@ int amd_set_subcaches(int cpu, unsigned long mask) pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000); } - cuid = cpu_data(cpu).compute_unit_id; + cuid = cpu_data(cpu).cpu_core_id; mask <<= 4 * cuid; mask |= (0xf ^ (1 << cuid)) << 26; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 4d0087f94ee5..7b76eb67a9b3 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -300,7 +300,6 @@ static int nearby_node(int apicid) #ifdef CONFIG_SMP static void amd_get_topology(struct cpuinfo_x86 *c) { - u32 cores_per_cu = 1; u8 node_id; int cpu = smp_processor_id(); @@ -312,9 +311,9 @@ static void amd_get_topology(struct cpuinfo_x86 *c) node_id = ecx & 7; /* get compute unit information */ - cores_per_cu = smp_num_siblings = ((ebx >> 8) & 3) + 1; + smp_num_siblings = ((ebx >> 8) & 3) + 1; c->x86_max_cores /= smp_num_siblings; - c->compute_unit_id = ebx & 0xff; + c->cpu_core_id = ebx & 0xff; } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; @@ -325,19 +324,16 @@ static void amd_get_topology(struct cpuinfo_x86 *c) /* fixup multi-node processor information */ if (nodes_per_socket > 1) { - u32 cores_per_node; u32 cus_per_node; set_cpu_cap(c, X86_FEATURE_AMD_DCM); cus_per_node = c->x86_max_cores / nodes_per_socket; - cores_per_node = cus_per_node * cores_per_cu; /* store NodeID, use llc_shared_map to store sibling info */ per_cpu(cpu_llc_id, cpu) = node_id; /* core id has to be in the [0 .. cores_per_node - 1] range */ - c->cpu_core_id %= cores_per_node; - c->compute_unit_id %= cus_per_node; + c->cpu_core_id %= cus_per_node; } } #endif diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b2c99f811c3f..a2065d3b3b39 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -422,7 +422,7 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) if (c->phys_proc_id == o->phys_proc_id && per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) && - c->compute_unit_id == o->compute_unit_id) + c->cpu_core_id == o->cpu_core_id) return topology_sane(c, o, "smt"); } else if (c->phys_proc_id == o->phys_proc_id && -- cgit v1.2.3 From f7be8610bca88e59dd2fd5d98fcbc5031ef0e079 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 28 Mar 2016 11:56:09 +0200 Subject: x86/Documentation: Start documenting x86 topology This should contain important aspects of how we represent the system topology on x86. If people have questions about it and this file doesn't answer it, then it must be updated. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/20160328095609.GD26651@pd.tnic Signed-off-by: Thomas Gleixner --- Documentation/x86/topology.txt | 208 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 Documentation/x86/topology.txt diff --git a/Documentation/x86/topology.txt b/Documentation/x86/topology.txt new file mode 100644 index 000000000000..06afac252f5b --- /dev/null +++ b/Documentation/x86/topology.txt @@ -0,0 +1,208 @@ +x86 Topology +============ + +This documents and clarifies the main aspects of x86 topology modelling and +representation in the kernel. Update/change when doing changes to the +respective code. + +The architecture-agnostic topology definitions are in +Documentation/cputopology.txt. This file holds x86-specific +differences/specialities which must not necessarily apply to the generic +definitions. Thus, the way to read up on Linux topology on x86 is to start +with the generic one and look at this one in parallel for the x86 specifics. + +Needless to say, code should use the generic functions - this file is *only* +here to *document* the inner workings of x86 topology. + +Started by Thomas Gleixner and Borislav Petkov . + +The main aim of the topology facilities is to present adequate interfaces to +code which needs to know/query/use the structure of the running system wrt +threads, cores, packages, etc. + +The kernel does not care about the concept of physical sockets because a +socket has no relevance to software. It's an electromechanical component. In +the past a socket always contained a single package (see below), but with the +advent of Multi Chip Modules (MCM) a socket can hold more than one package. So +there might be still references to sockets in the code, but they are of +historical nature and should be cleaned up. + +The topology of a system is described in the units of: + + - packages + - cores + - threads + +* Package: + + Packages contain a number of cores plus shared resources, e.g. DRAM + controller, shared caches etc. + + AMD nomenclature for package is 'Node'. + + Package-related topology information in the kernel: + + - cpuinfo_x86.x86_max_cores: + + The number of cores in a package. This information is retrieved via CPUID. + + - cpuinfo_x86.phys_proc_id: + + The physical ID of the package. This information is retrieved via CPUID + and deduced from the APIC IDs of the cores in the package. + + - cpuinfo_x86.logical_id: + + The logical ID of the package. As we do not trust BIOSes to enumerate the + packages in a consistent way, we introduced the concept of logical package + ID so we can sanely calculate the number of maximum possible packages in + the system and have the packages enumerated linearly. + + - topology_max_packages(): + + The maximum possible number of packages in the system. Helpful for per + package facilities to preallocate per package information. + + +* Cores: + + A core consists of 1 or more threads. It does not matter whether the threads + are SMT- or CMT-type threads. + + AMDs nomenclature for a CMT core is "Compute Unit". The kernel always uses + "core". + + Core-related topology information in the kernel: + + - smp_num_siblings: + + The number of threads in a core. The number of threads in a package can be + calculated by: + + threads_per_package = cpuinfo_x86.x86_max_cores * smp_num_siblings + + +* Threads: + + A thread is a single scheduling unit. It's the equivalent to a logical Linux + CPU. + + AMDs nomenclature for CMT threads is "Compute Unit Core". The kernel always + uses "thread". + + Thread-related topology information in the kernel: + + - topology_core_cpumask(): + + The cpumask contains all online threads in the package to which a thread + belongs. + + The number of online threads is also printed in /proc/cpuinfo "siblings." + + - topology_sibling_mask(): + + The cpumask contains all online threads in the core to which a thread + belongs. + + - topology_logical_package_id(): + + The logical package ID to which a thread belongs. + + - topology_physical_package_id(): + + The physical package ID to which a thread belongs. + + - topology_core_id(); + + The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo + "core_id." + + + +System topology examples + +Note: + +The alternative Linux CPU enumeration depends on how the BIOS enumerates the +threads. Many BIOSes enumerate all threads 0 first and then all threads 1. +That has the "advantage" that the logical Linux CPU numbers of threads 0 stay +the same whether threads are enabled or not. That's merely an implementation +detail and has no practical impact. + +1) Single Package, Single Core + + [package 0] -> [core 0] -> [thread 0] -> Linux CPU 0 + +2) Single Package, Dual Core + + a) One thread per core + + [package 0] -> [core 0] -> [thread 0] -> Linux CPU 0 + -> [core 1] -> [thread 0] -> Linux CPU 1 + + b) Two threads per core + + [package 0] -> [core 0] -> [thread 0] -> Linux CPU 0 + -> [thread 1] -> Linux CPU 1 + -> [core 1] -> [thread 0] -> Linux CPU 2 + -> [thread 1] -> Linux CPU 3 + + Alternative enumeration: + + [package 0] -> [core 0] -> [thread 0] -> Linux CPU 0 + -> [thread 1] -> Linux CPU 2 + -> [core 1] -> [thread 0] -> Linux CPU 1 + -> [thread 1] -> Linux CPU 3 + + AMD nomenclature for CMT systems: + + [node 0] -> [Compute Unit 0] -> [Compute Unit Core 0] -> Linux CPU 0 + -> [Compute Unit Core 1] -> Linux CPU 1 + -> [Compute Unit 1] -> [Compute Unit Core 0] -> Linux CPU 2 + -> [Compute Unit Core 1] -> Linux CPU 3 + +4) Dual Package, Dual Core + + a) One thread per core + + [package 0] -> [core 0] -> [thread 0] -> Linux CPU 0 + -> [core 1] -> [thread 0] -> Linux CPU 1 + + [package 1] -> [core 0] -> [thread 0] -> Linux CPU 2 + -> [core 1] -> [thread 0] -> Linux CPU 3 + + b) Two threads per core + + [package 0] -> [core 0] -> [thread 0] -> Linux CPU 0 + -> [thread 1] -> Linux CPU 1 + -> [core 1] -> [thread 0] -> Linux CPU 2 + -> [thread 1] -> Linux CPU 3 + + [package 1] -> [core 0] -> [thread 0] -> Linux CPU 4 + -> [thread 1] -> Linux CPU 5 + -> [core 1] -> [thread 0] -> Linux CPU 6 + -> [thread 1] -> Linux CPU 7 + + Alternative enumeration: + + [package 0] -> [core 0] -> [thread 0] -> Linux CPU 0 + -> [thread 1] -> Linux CPU 4 + -> [core 1] -> [thread 0] -> Linux CPU 1 + -> [thread 1] -> Linux CPU 5 + + [package 1] -> [core 0] -> [thread 0] -> Linux CPU 2 + -> [thread 1] -> Linux CPU 6 + -> [core 1] -> [thread 0] -> Linux CPU 3 + -> [thread 1] -> Linux CPU 7 + + AMD nomenclature for CMT systems: + + [node 0] -> [Compute Unit 0] -> [Compute Unit Core 0] -> Linux CPU 0 + -> [Compute Unit Core 1] -> Linux CPU 1 + -> [Compute Unit 1] -> [Compute Unit Core 0] -> Linux CPU 2 + -> [Compute Unit Core 1] -> Linux CPU 3 + + [node 1] -> [Compute Unit 0] -> [Compute Unit Core 0] -> Linux CPU 4 + -> [Compute Unit Core 1] -> Linux CPU 5 + -> [Compute Unit 1] -> [Compute Unit Core 0] -> Linux CPU 6 + -> [Compute Unit Core 1] -> Linux CPU 7 -- cgit v1.2.3 From 4a6772f514891eaacf26bcb7c2c808c557d23c6f Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Sat, 26 Mar 2016 20:47:00 +0200 Subject: x86/cpufreq: Remove duplicated TDP MSR macro definitions The list of CPU model specific registers contains two copies of TDP registers, remove the one, which is out of numerical order in the list. Fixes: 6a35fc2d6c22 ("cpufreq: intel_pstate: get P1 from TAR when available") Signed-off-by: Vladimir Zapolskiy Cc: Len Brown Cc: "Rafael J. Wysocki" Cc: Kristen Carlson Accardi Cc: Srinivas Pandruvada Link: http://lkml.kernel.org/r/1459018020-24577-1-git-send-email-vladimir_zapolskiy@mentor.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/msr-index.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 2da46ac16e37..426e946ed0c0 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -190,6 +190,7 @@ #define MSR_PP1_ENERGY_STATUS 0x00000641 #define MSR_PP1_POLICY 0x00000642 +/* Config TDP MSRs */ #define MSR_CONFIG_TDP_NOMINAL 0x00000648 #define MSR_CONFIG_TDP_LEVEL_1 0x00000649 #define MSR_CONFIG_TDP_LEVEL_2 0x0000064A @@ -210,13 +211,6 @@ #define MSR_GFX_PERF_LIMIT_REASONS 0x000006B0 #define MSR_RING_PERF_LIMIT_REASONS 0x000006B1 -/* Config TDP MSRs */ -#define MSR_CONFIG_TDP_NOMINAL 0x00000648 -#define MSR_CONFIG_TDP_LEVEL1 0x00000649 -#define MSR_CONFIG_TDP_LEVEL2 0x0000064A -#define MSR_CONFIG_TDP_CONTROL 0x0000064B -#define MSR_TURBO_ACTIVATION_RATIO 0x0000064C - /* Hardware P state interface */ #define MSR_PPERF 0x0000064e #define MSR_PERF_LIMIT_REASONS 0x0000064f -- cgit v1.2.3 From 5f870a3f7188065e13efafe1faeb01c136173bc4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 28 Mar 2016 20:20:17 +0200 Subject: x86/thread_info: Merge two !__ASSEMBLY__ sections We have #ifndef __ASSEMBLY__ ... #endif #ifndef __ASSEMBLY__ ... #endif Merge the two. No functionality change. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1459189217-25532-1-git-send-email-bp@alien8.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/thread_info.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 82866697fcf1..ffae84df8a93 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -276,11 +276,9 @@ static inline bool is_ia32_task(void) */ #define force_iret() set_thread_flag(TIF_NOTIFY_RESUME) -#endif /* !__ASSEMBLY__ */ - -#ifndef __ASSEMBLY__ extern void arch_task_cache_init(void); extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); extern void arch_release_task_struct(struct task_struct *tsk); -#endif +#endif /* !__ASSEMBLY__ */ + #endif /* _ASM_X86_THREAD_INFO_H */ -- cgit v1.2.3 From 34a4cceb78e48c75d1b48b25352a3f3b2cc2b2da Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Fri, 25 Mar 2016 10:08:40 +0800 Subject: x86/cpu: Add advanced power management bits Bit 11 of CPUID 8000_0007 edx is processor feedback interface. Bit 12 of CPUID 8000_0007 edx is accumulated power. Print proper names in proc/cpuinfo Reported-and-tested-by: Borislav Petkov Signed-off-by: Huang Rui Cc: Tony Li Cc: Fenghua Yu Cc: Tony Luck Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Andy Lutomirski Cc: Fengguang Wu Cc: Sherry Hurwitz Cc: Borislav Petkov Cc: "Len Brown" Link: http://lkml.kernel.org/r/1458871720-3209-1-git-send-email-ray.huang@amd.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/powerflags.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c index 31f0f335ed22..1dd8294fd730 100644 --- a/arch/x86/kernel/cpu/powerflags.c +++ b/arch/x86/kernel/cpu/powerflags.c @@ -18,4 +18,6 @@ const char *const x86_power_flags[32] = { "", /* tsc invariant mapped to constant_tsc */ "cpb", /* core performance boost */ "eff_freq_ro", /* Readonly aperf/mperf */ + "proc_feedback", /* processor feedback interface */ + "acc_power", /* accumulated power mechanism */ }; -- cgit v1.2.3 From 591b1d8d86074ac3a3163d89bcfe7b232cf83902 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 14 Dec 2015 11:06:34 -0800 Subject: x86/mm/pkeys: Add missing Documentation Stefan Richter noticed that the X86_INTEL_MEMORY_PROTECTION_KEYS option in arch/x86/Kconfig references Documentation/x86/protection-keys.txt, but the file does not exist. This is a patch merging mishap: the final (v8) version of the pkeys series did not include the documentation patch 32 and v7 included. Add it now. Reported-by: Stefan Richter Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20151214190634.426BEE41@viggo.jf.intel.com [ Added changelog. ] Signed-off-by: Ingo Molnar Signed-off-by: Ingo Molnar --- Documentation/x86/protection-keys.txt | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 Documentation/x86/protection-keys.txt diff --git a/Documentation/x86/protection-keys.txt b/Documentation/x86/protection-keys.txt new file mode 100644 index 000000000000..c281ded1ba16 --- /dev/null +++ b/Documentation/x86/protection-keys.txt @@ -0,0 +1,27 @@ +Memory Protection Keys for Userspace (PKU aka PKEYs) is a CPU feature +which will be found on future Intel CPUs. + +Memory Protection Keys provides a mechanism for enforcing page-based +protections, but without requiring modification of the page tables +when an application changes protection domains. It works by +dedicating 4 previously ignored bits in each page table entry to a +"protection key", giving 16 possible keys. + +There is also a new user-accessible register (PKRU) with two separate +bits (Access Disable and Write Disable) for each key. Being a CPU +register, PKRU is inherently thread-local, potentially giving each +thread a different set of protections from every other thread. + +There are two new instructions (RDPKRU/WRPKRU) for reading and writing +to the new register. The feature is only available in 64-bit mode, +even though there is theoretically space in the PAE PTEs. These +permissions are enforced on data access only and have no effect on +instruction fetches. + +=========================== Config Option =========================== + +This config option adds approximately 1.5kb of text. and 50 bytes of +data to the executable. A workload which does large O_DIRECT reads +of holes in XFS files was run to exercise get_user_pages_fast(). No +performance delta was observed with the config option +enabled or disabled. -- cgit v1.2.3 From 6d92bc9d483aa1751755a66fee8fb39dffb088c0 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Wed, 16 Mar 2016 20:04:35 -0700 Subject: x86/build: Build compressed x86 kernels as PIE The 32-bit x86 assembler in binutils 2.26 will generate R_386_GOT32X relocation to get the symbol address in PIC. When the compressed x86 kernel isn't built as PIC, the linker optimizes R_386_GOT32X relocations to their fixed symbol addresses. However, when the compressed x86 kernel is loaded at a different address, it leads to the following load failure: Failed to allocate space for phdrs during the decompression stage. If the compressed x86 kernel is relocatable at run-time, it should be compiled with -fPIE, instead of -fPIC, if possible and should be built as Position Independent Executable (PIE) so that linker won't optimize R_386_GOT32X relocation to its fixed symbol address. Older linkers generate R_386_32 relocations against locally defined symbols, _bss, _ebss, _got and _egot, in PIE. It isn't wrong, just less optimal than R_386_RELATIVE. But the x86 kernel fails to properly handle R_386_32 relocations when relocating the kernel. To generate R_386_RELATIVE relocations, we mark _bss, _ebss, _got and _egot as hidden in both 32-bit and 64-bit x86 kernels. To build a 64-bit compressed x86 kernel as PIE, we need to disable the relocation overflow check to avoid relocation overflow errors. We do this with a new linker command-line option, -z noreloc-overflow, which got added recently: commit 4c10bbaa0912742322f10d9d5bb630ba4e15dfa7 Author: H.J. Lu Date: Tue Mar 15 11:07:06 2016 -0700 Add -z noreloc-overflow option to x86-64 ld Add -z noreloc-overflow command-line option to the x86-64 ELF linker to disable relocation overflow check. This can be used to avoid relocation overflow check if there will be no dynamic relocation overflow at run-time. The 64-bit compressed x86 kernel is built as PIE only if the linker supports -z noreloc-overflow. So far 64-bit relocatable compressed x86 kernel boots fine even when it is built as a normal executable. Signed-off-by: H.J. Lu Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org [ Edited the changelog and comments. ] Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/Makefile | 14 +++++++++++++- arch/x86/boot/compressed/head_32.S | 28 ++++++++++++++++++++++++++++ arch/x86/boot/compressed/head_64.S | 8 ++++++++ 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 6915ff2bd996..8774cb23064f 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -26,7 +26,7 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 -KBUILD_CFLAGS += -fno-strict-aliasing -fPIC +KBUILD_CFLAGS += -fno-strict-aliasing $(call cc-option, -fPIE, -fPIC) KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING cflags-$(CONFIG_X86_32) := -march=i386 cflags-$(CONFIG_X86_64) := -mcmodel=small @@ -40,6 +40,18 @@ GCOV_PROFILE := n UBSAN_SANITIZE :=n LDFLAGS := -m elf_$(UTS_MACHINE) +ifeq ($(CONFIG_RELOCATABLE),y) +# If kernel is relocatable, build compressed kernel as PIE. +ifeq ($(CONFIG_X86_32),y) +LDFLAGS += $(call ld-option, -pie) $(call ld-option, --no-dynamic-linker) +else +# To build 64-bit compressed kernel as PIE, we disable relocation +# overflow check to avoid relocation overflow error with a new linker +# command-line option, -z noreloc-overflow. +LDFLAGS += $(shell $(LD) --help 2>&1 | grep -q "\-z noreloc-overflow" \ + && echo "-z noreloc-overflow -pie --no-dynamic-linker") +endif +endif LDFLAGS_vmlinux := -T hostprogs-y := mkpiggy diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 8ef964ddc18e..0256064da8da 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -31,6 +31,34 @@ #include #include +/* + * The 32-bit x86 assembler in binutils 2.26 will generate R_386_GOT32X + * relocation to get the symbol address in PIC. When the compressed x86 + * kernel isn't built as PIC, the linker optimizes R_386_GOT32X + * relocations to their fixed symbol addresses. However, when the + * compressed x86 kernel is loaded at a different address, it leads + * to the following load failure: + * + * Failed to allocate space for phdrs + * + * during the decompression stage. + * + * If the compressed x86 kernel is relocatable at run-time, it should be + * compiled with -fPIE, instead of -fPIC, if possible and should be built as + * Position Independent Executable (PIE) so that linker won't optimize + * R_386_GOT32X relocation to its fixed symbol address. Older + * linkers generate R_386_32 relocations against locally defined symbols, + * _bss, _ebss, _got and _egot, in PIE. It isn't wrong, just less + * optimal than R_386_RELATIVE. But the x86 kernel fails to properly handle + * R_386_32 relocations when relocating the kernel. To generate + * R_386_RELATIVE relocations, we mark _bss, _ebss, _got and _egot as + * hidden: + */ + .hidden _bss + .hidden _ebss + .hidden _got + .hidden _egot + __HEAD ENTRY(startup_32) #ifdef CONFIG_EFI_STUB diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index b0c0d16ef58d..86558a199139 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -33,6 +33,14 @@ #include #include +/* + * Locally defined symbols should be marked hidden: + */ + .hidden _bss + .hidden _ebss + .hidden _got + .hidden _egot + __HEAD .code32 ENTRY(startup_32) -- cgit v1.2.3 From f87e0434a3bedeb5e4d75d96d9f3ad424dae6b33 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 1 Apr 2016 12:15:46 +1030 Subject: lguest, x86/entry/32: Fix handling of guest syscalls using interrupt gates In a798f091113e ("x86/entry/32: Change INT80 to be an interrupt gate") Andy broke lguest. This is because lguest had special code to allow the 0x80 trap gate go straight into the guest itself; interrupts gates (without more work, as mentioned in the file's comments) bounce via the hypervisor. His change made them go via the hypervisor, but as it's in the range of normal hardware interrupts, they were not directed through to the guest at all. Turns out the guest userspace isn't very effective if syscalls are all noops. I haven't ripped out all the now-useless trap-direct-to-guest-kernel code yet, since it will still be needed if someone decides to update this optimization. Signed-off-by: Rusty Russell Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Weisbecker Cc: x86\@kernel.org Link: http://lkml.kernel.org/r/87fuv685kl.fsf@rustcorp.com.au Signed-off-by: Ingo Molnar --- drivers/lguest/interrupts_and_traps.c | 6 +++++- drivers/lguest/lg.h | 1 + drivers/lguest/x86/core.c | 6 +++++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index eb934b0242e0..67392b6ab845 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -331,7 +331,7 @@ void set_interrupt(struct lg_cpu *cpu, unsigned int irq) * Actually now I think of it, it's possible that Ron *is* half the Plan 9 * userbase. Oh well. */ -static bool could_be_syscall(unsigned int num) +bool could_be_syscall(unsigned int num) { /* Normal Linux IA32_SYSCALL_VECTOR or reserved vector? */ return num == IA32_SYSCALL_VECTOR || num == syscall_vector; @@ -416,6 +416,10 @@ bool deliver_trap(struct lg_cpu *cpu, unsigned int num) * * This routine indicates if a particular trap number could be delivered * directly. + * + * Unfortunately, Linux 4.6 started using an interrupt gate instead of a + * trap gate for syscalls, so this trick is ineffective. See Mastery for + * how we could do this anyway... */ static bool direct_trap(unsigned int num) { diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index ac8ad0461e80..69b3814afd2f 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -167,6 +167,7 @@ void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta); bool send_notify_to_eventfd(struct lg_cpu *cpu); void init_clockdev(struct lg_cpu *cpu); bool check_syscall_vector(struct lguest *lg); +bool could_be_syscall(unsigned int num); int init_interrupts(void); void free_interrupts(void); diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 6a4cd771a2be..adc162c7040d 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -429,8 +429,12 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) return; break; case 32 ... 255: + /* This might be a syscall. */ + if (could_be_syscall(cpu->regs->trapnum)) + break; + /* - * These values mean a real interrupt occurred, in which case + * Other values mean a real interrupt occurred, in which case * the Host handler has already been run. We just do a * friendly check if another process should now be run, then * return to run the Guest again. -- cgit v1.2.3 From a3125494cff084b098c80bb36fbe2061ffed9d52 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 6 Apr 2016 10:05:16 +0200 Subject: x86/mce: Avoid using object after free in genpool When we loop over all queued machine check error records to pass them to the registered notifiers we use llist_for_each_entry(). But the loop calls gen_pool_free() for the entry in the body of the loop - and then the iterator looks at node->next after the free. Use llist_for_each_entry_safe() instead. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Cc: Cc: Gong Chen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-edac Link: http://lkml.kernel.org/r/0205920@agluck-desk.sc.intel.com Link: http://lkml.kernel.org/r/1459929916-12852-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce-genpool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c index 0a850100c594..2658e2af74ec 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c +++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c @@ -29,7 +29,7 @@ static char gen_pool_buf[MCE_POOLSZ]; void mce_gen_pool_process(void) { struct llist_node *head; - struct mce_evt_llist *node; + struct mce_evt_llist *node, *tmp; struct mce *mce; head = llist_del_all(&mce_event_llist); @@ -37,7 +37,7 @@ void mce_gen_pool_process(void) return; head = llist_reverse_order(head); - llist_for_each_entry(node, head, llnode) { + llist_for_each_entry_safe(node, tmp, head, llnode) { mce = &node->mce; atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); -- cgit v1.2.3