From 8e57c586c6a8333c266a6cf76d50e110f2954558 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 3 Feb 2015 12:40:54 +0100 Subject: perf/x86/intel/uncore: Delete an unnecessary check before pci_dev_put() call The pci_dev_put() function tests whether its argument is NULL and then returns immediately. Thus the test around the call is not needed. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: H. Peter Anvin Cc: Linus Torvalds Link: http://lkml.kernel.org/r/54D0B59C.2060106@users.sourceforge.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index 21af6149edf2..12d9548457e7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -1132,8 +1132,7 @@ static int snbep_pci2phy_map_init(int devid) } } - if (ubox_dev) - pci_dev_put(ubox_dev); + pci_dev_put(ubox_dev); return err ? pcibios_err_to_errno(err) : 0; } -- cgit v1.2.3 From c796b205b88c775fd220c1a63390bac6a8cdda3f Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Fri, 23 Jan 2015 12:19:35 -0600 Subject: perf/x86/amd/ibs: Convert force_ibs_eilvt_setup() to void The caller of force_ibs_eilvt_setup() is ibs_eilvt_setup() which does not care about the return values. So mark it void and clean up the return statements. Signed-off-by: Aravind Gopalakrishnan Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1422037175-20957-1-git-send-email-aravind.gopalakrishnan@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index a61f5c6911da..989d3c215d2b 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -796,7 +796,7 @@ static int setup_ibs_ctl(int ibs_eilvt_off) * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that * is using the new offset. */ -static int force_ibs_eilvt_setup(void) +static void force_ibs_eilvt_setup(void) { int offset; int ret; @@ -811,26 +811,24 @@ static int force_ibs_eilvt_setup(void) if (offset == APIC_EILVT_NR_MAX) { printk(KERN_DEBUG "No EILVT entry available\n"); - return -EBUSY; + return; } ret = setup_ibs_ctl(offset); if (ret) goto out; - if (!ibs_eilvt_valid()) { - ret = -EFAULT; + if (!ibs_eilvt_valid()) goto out; - } pr_info("IBS: LVT offset %d assigned\n", offset); - return 0; + return; out: preempt_disable(); put_eilvt(offset); preempt_enable(); - return ret; + return; } static void ibs_eilvt_setup(void) -- cgit v1.2.3 From 27ac905b8f88d28779b0661809286b5ba2817d37 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 4 Nov 2014 21:55:57 -0500 Subject: perf/x86/intel: Reduce lbr_sel_map[] size The index of lbr_sel_map is bit value of perf branch_sample_type. PERF_SAMPLE_BRANCH_MAX is 1024 at present, so each lbr_sel_map uses 4096 bytes. By using bit shift as index, we can reduce lbr_sel_map size to 40 bytes. This patch defines 'bit shift' for branch types, and use 'bit shift' to define lbr_sel_maps. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Stephane Eranian Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: jolsa@redhat.com Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1415156173-10035-2-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 4 +++ arch/x86/kernel/cpu/perf_event_intel_lbr.c | 54 ++++++++++++++---------------- 2 files changed, 29 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index df525d2be1e8..0c45b22495dc 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -515,6 +515,10 @@ struct x86_pmu { struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); }; +enum { + PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE = PERF_SAMPLE_BRANCH_MAX_SHIFT, +}; + #define x86_add_quirk(func_) \ do { \ static struct x86_pmu_quirk __quirk __initdata = { \ diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 58f1a94beaf0..8bc078f43a82 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -69,10 +69,6 @@ static enum { #define LBR_FROM_FLAG_IN_TX (1ULL << 62) #define LBR_FROM_FLAG_ABORT (1ULL << 61) -#define for_each_branch_sample_type(x) \ - for ((x) = PERF_SAMPLE_BRANCH_USER; \ - (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1) - /* * x86control flow change classification * x86control flow changes include branches, interrupts, traps, faults @@ -403,14 +399,14 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) { struct hw_perf_event_extra *reg; u64 br_type = event->attr.branch_sample_type; - u64 mask = 0, m; - u64 v; + u64 mask = 0, v; + int i; - for_each_branch_sample_type(m) { - if (!(br_type & m)) + for (i = 0; i < PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE; i++) { + if (!(br_type & (1ULL << i))) continue; - v = x86_pmu.lbr_sel_map[m]; + v = x86_pmu.lbr_sel_map[i]; if (v == LBR_NOT_SUPP) return -EOPNOTSUPP; @@ -678,35 +674,35 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) /* * Map interface branch filters onto LBR filters */ -static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { - [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, - [PERF_SAMPLE_BRANCH_USER] = LBR_USER, - [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, - [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, - [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP - | LBR_IND_JMP | LBR_FAR, +static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = { + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_REL_JMP + | LBR_IND_JMP | LBR_FAR, /* * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches */ - [PERF_SAMPLE_BRANCH_ANY_CALL] = + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR, /* * NHM/WSM erratum: must include IND_JMP to capture IND_CALL */ - [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP, - [PERF_SAMPLE_BRANCH_COND] = LBR_JCC, + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, }; -static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { - [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, - [PERF_SAMPLE_BRANCH_USER] = LBR_USER, - [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, - [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, - [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR, - [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL - | LBR_FAR, - [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL, - [PERF_SAMPLE_BRANCH_COND] = LBR_JCC, +static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = { + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR, + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL + | LBR_FAR, + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, }; /* core */ -- cgit v1.2.3 From ba532500c5651a4be4108acc64ed99a95cb005b3 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 4 Nov 2014 21:55:58 -0500 Subject: perf: Introduce pmu context switch callback The callback is invoked when process is scheduled in or out. It provides mechanism for later patches to save/store the LBR stack. For the schedule in case, the callback is invoked at the same place that flush branch stack callback is invoked. So it also can replace the flush branch stack callback. To avoid unnecessary overhead, the callback is enabled only when there are events use the LBR stack. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: eranian@google.com Cc: jolsa@redhat.com Link: http://lkml.kernel.org/r/1415156173-10035-3-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 7 +++++++ arch/x86/kernel/cpu/perf_event.h | 2 ++ 2 files changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b71a7f86d68a..0efbd6cc2966 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1914,6 +1914,12 @@ static const struct attribute_group *x86_pmu_attr_groups[] = { NULL, }; +static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + if (x86_pmu.sched_task) + x86_pmu.sched_task(ctx, sched_in); +} + static void x86_pmu_flush_branch_stack(void) { if (x86_pmu.flush_branch_stack) @@ -1950,6 +1956,7 @@ static struct pmu pmu = { .event_idx = x86_pmu_event_idx, .flush_branch_stack = x86_pmu_flush_branch_stack, + .sched_task = x86_pmu_sched_task, }; void arch_perf_update_userpage(struct perf_event *event, diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 0c45b22495dc..211b54c0c00c 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -473,6 +473,8 @@ struct x86_pmu { void (*check_microcode)(void); void (*flush_branch_stack)(void); + void (*sched_task)(struct perf_event_context *ctx, + bool sched_in); /* * Intel Arch Perfmon v2+ -- cgit v1.2.3 From 2a0ad3b326a9024ba86dca4028499d31fa0c6c4d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 4 Nov 2014 21:55:59 -0500 Subject: perf/x86/intel: Use context switch callback to flush LBR stack Previous commit introduces context switch callback, its function overlaps with the flush branch stack callback. So we can use the context switch callback to flush LBR stack. This patch adds code that uses the flush branch callback to flush the LBR stack when task is being scheduled in. The callback is enabled only when there are events use the LBR hardware. This patch also removes all old flush branch stack code. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: eranian@google.com Cc: jolsa@redhat.com Link: http://lkml.kernel.org/r/1415156173-10035-4-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 7 ------- arch/x86/kernel/cpu/perf_event.h | 3 ++- arch/x86/kernel/cpu/perf_event_intel.c | 14 +------------- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 27 +++++++++++++++++++++++++++ 4 files changed, 30 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 0efbd6cc2966..6b1fd26a37cf 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1920,12 +1920,6 @@ static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) x86_pmu.sched_task(ctx, sched_in); } -static void x86_pmu_flush_branch_stack(void) -{ - if (x86_pmu.flush_branch_stack) - x86_pmu.flush_branch_stack(); -} - void perf_check_microcode(void) { if (x86_pmu.check_microcode) @@ -1955,7 +1949,6 @@ static struct pmu pmu = { .commit_txn = x86_pmu_commit_txn, .event_idx = x86_pmu_event_idx, - .flush_branch_stack = x86_pmu_flush_branch_stack, .sched_task = x86_pmu_sched_task, }; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 211b54c0c00c..949d0083a29e 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -472,7 +472,6 @@ struct x86_pmu { void (*cpu_dead)(int cpu); void (*check_microcode)(void); - void (*flush_branch_stack)(void); void (*sched_task)(struct perf_event_context *ctx, bool sched_in); @@ -733,6 +732,8 @@ void intel_pmu_pebs_disable_all(void); void intel_ds_init(void); +void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); + void intel_pmu_lbr_reset(void); void intel_pmu_lbr_enable(struct perf_event *event); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 498b6d967138..424fbf74dee7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2044,18 +2044,6 @@ static void intel_pmu_cpu_dying(int cpu) fini_debug_store_on_cpu(cpu); } -static void intel_pmu_flush_branch_stack(void) -{ - /* - * Intel LBR does not tag entries with the - * PID of the current task, then we need to - * flush it on ctxsw - * For now, we simply reset it - */ - if (x86_pmu.lbr_nr) - intel_pmu_lbr_reset(); -} - PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); PMU_FORMAT_ATTR(ldlat, "config1:0-15"); @@ -2107,7 +2095,7 @@ static __initconst const struct x86_pmu intel_pmu = { .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, .guest_get_msrs = intel_guest_get_msrs, - .flush_branch_stack = intel_pmu_flush_branch_stack, + .sched_task = intel_pmu_lbr_sched_task, }; static __init void intel_clovertown_quirk(void) diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 8bc078f43a82..c0e23c5f85bb 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -177,6 +177,31 @@ void intel_pmu_lbr_reset(void) intel_pmu_lbr_reset_64(); } +void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (!x86_pmu.lbr_nr) + return; + + /* + * When sampling the branck stack in system-wide, it may be + * necessary to flush the stack on context switch. This happens + * when the branch stack does not tag its entries with the pid + * of the current task. Otherwise it becomes impossible to + * associate a branch entry with a task. This ambiguity is more + * likely to appear when the branch stack supports priv level + * filtering and the user sets it to monitor only at the user + * level (which could be a useful measurement in system-wide + * mode). In that case, the risk is high of having a branch + * stack with branch from multiple tasks. + */ + if (sched_in) { + intel_pmu_lbr_reset(); + cpuc->lbr_context = ctx; + } +} + void intel_pmu_lbr_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -195,6 +220,7 @@ void intel_pmu_lbr_enable(struct perf_event *event) cpuc->br_sel = event->hw.branch_reg.reg; cpuc->lbr_users++; + perf_sched_cb_inc(event->ctx->pmu); } void intel_pmu_lbr_disable(struct perf_event *event) @@ -206,6 +232,7 @@ void intel_pmu_lbr_disable(struct perf_event *event) cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); + perf_sched_cb_dec(event->ctx->pmu); if (cpuc->enabled && !cpuc->lbr_users) { __intel_pmu_lbr_disable(); -- cgit v1.2.3 From e9d7f7cd97c45e2c612d3b38be05b4cfb27939ee Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 4 Nov 2014 21:56:00 -0500 Subject: perf/x86/intel: Add basic Haswell LBR call stack support Haswell has a new feature that utilizes the existing LBR facility to record call chains. To enable this feature, bits (JCC, NEAR_IND_JMP, NEAR_REL_JMP, FAR_BRANCH, EN_CALLSTACK) in LBR_SELECT must be set to 1, bits (NEAR_REL_CALL, NEAR-IND_CALL, NEAR_RET) must be cleared. Due to a hardware bug of Haswell, this feature doesn't work well with FREEZE_LBRS_ON_PMI. When the call stack feature is enabled, the LBR stack will capture unfiltered call data normally, but as return instructions are executed, the last captured branch record is flushed from the on-chip registers in a last-in first-out (LIFO) manner. Thus, branch information relative to leaf functions will not be captured, while preserving the call stack information of the main line execution path. This patch defines a separate lbr_sel map for Haswell. The map contains a new entry for the call stack feature. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: eranian@google.com Cc: jolsa@redhat.com Link: http://lkml.kernel.org/r/1415156173-10035-5-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 14 ++++- arch/x86/kernel/cpu/perf_event_intel.c | 2 +- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 91 ++++++++++++++++++++++-------- 3 files changed, 83 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 949d0083a29e..c9a62c5bca75 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -517,7 +517,11 @@ struct x86_pmu { }; enum { - PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE = PERF_SAMPLE_BRANCH_MAX_SHIFT, + PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = PERF_SAMPLE_BRANCH_MAX_SHIFT, + PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE, + + PERF_SAMPLE_BRANCH_CALL_STACK = + 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT, }; #define x86_add_quirk(func_) \ @@ -551,6 +555,12 @@ static struct perf_pmu_events_attr event_attr_##v = { \ extern struct x86_pmu x86_pmu __read_mostly; +static inline bool x86_pmu_has_lbr_callstack(void) +{ + return x86_pmu.lbr_sel_map && + x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0; +} + DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events); int x86_perf_event_set_period(struct perf_event *event); @@ -754,6 +764,8 @@ void intel_pmu_lbr_init_atom(void); void intel_pmu_lbr_init_snb(void); +void intel_pmu_lbr_init_hsw(void); + int intel_pmu_setup_lbr_filter(struct perf_event *event); int p4_pmu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 424fbf74dee7..a485ba124476 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2537,7 +2537,7 @@ __init int intel_pmu_init(void) memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); - intel_pmu_lbr_init_snb(); + intel_pmu_lbr_init_hsw(); x86_pmu.event_constraints = intel_hsw_event_constraints; x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index c0e23c5f85bb..ac8279e560a1 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -39,6 +39,7 @@ static enum { #define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */ #define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */ #define LBR_FAR_BIT 8 /* do not capture far branches */ +#define LBR_CALL_STACK_BIT 9 /* enable call stack */ #define LBR_KERNEL (1 << LBR_KERNEL_BIT) #define LBR_USER (1 << LBR_USER_BIT) @@ -49,6 +50,7 @@ static enum { #define LBR_REL_JMP (1 << LBR_REL_JMP_BIT) #define LBR_IND_JMP (1 << LBR_IND_JMP_BIT) #define LBR_FAR (1 << LBR_FAR_BIT) +#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT) #define LBR_PLM (LBR_KERNEL | LBR_USER) @@ -74,24 +76,25 @@ static enum { * x86control flow changes include branches, interrupts, traps, faults */ enum { - X86_BR_NONE = 0, /* unknown */ - - X86_BR_USER = 1 << 0, /* branch target is user */ - X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ - - X86_BR_CALL = 1 << 2, /* call */ - X86_BR_RET = 1 << 3, /* return */ - X86_BR_SYSCALL = 1 << 4, /* syscall */ - X86_BR_SYSRET = 1 << 5, /* syscall return */ - X86_BR_INT = 1 << 6, /* sw interrupt */ - X86_BR_IRET = 1 << 7, /* return from interrupt */ - X86_BR_JCC = 1 << 8, /* conditional */ - X86_BR_JMP = 1 << 9, /* jump */ - X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ - X86_BR_IND_CALL = 1 << 11,/* indirect calls */ - X86_BR_ABORT = 1 << 12,/* transaction abort */ - X86_BR_IN_TX = 1 << 13,/* in transaction */ - X86_BR_NO_TX = 1 << 14,/* not in transaction */ + X86_BR_NONE = 0, /* unknown */ + + X86_BR_USER = 1 << 0, /* branch target is user */ + X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ + + X86_BR_CALL = 1 << 2, /* call */ + X86_BR_RET = 1 << 3, /* return */ + X86_BR_SYSCALL = 1 << 4, /* syscall */ + X86_BR_SYSRET = 1 << 5, /* syscall return */ + X86_BR_INT = 1 << 6, /* sw interrupt */ + X86_BR_IRET = 1 << 7, /* return from interrupt */ + X86_BR_JCC = 1 << 8, /* conditional */ + X86_BR_JMP = 1 << 9, /* jump */ + X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ + X86_BR_IND_CALL = 1 << 11,/* indirect calls */ + X86_BR_ABORT = 1 << 12,/* transaction abort */ + X86_BR_IN_TX = 1 << 13,/* in transaction */ + X86_BR_NO_TX = 1 << 14,/* not in transaction */ + X86_BR_CALL_STACK = 1 << 15,/* call stack */ }; #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) @@ -373,7 +376,7 @@ void intel_pmu_lbr_read(void) * - in case there is no HW filter * - in case the HW filter has errata or limitations */ -static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) +static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event) { u64 br_type = event->attr.branch_sample_type; int mask = 0; @@ -410,11 +413,21 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) if (br_type & PERF_SAMPLE_BRANCH_COND) mask |= X86_BR_JCC; + if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) { + if (!x86_pmu_has_lbr_callstack()) + return -EOPNOTSUPP; + if (mask & ~(X86_BR_USER | X86_BR_KERNEL)) + return -EINVAL; + mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET | + X86_BR_CALL_STACK; + } + /* * stash actual user request into reg, it may * be used by fixup code for some CPU */ event->hw.branch_reg.reg = mask; + return 0; } /* @@ -443,8 +456,12 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) reg = &event->hw.branch_reg; reg->idx = EXTRA_REG_LBR; - /* LBR_SELECT operates in suppress mode so invert mask */ - reg->config = ~mask & x86_pmu.lbr_sel_mask; + /* + * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate + * in suppress mode. So LBR_SELECT should be set to + * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK) + */ + reg->config = mask ^ x86_pmu.lbr_sel_mask; return 0; } @@ -462,7 +479,9 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event) /* * setup SW LBR filter */ - intel_pmu_setup_sw_lbr_filter(event); + ret = intel_pmu_setup_sw_lbr_filter(event); + if (ret) + return ret; /* * setup HW LBR filter, if any @@ -732,6 +751,20 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = { [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, }; +static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = { + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR, + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL + | LBR_FAR, + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, + [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL + | LBR_RETURN | LBR_CALL_STACK, +}; + /* core */ void __init intel_pmu_lbr_init_core(void) { @@ -788,6 +821,20 @@ void __init intel_pmu_lbr_init_snb(void) pr_cont("16-deep LBR, "); } +/* haswell */ +void intel_pmu_lbr_init_hsw(void) +{ + x86_pmu.lbr_nr = 16; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_NHM_FROM; + x86_pmu.lbr_to = MSR_LBR_NHM_TO; + + x86_pmu.lbr_sel_mask = LBR_SEL_MASK; + x86_pmu.lbr_sel_map = hsw_lbr_sel_map; + + pr_cont("16-deep LBR, "); +} + /* atom */ void __init intel_pmu_lbr_init_atom(void) { -- cgit v1.2.3 From e18bf526422769611e7248135e36a4cea0e4e38d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 4 Nov 2014 21:56:03 -0500 Subject: perf/x86/intel: Allocate space for storing LBR stack When the LBR call stack is enabled, it is necessary to save/restore the LBR stack on context switch. We can use pmu specific data to store LBR stack when task is scheduled out. This patch adds code that allocates the pmu specific data. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Stephane Eranian Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: jolsa@redhat.com Link: http://lkml.kernel.org/r/1415156173-10035-8-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 4 ++++ arch/x86/kernel/cpu/perf_event.h | 7 +++++++ 2 files changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 6b1fd26a37cf..8ffd71ec2173 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -432,6 +432,9 @@ int x86_pmu_hw_config(struct perf_event *event) } } + if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK) + event->attach_state |= PERF_ATTACH_TASK_DATA; + /* * Generate PMC IRQs: * (keep 'enabled' bit clear for now) @@ -1950,6 +1953,7 @@ static struct pmu pmu = { .event_idx = x86_pmu_event_idx, .sched_task = x86_pmu_sched_task, + .task_ctx_size = sizeof(struct x86_perf_task_context), }; void arch_perf_update_userpage(struct perf_event *event, diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index c9a62c5bca75..69c26b396cf4 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -516,6 +516,13 @@ struct x86_pmu { struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); }; +struct x86_perf_task_context { + u64 lbr_from[MAX_LBR_ENTRIES]; + u64 lbr_to[MAX_LBR_ENTRIES]; + int lbr_callstack_users; + int lbr_stack_state; +}; + enum { PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = PERF_SAMPLE_BRANCH_MAX_SHIFT, PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE, -- cgit v1.2.3 From 63f0c1d84196b712fe9de99a8514486ab416d517 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 4 Nov 2014 21:56:04 -0500 Subject: perf/x86/intel: Track number of events that use the LBR callstack When enabling/disabling an event, check if the event uses the LBR callstack feature, adjust the LBR callstack usage count accordingly. Later patch will use the usage count to decide if LBR stack should be saved/restored. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: eranian@google.com Cc: jolsa@redhat.com Link: http://lkml.kernel.org/r/1415156173-10035-9-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index ac8279e560a1..ac8e54200934 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -205,9 +205,15 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) } } +static inline bool branch_user_callstack(unsigned br_sel) +{ + return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK); +} + void intel_pmu_lbr_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_task_context *task_ctx; if (!x86_pmu.lbr_nr) return; @@ -222,6 +228,12 @@ void intel_pmu_lbr_enable(struct perf_event *event) } cpuc->br_sel = event->hw.branch_reg.reg; + if (branch_user_callstack(cpuc->br_sel) && event->ctx && + event->ctx->task_ctx_data) { + task_ctx = event->ctx->task_ctx_data; + task_ctx->lbr_callstack_users++; + } + cpuc->lbr_users++; perf_sched_cb_inc(event->ctx->pmu); } @@ -229,10 +241,17 @@ void intel_pmu_lbr_enable(struct perf_event *event) void intel_pmu_lbr_disable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_task_context *task_ctx; if (!x86_pmu.lbr_nr) return; + if (branch_user_callstack(cpuc->br_sel) && event->ctx && + event->ctx->task_ctx_data) { + task_ctx = event->ctx->task_ctx_data; + task_ctx->lbr_callstack_users--; + } + cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); perf_sched_cb_dec(event->ctx->pmu); -- cgit v1.2.3 From 76cb2c617f12a4dd53c0e899972813b805ad6cc2 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 4 Nov 2014 21:56:05 -0500 Subject: perf/x86/intel: Save/restore LBR stack during context switch When the LBR call stack is enabled, it is necessary to save/restore the LBR stack on context switch. The solution is saving/restoring the LBR stack to/from task's perf event context. The LBR stack is saved/restored only when there are events that use the LBR call stack. If no event uses LBR call stack, the LBR stack is reset when task is scheduled in. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: eranian@google.com Cc: jolsa@redhat.com Link: http://lkml.kernel.org/r/1415156173-10035-10-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 88 ++++++++++++++++++++++++++---- 1 file changed, 76 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index ac8e54200934..a8b3f236cf37 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -180,13 +180,89 @@ void intel_pmu_lbr_reset(void) intel_pmu_lbr_reset_64(); } +/* + * TOS = most recently recorded branch + */ +static inline u64 intel_pmu_lbr_tos(void) +{ + u64 tos; + + rdmsrl(x86_pmu.lbr_tos, tos); + return tos; +} + +enum { + LBR_NONE, + LBR_VALID, +}; + +static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) +{ + int i; + unsigned lbr_idx, mask; + u64 tos; + + if (task_ctx->lbr_callstack_users == 0 || + task_ctx->lbr_stack_state == LBR_NONE) { + intel_pmu_lbr_reset(); + return; + } + + mask = x86_pmu.lbr_nr - 1; + tos = intel_pmu_lbr_tos(); + for (i = 0; i < x86_pmu.lbr_nr; i++) { + lbr_idx = (tos - i) & mask; + wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); + wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); + } + task_ctx->lbr_stack_state = LBR_NONE; +} + +static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) +{ + int i; + unsigned lbr_idx, mask; + u64 tos; + + if (task_ctx->lbr_callstack_users == 0) { + task_ctx->lbr_stack_state = LBR_NONE; + return; + } + + mask = x86_pmu.lbr_nr - 1; + tos = intel_pmu_lbr_tos(); + for (i = 0; i < x86_pmu.lbr_nr; i++) { + lbr_idx = (tos - i) & mask; + rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); + rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); + } + task_ctx->lbr_stack_state = LBR_VALID; +} + void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_task_context *task_ctx; if (!x86_pmu.lbr_nr) return; + /* + * If LBR callstack feature is enabled and the stack was saved when + * the task was scheduled out, restore the stack. Otherwise flush + * the LBR stack. + */ + task_ctx = ctx ? ctx->task_ctx_data : NULL; + if (task_ctx) { + if (sched_in) { + __intel_pmu_lbr_restore(task_ctx); + cpuc->lbr_context = ctx; + } else { + __intel_pmu_lbr_save(task_ctx); + } + return; + } + /* * When sampling the branck stack in system-wide, it may be * necessary to flush the stack on context switch. This happens @@ -279,18 +355,6 @@ void intel_pmu_lbr_disable_all(void) __intel_pmu_lbr_disable(); } -/* - * TOS = most recently recorded branch - */ -static inline u64 intel_pmu_lbr_tos(void) -{ - u64 tos; - - rdmsrl(x86_pmu.lbr_tos, tos); - - return tos; -} - static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) { unsigned long mask = x86_pmu.lbr_nr - 1; -- cgit v1.2.3 From a46a23000198d929391aa9dac8de68734efa2703 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 4 Nov 2014 21:56:06 -0500 Subject: perf: Simplify the branch stack check Use event->attr.branch_sample_type to replace intel_pmu_needs_lbr_smpl() for avoiding duplicated code that implicitly enables the LBR. Currently, branch stack can be enabled by user explicitly requesting branch sampling or implicit branch sampling to correct PEBS skid. For user explicitly requested branch sampling, the branch_sample_type is explicitly set by user. For PEBS case, the branch_sample_type is also implicitly set to PERF_SAMPLE_BRANCH_ANY in x86_pmu_hw_config. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: eranian@google.com Cc: jolsa@redhat.com Link: http://lkml.kernel.org/r/1415156173-10035-11-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index a485ba124476..9f1dd18fa395 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1029,20 +1029,6 @@ static __initconst const u64 slm_hw_cache_event_ids }, }; -static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) -{ - /* user explicitly requested branch sampling */ - if (has_branch_stack(event)) - return true; - - /* implicit branch sampling to correct PEBS skid */ - if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 && - x86_pmu.intel_cap.pebs_format < 2) - return true; - - return false; -} - static void intel_pmu_disable_all(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1207,7 +1193,7 @@ static void intel_pmu_disable_event(struct perf_event *event) * must disable before any actual event * because any event may be combined with LBR */ - if (intel_pmu_needs_lbr_smpl(event)) + if (needs_branch_stack(event)) intel_pmu_lbr_disable(event); if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { @@ -1268,7 +1254,7 @@ static void intel_pmu_enable_event(struct perf_event *event) * must enabled before any actual event * because any event may be combined with LBR */ - if (intel_pmu_needs_lbr_smpl(event)) + if (needs_branch_stack(event)) intel_pmu_lbr_enable(event); if (event->attr.exclude_host) @@ -1747,7 +1733,7 @@ static int intel_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip && x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); - if (intel_pmu_needs_lbr_smpl(event)) { + if (needs_branch_stack(event)) { ret = intel_pmu_setup_lbr_filter(event); if (ret) return ret; -- cgit v1.2.3 From 4b854900995194601d767fcd112307b21ed278b2 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 4 Nov 2014 21:56:08 -0500 Subject: perf/x86/intel: Re-organize code that implicitly enables LBR/PEBS Make later patch more readable, no logic change. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: eranian@google.com Cc: jolsa@redhat.com Link: http://lkml.kernel.org/r/1415156173-10035-13-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 59 ++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8ffd71ec2173..e0dab5ce61e9 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -399,36 +399,35 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip > precise) return -EOPNOTSUPP; - /* - * check that PEBS LBR correction does not conflict with - * whatever the user is asking with attr->branch_sample_type - */ - if (event->attr.precise_ip > 1 && - x86_pmu.intel_cap.pebs_format < 2) { - u64 *br_type = &event->attr.branch_sample_type; - - if (has_branch_stack(event)) { - if (!precise_br_compat(event)) - return -EOPNOTSUPP; - - /* branch_sample_type is compatible */ - - } else { - /* - * user did not specify branch_sample_type - * - * For PEBS fixups, we capture all - * the branches at the priv level of the - * event. - */ - *br_type = PERF_SAMPLE_BRANCH_ANY; - - if (!event->attr.exclude_user) - *br_type |= PERF_SAMPLE_BRANCH_USER; - - if (!event->attr.exclude_kernel) - *br_type |= PERF_SAMPLE_BRANCH_KERNEL; - } + } + /* + * check that PEBS LBR correction does not conflict with + * whatever the user is asking with attr->branch_sample_type + */ + if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) { + u64 *br_type = &event->attr.branch_sample_type; + + if (has_branch_stack(event)) { + if (!precise_br_compat(event)) + return -EOPNOTSUPP; + + /* branch_sample_type is compatible */ + + } else { + /* + * user did not specify branch_sample_type + * + * For PEBS fixups, we capture all + * the branches at the priv level of the + * event. + */ + *br_type = PERF_SAMPLE_BRANCH_ANY; + + if (!event->attr.exclude_user) + *br_type |= PERF_SAMPLE_BRANCH_USER; + + if (!event->attr.exclude_kernel) + *br_type |= PERF_SAMPLE_BRANCH_KERNEL; } } -- cgit v1.2.3 From 2c70d0086e4e9e2440f0f78098090f32bde14277 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 4 Nov 2014 21:56:10 -0500 Subject: perf/x86/intel: Disable FREEZE_LBRS_ON_PMI when LBR operates in callstack mode LBR callstack is designed for PEBS, It does not work well with FREEZE_LBRS_ON_PMI for non PEBS event. If FREEZE_LBRS_ON_PMI is set for non PEBS event, PMIs near call/return instructions may cause superfluous increase/decrease of LBR_TOS. This patch modifies __intel_pmu_lbr_enable() to not enable FREEZE_LBRS_ON_PMI when LBR operates in callstack mode. We currently don't use LBR callstack to capture kernel space callchain, so disabling FREEZE_LBRS_ON_PMI should not be a problem. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: eranian@google.com Cc: jolsa@redhat.com Link: http://lkml.kernel.org/r/1415156173-10035-15-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index a8b3f236cf37..92a44fdbc9d3 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -131,14 +131,23 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); static void __intel_pmu_lbr_enable(void) { - u64 debugctl; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 debugctl, lbr_select = 0; - if (cpuc->lbr_sel) - wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config); + if (cpuc->lbr_sel) { + lbr_select = cpuc->lbr_sel->config; + wrmsrl(MSR_LBR_SELECT, lbr_select); + } rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); - debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + debugctl |= DEBUGCTLMSR_LBR; + /* + * LBR callstack does not work well with FREEZE_LBRS_ON_PMI. + * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions + * may cause superfluous increase/decrease of LBR_TOS. + */ + if (!(lbr_select & LBR_CALL_STACK)) + debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); } -- cgit v1.2.3 From aa54ae9b87b83af7edabcc34a299e7e014609af4 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 4 Nov 2014 21:56:11 -0500 Subject: perf/x86/intel: Discard zero length call entries in LBR call stack "Zero length call" uses the attribute of the call instruction to push the immediate instruction pointer on to the stack and then pops off that address into a register. This is accomplished without any matching return instruction. It confuses the hardware and make the recorded call stack incorrect. We can partially resolve this issue by: decode call instructions and discard any zero length call entry in the LBR stack. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: eranian@google.com Cc: jolsa@redhat.com Link: http://lkml.kernel.org/r/1415156173-10035-16-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 92a44fdbc9d3..084f2eb20c8b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -94,7 +94,8 @@ enum { X86_BR_ABORT = 1 << 12,/* transaction abort */ X86_BR_IN_TX = 1 << 13,/* in transaction */ X86_BR_NO_TX = 1 << 14,/* not in transaction */ - X86_BR_CALL_STACK = 1 << 15,/* call stack */ + X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ + X86_BR_CALL_STACK = 1 << 16,/* call stack */ }; #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) @@ -111,13 +112,15 @@ enum { X86_BR_JMP |\ X86_BR_IRQ |\ X86_BR_ABORT |\ - X86_BR_IND_CALL) + X86_BR_IND_CALL |\ + X86_BR_ZERO_CALL) #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) #define X86_BR_ANY_CALL \ (X86_BR_CALL |\ X86_BR_IND_CALL |\ + X86_BR_ZERO_CALL |\ X86_BR_SYSCALL |\ X86_BR_IRQ |\ X86_BR_INT) @@ -702,6 +705,12 @@ static int branch_type(unsigned long from, unsigned long to, int abort) ret = X86_BR_INT; break; case 0xe8: /* call near rel */ + insn_get_immediate(&insn); + if (insn.immediate1.value == 0) { + /* zero length call */ + ret = X86_BR_ZERO_CALL; + break; + } case 0x9a: /* call far absolute */ ret = X86_BR_CALL; break; -- cgit v1.2.3 From 2c44b1936bb3b135a3fac8b3493394d42e51cf70 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 5 Nov 2014 10:36:45 +0100 Subject: perf/x86/intel: Expose LBR callstack to user space tooling With LBR call stack feature enable, there are three callchain options. Enable the 3rd callchain option (LBR callstack) to user space tooling. Signed-off-by: Peter Zijlstra (Intel) Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Andy Lutomirski Cc: Kan Liang Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/20141105093759.GQ10501@worktop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 8 -------- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 8 ++++---- 2 files changed, 4 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 69c26b396cf4..a371d27d6795 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -523,14 +523,6 @@ struct x86_perf_task_context { int lbr_stack_state; }; -enum { - PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = PERF_SAMPLE_BRANCH_MAX_SHIFT, - PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE, - - PERF_SAMPLE_BRANCH_CALL_STACK = - 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT, -}; - #define x86_add_quirk(func_) \ do { \ static struct x86_pmu_quirk __quirk __initdata = { \ diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 084f2eb20c8b..0473874109cb 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -537,7 +537,7 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) u64 mask = 0, v; int i; - for (i = 0; i < PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE; i++) { + for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) { if (!(br_type & (1ULL << i))) continue; @@ -821,7 +821,7 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) /* * Map interface branch filters onto LBR filters */ -static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = { +static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, @@ -840,7 +840,7 @@ static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = { [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, }; -static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = { +static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, @@ -852,7 +852,7 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = { [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, }; -static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = { +static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, -- cgit v1.2.3 From 4421f8f0fa02bc982b410cd773223cc280791c54 Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Wed, 18 Feb 2015 15:21:07 +0100 Subject: livepatch: remove extern specifier from header files Storage-class specifier 'extern' is redundant in front of the function declaration. According to the C specification it has the same meaning as if not present at all. So remove it. Signed-off-by: Miroslav Benes Acked-by: Josh Poimboeuf Reviewed-by: Masami Hiramatsu Signed-off-by: Jiri Kosina --- arch/x86/include/asm/livepatch.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h index a455a53d789a..2d29197bd2fb 100644 --- a/arch/x86/include/asm/livepatch.h +++ b/arch/x86/include/asm/livepatch.h @@ -32,8 +32,8 @@ static inline int klp_check_compiler_support(void) #endif return 0; } -extern int klp_write_module_reloc(struct module *mod, unsigned long type, - unsigned long loc, unsigned long value); +int klp_write_module_reloc(struct module *mod, unsigned long type, + unsigned long loc, unsigned long value); static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) { -- cgit v1.2.3 From cbc82b17263877ea5d21e84c58ce03f0292458a1 Mon Sep 17 00:00:00 2001 From: Peter P Waskiewicz Jr Date: Fri, 23 Jan 2015 18:45:43 +0000 Subject: x86: Add support for Intel Cache QoS Monitoring (CQM) detection This patch adds support for the new Cache QoS Monitoring (CQM) feature found in future Intel Xeon processors. It includes the new values to track CQM resources to the cpuinfo_x86 structure, plus the CPUID detection routines for CQM. CQM allows a process, or set of processes, to be tracked by the CPU to determine the cache usage of that task group. Using this data from the CPU, software can be written to extract this data and report cache usage and occupancy for a particular process, or group of processes. More information about Cache QoS Monitoring can be found in the Intel (R) x86 Architecture Software Developer Manual, section 17.14. Signed-off-by: Peter P Waskiewicz Jr Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Chris Webb Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Igor Mammedov Cc: Jacob Shin Cc: Jan Beulich Cc: Jiri Olsa Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Steven Honeyman Cc: Steven Rostedt Cc: Vikas Shivappa Link: http://lkml.kernel.org/r/1422038748-21397-5-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 9 ++++++++- arch/x86/include/asm/processor.h | 3 +++ arch/x86/kernel/cpu/common.c | 39 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 90a54851aedc..361922dcc9b1 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -12,7 +12,7 @@ #include #endif -#define NCAPINTS 11 /* N 32-bit words worth of info */ +#define NCAPINTS 13 /* N 32-bit words worth of info */ #define NBUGINTS 1 /* N 32-bit bug flags */ /* @@ -226,6 +226,7 @@ #define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ #define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ +#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ #define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ @@ -242,6 +243,12 @@ #define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ #define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ +#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ + +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ +#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ + /* * BUG word(s) */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ec1c93588cef..a12d50e04d7a 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -109,6 +109,9 @@ struct cpuinfo_x86 { /* in KB - valid for CPUS which support this call: */ int x86_cache_size; int x86_cache_alignment; /* In bytes */ + /* Cache QoS architectural values: */ + int x86_cache_max_rmid; /* max index */ + int x86_cache_occ_scale; /* scale to bytes */ int x86_power; unsigned long loops_per_jiffy; /* cpuid returned max cores value: */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 07f2fc3c13a4..9fa00b2ea0ee 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -645,6 +645,30 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[10] = eax; } + /* Additional Intel-defined flags: level 0x0000000F */ + if (c->cpuid_level >= 0x0000000F) { + u32 eax, ebx, ecx, edx; + + /* QoS sub-leaf, EAX=0Fh, ECX=0 */ + cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx); + c->x86_capability[11] = edx; + if (cpu_has(c, X86_FEATURE_CQM_LLC)) { + /* will be overridden if occupancy monitoring exists */ + c->x86_cache_max_rmid = ebx; + + /* QoS sub-leaf, EAX=0Fh, ECX=1 */ + cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx); + c->x86_capability[12] = edx; + if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) { + c->x86_cache_max_rmid = ecx; + c->x86_cache_occ_scale = ebx; + } + } else { + c->x86_cache_max_rmid = -1; + c->x86_cache_occ_scale = -1; + } + } + /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); c->extended_cpuid_level = xlvl; @@ -833,6 +857,20 @@ static void generic_identify(struct cpuinfo_x86 *c) detect_nopl(c); } +static void x86_init_cache_qos(struct cpuinfo_x86 *c) +{ + /* + * The heavy lifting of max_rmid and cache_occ_scale are handled + * in get_cpu_cap(). Here we just set the max_rmid for the boot_cpu + * in case CQM bits really aren't there in this CPU. + */ + if (c != &boot_cpu_data) { + boot_cpu_data.x86_cache_max_rmid = + min(boot_cpu_data.x86_cache_max_rmid, + c->x86_cache_max_rmid); + } +} + /* * This does the hard work of actually picking apart the CPU stuff... */ @@ -922,6 +960,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) init_hypervisor(c); x86_init_rdrand(c); + x86_init_cache_qos(c); /* * Clear/Set all flags overriden by options, need do it -- cgit v1.2.3 From 4afbb24ce5e723c8a093a6674a3c33062175078a Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 23 Jan 2015 18:45:44 +0000 Subject: perf/x86/intel: Add Intel Cache QoS Monitoring support Future Intel Xeon processors support a Cache QoS Monitoring feature that allows tracking of the LLC occupancy for a task or task group, i.e. the amount of data in pulled into the LLC for the task (group). Currently the PMU only supports per-cpu events. We create an event for each cpu and read out all the LLC occupancy values. Because this results in duplicate values being written out to userspace, we also export a .per-pkg event file so that the perf tools only accumulate values for one cpu per package. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Vikas Shivappa Link: http://lkml.kernel.org/r/1422038748-21397-6-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 530 +++++++++++++++++++++++++++++ 1 file changed, 530 insertions(+) create mode 100644 arch/x86/kernel/cpu/perf_event_intel_cqm.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c new file mode 100644 index 000000000000..05b4cd26f426 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -0,0 +1,530 @@ +/* + * Intel Cache Quality-of-Service Monitoring (CQM) support. + * + * Based very, very heavily on work by Peter Zijlstra. + */ + +#include +#include +#include +#include "perf_event.h" + +#define MSR_IA32_PQR_ASSOC 0x0c8f +#define MSR_IA32_QM_CTR 0x0c8e +#define MSR_IA32_QM_EVTSEL 0x0c8d + +static unsigned int cqm_max_rmid = -1; +static unsigned int cqm_l3_scale; /* supposedly cacheline size */ + +struct intel_cqm_state { + raw_spinlock_t lock; + int rmid; + int cnt; +}; + +static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state); + +/* + * Protects cache_cgroups. + */ +static DEFINE_MUTEX(cache_mutex); + +/* + * Groups of events that have the same target(s), one RMID per group. + */ +static LIST_HEAD(cache_groups); + +/* + * Mask of CPUs for reading CQM values. We only need one per-socket. + */ +static cpumask_t cqm_cpumask; + +#define RMID_VAL_ERROR (1ULL << 63) +#define RMID_VAL_UNAVAIL (1ULL << 62) + +#define QOS_L3_OCCUP_EVENT_ID (1 << 0) + +#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID + +static u64 __rmid_read(unsigned long rmid) +{ + u64 val; + + /* + * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt, + * it just says that to increase confusion. + */ + wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid); + rdmsrl(MSR_IA32_QM_CTR, val); + + /* + * Aside from the ERROR and UNAVAIL bits, assume this thing returns + * the number of cachelines tagged with @rmid. + */ + return val; +} + +static unsigned long *cqm_rmid_bitmap; + +/* + * Returns < 0 on fail. + */ +static int __get_rmid(void) +{ + return bitmap_find_free_region(cqm_rmid_bitmap, cqm_max_rmid, 0); +} + +static void __put_rmid(int rmid) +{ + bitmap_release_region(cqm_rmid_bitmap, rmid, 0); +} + +static int intel_cqm_setup_rmid_cache(void) +{ + cqm_rmid_bitmap = kmalloc(sizeof(long) * BITS_TO_LONGS(cqm_max_rmid), GFP_KERNEL); + if (!cqm_rmid_bitmap) + return -ENOMEM; + + bitmap_zero(cqm_rmid_bitmap, cqm_max_rmid); + + /* + * RMID 0 is special and is always allocated. It's used for all + * tasks that are not monitored. + */ + bitmap_allocate_region(cqm_rmid_bitmap, 0, 0); + + return 0; +} + +/* + * Determine if @a and @b measure the same set of tasks. + */ +static bool __match_event(struct perf_event *a, struct perf_event *b) +{ + if ((a->attach_state & PERF_ATTACH_TASK) != + (b->attach_state & PERF_ATTACH_TASK)) + return false; + + /* not task */ + + return true; /* if not task, we're machine wide */ +} + +/* + * Determine if @a's tasks intersect with @b's tasks + */ +static bool __conflict_event(struct perf_event *a, struct perf_event *b) +{ + /* + * If one of them is not a task, same story as above with cgroups. + */ + if (!(a->attach_state & PERF_ATTACH_TASK) || + !(b->attach_state & PERF_ATTACH_TASK)) + return true; + + /* + * Must be non-overlapping. + */ + return false; +} + +/* + * Find a group and setup RMID. + * + * If we're part of a group, we use the group's RMID. + */ +static int intel_cqm_setup_event(struct perf_event *event, + struct perf_event **group) +{ + struct perf_event *iter; + int rmid; + + list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { + if (__match_event(iter, event)) { + /* All tasks in a group share an RMID */ + event->hw.cqm_rmid = iter->hw.cqm_rmid; + *group = iter; + return 0; + } + + if (__conflict_event(iter, event)) + return -EBUSY; + } + + rmid = __get_rmid(); + if (rmid < 0) + return rmid; + + event->hw.cqm_rmid = rmid; + return 0; +} + +static void intel_cqm_event_read(struct perf_event *event) +{ + unsigned long rmid = event->hw.cqm_rmid; + u64 val; + + val = __rmid_read(rmid); + + /* + * Ignore this reading on error states and do not update the value. + */ + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) + return; + + local64_set(&event->count, val); +} + +static void intel_cqm_event_start(struct perf_event *event, int mode) +{ + struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); + unsigned long rmid = event->hw.cqm_rmid; + unsigned long flags; + + if (!(event->hw.cqm_state & PERF_HES_STOPPED)) + return; + + event->hw.cqm_state &= ~PERF_HES_STOPPED; + + raw_spin_lock_irqsave(&state->lock, flags); + + if (state->cnt++) + WARN_ON_ONCE(state->rmid != rmid); + else + WARN_ON_ONCE(state->rmid); + + state->rmid = rmid; + wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid); + + raw_spin_unlock_irqrestore(&state->lock, flags); +} + +static void intel_cqm_event_stop(struct perf_event *event, int mode) +{ + struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); + unsigned long flags; + + if (event->hw.cqm_state & PERF_HES_STOPPED) + return; + + event->hw.cqm_state |= PERF_HES_STOPPED; + + raw_spin_lock_irqsave(&state->lock, flags); + intel_cqm_event_read(event); + + if (!--state->cnt) { + state->rmid = 0; + wrmsrl(MSR_IA32_PQR_ASSOC, 0); + } else { + WARN_ON_ONCE(!state->rmid); + } + + raw_spin_unlock_irqrestore(&state->lock, flags); +} + +static int intel_cqm_event_add(struct perf_event *event, int mode) +{ + int rmid; + + event->hw.cqm_state = PERF_HES_STOPPED; + rmid = event->hw.cqm_rmid; + WARN_ON_ONCE(!rmid); + + if (mode & PERF_EF_START) + intel_cqm_event_start(event, mode); + + return 0; +} + +static void intel_cqm_event_del(struct perf_event *event, int mode) +{ + intel_cqm_event_stop(event, mode); +} + +static void intel_cqm_event_destroy(struct perf_event *event) +{ + struct perf_event *group_other = NULL; + + mutex_lock(&cache_mutex); + + /* + * If there's another event in this group... + */ + if (!list_empty(&event->hw.cqm_group_entry)) { + group_other = list_first_entry(&event->hw.cqm_group_entry, + struct perf_event, + hw.cqm_group_entry); + list_del(&event->hw.cqm_group_entry); + } + + /* + * And we're the group leader.. + */ + if (!list_empty(&event->hw.cqm_groups_entry)) { + /* + * If there was a group_other, make that leader, otherwise + * destroy the group and return the RMID. + */ + if (group_other) { + list_replace(&event->hw.cqm_groups_entry, + &group_other->hw.cqm_groups_entry); + } else { + int rmid = event->hw.cqm_rmid; + + __put_rmid(rmid); + list_del(&event->hw.cqm_groups_entry); + } + } + + mutex_unlock(&cache_mutex); +} + +static struct pmu intel_cqm_pmu; + +/* + * XXX there's a bit of a problem in that we cannot simply do the one + * event per node as one would want, since that one event would one get + * scheduled on the one cpu. But we want to 'schedule' the RMID on all + * CPUs. + * + * This means we want events for each CPU, however, that generates a lot + * of duplicate values out to userspace -- this is not to be helped + * unless we want to change the core code in some way. Fore more info, + * see intel_cqm_event_read(). + */ +static int intel_cqm_event_init(struct perf_event *event) +{ + struct perf_event *group = NULL; + int err; + + if (event->attr.type != intel_cqm_pmu.type) + return -ENOENT; + + if (event->attr.config & ~QOS_EVENT_MASK) + return -EINVAL; + + if (event->cpu == -1) + return -EINVAL; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest || + event->attr.sample_period) /* no sampling */ + return -EINVAL; + + INIT_LIST_HEAD(&event->hw.cqm_group_entry); + INIT_LIST_HEAD(&event->hw.cqm_groups_entry); + + event->destroy = intel_cqm_event_destroy; + + mutex_lock(&cache_mutex); + + err = intel_cqm_setup_event(event, &group); /* will also set rmid */ + if (err) + goto out; + + if (group) { + list_add_tail(&event->hw.cqm_group_entry, + &group->hw.cqm_group_entry); + } else { + list_add_tail(&event->hw.cqm_groups_entry, + &cache_groups); + } + +out: + mutex_unlock(&cache_mutex); + return err; +} + +EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01"); +EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1"); +EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes"); +EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL); +EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1"); + +static struct attribute *intel_cqm_events_attr[] = { + EVENT_PTR(intel_cqm_llc), + EVENT_PTR(intel_cqm_llc_pkg), + EVENT_PTR(intel_cqm_llc_unit), + EVENT_PTR(intel_cqm_llc_scale), + EVENT_PTR(intel_cqm_llc_snapshot), + NULL, +}; + +static struct attribute_group intel_cqm_events_group = { + .name = "events", + .attrs = intel_cqm_events_attr, +}; + +PMU_FORMAT_ATTR(event, "config:0-7"); +static struct attribute *intel_cqm_formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group intel_cqm_format_group = { + .name = "format", + .attrs = intel_cqm_formats_attr, +}; + +static const struct attribute_group *intel_cqm_attr_groups[] = { + &intel_cqm_events_group, + &intel_cqm_format_group, + NULL, +}; + +static struct pmu intel_cqm_pmu = { + .attr_groups = intel_cqm_attr_groups, + .task_ctx_nr = perf_sw_context, + .event_init = intel_cqm_event_init, + .add = intel_cqm_event_add, + .del = intel_cqm_event_del, + .start = intel_cqm_event_start, + .stop = intel_cqm_event_stop, + .read = intel_cqm_event_read, +}; + +static inline void cqm_pick_event_reader(int cpu) +{ + int phys_id = topology_physical_package_id(cpu); + int i; + + for_each_cpu(i, &cqm_cpumask) { + if (phys_id == topology_physical_package_id(i)) + return; /* already got reader for this socket */ + } + + cpumask_set_cpu(cpu, &cqm_cpumask); +} + +static void intel_cqm_cpu_prepare(unsigned int cpu) +{ + struct intel_cqm_state *state = &per_cpu(cqm_state, cpu); + struct cpuinfo_x86 *c = &cpu_data(cpu); + + raw_spin_lock_init(&state->lock); + state->rmid = 0; + state->cnt = 0; + + WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid); + WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale); +} + +static void intel_cqm_cpu_exit(unsigned int cpu) +{ + int phys_id = topology_physical_package_id(cpu); + int i; + + /* + * Is @cpu a designated cqm reader? + */ + if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask)) + return; + + for_each_online_cpu(i) { + if (i == cpu) + continue; + + if (phys_id == topology_physical_package_id(i)) { + cpumask_set_cpu(i, &cqm_cpumask); + break; + } + } +} + +static int intel_cqm_cpu_notifier(struct notifier_block *nb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + intel_cqm_cpu_prepare(cpu); + break; + case CPU_DOWN_PREPARE: + intel_cqm_cpu_exit(cpu); + break; + case CPU_STARTING: + cqm_pick_event_reader(cpu); + break; + } + + return NOTIFY_OK; +} + +static const struct x86_cpu_id intel_cqm_match[] = { + { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC }, + {} +}; + +static int __init intel_cqm_init(void) +{ + char *str, scale[20]; + int i, cpu, ret; + + if (!x86_match_cpu(intel_cqm_match)) + return -ENODEV; + + cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale; + + /* + * It's possible that not all resources support the same number + * of RMIDs. Instead of making scheduling much more complicated + * (where we have to match a task's RMID to a cpu that supports + * that many RMIDs) just find the minimum RMIDs supported across + * all cpus. + * + * Also, check that the scales match on all cpus. + */ + cpu_notifier_register_begin(); + + for_each_online_cpu(cpu) { + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->x86_cache_max_rmid < cqm_max_rmid) + cqm_max_rmid = c->x86_cache_max_rmid; + + if (c->x86_cache_occ_scale != cqm_l3_scale) { + pr_err("Multiple LLC scale values, disabling\n"); + ret = -EINVAL; + goto out; + } + } + + snprintf(scale, sizeof(scale), "%u", cqm_l3_scale); + str = kstrdup(scale, GFP_KERNEL); + if (!str) { + ret = -ENOMEM; + goto out; + } + + event_attr_intel_cqm_llc_scale.event_str = str; + + ret = intel_cqm_setup_rmid_cache(); + if (ret) + goto out; + + for_each_online_cpu(i) { + intel_cqm_cpu_prepare(i); + cqm_pick_event_reader(i); + } + + __perf_cpu_notifier(intel_cqm_cpu_notifier); + + ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); + + if (ret) + pr_err("Intel CQM perf registration failed: %d\n", ret); + else + pr_info("Intel CQM monitoring enabled\n"); + +out: + cpu_notifier_register_done(); + + return ret; +} +device_initcall(intel_cqm_init); -- cgit v1.2.3 From 35298e554c74b7849875e3676ba8eaf833c7b917 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 23 Jan 2015 18:45:45 +0000 Subject: perf/x86/intel: Implement LRU monitoring ID allocation for CQM It's possible to run into issues with re-using unused monitoring IDs because there may be stale cachelines associated with that ID from a previous allocation. This can cause the LLC occupancy values to be inaccurate. To attempt to mitigate this problem we place the IDs on a least recently used list, essentially a FIFO. The basic idea is that the longer the time period between ID re-use the lower the probability that stale cachelines exist in the cache. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Vikas Shivappa Link: http://lkml.kernel.org/r/1422038748-21397-7-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 100 ++++++++++++++++++++++++++--- 1 file changed, 92 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 05b4cd26f426..b5d9d746dbc0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -25,7 +25,7 @@ struct intel_cqm_state { static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state); /* - * Protects cache_cgroups. + * Protects cache_cgroups and cqm_rmid_lru. */ static DEFINE_MUTEX(cache_mutex); @@ -64,36 +64,120 @@ static u64 __rmid_read(unsigned long rmid) return val; } -static unsigned long *cqm_rmid_bitmap; +struct cqm_rmid_entry { + u64 rmid; + struct list_head list; +}; + +/* + * A least recently used list of RMIDs. + * + * Oldest entry at the head, newest (most recently used) entry at the + * tail. This list is never traversed, it's only used to keep track of + * the lru order. That is, we only pick entries of the head or insert + * them on the tail. + * + * All entries on the list are 'free', and their RMIDs are not currently + * in use. To mark an RMID as in use, remove its entry from the lru + * list. + * + * This list is protected by cache_mutex. + */ +static LIST_HEAD(cqm_rmid_lru); + +/* + * We use a simple array of pointers so that we can lookup a struct + * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid() + * and __put_rmid() from having to worry about dealing with struct + * cqm_rmid_entry - they just deal with rmids, i.e. integers. + * + * Once this array is initialized it is read-only. No locks are required + * to access it. + * + * All entries for all RMIDs can be looked up in the this array at all + * times. + */ +static struct cqm_rmid_entry **cqm_rmid_ptrs; + +static inline struct cqm_rmid_entry *__rmid_entry(int rmid) +{ + struct cqm_rmid_entry *entry; + + entry = cqm_rmid_ptrs[rmid]; + WARN_ON(entry->rmid != rmid); + + return entry; +} /* * Returns < 0 on fail. + * + * We expect to be called with cache_mutex held. */ static int __get_rmid(void) { - return bitmap_find_free_region(cqm_rmid_bitmap, cqm_max_rmid, 0); + struct cqm_rmid_entry *entry; + + lockdep_assert_held(&cache_mutex); + + if (list_empty(&cqm_rmid_lru)) + return -EAGAIN; + + entry = list_first_entry(&cqm_rmid_lru, struct cqm_rmid_entry, list); + list_del(&entry->list); + + return entry->rmid; } static void __put_rmid(int rmid) { - bitmap_release_region(cqm_rmid_bitmap, rmid, 0); + struct cqm_rmid_entry *entry; + + lockdep_assert_held(&cache_mutex); + + entry = __rmid_entry(rmid); + + list_add_tail(&entry->list, &cqm_rmid_lru); } static int intel_cqm_setup_rmid_cache(void) { - cqm_rmid_bitmap = kmalloc(sizeof(long) * BITS_TO_LONGS(cqm_max_rmid), GFP_KERNEL); - if (!cqm_rmid_bitmap) + struct cqm_rmid_entry *entry; + int r; + + cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) * + (cqm_max_rmid + 1), GFP_KERNEL); + if (!cqm_rmid_ptrs) return -ENOMEM; - bitmap_zero(cqm_rmid_bitmap, cqm_max_rmid); + for (r = 0; r <= cqm_max_rmid; r++) { + struct cqm_rmid_entry *entry; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + goto fail; + + INIT_LIST_HEAD(&entry->list); + entry->rmid = r; + cqm_rmid_ptrs[r] = entry; + + list_add_tail(&entry->list, &cqm_rmid_lru); + } /* * RMID 0 is special and is always allocated. It's used for all * tasks that are not monitored. */ - bitmap_allocate_region(cqm_rmid_bitmap, 0, 0); + entry = __rmid_entry(0); + list_del(&entry->list); return 0; +fail: + while (r--) + kfree(cqm_rmid_ptrs[r]); + + kfree(cqm_rmid_ptrs); + return -ENOMEM; } /* -- cgit v1.2.3 From bfe1fcd2688f557a6b6a88f59ea7619228728bd7 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 23 Jan 2015 18:45:46 +0000 Subject: perf/x86/intel: Support task events with Intel CQM Add support for task events as well as system-wide events. This change has a big impact on the way that we gather LLC occupancy values in intel_cqm_event_read(). Currently, for system-wide (per-cpu) events we defer processing to userspace which knows how to discard all but one cpu result per package. Things aren't so simple for task events because we need to do the value aggregation ourselves. To do this, we defer updating the LLC occupancy value in event->count from intel_cqm_event_read() and do an SMP cross-call to read values for all packages in intel_cqm_event_count(). We need to ensure that we only do this for one task event per cache group, otherwise we'll report duplicate values. If we're a system-wide event we want to fallback to the default perf_event_count() implementation. Refactor this into a common function so that we don't duplicate the code. Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an event's task (if the event isn't per-cpu) inside of the Intel CQM PMU driver. This task information is only availble in the upper layers of the perf infrastructure. Other perf backends stash the target task in event->hw.*target so we need to do something similar. The task is used to determine whether events should share a cache group and an RMID. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Vikas Shivappa Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 195 +++++++++++++++++++++++++---- 1 file changed, 174 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index b5d9d746dbc0..8003d87afd89 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -182,23 +182,124 @@ fail: /* * Determine if @a and @b measure the same set of tasks. + * + * If @a and @b measure the same set of tasks then we want to share a + * single RMID. */ static bool __match_event(struct perf_event *a, struct perf_event *b) { + /* Per-cpu and task events don't mix */ if ((a->attach_state & PERF_ATTACH_TASK) != (b->attach_state & PERF_ATTACH_TASK)) return false; - /* not task */ +#ifdef CONFIG_CGROUP_PERF + if (a->cgrp != b->cgrp) + return false; +#endif + + /* If not task event, we're machine wide */ + if (!(b->attach_state & PERF_ATTACH_TASK)) + return true; + + /* + * Events that target same task are placed into the same cache group. + */ + if (a->hw.cqm_target == b->hw.cqm_target) + return true; + + /* + * Are we an inherited event? + */ + if (b->parent == a) + return true; + + return false; +} + +#ifdef CONFIG_CGROUP_PERF +static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) +{ + if (event->attach_state & PERF_ATTACH_TASK) + return perf_cgroup_from_task(event->hw.cqm_target); - return true; /* if not task, we're machine wide */ + return event->cgrp; } +#endif /* * Determine if @a's tasks intersect with @b's tasks + * + * There are combinations of events that we explicitly prohibit, + * + * PROHIBITS + * system-wide -> cgroup and task + * cgroup -> system-wide + * -> task in cgroup + * task -> system-wide + * -> task in cgroup + * + * Call this function before allocating an RMID. */ static bool __conflict_event(struct perf_event *a, struct perf_event *b) { +#ifdef CONFIG_CGROUP_PERF + /* + * We can have any number of cgroups but only one system-wide + * event at a time. + */ + if (a->cgrp && b->cgrp) { + struct perf_cgroup *ac = a->cgrp; + struct perf_cgroup *bc = b->cgrp; + + /* + * This condition should have been caught in + * __match_event() and we should be sharing an RMID. + */ + WARN_ON_ONCE(ac == bc); + + if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || + cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) + return true; + + return false; + } + + if (a->cgrp || b->cgrp) { + struct perf_cgroup *ac, *bc; + + /* + * cgroup and system-wide events are mutually exclusive + */ + if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) || + (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK))) + return true; + + /* + * Ensure neither event is part of the other's cgroup + */ + ac = event_to_cgroup(a); + bc = event_to_cgroup(b); + if (ac == bc) + return true; + + /* + * Must have cgroup and non-intersecting task events. + */ + if (!ac || !bc) + return false; + + /* + * We have cgroup and task events, and the task belongs + * to a cgroup. Check for for overlap. + */ + if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || + cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) + return true; + + return false; + } +#endif /* * If one of them is not a task, same story as above with cgroups. */ @@ -245,9 +346,16 @@ static int intel_cqm_setup_event(struct perf_event *event, static void intel_cqm_event_read(struct perf_event *event) { - unsigned long rmid = event->hw.cqm_rmid; + unsigned long rmid; u64 val; + /* + * Task events are handled by intel_cqm_event_count(). + */ + if (event->cpu == -1) + return; + + rmid = event->hw.cqm_rmid; val = __rmid_read(rmid); /* @@ -259,6 +367,63 @@ static void intel_cqm_event_read(struct perf_event *event) local64_set(&event->count, val); } +struct rmid_read { + unsigned int rmid; + atomic64_t value; +}; + +static void __intel_cqm_event_count(void *info) +{ + struct rmid_read *rr = info; + u64 val; + + val = __rmid_read(rr->rmid); + + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) + return; + + atomic64_add(val, &rr->value); +} + +static inline bool cqm_group_leader(struct perf_event *event) +{ + return !list_empty(&event->hw.cqm_groups_entry); +} + +static u64 intel_cqm_event_count(struct perf_event *event) +{ + struct rmid_read rr = { + .rmid = event->hw.cqm_rmid, + .value = ATOMIC64_INIT(0), + }; + + /* + * We only need to worry about task events. System-wide events + * are handled like usual, i.e. entirely with + * intel_cqm_event_read(). + */ + if (event->cpu != -1) + return __perf_event_count(event); + + /* + * Only the group leader gets to report values. This stops us + * reporting duplicate values to userspace, and gives us a clear + * rule for which task gets to report the values. + * + * Note that it is impossible to attribute these values to + * specific packages - we forfeit that ability when we create + * task events. + */ + if (!cqm_group_leader(event)) + return 0; + + on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1); + + local64_set(&event->count, atomic64_read(&rr.value)); + + return __perf_event_count(event); +} + static void intel_cqm_event_start(struct perf_event *event, int mode) { struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); @@ -344,7 +509,7 @@ static void intel_cqm_event_destroy(struct perf_event *event) /* * And we're the group leader.. */ - if (!list_empty(&event->hw.cqm_groups_entry)) { + if (cqm_group_leader(event)) { /* * If there was a group_other, make that leader, otherwise * destroy the group and return the RMID. @@ -365,17 +530,6 @@ static void intel_cqm_event_destroy(struct perf_event *event) static struct pmu intel_cqm_pmu; -/* - * XXX there's a bit of a problem in that we cannot simply do the one - * event per node as one would want, since that one event would one get - * scheduled on the one cpu. But we want to 'schedule' the RMID on all - * CPUs. - * - * This means we want events for each CPU, however, that generates a lot - * of duplicate values out to userspace -- this is not to be helped - * unless we want to change the core code in some way. Fore more info, - * see intel_cqm_event_read(). - */ static int intel_cqm_event_init(struct perf_event *event) { struct perf_event *group = NULL; @@ -387,9 +541,6 @@ static int intel_cqm_event_init(struct perf_event *event) if (event->attr.config & ~QOS_EVENT_MASK) return -EINVAL; - if (event->cpu == -1) - return -EINVAL; - /* unsupported modes and filters */ if (event->attr.exclude_user || event->attr.exclude_kernel || @@ -407,7 +558,8 @@ static int intel_cqm_event_init(struct perf_event *event) mutex_lock(&cache_mutex); - err = intel_cqm_setup_event(event, &group); /* will also set rmid */ + /* Will also set rmid */ + err = intel_cqm_setup_event(event, &group); if (err) goto out; @@ -470,6 +622,7 @@ static struct pmu intel_cqm_pmu = { .start = intel_cqm_event_start, .stop = intel_cqm_event_stop, .read = intel_cqm_event_read, + .count = intel_cqm_event_count, }; static inline void cqm_pick_event_reader(int cpu) @@ -599,8 +752,8 @@ static int __init intel_cqm_init(void) __perf_cpu_notifier(intel_cqm_cpu_notifier); - ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); - + ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", + PERF_TYPE_INTEL_CQM); if (ret) pr_err("Intel CQM perf registration failed: %d\n", ret); else -- cgit v1.2.3 From bff671dba7981195a644a5dc210d65de8ae2d251 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 23 Jan 2015 18:45:47 +0000 Subject: perf/x86/intel: Perform rotation on Intel CQM RMIDs There are many use cases where people will want to monitor more tasks than there exist RMIDs in the hardware, meaning that we have to perform some kind of multiplexing. We do this by "rotating" the RMIDs in a workqueue, and assigning an RMID to a waiting event when the RMID becomes unused. This scheme reserves one RMID at all times for rotation. When we need to schedule a new event we give it the reserved RMID, pick a victim event from the front of the global CQM list and wait for the victim's RMID to drop to zero occupancy, before it becomes the new reserved RMID. We put the victim's RMID onto the limbo list, where it resides for a "minimum queue time", which is intended to save ourselves an expensive smp IPI when the RMID is unlikely to have a occupancy value below __intel_cqm_threshold. If we fail to recycle an RMID, even after waiting the minimum queue time then we need to increment __intel_cqm_threshold. There is an upper bound on this threshold, __intel_cqm_max_threshold, which is programmable from userland as /sys/devices/intel_cqm/max_recycling_threshold. The comments above __intel_cqm_rmid_rotate() have more details. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Vikas Shivappa Link: http://lkml.kernel.org/r/1422038748-21397-9-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 671 ++++++++++++++++++++++++++--- 1 file changed, 623 insertions(+), 48 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 8003d87afd89..e31f5086f2b5 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -25,9 +25,13 @@ struct intel_cqm_state { static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state); /* - * Protects cache_cgroups and cqm_rmid_lru. + * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru. + * Also protects event->hw.cqm_rmid + * + * Hold either for stability, both for modification of ->hw.cqm_rmid. */ static DEFINE_MUTEX(cache_mutex); +static DEFINE_RAW_SPINLOCK(cache_lock); /* * Groups of events that have the same target(s), one RMID per group. @@ -46,7 +50,34 @@ static cpumask_t cqm_cpumask; #define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID -static u64 __rmid_read(unsigned long rmid) +/* + * This is central to the rotation algorithm in __intel_cqm_rmid_rotate(). + * + * This rmid is always free and is guaranteed to have an associated + * near-zero occupancy value, i.e. no cachelines are tagged with this + * RMID, once __intel_cqm_rmid_rotate() returns. + */ +static unsigned int intel_cqm_rotation_rmid; + +#define INVALID_RMID (-1) + +/* + * Is @rmid valid for programming the hardware? + * + * rmid 0 is reserved by the hardware for all non-monitored tasks, which + * means that we should never come across an rmid with that value. + * Likewise, an rmid value of -1 is used to indicate "no rmid currently + * assigned" and is used as part of the rotation code. + */ +static inline bool __rmid_valid(unsigned int rmid) +{ + if (!rmid || rmid == INVALID_RMID) + return false; + + return true; +} + +static u64 __rmid_read(unsigned int rmid) { u64 val; @@ -64,13 +95,21 @@ static u64 __rmid_read(unsigned long rmid) return val; } +enum rmid_recycle_state { + RMID_YOUNG = 0, + RMID_AVAILABLE, + RMID_DIRTY, +}; + struct cqm_rmid_entry { - u64 rmid; + unsigned int rmid; + enum rmid_recycle_state state; struct list_head list; + unsigned long queue_time; }; /* - * A least recently used list of RMIDs. + * cqm_rmid_free_lru - A least recently used list of RMIDs. * * Oldest entry at the head, newest (most recently used) entry at the * tail. This list is never traversed, it's only used to keep track of @@ -81,9 +120,18 @@ struct cqm_rmid_entry { * in use. To mark an RMID as in use, remove its entry from the lru * list. * - * This list is protected by cache_mutex. + * + * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs. + * + * This list is contains RMIDs that no one is currently using but that + * may have a non-zero occupancy value associated with them. The + * rotation worker moves RMIDs from the limbo list to the free list once + * the occupancy value drops below __intel_cqm_threshold. + * + * Both lists are protected by cache_mutex. */ -static LIST_HEAD(cqm_rmid_lru); +static LIST_HEAD(cqm_rmid_free_lru); +static LIST_HEAD(cqm_rmid_limbo_lru); /* * We use a simple array of pointers so that we can lookup a struct @@ -120,37 +168,43 @@ static int __get_rmid(void) lockdep_assert_held(&cache_mutex); - if (list_empty(&cqm_rmid_lru)) - return -EAGAIN; + if (list_empty(&cqm_rmid_free_lru)) + return INVALID_RMID; - entry = list_first_entry(&cqm_rmid_lru, struct cqm_rmid_entry, list); + entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list); list_del(&entry->list); return entry->rmid; } -static void __put_rmid(int rmid) +static void __put_rmid(unsigned int rmid) { struct cqm_rmid_entry *entry; lockdep_assert_held(&cache_mutex); + WARN_ON(!__rmid_valid(rmid)); entry = __rmid_entry(rmid); - list_add_tail(&entry->list, &cqm_rmid_lru); + entry->queue_time = jiffies; + entry->state = RMID_YOUNG; + + list_add_tail(&entry->list, &cqm_rmid_limbo_lru); } static int intel_cqm_setup_rmid_cache(void) { struct cqm_rmid_entry *entry; - int r; + unsigned int nr_rmids; + int r = 0; + nr_rmids = cqm_max_rmid + 1; cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) * - (cqm_max_rmid + 1), GFP_KERNEL); + nr_rmids, GFP_KERNEL); if (!cqm_rmid_ptrs) return -ENOMEM; - for (r = 0; r <= cqm_max_rmid; r++) { + for (; r <= cqm_max_rmid; r++) { struct cqm_rmid_entry *entry; entry = kmalloc(sizeof(*entry), GFP_KERNEL); @@ -161,7 +215,7 @@ static int intel_cqm_setup_rmid_cache(void) entry->rmid = r; cqm_rmid_ptrs[r] = entry; - list_add_tail(&entry->list, &cqm_rmid_lru); + list_add_tail(&entry->list, &cqm_rmid_free_lru); } /* @@ -171,6 +225,10 @@ static int intel_cqm_setup_rmid_cache(void) entry = __rmid_entry(0); list_del(&entry->list); + mutex_lock(&cache_mutex); + intel_cqm_rotation_rmid = __get_rmid(); + mutex_unlock(&cache_mutex); + return 0; fail: while (r--) @@ -313,6 +371,424 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b) return false; } +struct rmid_read { + unsigned int rmid; + atomic64_t value; +}; + +static void __intel_cqm_event_count(void *info); + +/* + * Exchange the RMID of a group of events. + */ +static unsigned int +intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid) +{ + struct perf_event *event; + unsigned int old_rmid = group->hw.cqm_rmid; + struct list_head *head = &group->hw.cqm_group_entry; + + lockdep_assert_held(&cache_mutex); + + /* + * If our RMID is being deallocated, perform a read now. + */ + if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) { + struct rmid_read rr = { + .value = ATOMIC64_INIT(0), + .rmid = old_rmid, + }; + + on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, + &rr, 1); + local64_set(&group->count, atomic64_read(&rr.value)); + } + + raw_spin_lock_irq(&cache_lock); + + group->hw.cqm_rmid = rmid; + list_for_each_entry(event, head, hw.cqm_group_entry) + event->hw.cqm_rmid = rmid; + + raw_spin_unlock_irq(&cache_lock); + + return old_rmid; +} + +/* + * If we fail to assign a new RMID for intel_cqm_rotation_rmid because + * cachelines are still tagged with RMIDs in limbo, we progressively + * increment the threshold until we find an RMID in limbo with <= + * __intel_cqm_threshold lines tagged. This is designed to mitigate the + * problem where cachelines tagged with an RMID are not steadily being + * evicted. + * + * On successful rotations we decrease the threshold back towards zero. + * + * __intel_cqm_max_threshold provides an upper bound on the threshold, + * and is measured in bytes because it's exposed to userland. + */ +static unsigned int __intel_cqm_threshold; +static unsigned int __intel_cqm_max_threshold; + +/* + * Test whether an RMID has a zero occupancy value on this cpu. + */ +static void intel_cqm_stable(void *arg) +{ + struct cqm_rmid_entry *entry; + + list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { + if (entry->state != RMID_AVAILABLE) + break; + + if (__rmid_read(entry->rmid) > __intel_cqm_threshold) + entry->state = RMID_DIRTY; + } +} + +/* + * If we have group events waiting for an RMID that don't conflict with + * events already running, assign @rmid. + */ +static bool intel_cqm_sched_in_event(unsigned int rmid) +{ + struct perf_event *leader, *event; + + lockdep_assert_held(&cache_mutex); + + leader = list_first_entry(&cache_groups, struct perf_event, + hw.cqm_groups_entry); + event = leader; + + list_for_each_entry_continue(event, &cache_groups, + hw.cqm_groups_entry) { + if (__rmid_valid(event->hw.cqm_rmid)) + continue; + + if (__conflict_event(event, leader)) + continue; + + intel_cqm_xchg_rmid(event, rmid); + return true; + } + + return false; +} + +/* + * Initially use this constant for both the limbo queue time and the + * rotation timer interval, pmu::hrtimer_interval_ms. + * + * They don't need to be the same, but the two are related since if you + * rotate faster than you recycle RMIDs, you may run out of available + * RMIDs. + */ +#define RMID_DEFAULT_QUEUE_TIME 250 /* ms */ + +static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME; + +/* + * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list + * @nr_available: number of freeable RMIDs on the limbo list + * + * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no + * cachelines are tagged with those RMIDs. After this we can reuse them + * and know that the current set of active RMIDs is stable. + * + * Return %true or %false depending on whether stabilization needs to be + * reattempted. + * + * If we return %true then @nr_available is updated to indicate the + * number of RMIDs on the limbo list that have been queued for the + * minimum queue time (RMID_AVAILABLE), but whose data occupancy values + * are above __intel_cqm_threshold. + */ +static bool intel_cqm_rmid_stabilize(unsigned int *available) +{ + struct cqm_rmid_entry *entry, *tmp; + struct perf_event *event; + + lockdep_assert_held(&cache_mutex); + + *available = 0; + list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { + unsigned long min_queue_time; + unsigned long now = jiffies; + + /* + * We hold RMIDs placed into limbo for a minimum queue + * time. Before the minimum queue time has elapsed we do + * not recycle RMIDs. + * + * The reasoning is that until a sufficient time has + * passed since we stopped using an RMID, any RMID + * placed onto the limbo list will likely still have + * data tagged in the cache, which means we'll probably + * fail to recycle it anyway. + * + * We can save ourselves an expensive IPI by skipping + * any RMIDs that have not been queued for the minimum + * time. + */ + min_queue_time = entry->queue_time + + msecs_to_jiffies(__rmid_queue_time_ms); + + if (time_after(min_queue_time, now)) + break; + + entry->state = RMID_AVAILABLE; + (*available)++; + } + + /* + * Fast return if none of the RMIDs on the limbo list have been + * sitting on the queue for the minimum queue time. + */ + if (!*available) + return false; + + /* + * Test whether an RMID is free for each package. + */ + on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true); + + list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) { + /* + * Exhausted all RMIDs that have waited min queue time. + */ + if (entry->state == RMID_YOUNG) + break; + + if (entry->state == RMID_DIRTY) + continue; + + list_del(&entry->list); /* remove from limbo */ + + /* + * The rotation RMID gets priority if it's + * currently invalid. In which case, skip adding + * the RMID to the the free lru. + */ + if (!__rmid_valid(intel_cqm_rotation_rmid)) { + intel_cqm_rotation_rmid = entry->rmid; + continue; + } + + /* + * If we have groups waiting for RMIDs, hand + * them one now. + */ + list_for_each_entry(event, &cache_groups, + hw.cqm_groups_entry) { + if (__rmid_valid(event->hw.cqm_rmid)) + continue; + + intel_cqm_xchg_rmid(event, entry->rmid); + entry = NULL; + break; + } + + if (!entry) + continue; + + /* + * Otherwise place it onto the free list. + */ + list_add_tail(&entry->list, &cqm_rmid_free_lru); + } + + + return __rmid_valid(intel_cqm_rotation_rmid); +} + +/* + * Pick a victim group and move it to the tail of the group list. + */ +static struct perf_event * +__intel_cqm_pick_and_rotate(void) +{ + struct perf_event *rotor; + + lockdep_assert_held(&cache_mutex); + lockdep_assert_held(&cache_lock); + + rotor = list_first_entry(&cache_groups, struct perf_event, + hw.cqm_groups_entry); + list_rotate_left(&cache_groups); + + return rotor; +} + +/* + * Attempt to rotate the groups and assign new RMIDs. + * + * Rotating RMIDs is complicated because the hardware doesn't give us + * any clues. + * + * There's problems with the hardware interface; when you change the + * task:RMID map cachelines retain their 'old' tags, giving a skewed + * picture. In order to work around this, we must always keep one free + * RMID - intel_cqm_rotation_rmid. + * + * Rotation works by taking away an RMID from a group (the old RMID), + * and assigning the free RMID to another group (the new RMID). We must + * then wait for the old RMID to not be used (no cachelines tagged). + * This ensure that all cachelines are tagged with 'active' RMIDs. At + * this point we can start reading values for the new RMID and treat the + * old RMID as the free RMID for the next rotation. + * + * Return %true or %false depending on whether we did any rotating. + */ +static bool __intel_cqm_rmid_rotate(void) +{ + struct perf_event *group, *rotor, *start = NULL; + unsigned int threshold_limit; + unsigned int nr_needed = 0; + unsigned int nr_available; + unsigned int rmid; + bool rotated = false; + + mutex_lock(&cache_mutex); + +again: + /* + * Fast path through this function if there are no groups and no + * RMIDs that need cleaning. + */ + if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru)) + goto out; + + list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) { + if (!__rmid_valid(group->hw.cqm_rmid)) { + if (!start) + start = group; + nr_needed++; + } + } + + /* + * We have some event groups, but they all have RMIDs assigned + * and no RMIDs need cleaning. + */ + if (!nr_needed && list_empty(&cqm_rmid_limbo_lru)) + goto out; + + if (!nr_needed) + goto stabilize; + + /* + * We have more event groups without RMIDs than available RMIDs. + * + * We force deallocate the rmid of the group at the head of + * cache_groups. The first event group without an RMID then gets + * assigned intel_cqm_rotation_rmid. This ensures we always make + * forward progress. + * + * Rotate the cache_groups list so the previous head is now the + * tail. + */ + rotor = __intel_cqm_pick_and_rotate(); + rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID); + + /* + * The group at the front of the list should always have a valid + * RMID. If it doesn't then no groups have RMIDs assigned. + */ + if (!__rmid_valid(rmid)) + goto stabilize; + + /* + * If the rotation is going to succeed, reduce the threshold so + * that we don't needlessly reuse dirty RMIDs. + */ + if (__rmid_valid(intel_cqm_rotation_rmid)) { + intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid); + intel_cqm_rotation_rmid = INVALID_RMID; + + if (__intel_cqm_threshold) + __intel_cqm_threshold--; + } + + __put_rmid(rmid); + + rotated = true; + +stabilize: + /* + * We now need to stablize the RMID we freed above (if any) to + * ensure that the next time we rotate we have an RMID with zero + * occupancy value. + * + * Alternatively, if we didn't need to perform any rotation, + * we'll have a bunch of RMIDs in limbo that need stabilizing. + */ + threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale; + + while (intel_cqm_rmid_stabilize(&nr_available) && + __intel_cqm_threshold < threshold_limit) { + unsigned int steal_limit; + + /* + * Don't spin if nobody is actively waiting for an RMID, + * the rotation worker will be kicked as soon as an + * event needs an RMID anyway. + */ + if (!nr_needed) + break; + + /* Allow max 25% of RMIDs to be in limbo. */ + steal_limit = (cqm_max_rmid + 1) / 4; + + /* + * We failed to stabilize any RMIDs so our rotation + * logic is now stuck. In order to make forward progress + * we have a few options: + * + * 1. rotate ("steal") another RMID + * 2. increase the threshold + * 3. do nothing + * + * We do both of 1. and 2. until we hit the steal limit. + * + * The steal limit prevents all RMIDs ending up on the + * limbo list. This can happen if every RMID has a + * non-zero occupancy above threshold_limit, and the + * occupancy values aren't dropping fast enough. + * + * Note that there is prioritisation at work here - we'd + * rather increase the number of RMIDs on the limbo list + * than increase the threshold, because increasing the + * threshold skews the event data (because we reuse + * dirty RMIDs) - threshold bumps are a last resort. + */ + if (nr_available < steal_limit) + goto again; + + __intel_cqm_threshold++; + } + +out: + mutex_unlock(&cache_mutex); + return rotated; +} + +static void intel_cqm_rmid_rotate(struct work_struct *work); + +static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate); + +static struct pmu intel_cqm_pmu; + +static void intel_cqm_rmid_rotate(struct work_struct *work) +{ + unsigned long delay; + + __intel_cqm_rmid_rotate(); + + delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms); + schedule_delayed_work(&intel_cqm_rmid_work, delay); +} + /* * Find a group and setup RMID. * @@ -322,7 +798,6 @@ static int intel_cqm_setup_event(struct perf_event *event, struct perf_event **group) { struct perf_event *iter; - int rmid; list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { if (__match_event(iter, event)) { @@ -336,17 +811,14 @@ static int intel_cqm_setup_event(struct perf_event *event, return -EBUSY; } - rmid = __get_rmid(); - if (rmid < 0) - return rmid; - - event->hw.cqm_rmid = rmid; + event->hw.cqm_rmid = __get_rmid(); return 0; } static void intel_cqm_event_read(struct perf_event *event) { - unsigned long rmid; + unsigned long flags; + unsigned int rmid; u64 val; /* @@ -355,23 +827,25 @@ static void intel_cqm_event_read(struct perf_event *event) if (event->cpu == -1) return; + raw_spin_lock_irqsave(&cache_lock, flags); rmid = event->hw.cqm_rmid; + + if (!__rmid_valid(rmid)) + goto out; + val = __rmid_read(rmid); /* * Ignore this reading on error states and do not update the value. */ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - return; + goto out; local64_set(&event->count, val); +out: + raw_spin_unlock_irqrestore(&cache_lock, flags); } -struct rmid_read { - unsigned int rmid; - atomic64_t value; -}; - static void __intel_cqm_event_count(void *info) { struct rmid_read *rr = info; @@ -392,8 +866,8 @@ static inline bool cqm_group_leader(struct perf_event *event) static u64 intel_cqm_event_count(struct perf_event *event) { + unsigned long flags; struct rmid_read rr = { - .rmid = event->hw.cqm_rmid, .value = ATOMIC64_INIT(0), }; @@ -417,17 +891,36 @@ static u64 intel_cqm_event_count(struct perf_event *event) if (!cqm_group_leader(event)) return 0; - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1); + /* + * Notice that we don't perform the reading of an RMID + * atomically, because we can't hold a spin lock across the + * IPIs. + * + * Speculatively perform the read, since @event might be + * assigned a different (possibly invalid) RMID while we're + * busying performing the IPI calls. It's therefore necessary to + * check @event's RMID afterwards, and if it has changed, + * discard the result of the read. + */ + rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid); - local64_set(&event->count, atomic64_read(&rr.value)); + if (!__rmid_valid(rr.rmid)) + goto out; + + on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1); + raw_spin_lock_irqsave(&cache_lock, flags); + if (event->hw.cqm_rmid == rr.rmid) + local64_set(&event->count, atomic64_read(&rr.value)); + raw_spin_unlock_irqrestore(&cache_lock, flags); +out: return __perf_event_count(event); } static void intel_cqm_event_start(struct perf_event *event, int mode) { struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); - unsigned long rmid = event->hw.cqm_rmid; + unsigned int rmid = event->hw.cqm_rmid; unsigned long flags; if (!(event->hw.cqm_state & PERF_HES_STOPPED)) @@ -473,15 +966,19 @@ static void intel_cqm_event_stop(struct perf_event *event, int mode) static int intel_cqm_event_add(struct perf_event *event, int mode) { - int rmid; + unsigned long flags; + unsigned int rmid; + + raw_spin_lock_irqsave(&cache_lock, flags); event->hw.cqm_state = PERF_HES_STOPPED; rmid = event->hw.cqm_rmid; - WARN_ON_ONCE(!rmid); - if (mode & PERF_EF_START) + if (__rmid_valid(rmid) && (mode & PERF_EF_START)) intel_cqm_event_start(event, mode); + raw_spin_unlock_irqrestore(&cache_lock, flags); + return 0; } @@ -518,9 +1015,10 @@ static void intel_cqm_event_destroy(struct perf_event *event) list_replace(&event->hw.cqm_groups_entry, &group_other->hw.cqm_groups_entry); } else { - int rmid = event->hw.cqm_rmid; + unsigned int rmid = event->hw.cqm_rmid; - __put_rmid(rmid); + if (__rmid_valid(rmid)) + __put_rmid(rmid); list_del(&event->hw.cqm_groups_entry); } } @@ -528,11 +1026,10 @@ static void intel_cqm_event_destroy(struct perf_event *event) mutex_unlock(&cache_mutex); } -static struct pmu intel_cqm_pmu; - static int intel_cqm_event_init(struct perf_event *event) { struct perf_event *group = NULL; + bool rotate = false; int err; if (event->attr.type != intel_cqm_pmu.type) @@ -569,10 +1066,24 @@ static int intel_cqm_event_init(struct perf_event *event) } else { list_add_tail(&event->hw.cqm_groups_entry, &cache_groups); + + /* + * All RMIDs are either in use or have recently been + * used. Kick the rotation worker to clean/free some. + * + * We only do this for the group leader, rather than for + * every event in a group to save on needless work. + */ + if (!__rmid_valid(event->hw.cqm_rmid)) + rotate = true; } out: mutex_unlock(&cache_mutex); + + if (rotate) + schedule_delayed_work(&intel_cqm_rmid_work, 0); + return err; } @@ -607,22 +1118,76 @@ static struct attribute_group intel_cqm_format_group = { .attrs = intel_cqm_formats_attr, }; +static ssize_t +max_recycle_threshold_show(struct device *dev, struct device_attribute *attr, + char *page) +{ + ssize_t rv; + + mutex_lock(&cache_mutex); + rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold); + mutex_unlock(&cache_mutex); + + return rv; +} + +static ssize_t +max_recycle_threshold_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned int bytes, cachelines; + int ret; + + ret = kstrtouint(buf, 0, &bytes); + if (ret) + return ret; + + mutex_lock(&cache_mutex); + + __intel_cqm_max_threshold = bytes; + cachelines = bytes / cqm_l3_scale; + + /* + * The new maximum takes effect immediately. + */ + if (__intel_cqm_threshold > cachelines) + __intel_cqm_threshold = cachelines; + + mutex_unlock(&cache_mutex); + + return count; +} + +static DEVICE_ATTR_RW(max_recycle_threshold); + +static struct attribute *intel_cqm_attrs[] = { + &dev_attr_max_recycle_threshold.attr, + NULL, +}; + +static const struct attribute_group intel_cqm_group = { + .attrs = intel_cqm_attrs, +}; + static const struct attribute_group *intel_cqm_attr_groups[] = { &intel_cqm_events_group, &intel_cqm_format_group, + &intel_cqm_group, NULL, }; static struct pmu intel_cqm_pmu = { - .attr_groups = intel_cqm_attr_groups, - .task_ctx_nr = perf_sw_context, - .event_init = intel_cqm_event_init, - .add = intel_cqm_event_add, - .del = intel_cqm_event_del, - .start = intel_cqm_event_start, - .stop = intel_cqm_event_stop, - .read = intel_cqm_event_read, - .count = intel_cqm_event_count, + .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME, + .attr_groups = intel_cqm_attr_groups, + .task_ctx_nr = perf_sw_context, + .event_init = intel_cqm_event_init, + .add = intel_cqm_event_add, + .del = intel_cqm_event_del, + .start = intel_cqm_event_start, + .stop = intel_cqm_event_stop, + .read = intel_cqm_event_read, + .count = intel_cqm_event_count, }; static inline void cqm_pick_event_reader(int cpu) @@ -732,6 +1297,16 @@ static int __init intel_cqm_init(void) } } + /* + * A reasonable upper limit on the max threshold is the number + * of lines tagged per RMID if all RMIDs have the same number of + * lines tagged in the LLC. + * + * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. + */ + __intel_cqm_max_threshold = + boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1); + snprintf(scale, sizeof(scale), "%u", cqm_l3_scale); str = kstrdup(scale, GFP_KERNEL); if (!str) { -- cgit v1.2.3 From 59bf7fd45c90a8fde22a7717b5413e4ed9666c32 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 23 Jan 2015 18:45:48 +0000 Subject: perf/x86/intel: Enable conflicting event scheduling for CQM We can leverage the workqueue that we use for RMID rotation to support scheduling of conflicting monitoring events. Allowing events that monitor conflicting things is done at various other places in the perf subsystem, so there's precedent there. An example of two conflicting events would be monitoring a cgroup and simultaneously monitoring a task within that cgroup. This uses the cache_groups list as a queuing mechanism, where every event that reaches the front of the list gets the chance to be scheduled in, possibly descheduling any conflicting events that are running. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Vikas Shivappa Link: http://lkml.kernel.org/r/1422038748-21397-10-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 130 +++++++++++++++++++---------- 1 file changed, 84 insertions(+), 46 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index e31f5086f2b5..9a8ef8376fcd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -507,7 +507,6 @@ static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME; static bool intel_cqm_rmid_stabilize(unsigned int *available) { struct cqm_rmid_entry *entry, *tmp; - struct perf_event *event; lockdep_assert_held(&cache_mutex); @@ -577,19 +576,9 @@ static bool intel_cqm_rmid_stabilize(unsigned int *available) /* * If we have groups waiting for RMIDs, hand - * them one now. + * them one now provided they don't conflict. */ - list_for_each_entry(event, &cache_groups, - hw.cqm_groups_entry) { - if (__rmid_valid(event->hw.cqm_rmid)) - continue; - - intel_cqm_xchg_rmid(event, entry->rmid); - entry = NULL; - break; - } - - if (!entry) + if (intel_cqm_sched_in_event(entry->rmid)) continue; /* @@ -604,25 +593,73 @@ static bool intel_cqm_rmid_stabilize(unsigned int *available) /* * Pick a victim group and move it to the tail of the group list. + * @next: The first group without an RMID */ -static struct perf_event * -__intel_cqm_pick_and_rotate(void) +static void __intel_cqm_pick_and_rotate(struct perf_event *next) { struct perf_event *rotor; + unsigned int rmid; lockdep_assert_held(&cache_mutex); - lockdep_assert_held(&cache_lock); rotor = list_first_entry(&cache_groups, struct perf_event, hw.cqm_groups_entry); + + /* + * The group at the front of the list should always have a valid + * RMID. If it doesn't then no groups have RMIDs assigned and we + * don't need to rotate the list. + */ + if (next == rotor) + return; + + rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID); + __put_rmid(rmid); + list_rotate_left(&cache_groups); +} + +/* + * Deallocate the RMIDs from any events that conflict with @event, and + * place them on the back of the group list. + */ +static void intel_cqm_sched_out_conflicting_events(struct perf_event *event) +{ + struct perf_event *group, *g; + unsigned int rmid; + + lockdep_assert_held(&cache_mutex); + + list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) { + if (group == event) + continue; + + rmid = group->hw.cqm_rmid; + + /* + * Skip events that don't have a valid RMID. + */ + if (!__rmid_valid(rmid)) + continue; + + /* + * No conflict? No problem! Leave the event alone. + */ + if (!__conflict_event(group, event)) + continue; - return rotor; + intel_cqm_xchg_rmid(group, INVALID_RMID); + __put_rmid(rmid); + } } /* * Attempt to rotate the groups and assign new RMIDs. * + * We rotate for two reasons, + * 1. To handle the scheduling of conflicting events + * 2. To recycle RMIDs + * * Rotating RMIDs is complicated because the hardware doesn't give us * any clues. * @@ -642,11 +679,10 @@ __intel_cqm_pick_and_rotate(void) */ static bool __intel_cqm_rmid_rotate(void) { - struct perf_event *group, *rotor, *start = NULL; + struct perf_event *group, *start = NULL; unsigned int threshold_limit; unsigned int nr_needed = 0; unsigned int nr_available; - unsigned int rmid; bool rotated = false; mutex_lock(&cache_mutex); @@ -678,7 +714,9 @@ again: goto stabilize; /* - * We have more event groups without RMIDs than available RMIDs. + * We have more event groups without RMIDs than available RMIDs, + * or we have event groups that conflict with the ones currently + * scheduled. * * We force deallocate the rmid of the group at the head of * cache_groups. The first event group without an RMID then gets @@ -688,15 +726,7 @@ again: * Rotate the cache_groups list so the previous head is now the * tail. */ - rotor = __intel_cqm_pick_and_rotate(); - rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID); - - /* - * The group at the front of the list should always have a valid - * RMID. If it doesn't then no groups have RMIDs assigned. - */ - if (!__rmid_valid(rmid)) - goto stabilize; + __intel_cqm_pick_and_rotate(start); /* * If the rotation is going to succeed, reduce the threshold so @@ -704,14 +734,14 @@ again: */ if (__rmid_valid(intel_cqm_rotation_rmid)) { intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid); - intel_cqm_rotation_rmid = INVALID_RMID; + intel_cqm_rotation_rmid = __get_rmid(); + + intel_cqm_sched_out_conflicting_events(start); if (__intel_cqm_threshold) __intel_cqm_threshold--; } - __put_rmid(rmid); - rotated = true; stabilize: @@ -794,25 +824,37 @@ static void intel_cqm_rmid_rotate(struct work_struct *work) * * If we're part of a group, we use the group's RMID. */ -static int intel_cqm_setup_event(struct perf_event *event, - struct perf_event **group) +static void intel_cqm_setup_event(struct perf_event *event, + struct perf_event **group) { struct perf_event *iter; + unsigned int rmid; + bool conflict = false; list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { + rmid = iter->hw.cqm_rmid; + if (__match_event(iter, event)) { /* All tasks in a group share an RMID */ - event->hw.cqm_rmid = iter->hw.cqm_rmid; + event->hw.cqm_rmid = rmid; *group = iter; - return 0; + return; } - if (__conflict_event(iter, event)) - return -EBUSY; + /* + * We only care about conflicts for events that are + * actually scheduled in (and hence have a valid RMID). + */ + if (__conflict_event(iter, event) && __rmid_valid(rmid)) + conflict = true; } - event->hw.cqm_rmid = __get_rmid(); - return 0; + if (conflict) + rmid = INVALID_RMID; + else + rmid = __get_rmid(); + + event->hw.cqm_rmid = rmid; } static void intel_cqm_event_read(struct perf_event *event) @@ -1030,7 +1072,6 @@ static int intel_cqm_event_init(struct perf_event *event) { struct perf_event *group = NULL; bool rotate = false; - int err; if (event->attr.type != intel_cqm_pmu.type) return -ENOENT; @@ -1056,9 +1097,7 @@ static int intel_cqm_event_init(struct perf_event *event) mutex_lock(&cache_mutex); /* Will also set rmid */ - err = intel_cqm_setup_event(event, &group); - if (err) - goto out; + intel_cqm_setup_event(event, &group); if (group) { list_add_tail(&event->hw.cqm_group_entry, @@ -1078,13 +1117,12 @@ static int intel_cqm_event_init(struct perf_event *event) rotate = true; } -out: mutex_unlock(&cache_mutex); if (rotate) schedule_delayed_work(&intel_cqm_rmid_work, 0); - return err; + return 0; } EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01"); -- cgit v1.2.3 From 66c046b407166686aeea2e8c0abfa1c4e4b94f26 Mon Sep 17 00:00:00 2001 From: "Lad, Prabhakar" Date: Thu, 5 Feb 2015 10:29:35 +0000 Subject: crypto: sha-mb - Fix big integer constant sparse warning this patch fixes following sparse warning: sha1_mb_mgr_init_avx2.c:59:31: warning: constant 0xF76543210 is so big it is long Signed-off-by: Lad, Prabhakar Signed-off-by: Herbert Xu --- arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c index 4ca7e166a2aa..822acb5b464c 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c +++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c @@ -56,7 +56,7 @@ void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state) { unsigned int j; - state->unused_lanes = 0xF76543210; + state->unused_lanes = 0xF76543210ULL; for (j = 0; j < 8; j++) { state->lens[j] = 0xFFFFFFFF; state->ldata[j].job_in_lane = NULL; -- cgit v1.2.3 From 81e397d937a8e9f46f024a9f876cf14d8e2b45a7 Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Fri, 6 Feb 2015 10:25:20 -0800 Subject: crypto: aesni - make driver-gcm-aes-aesni helper a proper aead alg Changed the __driver-gcm-aes-aesni to be a proper aead algorithm. This required a valid setkey and setauthsize functions to be added and also some changes to make sure that math context is not corrupted when the alg is used directly. Note that the __driver-gcm-aes-aesni should not be used directly by modules that can use it in interrupt context as we don't have a good fallback mechanism in this case. Signed-off-by: Adrian Hoban Signed-off-by: Tadeusz Struk Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 164 +++++++++++++++++++++++++------------ 1 file changed, 110 insertions(+), 54 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 947c6bf52c33..6893f4947583 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -890,15 +890,12 @@ out_free_ablkcipher: return ret; } -static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, - unsigned int key_len) +static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key, + unsigned int key_len) { int ret = 0; - struct crypto_tfm *tfm = crypto_aead_tfm(parent); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); - struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); - struct aesni_rfc4106_gcm_ctx *child_ctx = - aesni_rfc4106_gcm_ctx_get(cryptd_child); + struct crypto_tfm *tfm = crypto_aead_tfm(aead); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead); u8 *new_key_align, *new_key_mem = NULL; if (key_len < 4) { @@ -943,20 +940,31 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, goto exit; } ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); - memcpy(child_ctx, ctx, sizeof(*ctx)); exit: kfree(new_key_mem); return ret; } -/* This is the Integrity Check Value (aka the authentication tag length and can - * be 8, 12 or 16 bytes long. */ -static int rfc4106_set_authsize(struct crypto_aead *parent, - unsigned int authsize) +static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, + unsigned int key_len) { struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); - struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); + struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm); + struct aesni_rfc4106_gcm_ctx *c_ctx = aesni_rfc4106_gcm_ctx_get(child); + struct cryptd_aead *cryptd_tfm = ctx->cryptd_tfm; + int ret; + ret = crypto_aead_setkey(child, key, key_len); + if (!ret) { + memcpy(ctx, c_ctx, sizeof(*ctx)); + ctx->cryptd_tfm = cryptd_tfm; + } + return ret; +} + +static int common_rfc4106_set_authsize(struct crypto_aead *aead, + unsigned int authsize) +{ switch (authsize) { case 8: case 12: @@ -965,51 +973,23 @@ static int rfc4106_set_authsize(struct crypto_aead *parent, default: return -EINVAL; } - crypto_aead_crt(parent)->authsize = authsize; - crypto_aead_crt(cryptd_child)->authsize = authsize; + crypto_aead_crt(aead)->authsize = authsize; return 0; } -static int rfc4106_encrypt(struct aead_request *req) -{ - int ret; - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - - if (!irq_fpu_usable()) { - struct aead_request *cryptd_req = - (struct aead_request *) aead_request_ctx(req); - memcpy(cryptd_req, req, sizeof(*req)); - aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - return crypto_aead_encrypt(cryptd_req); - } else { - struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); - kernel_fpu_begin(); - ret = cryptd_child->base.crt_aead.encrypt(req); - kernel_fpu_end(); - return ret; - } -} - -static int rfc4106_decrypt(struct aead_request *req) +/* This is the Integrity Check Value (aka the authentication tag length and can + * be 8, 12 or 16 bytes long. */ +static int rfc4106_set_authsize(struct crypto_aead *parent, + unsigned int authsize) { + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); + struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm); int ret; - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - if (!irq_fpu_usable()) { - struct aead_request *cryptd_req = - (struct aead_request *) aead_request_ctx(req); - memcpy(cryptd_req, req, sizeof(*req)); - aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - return crypto_aead_decrypt(cryptd_req); - } else { - struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); - kernel_fpu_begin(); - ret = cryptd_child->base.crt_aead.decrypt(req); - kernel_fpu_end(); - return ret; - } + ret = crypto_aead_setauthsize(child, authsize); + if (!ret) + crypto_aead_crt(parent)->authsize = authsize; + return ret; } static int __driver_rfc4106_encrypt(struct aead_request *req) @@ -1185,6 +1165,78 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) } return retval; } + +static int rfc4106_encrypt(struct aead_request *req) +{ + int ret; + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + + if (!irq_fpu_usable()) { + struct aead_request *cryptd_req = + (struct aead_request *) aead_request_ctx(req); + + memcpy(cryptd_req, req, sizeof(*req)); + aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + ret = crypto_aead_encrypt(cryptd_req); + } else { + kernel_fpu_begin(); + ret = __driver_rfc4106_encrypt(req); + kernel_fpu_end(); + } + return ret; +} + +static int rfc4106_decrypt(struct aead_request *req) +{ + int ret; + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + + if (!irq_fpu_usable()) { + struct aead_request *cryptd_req = + (struct aead_request *) aead_request_ctx(req); + + memcpy(cryptd_req, req, sizeof(*req)); + aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + ret = crypto_aead_decrypt(cryptd_req); + } else { + kernel_fpu_begin(); + ret = __driver_rfc4106_decrypt(req); + kernel_fpu_end(); + } + return ret; +} + +static int helper_rfc4106_encrypt(struct aead_request *req) +{ + int ret; + + if (unlikely(!irq_fpu_usable())) { + WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context"); + ret = -EINVAL; + } else { + kernel_fpu_begin(); + ret = __driver_rfc4106_encrypt(req); + kernel_fpu_end(); + } + return ret; +} + +static int helper_rfc4106_decrypt(struct aead_request *req) +{ + int ret; + + if (unlikely(!irq_fpu_usable())) { + WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context"); + ret = -EINVAL; + } else { + kernel_fpu_begin(); + ret = __driver_rfc4106_decrypt(req); + kernel_fpu_end(); + } + return ret; +} #endif static struct crypto_alg aesni_algs[] = { { @@ -1366,8 +1418,12 @@ static struct crypto_alg aesni_algs[] = { { .cra_module = THIS_MODULE, .cra_u = { .aead = { - .encrypt = __driver_rfc4106_encrypt, - .decrypt = __driver_rfc4106_decrypt, + .setkey = common_rfc4106_set_key, + .setauthsize = common_rfc4106_set_authsize, + .encrypt = helper_rfc4106_encrypt, + .decrypt = helper_rfc4106_decrypt, + .ivsize = 8, + .maxauthsize = 16, }, }, }, { -- cgit v1.2.3 From 89f9f6796d41e10e224b0cb0027ddd78cb881f65 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Fri, 27 Feb 2015 11:25:59 -0800 Subject: hv: vmbus_post_msg: retry the hypercall on some transient errors I got HV_STATUS_INVALID_CONNECTION_ID on Hyper-V 2008 R2 when keeping running "rmmod hv_netvsc; modprobe hv_netvsc; rmmod hv_utils; modprobe hv_utils" in a Linux guest. Looks the host has some kind of throttling mechanism if some kinds of hypercalls are sent too frequently. Without the patch, the driver can occasionally fail to load. Also let's retry HV_STATUS_INSUFFICIENT_MEMORY, though we didn't get it before. Removed 'case -ENOMEM', since the hypervisor doesn't return this. CC: "K. Y. Srinivasan" Reviewed-by: Jason Wang Signed-off-by: Dexuan Cui Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/uapi/asm/hyperv.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index 90c458e66e13..ce6068dbcfbc 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -225,6 +225,8 @@ #define HV_STATUS_INVALID_HYPERCALL_CODE 2 #define HV_STATUS_INVALID_HYPERCALL_INPUT 3 #define HV_STATUS_INVALID_ALIGNMENT 4 +#define HV_STATUS_INSUFFICIENT_MEMORY 11 +#define HV_STATUS_INVALID_CONNECTION_ID 18 #define HV_STATUS_INSUFFICIENT_BUFFERS 19 typedef struct _HV_REFERENCE_TSC_PAGE { -- cgit v1.2.3 From d089f8e97d371a662dd233491e03bda377c9d46d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 5 Mar 2015 10:49:17 +1030 Subject: x86: fix up obsolete cpu function usage. Thanks to spatch, plus manual removal of "&*". Signed-off-by: Rusty Russell Cc: x86@kernel.org --- arch/x86/kernel/apic/x2apic_cluster.c | 8 ++++---- arch/x86/kernel/irq.c | 4 ++-- arch/x86/platform/uv/tlb_uv.c | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index e658f21681c8..d9d0bd2faaf4 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -135,12 +135,12 @@ static void init_x2apic_ldr(void) per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR); - __cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu)); + cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, this_cpu)); for_each_online_cpu(cpu) { if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) continue; - __cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu)); - __cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu)); + cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu)); + cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu)); } } @@ -195,7 +195,7 @@ static int x2apic_init_cpu_notifier(void) BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu)); - __cpu_set(cpu, per_cpu(cpus_in_cluster, cpu)); + cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu)); register_hotcpu_notifier(&x2apic_cpu_notifier); return 1; } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 67b1cbe0093a..e5952c225532 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -295,7 +295,7 @@ int check_irq_vectors_for_cpu_disable(void) this_cpu = smp_processor_id(); cpumask_copy(&online_new, cpu_online_mask); - cpu_clear(this_cpu, online_new); + cpumask_clear_cpu(this_cpu, &online_new); this_count = 0; for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { @@ -307,7 +307,7 @@ int check_irq_vectors_for_cpu_disable(void) data = irq_desc_get_irq_data(desc); cpumask_copy(&affinity_new, data->affinity); - cpu_clear(this_cpu, affinity_new); + cpumask_clear_cpu(this_cpu, &affinity_new); /* Do not count inactive or per-cpu irqs. */ if (!irq_has_action(irq) || irqd_is_per_cpu(data)) diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 994798548b1a..3b6ec42718e4 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -415,7 +415,7 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) struct reset_args reset_args; reset_args.sender = sender; - cpus_clear(*mask); + cpumask_clear(mask); /* find a single cpu for each uvhub in this distribution mask */ maskbits = sizeof(struct pnmask) * BITSPERBYTE; /* each bit is a pnode relative to the partition base pnode */ @@ -425,7 +425,7 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) continue; apnode = pnode + bcp->partition_base_pnode; cpu = pnode_to_first_cpu(apnode, smaster); - cpu_set(cpu, *mask); + cpumask_set_cpu(cpu, mask); } /* IPI all cpus; preemption is already disabled */ @@ -1126,7 +1126,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, /* don't actually do a shootdown of the local cpu */ cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); - if (cpu_isset(cpu, *cpumask)) + if (cpumask_test_cpu(cpu, cpumask)) stat->s_ntargself++; bau_desc = bcp->descriptor_base; -- cgit v1.2.3 From d939be3add4f1410079dad2755d4936cdb70903b Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Fri, 27 Feb 2015 23:52:31 +0900 Subject: treewide: Fix typo in printk messages This patch fix spelling typo in printk messages. Signed-off-by: Masanari Iida Acked-by: Randy Dunlap Signed-off-by: Jiri Kosina --- arch/x86/kernel/test_rodata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c index b79133abda48..5ecbfe5099da 100644 --- a/arch/x86/kernel/test_rodata.c +++ b/arch/x86/kernel/test_rodata.c @@ -57,7 +57,7 @@ int rodata_test(void) /* test 3: check the value hasn't changed */ /* If this test fails, we managed to overwrite the data */ if (!rodata_test_data) { - printk(KERN_ERR "rodata_test: Test 3 failes (end data)\n"); + printk(KERN_ERR "rodata_test: Test 3 fails (end data)\n"); return -ENODEV; } /* test 4: check if the rodata section is 4Kb aligned */ -- cgit v1.2.3 From 1bd187de536494a27925902b9653e9ef0ace4774 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 23 Feb 2015 16:24:44 +0200 Subject: x86, intel-mid: remove Intel MID specific serial support Since we have a native 8250 driver carrying the Intel MID serial devices the specific support is not needed anymore. This patch removes it for Intel MID. Note that the console device name is changed from ttyMFDx to ttySx. Signed-off-by: Andy Shevchenko Acked-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/Kconfig.debug | 4 - arch/x86/include/asm/intel-mid.h | 3 - arch/x86/kernel/early_printk.c | 6 -- arch/x86/platform/intel-mid/Makefile | 1 - .../platform/intel-mid/early_printk_intel_mid.c | 112 --------------------- 5 files changed, 126 deletions(-) delete mode 100644 arch/x86/platform/intel-mid/early_printk_intel_mid.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 20028da8ae18..72484a645f05 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -43,10 +43,6 @@ config EARLY_PRINTK with klogd/syslogd or the X server. You should normally N here, unless you want to debug such a crash. -config EARLY_PRINTK_INTEL_MID - bool "Early printk for Intel MID platform support" - depends on EARLY_PRINTK && X86_INTEL_MID - config EARLY_PRINTK_DBGP bool "Early printk via EHCI debug port" depends on EARLY_PRINTK && PCI diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index 705d35708a50..7c5af123bdbd 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -136,9 +136,6 @@ extern enum intel_mid_timer_options intel_mid_timer_options; #define SFI_MTMR_MAX_NUM 8 #define SFI_MRTC_MAX 8 -extern struct console early_hsu_console; -extern void hsu_early_console_init(const char *); - extern void intel_scu_devices_create(void); extern void intel_scu_devices_destroy(void); diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index a62536a1be88..f85e3fb50f28 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -375,12 +375,6 @@ static int __init setup_early_printk(char *buf) if (!strncmp(buf, "xen", 3)) early_console_register(&xenboot_console, keep); #endif -#ifdef CONFIG_EARLY_PRINTK_INTEL_MID - if (!strncmp(buf, "hsu", 3)) { - hsu_early_console_init(buf + 3); - early_console_register(&early_hsu_console, keep); - } -#endif #ifdef CONFIG_EARLY_PRINTK_EFI if (!strncmp(buf, "efi", 3)) early_console_register(&early_efi_console, keep); diff --git a/arch/x86/platform/intel-mid/Makefile b/arch/x86/platform/intel-mid/Makefile index 0a8ee703b9fa..0ce1b1913673 100644 --- a/arch/x86/platform/intel-mid/Makefile +++ b/arch/x86/platform/intel-mid/Makefile @@ -1,5 +1,4 @@ obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o mfld.o mrfl.o -obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_intel_mid.o # SFI specific code ifdef CONFIG_X86_INTEL_MID diff --git a/arch/x86/platform/intel-mid/early_printk_intel_mid.c b/arch/x86/platform/intel-mid/early_printk_intel_mid.c deleted file mode 100644 index 4e720829ab90..000000000000 --- a/arch/x86/platform/intel-mid/early_printk_intel_mid.c +++ /dev/null @@ -1,112 +0,0 @@ -/* - * early_printk_intel_mid.c - early consoles for Intel MID platforms - * - * Copyright (c) 2008-2010, Intel Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -/* - * This file implements early console named hsu. - * hsu is based on a High Speed UART device which only exists in the Medfield - * platform - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -/* - * Following is the early console based on Medfield HSU (High - * Speed UART) device. - */ -#define HSU_PORT_BASE 0xffa28080 - -static void __iomem *phsu; - -void hsu_early_console_init(const char *s) -{ - unsigned long paddr, port = 0; - u8 lcr; - - /* - * Select the early HSU console port if specified by user in the - * kernel command line. - */ - if (*s && !kstrtoul(s, 10, &port)) - port = clamp_val(port, 0, 2); - - paddr = HSU_PORT_BASE + port * 0x80; - phsu = (void __iomem *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, paddr); - - /* Disable FIFO */ - writeb(0x0, phsu + UART_FCR); - - /* Set to default 115200 bps, 8n1 */ - lcr = readb(phsu + UART_LCR); - writeb((0x80 | lcr), phsu + UART_LCR); - writeb(0x18, phsu + UART_DLL); - writeb(lcr, phsu + UART_LCR); - writel(0x3600, phsu + UART_MUL*4); - - writeb(0x8, phsu + UART_MCR); - writeb(0x7, phsu + UART_FCR); - writeb(0x3, phsu + UART_LCR); - - /* Clear IRQ status */ - readb(phsu + UART_LSR); - readb(phsu + UART_RX); - readb(phsu + UART_IIR); - readb(phsu + UART_MSR); - - /* Enable FIFO */ - writeb(0x7, phsu + UART_FCR); -} - -#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE) - -static void early_hsu_putc(char ch) -{ - unsigned int timeout = 10000; /* 10ms */ - u8 status; - - while (--timeout) { - status = readb(phsu + UART_LSR); - if (status & BOTH_EMPTY) - break; - udelay(1); - } - - /* Only write the char when there was no timeout */ - if (timeout) - writeb(ch, phsu + UART_TX); -} - -static void early_hsu_write(struct console *con, const char *str, unsigned n) -{ - int i; - - for (i = 0; i < n && *str; i++) { - if (*str == '\n') - early_hsu_putc('\r'); - early_hsu_putc(*str); - str++; - } -} - -struct console early_hsu_console = { - .name = "earlyhsu", - .write = early_hsu_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; -- cgit v1.2.3 From c467ea763fd5d8795b7d1b5a78eb94b6ad8f66ad Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 4 Mar 2015 18:06:33 +0100 Subject: context_tracking: Rename context symbols to prepare for transition state Current context tracking symbols are designed to express living state. As such they are prefixed with "IN_": IN_USER, IN_KERNEL. Now we are going to use these symbols to also express state transitions such as context_tracking_enter(IN_USER) or context_tracking_exit(IN_USER). But while the "IN_" prefix works well to express entering a context, it's confusing to depict a context exit: context_tracking_exit(IN_USER) could mean two things: 1) We are exiting the current context to enter user context. 2) We are exiting the user context We want 2) but the reviewer may be confused and understand 1) So lets disambiguate these symbols and rename them to CONTEXT_USER and CONTEXT_KERNEL. Acked-by: Rik van Riel Cc: Paul E. McKenney Cc: Andy Lutomirski Cc: Will deacon Cc: Marcelo Tosatti Cc: Christian Borntraeger Cc: Luiz Capitulino Cc: Paolo Bonzini Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/traps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9d2073e2ecc9..756f74eed35d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -123,7 +123,7 @@ enum ctx_state ist_enter(struct pt_regs *regs) * but we need to notify RCU. */ rcu_nmi_enter(); - prev_state = IN_KERNEL; /* the value is irrelevant. */ + prev_state = CONTEXT_KERNEL; /* the value is irrelevant. */ } /* -- cgit v1.2.3 From fdaf3a6539d6b9b6a7240c3b00bd81c813b2760d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 10 Mar 2015 12:43:45 +1030 Subject: x86: fix more deprecated cpu function usage. Signed-off-by: Rusty Russell --- arch/x86/kernel/apic/x2apic_cluster.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index d9d0bd2faaf4..ab3219b3fbda 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -171,8 +171,8 @@ update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu) for_each_online_cpu(cpu) { if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) continue; - __cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu)); - __cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu)); + cpumask_clear_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu)); + cpumask_clear_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu)); } free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu)); free_cpumask_var(per_cpu(ipi_mask, this_cpu)); -- cgit v1.2.3 From 079119a2c7c83506ad753e7b95e9ed253e9963f9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 11 Mar 2015 13:52:53 +0200 Subject: serial, x86: use UPF_* constants for flags This patch fixes the following sparse warnings: drivers/tty/serial/8250/8250_core.c:3231:32: warning: incorrect type in assignment (different base types) drivers/tty/serial/8250/8250_core.c:3231:32: expected restricted upf_t [usertype] flags drivers/tty/serial/8250/8250_core.c:3231:32: got unsigned int const [unsigned] flags Signed-off-by: Andy Shevchenko Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/serial.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/serial.h b/arch/x86/include/asm/serial.h index 460b84f64556..8378b8c9109c 100644 --- a/arch/x86/include/asm/serial.h +++ b/arch/x86/include/asm/serial.h @@ -12,11 +12,11 @@ /* Standard COM flags (except for COM4, because of the 8514 problem) */ #ifdef CONFIG_SERIAL_DETECT_IRQ -# define STD_COMX_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ) -# define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | 0 | ASYNC_AUTO_IRQ) +# define STD_COMX_FLAGS (UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | UPF_AUTO_IRQ) +# define STD_COM4_FLAGS (UPF_BOOT_AUTOCONF | 0 | UPF_AUTO_IRQ) #else -# define STD_COMX_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | 0 ) -# define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | 0 | 0 ) +# define STD_COMX_FLAGS (UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | 0 ) +# define STD_COM4_FLAGS (UPF_BOOT_AUTOCONF | 0 | 0 ) #endif #define SERIAL_PORT_DFNS \ -- cgit v1.2.3 From 2a442c9c6453d3d043dfd89f2e03a1deff8a6f06 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Feb 2015 11:42:15 -0800 Subject: x86: Use common outgoing-CPU-notification code This commit removes the open-coded CPU-offline notification with new common code. Among other things, this change avoids calling scheduler code using RCU from an offline CPU that RCU is ignoring. It also allows Xen to notice at online time that the CPU did not go offline correctly. Note that Xen has the surviving CPU carry out some cleanup operations, so if the surviving CPU times out, these cleanup operations might have been carried out while the outgoing CPU was still running. It might therefore be unwise to bring this CPU back online, and this commit avoids doing so. Signed-off-by: Boris Ostrovsky Signed-off-by: Paul E. McKenney Cc: Cc: Konrad Rzeszutek Wilk Cc: David Vrabel Cc: --- arch/x86/include/asm/cpu.h | 2 -- arch/x86/include/asm/smp.h | 2 +- arch/x86/kernel/smpboot.c | 39 ++++++++++++++++++--------------------- arch/x86/xen/smp.c | 46 +++++++++++++++++++++++++--------------------- 4 files changed, 44 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index d2b12988d2ed..bf2caa1dedc5 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -34,8 +34,6 @@ extern int _debug_hotplug_cpu(int cpu, int action); #endif #endif -DECLARE_PER_CPU(int, cpu_state); - int mwait_usable(const struct cpuinfo_x86 *); #endif /* _ASM_X86_CPU_H */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 8cd1cc3bc835..a5cb4f6e9492 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -150,12 +150,12 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) } void cpu_disable_common(void); -void cpu_die_common(unsigned int cpu); void native_smp_prepare_boot_cpu(void); void native_smp_prepare_cpus(unsigned int max_cpus); void native_smp_cpus_done(unsigned int max_cpus); int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_disable(void); +int common_cpu_die(unsigned int cpu); void native_cpu_die(unsigned int cpu); void native_play_dead(void); void play_dead_common(void); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index febc6aabc72e..c8fa34963ead 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -77,9 +77,6 @@ #include #include -/* State of each CPU */ -DEFINE_PER_CPU(int, cpu_state) = { 0 }; - /* Number of siblings per CPU package */ int smp_num_siblings = 1; EXPORT_SYMBOL(smp_num_siblings); @@ -257,7 +254,7 @@ static void notrace start_secondary(void *unused) lock_vector_lock(); set_cpu_online(smp_processor_id(), true); unlock_vector_lock(); - per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + cpu_set_state_online(smp_processor_id()); x86_platform.nmi_init(); /* enable local interrupts */ @@ -948,7 +945,10 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) */ mtrr_save_state(); - per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; + /* x86 CPUs take themselves offline, so delayed offline is OK. */ + err = cpu_check_up_prepare(cpu); + if (err && err != -EBUSY) + return err; /* the FPU context is blank, nobody can own it */ __cpu_disable_lazy_restore(cpu); @@ -1191,7 +1191,7 @@ void __init native_smp_prepare_boot_cpu(void) switch_to_new_gdt(me); /* already set me in cpu_online_mask in boot_cpu_init() */ cpumask_set_cpu(me, cpu_callout_mask); - per_cpu(cpu_state, me) = CPU_ONLINE; + cpu_set_state_online(me); } void __init native_smp_cpus_done(unsigned int max_cpus) @@ -1318,14 +1318,10 @@ static void __ref remove_cpu_from_maps(int cpu) numa_remove_cpu(cpu); } -static DEFINE_PER_CPU(struct completion, die_complete); - void cpu_disable_common(void) { int cpu = smp_processor_id(); - init_completion(&per_cpu(die_complete, smp_processor_id())); - remove_siblinginfo(cpu); /* It's now safe to remove this processor from the online map */ @@ -1349,24 +1345,27 @@ int native_cpu_disable(void) return 0; } -void cpu_die_common(unsigned int cpu) +int common_cpu_die(unsigned int cpu) { - wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ); -} + int ret = 0; -void native_cpu_die(unsigned int cpu) -{ /* We don't do anything here: idle task is faking death itself. */ - cpu_die_common(cpu); - /* They ack this in play_dead() by setting CPU_DEAD */ - if (per_cpu(cpu_state, cpu) == CPU_DEAD) { + if (cpu_wait_death(cpu, 5)) { if (system_state == SYSTEM_RUNNING) pr_info("CPU %u is now offline\n", cpu); } else { pr_err("CPU %u didn't die...\n", cpu); + ret = -1; } + + return ret; +} + +void native_cpu_die(unsigned int cpu) +{ + common_cpu_die(cpu); } void play_dead_common(void) @@ -1375,10 +1374,8 @@ void play_dead_common(void) reset_lazy_tlbstate(); amd_e400_remove_cpu(raw_smp_processor_id()); - mb(); /* Ack it */ - __this_cpu_write(cpu_state, CPU_DEAD); - complete(&per_cpu(die_complete, smp_processor_id())); + (void)cpu_report_death(); /* * With physical CPU hotplug, we should halt the cpu diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 08e8489c47f1..1c5e760f34ca 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -90,14 +90,10 @@ static void cpu_bringup(void) set_cpu_online(cpu, true); - this_cpu_write(cpu_state, CPU_ONLINE); - - wmb(); + cpu_set_state_online(cpu); /* Implies full memory barrier. */ /* We can take interrupts now: we're officially "up". */ local_irq_enable(); - - wmb(); /* make sure everything is out */ } /* @@ -459,7 +455,13 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) xen_setup_timer(cpu); xen_init_lock_cpu(cpu); - per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; + /* + * PV VCPUs are always successfully taken down (see 'while' loop + * in xen_cpu_die()), so -EBUSY is an error. + */ + rc = cpu_check_up_prepare(cpu); + if (rc) + return rc; /* make sure interrupts start blocked */ per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; @@ -479,10 +481,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); BUG_ON(rc); - while(per_cpu(cpu_state, cpu) != CPU_ONLINE) { + while (cpu_report_state(cpu) != CPU_ONLINE) HYPERVISOR_sched_op(SCHEDOP_yield, NULL); - barrier(); - } return 0; } @@ -511,11 +511,11 @@ static void xen_cpu_die(unsigned int cpu) schedule_timeout(HZ/10); } - cpu_die_common(cpu); - - xen_smp_intr_free(cpu); - xen_uninit_lock_cpu(cpu); - xen_teardown_timer(cpu); + if (common_cpu_die(cpu) == 0) { + xen_smp_intr_free(cpu); + xen_uninit_lock_cpu(cpu); + xen_teardown_timer(cpu); + } } static void xen_play_dead(void) /* used only with HOTPLUG_CPU */ @@ -747,6 +747,16 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) { int rc; + + /* + * This can happen if CPU was offlined earlier and + * offlining timed out in common_cpu_die(). + */ + if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) { + xen_smp_intr_free(cpu); + xen_uninit_lock_cpu(cpu); + } + /* * xen_smp_intr_init() needs to run before native_cpu_up() * so that IPI vectors are set up on the booting CPU before @@ -768,12 +778,6 @@ static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) return rc; } -static void xen_hvm_cpu_die(unsigned int cpu) -{ - xen_cpu_die(cpu); - native_cpu_die(cpu); -} - void __init xen_hvm_smp_init(void) { if (!xen_have_vector_callback) @@ -781,7 +785,7 @@ void __init xen_hvm_smp_init(void) smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; smp_ops.smp_send_reschedule = xen_smp_send_reschedule; smp_ops.cpu_up = xen_hvm_cpu_up; - smp_ops.cpu_die = xen_hvm_cpu_die; + smp_ops.cpu_die = xen_cpu_die; smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu; -- cgit v1.2.3 From 05713ba9055f1a209d50e480626f36c401bda3ad Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 11 Mar 2015 17:56:26 +0100 Subject: crypto: don't export static symbol The semantic patch that fixes this problem is as follows: (http://coccinelle.lip6.fr/) // @r@ type T; identifier f; @@ static T f (...) { ... } @@ identifier r.f; declarer name EXPORT_SYMBOL_GPL; @@ -EXPORT_SYMBOL_GPL(f); // Signed-off-by: Julia Lawall Signed-off-by: Herbert Xu --- arch/x86/crypto/glue_helper.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index 432f1d76ceb8..6a85598931b5 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -232,7 +232,6 @@ static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr, le128_to_be128((be128 *)walk->iv, &ctrblk); } -EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit); static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, struct blkcipher_desc *desc, -- cgit v1.2.3 From 297d716f6260cc9421d971b124ca196b957ee458 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 12 Mar 2015 08:44:11 +0100 Subject: power_supply: Change ownership from driver to core Change the ownership of power_supply structure from each driver implementing the class to the power supply core. The patch changes power_supply_register() function thus all drivers implementing power supply class are adjusted. Each driver provides the implementation of power supply. However it should not be the owner of power supply class instance because it is exposed by core to other subsystems with power_supply_get_by_name(). These other subsystems have no knowledge when the driver will unregister the power supply. This leads to several issues when driver is unbound - mostly because user of power supply accesses freed memory. Instead let the core own the instance of struct 'power_supply'. Other users of this power supply will still access valid memory because it will be freed when device reference count reaches 0. Currently this means "it will leak" but power_supply_put() call in next patches will solve it. This solves invalid memory references in following race condition scenario: Thread 1: charger manager Thread 2: power supply driver, used by charger manager THREAD 1 (charger manager) THREAD 2 (power supply driver) ========================== ============================== psy = power_supply_get_by_name() Driver unbind, .remove power_supply_unregister() Device fully removed psy->get_property() The 'get_property' call is executed in invalid context because the driver was unbound and struct 'power_supply' memory was freed. This could be observed easily with charger manager driver (here compiled with max17040 fuel gauge): $ cat /sys/devices/virtual/power_supply/cm-battery/capacity & $ echo "1-0036" > /sys/bus/i2c/drivers/max17040/unbind [ 55.725123] Unable to handle kernel NULL pointer dereference at virtual address 00000000 [ 55.732584] pgd = d98d4000 [ 55.734060] [00000000] *pgd=5afa2831, *pte=00000000, *ppte=00000000 [ 55.740318] Internal error: Oops: 80000007 [#1] PREEMPT SMP ARM [ 55.746210] Modules linked in: [ 55.749259] CPU: 1 PID: 2936 Comm: cat Tainted: G W 3.19.0-rc1-next-20141226-00048-gf79f475f3c44-dirty #1496 [ 55.760190] Hardware name: SAMSUNG EXYNOS (Flattened Device Tree) [ 55.766270] task: d9b76f00 ti: daf54000 task.ti: daf54000 [ 55.771647] PC is at 0x0 [ 55.774182] LR is at charger_get_property+0x2f4/0x36c [ 55.779201] pc : [<00000000>] lr : [] psr: 60000013 [ 55.779201] sp : daf55e90 ip : 00000003 fp : 00000000 [ 55.790657] r10: 00000000 r9 : c06e2878 r8 : d9b26c68 [ 55.795865] r7 : dad81610 r6 : daec7410 r5 : daf55ebc r4 : 00000000 [ 55.802367] r3 : 00000000 r2 : daf55ebc r1 : 0000002a r0 : d9b26c68 [ 55.808879] Flags: nZCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment user [ 55.815994] Control: 10c5387d Table: 598d406a DAC: 00000015 [ 55.821723] Process cat (pid: 2936, stack limit = 0xdaf54210) [ 55.827451] Stack: (0xdaf55e90 to 0xdaf56000) [ 55.831795] 5e80: 60000013 c01459c4 0000002a c06f8ef8 [ 55.839956] 5ea0: db651000 c06f8ef8 daebac00 c04cb668 daebac08 c0346864 00000000 c01459c4 [ 55.848115] 5ec0: d99eaa80 c06f8ef8 00000fff 00001000 db651000 c027f25c c027f240 d99eaa80 [ 55.856274] 5ee0: d9a06c00 c0146218 daf55f18 00001000 d99eaa80 db4c18c0 00000001 00000001 [ 55.864468] 5f00: daf55f80 c0144c78 c0144c54 c0107f90 00015000 d99eaab0 00000000 00000000 [ 55.872603] 5f20: 000051c7 00000000 db4c18c0 c04a9370 00015000 00001000 daf55f80 00001000 [ 55.880763] 5f40: daf54000 00015000 00000000 c00e53dc db4c18c0 c00e548c 0000000d 00008124 [ 55.888937] 5f60: 00000001 00000000 00000000 db4c18c0 db4c18c0 00001000 00015000 c00e5550 [ 55.897099] 5f80: 00000000 00000000 00001000 00001000 00015000 00000003 00000003 c000f364 [ 55.905239] 5fa0: 00000000 c000f1a0 00001000 00015000 00000003 00015000 00001000 0001333c [ 55.913399] 5fc0: 00001000 00015000 00000003 00000003 00000002 00000000 00000000 00000000 [ 55.921560] 5fe0: 7fffe000 be999850 0000a225 b6f3c19c 60000010 00000003 00000000 00000000 [ 55.929744] [] (charger_get_property) from [] (power_supply_show_property+0x48/0x20c) [ 55.939286] [] (power_supply_show_property) from [] (dev_attr_show+0x1c/0x48) [ 55.948130] [] (dev_attr_show) from [] (sysfs_kf_seq_show+0x84/0x104) [ 55.956298] [] (sysfs_kf_seq_show) from [] (kernfs_seq_show+0x24/0x28) [ 55.964536] [] (kernfs_seq_show) from [] (seq_read+0x1b0/0x484) [ 55.972172] [] (seq_read) from [] (__vfs_read+0x18/0x4c) [ 55.979188] [] (__vfs_read) from [] (vfs_read+0x7c/0x100) [ 55.986304] [] (vfs_read) from [] (SyS_read+0x40/0x8c) [ 55.993164] [] (SyS_read) from [] (ret_fast_syscall+0x0/0x48) [ 56.000626] Code: bad PC value [ 56.011652] ---[ end trace 7b64343fbdae8ef1 ]--- Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bartlomiej Zolnierkiewicz [for the nvec part] Reviewed-by: Marc Dietrich [for compal-laptop.c] Acked-by: Darren Hart [for the mfd part] Acked-by: Lee Jones [for the hid part] Acked-by: Jiri Kosina [for the acpi part] Acked-by: Rafael J. Wysocki Signed-off-by: Sebastian Reichel --- arch/x86/platform/olpc/olpc-xo1-sci.c | 4 ++-- arch/x86/platform/olpc/olpc-xo15-sci.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c index 9a2e590dd202..e4ed28bbf79d 100644 --- a/arch/x86/platform/olpc/olpc-xo1-sci.c +++ b/arch/x86/platform/olpc/olpc-xo1-sci.c @@ -61,7 +61,7 @@ static void battery_status_changed(void) if (psy) { power_supply_changed(psy); - put_device(psy->dev); + put_device(&psy->dev); } } @@ -71,7 +71,7 @@ static void ac_status_changed(void) if (psy) { power_supply_changed(psy); - put_device(psy->dev); + put_device(&psy->dev); } } diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c index 08e350e757dc..186634e9021d 100644 --- a/arch/x86/platform/olpc/olpc-xo15-sci.c +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c @@ -83,7 +83,7 @@ static void battery_status_changed(void) if (psy) { power_supply_changed(psy); - put_device(psy->dev); + put_device(&psy->dev); } } @@ -93,7 +93,7 @@ static void ac_status_changed(void) if (psy) { power_supply_changed(psy); - put_device(psy->dev); + put_device(&psy->dev); } } -- cgit v1.2.3 From ed6dad52298152a5c493223234e431f206c5a46b Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 12 Mar 2015 08:44:15 +0100 Subject: x86/olpc/xo1/sci: Use newly added power_supply_put API Replace direct usage of put_device() with new API: power_supply_put(). Signed-off-by: Krzysztof Kozlowski Acked-by: Pavel Machek Reviewed-by: Bartlomiej Zolnierkiewicz Reviewed-by: Sebastian Reichel Acked-by: Ingo Molnar Signed-off-by: Sebastian Reichel --- arch/x86/platform/olpc/olpc-xo1-sci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c index e4ed28bbf79d..7fa8b3b53bc0 100644 --- a/arch/x86/platform/olpc/olpc-xo1-sci.c +++ b/arch/x86/platform/olpc/olpc-xo1-sci.c @@ -61,7 +61,7 @@ static void battery_status_changed(void) if (psy) { power_supply_changed(psy); - put_device(&psy->dev); + power_supply_put(psy); } } @@ -71,7 +71,7 @@ static void ac_status_changed(void) if (psy) { power_supply_changed(psy); - put_device(&psy->dev); + power_supply_put(psy); } } -- cgit v1.2.3 From 67273a1b421a12ef77bcf08b027d165f399054ae Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 12 Mar 2015 08:44:16 +0100 Subject: x86/olpc/xo15/sci: Use newly added power_supply_put API Replace direct usage of put_device() with new API: power_supply_put(). Signed-off-by: Krzysztof Kozlowski Acked-by: Pavel Machek Reviewed-by: Bartlomiej Zolnierkiewicz Reviewed-by: Sebastian Reichel Acked-by: Ingo Molnar Signed-off-by: Sebastian Reichel --- arch/x86/platform/olpc/olpc-xo15-sci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c index 186634e9021d..55130846ac87 100644 --- a/arch/x86/platform/olpc/olpc-xo15-sci.c +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c @@ -83,7 +83,7 @@ static void battery_status_changed(void) if (psy) { power_supply_changed(psy); - put_device(&psy->dev); + power_supply_put(psy); } } @@ -93,7 +93,7 @@ static void ac_status_changed(void) if (psy) { power_supply_changed(psy); - put_device(&psy->dev); + power_supply_put(psy); } } -- cgit v1.2.3 From c42e9902f3f1e731a7b83397acdf402eeb3237ca Mon Sep 17 00:00:00 2001 From: Ameen Ali Date: Fri, 13 Mar 2015 23:38:21 +0200 Subject: crypto: sha1-mb - Syntax error fixing a syntax-error . Signed-off-by: Ameen Ali Signed-off-by: Herbert Xu --- arch/x86/crypto/sha-mb/sha1_mb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c index fd9f6b035b16..9414fd964ecd 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb.c +++ b/arch/x86/crypto/sha-mb/sha1_mb.c @@ -828,7 +828,7 @@ static unsigned long sha1_mb_flusher(struct mcryptd_alg_cstate *cstate) while (!list_empty(&cstate->work_list)) { rctx = list_entry(cstate->work_list.next, struct mcryptd_hash_request_ctx, waiter); - if time_before(cur_time, rctx->tag.expire) + if (time_before(cur_time, rctx->tag.expire)) break; kernel_fpu_begin(); sha_ctx = (struct sha1_hash_ctx *) sha1_ctx_mgr_flush(cstate->mgr); -- cgit v1.2.3 From 9b4ade226f7468bb26f98b6cd01cb5b8a05fc96d Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 21 Jan 2015 08:49:22 +0100 Subject: xen: build infrastructure for generating hypercall depending symbols Today there are several places in the kernel which build tables containing one entry for each possible Xen hypercall. Create an infrastructure to be able to generate these tables at build time. Based-on-patch-by: Jan Beulich Signed-off-by: Juergen Gross Reviewed-by: David Vrabel Acked-by: Ingo Molnar Signed-off-by: David Vrabel --- arch/x86/syscalls/Makefile | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile index 3323c2745248..a55abb9f6c5e 100644 --- a/arch/x86/syscalls/Makefile +++ b/arch/x86/syscalls/Makefile @@ -19,6 +19,9 @@ quiet_cmd_syshdr = SYSHDR $@ quiet_cmd_systbl = SYSTBL $@ cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@ +quiet_cmd_hypercalls = HYPERCALLS $@ + cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<,$^) + syshdr_abi_unistd_32 := i386 $(uapi)/unistd_32.h: $(syscall32) $(syshdr) $(call if_changed,syshdr) @@ -47,10 +50,16 @@ $(out)/syscalls_32.h: $(syscall32) $(systbl) $(out)/syscalls_64.h: $(syscall64) $(systbl) $(call if_changed,systbl) +$(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh + $(call if_changed,hypercalls) + +$(out)/xen-hypercalls.h: $(srctree)/include/xen/interface/xen*.h + uapisyshdr-y += unistd_32.h unistd_64.h unistd_x32.h syshdr-y += syscalls_32.h syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h syshdr-$(CONFIG_X86_64) += syscalls_64.h +syshdr-$(CONFIG_XEN) += xen-hypercalls.h targets += $(uapisyshdr-y) $(syshdr-y) -- cgit v1.2.3 From 16b12d605795e775cd80b5901c85dd2e320a6ec2 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 21 Jan 2015 08:49:23 +0100 Subject: xen: synchronize include/xen/interface/xen.h with xen The header include/xen/interface/xen.h doesn't contain all definitions from Xen's version of that header. Update it accordingly. Signed-off-by: Juergen Gross Reviewed-by: David Vrabel Signed-off-by: David Vrabel --- arch/x86/xen/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c index 520022d1a181..8296cdf4e581 100644 --- a/arch/x86/xen/trace.c +++ b/arch/x86/xen/trace.c @@ -29,7 +29,7 @@ static const char *xen_hypercall_names[] = { N(vcpu_op), N(set_segment_base), N(mmuext_op), - N(acm_op), + N(xsm_op), N(nmi_op), N(sched_op), N(callback_op), -- cgit v1.2.3 From fc903f87364d1b00890fa991a6f116e2bb38ec1e Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 21 Jan 2015 08:49:24 +0100 Subject: xen: use generated hypervisor symbols in arch/x86/xen/trace.c Instead of manually list all hypervisor calls in arch/x86/xen/trace.c use the auto generated list. Signed-off-by: Juergen Gross Reviewed-by: David Vrabel Signed-off-by: David Vrabel --- arch/x86/xen/trace.c | 50 ++++---------------------------------------------- 1 file changed, 4 insertions(+), 46 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c index 8296cdf4e581..a702ec2f5931 100644 --- a/arch/x86/xen/trace.c +++ b/arch/x86/xen/trace.c @@ -1,54 +1,12 @@ #include #include +#include -#define N(x) [__HYPERVISOR_##x] = "("#x")" +#define HYPERCALL(x) [__HYPERVISOR_##x] = "("#x")", static const char *xen_hypercall_names[] = { - N(set_trap_table), - N(mmu_update), - N(set_gdt), - N(stack_switch), - N(set_callbacks), - N(fpu_taskswitch), - N(sched_op_compat), - N(dom0_op), - N(set_debugreg), - N(get_debugreg), - N(update_descriptor), - N(memory_op), - N(multicall), - N(update_va_mapping), - N(set_timer_op), - N(event_channel_op_compat), - N(xen_version), - N(console_io), - N(physdev_op_compat), - N(grant_table_op), - N(vm_assist), - N(update_va_mapping_otherdomain), - N(iret), - N(vcpu_op), - N(set_segment_base), - N(mmuext_op), - N(xsm_op), - N(nmi_op), - N(sched_op), - N(callback_op), - N(xenoprof_op), - N(event_channel_op), - N(physdev_op), - N(hvm_op), - -/* Architecture-specific hypercall definitions. */ - N(arch_0), - N(arch_1), - N(arch_2), - N(arch_3), - N(arch_4), - N(arch_5), - N(arch_6), - N(arch_7), +#include }; -#undef N +#undef HYPERCALL static const char *xen_hypercall_name(unsigned op) { -- cgit v1.2.3 From 526abeaed4971def462f209632b88fd7b8c4a09c Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 21 Jan 2015 08:49:25 +0100 Subject: xen: use generated hypercall symbols in arch/x86/xen/xen-head.S Instead of manually list each hypercall in arch/x86/xen/xen-head.S use the auto generated symbol list. This also corrects the wrong address of xen_hypercall_mca which was located 32 bytes higher than it should. Symbol addresses have been verified to match the correct ones via objdump output. Based-on-patch-by: Jan Beulich Signed-off-by: Juergen Gross Reviewed-by: David Vrabel Signed-off-by: David Vrabel --- arch/x86/xen/xen-head.S | 63 ++++++++----------------------------------------- 1 file changed, 10 insertions(+), 53 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 674b222544b7..8afdfccf6086 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -12,6 +12,8 @@ #include #include +#include +#include #include #ifdef CONFIG_XEN_PVH @@ -85,59 +87,14 @@ ENTRY(xen_pvh_early_cpu_init) .pushsection .text .balign PAGE_SIZE ENTRY(hypercall_page) -#define NEXT_HYPERCALL(x) \ - ENTRY(xen_hypercall_##x) \ - .skip 32 - -NEXT_HYPERCALL(set_trap_table) -NEXT_HYPERCALL(mmu_update) -NEXT_HYPERCALL(set_gdt) -NEXT_HYPERCALL(stack_switch) -NEXT_HYPERCALL(set_callbacks) -NEXT_HYPERCALL(fpu_taskswitch) -NEXT_HYPERCALL(sched_op_compat) -NEXT_HYPERCALL(platform_op) -NEXT_HYPERCALL(set_debugreg) -NEXT_HYPERCALL(get_debugreg) -NEXT_HYPERCALL(update_descriptor) -NEXT_HYPERCALL(ni) -NEXT_HYPERCALL(memory_op) -NEXT_HYPERCALL(multicall) -NEXT_HYPERCALL(update_va_mapping) -NEXT_HYPERCALL(set_timer_op) -NEXT_HYPERCALL(event_channel_op_compat) -NEXT_HYPERCALL(xen_version) -NEXT_HYPERCALL(console_io) -NEXT_HYPERCALL(physdev_op_compat) -NEXT_HYPERCALL(grant_table_op) -NEXT_HYPERCALL(vm_assist) -NEXT_HYPERCALL(update_va_mapping_otherdomain) -NEXT_HYPERCALL(iret) -NEXT_HYPERCALL(vcpu_op) -NEXT_HYPERCALL(set_segment_base) -NEXT_HYPERCALL(mmuext_op) -NEXT_HYPERCALL(xsm_op) -NEXT_HYPERCALL(nmi_op) -NEXT_HYPERCALL(sched_op) -NEXT_HYPERCALL(callback_op) -NEXT_HYPERCALL(xenoprof_op) -NEXT_HYPERCALL(event_channel_op) -NEXT_HYPERCALL(physdev_op) -NEXT_HYPERCALL(hvm_op) -NEXT_HYPERCALL(sysctl) -NEXT_HYPERCALL(domctl) -NEXT_HYPERCALL(kexec_op) -NEXT_HYPERCALL(tmem_op) /* 38 */ -ENTRY(xen_hypercall_rsvr) - .skip 320 -NEXT_HYPERCALL(mca) /* 48 */ -NEXT_HYPERCALL(arch_1) -NEXT_HYPERCALL(arch_2) -NEXT_HYPERCALL(arch_3) -NEXT_HYPERCALL(arch_4) -NEXT_HYPERCALL(arch_5) -NEXT_HYPERCALL(arch_6) - .balign PAGE_SIZE + .skip PAGE_SIZE + +#define HYPERCALL(n) \ + .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \ + .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 +#include +#undef HYPERCALL + .popsection ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") -- cgit v1.2.3 From feb44f1f7a4ac299d1ab1c3606860e70b9b89d69 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 2 Mar 2015 12:06:23 -0500 Subject: x86/xen: Provide a "Xen PV" APIC driver to support >255 VCPUs Instead of mangling the default APIC driver, provide a Xen PV guest specific one that explicitly provides appropriate methods. This allows use to report that all APIC IDs are valid, allowing dom0 to boot with more than 255 VCPUs. Since the probe order of APIC drivers is link dependent, we add in an late probe function to change to the Xen PV if it hadn't been done during bootup. Suggested-by: David Vrabel Reported-by: Cathy Avery Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- arch/x86/xen/apic.c | 180 +++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/xen/enlighten.c | 90 +----------------------- 2 files changed, 181 insertions(+), 89 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 7005ced5d1ad..5e0ecaf4c336 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -7,6 +7,7 @@ #include #include #include "xen-ops.h" +#include "smp.h" static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) { @@ -28,7 +29,186 @@ static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) return 0xfd; } +static unsigned long xen_set_apic_id(unsigned int x) +{ + WARN_ON(1); + return x; +} + +static unsigned int xen_get_apic_id(unsigned long x) +{ + return ((x)>>24) & 0xFFu; +} + +static u32 xen_apic_read(u32 reg) +{ + struct xen_platform_op op = { + .cmd = XENPF_get_cpuinfo, + .interface_version = XENPF_INTERFACE_VERSION, + .u.pcpu_info.xen_cpuid = 0, + }; + int ret = 0; + + /* Shouldn't need this as APIC is turned off for PV, and we only + * get called on the bootup processor. But just in case. */ + if (!xen_initial_domain() || smp_processor_id()) + return 0; + + if (reg == APIC_LVR) + return 0x10; +#ifdef CONFIG_X86_32 + if (reg == APIC_LDR) + return SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); +#endif + if (reg != APIC_ID) + return 0; + + ret = HYPERVISOR_dom0_op(&op); + if (ret) + return 0; + + return op.u.pcpu_info.apic_id << 24; +} + +static void xen_apic_write(u32 reg, u32 val) +{ + /* Warn to see if there's any stray references */ + WARN_ON(1); +} + +static u64 xen_apic_icr_read(void) +{ + return 0; +} + +static void xen_apic_icr_write(u32 low, u32 id) +{ + /* Warn to see if there's any stray references */ + WARN_ON(1); +} + +static u32 xen_safe_apic_wait_icr_idle(void) +{ + return 0; +} + +static int xen_apic_probe_pv(void) +{ + if (xen_pv_domain()) + return 1; + + return 0; +} + +static int xen_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return xen_pv_domain(); +} + +static int xen_id_always_valid(int apicid) +{ + return 1; +} + +static int xen_id_always_registered(void) +{ + return 1; +} + +static int xen_phys_pkg_id(int initial_apic_id, int index_msb) +{ + return initial_apic_id >> index_msb; +} + +#ifdef CONFIG_X86_32 +static int xen_x86_32_early_logical_apicid(int cpu) +{ + /* Match with APIC_LDR read. Otherwise setup_local_APIC complains. */ + return 1 << cpu; +} +#endif + +static void xen_noop(void) +{ +} + +static void xen_silent_inquire(int apicid) +{ +} + +static struct apic xen_pv_apic = { + .name = "Xen PV", + .probe = xen_apic_probe_pv, + .acpi_madt_oem_check = xen_madt_oem_check, + .apic_id_valid = xen_id_always_valid, + .apic_id_registered = xen_id_always_registered, + + /* .irq_delivery_mode - used in native_compose_msi_msg only */ + /* .irq_dest_mode - used in native_compose_msi_msg only */ + + .target_cpus = default_target_cpus, + .disable_esr = 0, + /* .dest_logical - default_send_IPI_ use it but we use our own. */ + .check_apicid_used = default_check_apicid_used, /* Used on 32-bit */ + + .vector_allocation_domain = flat_vector_allocation_domain, + .init_apic_ldr = xen_noop, /* setup_local_APIC calls it */ + + .ioapic_phys_id_map = default_ioapic_phys_id_map, /* Used on 32-bit */ + .setup_apic_routing = NULL, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = physid_set_mask_of_physid, /* Used on 32-bit */ + .check_phys_apicid_present = default_check_phys_apicid_present, /* smp_sanity_check needs it */ + .phys_pkg_id = xen_phys_pkg_id, /* detect_ht */ + + .get_apic_id = xen_get_apic_id, + .set_apic_id = xen_set_apic_id, /* Can be NULL on 32-bit. */ + .apic_id_mask = 0xFF << 24, /* Used by verify_local_APIC. Match with what xen_get_apic_id does. */ + + .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, + +#ifdef CONFIG_SMP + .send_IPI_mask = xen_send_IPI_mask, + .send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself, + .send_IPI_allbutself = xen_send_IPI_allbutself, + .send_IPI_all = xen_send_IPI_all, + .send_IPI_self = xen_send_IPI_self, +#endif + /* .wait_for_init_deassert- used by AP bootup - smp_callin which we don't use */ + .inquire_remote_apic = xen_silent_inquire, + + .read = xen_apic_read, + .write = xen_apic_write, + .eoi_write = xen_apic_write, + + .icr_read = xen_apic_icr_read, + .icr_write = xen_apic_icr_write, + .wait_icr_idle = xen_noop, + .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle, + +#ifdef CONFIG_X86_32 + /* generic_processor_info and setup_local_APIC. */ + .x86_32_early_logical_apicid = xen_x86_32_early_logical_apicid, +#endif +}; + +static void __init xen_apic_check(void) +{ + if (apic == &xen_pv_apic) + return; + + pr_info("Switched APIC routing from %s to %s.\n", apic->name, + xen_pv_apic.name); + apic = &xen_pv_apic; +} void __init xen_init_apic(void) { x86_io_apic_ops.read = xen_io_apic_read; + /* On PV guests the APIC CPUID bit is disabled so none of the + * routines end up executing. */ + if (!xen_initial_domain()) + apic = &xen_pv_apic; + + x86_platform.apic_post_init = xen_apic_check; } +apic_driver(xen_pv_apic); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5240f563076d..b9a227284149 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -927,92 +927,6 @@ static void xen_io_delay(void) { } -#ifdef CONFIG_X86_LOCAL_APIC -static unsigned long xen_set_apic_id(unsigned int x) -{ - WARN_ON(1); - return x; -} -static unsigned int xen_get_apic_id(unsigned long x) -{ - return ((x)>>24) & 0xFFu; -} -static u32 xen_apic_read(u32 reg) -{ - struct xen_platform_op op = { - .cmd = XENPF_get_cpuinfo, - .interface_version = XENPF_INTERFACE_VERSION, - .u.pcpu_info.xen_cpuid = 0, - }; - int ret = 0; - - /* Shouldn't need this as APIC is turned off for PV, and we only - * get called on the bootup processor. But just in case. */ - if (!xen_initial_domain() || smp_processor_id()) - return 0; - - if (reg == APIC_LVR) - return 0x10; - - if (reg != APIC_ID) - return 0; - - ret = HYPERVISOR_dom0_op(&op); - if (ret) - return 0; - - return op.u.pcpu_info.apic_id << 24; -} - -static void xen_apic_write(u32 reg, u32 val) -{ - /* Warn to see if there's any stray references */ - WARN_ON(1); -} - -static u64 xen_apic_icr_read(void) -{ - return 0; -} - -static void xen_apic_icr_write(u32 low, u32 id) -{ - /* Warn to see if there's any stray references */ - WARN_ON(1); -} - -static void xen_apic_wait_icr_idle(void) -{ - return; -} - -static u32 xen_safe_apic_wait_icr_idle(void) -{ - return 0; -} - -static void set_xen_basic_apic_ops(void) -{ - apic->read = xen_apic_read; - apic->write = xen_apic_write; - apic->icr_read = xen_apic_icr_read; - apic->icr_write = xen_apic_icr_write; - apic->wait_icr_idle = xen_apic_wait_icr_idle; - apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle; - apic->set_apic_id = xen_set_apic_id; - apic->get_apic_id = xen_get_apic_id; - -#ifdef CONFIG_SMP - apic->send_IPI_allbutself = xen_send_IPI_allbutself; - apic->send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself; - apic->send_IPI_mask = xen_send_IPI_mask; - apic->send_IPI_all = xen_send_IPI_all; - apic->send_IPI_self = xen_send_IPI_self; -#endif -} - -#endif - static void xen_clts(void) { struct multicall_space mcs; @@ -1618,7 +1532,7 @@ asmlinkage __visible void __init xen_start_kernel(void) /* * set up the basic apic ops. */ - set_xen_basic_apic_ops(); + xen_init_apic(); #endif if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { @@ -1731,8 +1645,6 @@ asmlinkage __visible void __init xen_start_kernel(void) if (HYPERVISOR_dom0_op(&op) == 0) boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags; - xen_init_apic(); - /* Make sure ACS will be enabled */ pci_request_acs(); -- cgit v1.2.3 From b3b06c7eb7820cea5c15f9faa4964044284c5399 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 3 Mar 2015 16:12:12 -0500 Subject: x86/xen/apic: WARN with details. We should not be writting to the APIC registers under Xen PV. But if we do instead of just giving an blanket warning - include some details to help troubleshoot. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 5e0ecaf4c336..70e060ad879a 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -73,7 +73,7 @@ static u32 xen_apic_read(u32 reg) static void xen_apic_write(u32 reg, u32 val) { /* Warn to see if there's any stray references */ - WARN_ON(1); + WARN(1,"register: %x, value: %x\n", reg, val); } static u64 xen_apic_icr_read(void) -- cgit v1.2.3 From 628c28eefd6f2cef03b212081b466ae43fd093a3 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Wed, 11 Mar 2015 14:49:56 +0000 Subject: xen: unify foreign GFN map/unmap for auto-xlated physmap guests Auto-translated physmap guests (arm, arm64 and x86 PVHVM/PVH) map and unmap foreign GFNs using the same method (updating the physmap). Unify the two arm and x86 implementations into one commont one. Note that on arm and arm64, the correct error code will be returned (instead of always -EFAULT) and map/unmap failure warnings are no longer printed. These changes are required if the foreign domain is paging (-ENOENT failures are expected and must be propagated up to the caller). Signed-off-by: David Vrabel Reviewed-by: Stefano Stabellini --- arch/x86/xen/mmu.c | 110 ++--------------------------------------------------- 1 file changed, 3 insertions(+), 107 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index adca9e2b6553..3d536a56ddf1 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2436,95 +2436,6 @@ void __init xen_hvm_init_mmu_ops(void) } #endif -#ifdef CONFIG_XEN_PVH -/* - * Map foreign gfn (fgfn), to local pfn (lpfn). This for the user - * space creating new guest on pvh dom0 and needing to map domU pages. - */ -static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn, - unsigned int domid) -{ - int rc, err = 0; - xen_pfn_t gpfn = lpfn; - xen_ulong_t idx = fgfn; - - struct xen_add_to_physmap_range xatp = { - .domid = DOMID_SELF, - .foreign_domid = domid, - .size = 1, - .space = XENMAPSPACE_gmfn_foreign, - }; - set_xen_guest_handle(xatp.idxs, &idx); - set_xen_guest_handle(xatp.gpfns, &gpfn); - set_xen_guest_handle(xatp.errs, &err); - - rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp); - if (rc < 0) - return rc; - return err; -} - -static int xlate_remove_from_p2m(unsigned long spfn, int count) -{ - struct xen_remove_from_physmap xrp; - int i, rc; - - for (i = 0; i < count; i++) { - xrp.domid = DOMID_SELF; - xrp.gpfn = spfn+i; - rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); - if (rc) - break; - } - return rc; -} - -struct xlate_remap_data { - unsigned long fgfn; /* foreign domain's gfn */ - pgprot_t prot; - domid_t domid; - int index; - struct page **pages; -}; - -static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, - void *data) -{ - int rc; - struct xlate_remap_data *remap = data; - unsigned long pfn = page_to_pfn(remap->pages[remap->index++]); - pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot)); - - rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid); - if (rc) - return rc; - native_set_pte(ptep, pteval); - - return 0; -} - -static int xlate_remap_gfn_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long mfn, - int nr, pgprot_t prot, unsigned domid, - struct page **pages) -{ - int err; - struct xlate_remap_data pvhdata; - - BUG_ON(!pages); - - pvhdata.fgfn = mfn; - pvhdata.prot = prot; - pvhdata.domid = domid; - pvhdata.index = 0; - pvhdata.pages = pages; - err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT, - xlate_map_pte_fn, &pvhdata); - flush_tlb_all(); - return err; -} -#endif - #define REMAP_BATCH_SIZE 16 struct remap_data { @@ -2564,8 +2475,8 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, if (xen_feature(XENFEAT_auto_translated_physmap)) { #ifdef CONFIG_XEN_PVH /* We need to update the local page tables and the xen HAP */ - return xlate_remap_gfn_range(vma, addr, mfn, nr, prot, - domid, pages); + return xen_xlate_remap_gfn_range(vma, addr, mfn, nr, prot, + domid, pages); #else return -EINVAL; #endif @@ -2609,22 +2520,7 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, return 0; #ifdef CONFIG_XEN_PVH - while (numpgs--) { - /* - * The mmu has already cleaned up the process mmu - * resources at this point (lookup_address will return - * NULL). - */ - unsigned long pfn = page_to_pfn(pages[numpgs]); - - xlate_remove_from_p2m(pfn, 1); - } - /* - * We don't need to flush tlbs because as part of - * xlate_remove_from_p2m, the hypervisor will do tlb flushes - * after removing the p2m entries from the EPT/NPT - */ - return 0; + return xen_xlate_unmap_gfn_range(vma, numpgs, pages); #else return -EINVAL; #endif -- cgit v1.2.3 From 4e8c0c8c4bf3a5b5c98046e146ab3884bf7a7d0e Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Wed, 11 Mar 2015 14:49:57 +0000 Subject: xen/privcmd: improve performance of MMAPBATCH_V2 Make the IOCTL_PRIVCMD_MMAPBATCH_V2 (and older V1 version) map multiple frames at a time rather than one at a time, despite the pages being non-consecutive GFNs. xen_remap_foreign_mfn_array() is added which maps an array of GFNs (instead of a consecutive range of GFNs). Since per-frame errors are returned in an array, privcmd must set the MMAPBATCH_V1 error bits as part of the "report errors" phase, after all the frames are mapped. Migrate times are significantly improved (when using a PV toolstack domain). For example, for an idle 12 GiB PV guest: Before After real 0m38.179s 0m26.868s user 0m15.096s 0m13.652s sys 0m28.988s 0m18.732s Signed-off-by: David Vrabel Reviewed-by: Stefano Stabellini --- arch/x86/xen/mmu.c | 101 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 82 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3d536a56ddf1..29b3be230ede 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2439,7 +2439,8 @@ void __init xen_hvm_init_mmu_ops(void) #define REMAP_BATCH_SIZE 16 struct remap_data { - unsigned long mfn; + xen_pfn_t *mfn; + bool contiguous; pgprot_t prot; struct mmu_update *mmu_update; }; @@ -2448,7 +2449,14 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, void *data) { struct remap_data *rmd = data; - pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot)); + pte_t pte = pte_mkspecial(mfn_pte(*rmd->mfn, rmd->prot)); + + /* If we have a contigious range, just update the mfn itself, + else update pointer to be "next mfn". */ + if (rmd->contiguous) + (*rmd->mfn)++; + else + rmd->mfn++; rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; rmd->mmu_update->val = pte_val_ma(pte); @@ -2457,26 +2465,26 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, return 0; } -int xen_remap_domain_mfn_range(struct vm_area_struct *vma, - unsigned long addr, - xen_pfn_t mfn, int nr, - pgprot_t prot, unsigned domid, - struct page **pages) - +static int do_remap_mfn(struct vm_area_struct *vma, + unsigned long addr, + xen_pfn_t *mfn, int nr, + int *err_ptr, pgprot_t prot, + unsigned domid, + struct page **pages) { + int err = 0; struct remap_data rmd; struct mmu_update mmu_update[REMAP_BATCH_SIZE]; - int batch; unsigned long range; - int err = 0; + int mapped = 0; BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); if (xen_feature(XENFEAT_auto_translated_physmap)) { #ifdef CONFIG_XEN_PVH /* We need to update the local page tables and the xen HAP */ - return xen_xlate_remap_gfn_range(vma, addr, mfn, nr, prot, - domid, pages); + return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr, + prot, domid, pages); #else return -EINVAL; #endif @@ -2484,9 +2492,15 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, rmd.mfn = mfn; rmd.prot = prot; + /* We use the err_ptr to indicate if there we are doing a contigious + * mapping or a discontigious mapping. */ + rmd.contiguous = !err_ptr; while (nr) { - batch = min(REMAP_BATCH_SIZE, nr); + int index = 0; + int done = 0; + int batch = min(REMAP_BATCH_SIZE, nr); + int batch_left = batch; range = (unsigned long)batch << PAGE_SHIFT; rmd.mmu_update = mmu_update; @@ -2495,23 +2509,72 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, if (err) goto out; - err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid); - if (err < 0) - goto out; + /* We record the error for each page that gives an error, but + * continue mapping until the whole set is done */ + do { + int i; + + err = HYPERVISOR_mmu_update(&mmu_update[index], + batch_left, &done, domid); + + /* + * @err_ptr may be the same buffer as @mfn, so + * only clear it after each chunk of @mfn is + * used. + */ + if (err_ptr) { + for (i = index; i < index + done; i++) + err_ptr[i] = 0; + } + if (err < 0) { + if (!err_ptr) + goto out; + err_ptr[i] = err; + done++; /* Skip failed frame. */ + } else + mapped += done; + batch_left -= done; + index += done; + } while (batch_left); nr -= batch; addr += range; + if (err_ptr) + err_ptr += batch; } - - err = 0; out: xen_flush_tlb_all(); - return err; + return err < 0 ? err : mapped; +} + +int xen_remap_domain_mfn_range(struct vm_area_struct *vma, + unsigned long addr, + xen_pfn_t mfn, int nr, + pgprot_t prot, unsigned domid, + struct page **pages) +{ + return do_remap_mfn(vma, addr, &mfn, nr, NULL, prot, domid, pages); } EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); +int xen_remap_domain_mfn_array(struct vm_area_struct *vma, + unsigned long addr, + xen_pfn_t *mfn, int nr, + int *err_ptr, pgprot_t prot, + unsigned domid, struct page **pages) +{ + /* We BUG_ON because it's a programmer error to pass a NULL err_ptr, + * and the consequences later is quite hard to detect what the actual + * cause of "wrong memory was mapped in". + */ + BUG_ON(err_ptr == NULL); + return do_remap_mfn(vma, addr, mfn, nr, err_ptr, prot, domid, pages); +} +EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array); + + /* Returns: 0 success */ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, int numpgs, struct page **pages) -- cgit v1.2.3 From c80e5c0c23ce2282476fdc64c4b5e3d3a40723fd Mon Sep 17 00:00:00 2001 From: Eugene Shatokhin Date: Tue, 17 Mar 2015 19:09:18 +0900 Subject: kprobes/x86: Return correct length in __copy_instruction() On x86-64, __copy_instruction() always returns 0 (error) if the instruction uses %rip-relative addressing. This is because kernel_insn_init() is called the second time for 'insn' instance in such cases and sets all its fields to 0. Because of this, trying to place a kprobe on such instruction will fail, register_kprobe() will return -EINVAL. This patch fixes the problem. Signed-off-by: Eugene Shatokhin Signed-off-by: Masami Hiramatsu Link: http://lkml.kernel.org/r/20150317100918.28349.94654.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes/core.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 4e3d5a9621fe..03189d86357d 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -354,6 +354,7 @@ int __copy_instruction(u8 *dest, u8 *src) { struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; + int length; unsigned long recovered_insn = recover_probed_instruction(buf, (unsigned long)src); @@ -361,16 +362,18 @@ int __copy_instruction(u8 *dest, u8 *src) return 0; kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); insn_get_length(&insn); + length = insn.length; + /* Another subsystem puts a breakpoint, failed to recover */ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) return 0; - memcpy(dest, insn.kaddr, insn.length); + memcpy(dest, insn.kaddr, length); #ifdef CONFIG_X86_64 if (insn_rip_relative(&insn)) { s64 newdisp; u8 *disp; - kernel_insn_init(&insn, dest, insn.length); + kernel_insn_init(&insn, dest, length); insn_get_displacement(&insn); /* * The copied instruction uses the %rip-relative addressing @@ -394,7 +397,7 @@ int __copy_instruction(u8 *dest, u8 *src) *(s32 *) disp = (s32) newdisp; } #endif - return insn.length; + return length; } static int arch_copy_kprobe(struct kprobe *p) -- cgit v1.2.3 From 431d452af13720463dda498999b2e9a08729c03a Mon Sep 17 00:00:00 2001 From: Zhonghui Fu Date: Wed, 18 Mar 2015 15:54:27 +0100 Subject: PM / sleep: add pm-trace support for suspending phase Occasionally, the system can't come back up after suspend/resume due to problems of device suspending phase. This patch make PM_TRACE infrastructure cover device suspending phase of suspend/resume process, and the information in RTC can tell developers which device suspending function make system hang. Signed-off-by: Zhonghui Fu Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/pm-trace.h | 23 +++++++++++++++++++++++ arch/x86/include/asm/resume-trace.h | 21 --------------------- 2 files changed, 23 insertions(+), 21 deletions(-) create mode 100644 arch/x86/include/asm/pm-trace.h delete mode 100644 arch/x86/include/asm/resume-trace.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pm-trace.h b/arch/x86/include/asm/pm-trace.h new file mode 100644 index 000000000000..7b7ac42c3661 --- /dev/null +++ b/arch/x86/include/asm/pm-trace.h @@ -0,0 +1,23 @@ +#ifndef _ASM_X86_PM_TRACE_H +#define _ASM_X86_PM_TRACE_H + +#include + +#define TRACE_RESUME(user) \ +do { \ + if (pm_trace_enabled) { \ + const void *tracedata; \ + asm volatile(_ASM_MOV " $1f,%0\n" \ + ".section .tracedata,\"a\"\n" \ + "1:\t.word %c1\n\t" \ + _ASM_PTR " %c2\n" \ + ".previous" \ + :"=r" (tracedata) \ + : "i" (__LINE__), "i" (__FILE__)); \ + generate_pm_trace(tracedata, user); \ + } \ +} while (0) + +#define TRACE_SUSPEND(user) TRACE_RESUME(user) + +#endif /* _ASM_X86_PM_TRACE_H */ diff --git a/arch/x86/include/asm/resume-trace.h b/arch/x86/include/asm/resume-trace.h deleted file mode 100644 index 3ff1c2cb1da5..000000000000 --- a/arch/x86/include/asm/resume-trace.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef _ASM_X86_RESUME_TRACE_H -#define _ASM_X86_RESUME_TRACE_H - -#include - -#define TRACE_RESUME(user) \ -do { \ - if (pm_trace_enabled) { \ - const void *tracedata; \ - asm volatile(_ASM_MOV " $1f,%0\n" \ - ".section .tracedata,\"a\"\n" \ - "1:\t.word %c1\n\t" \ - _ASM_PTR " %c2\n" \ - ".previous" \ - :"=r" (tracedata) \ - : "i" (__LINE__), "i" (__FILE__)); \ - generate_resume_trace(tracedata, user); \ - } \ -} while (0) - -#endif /* _ASM_X86_RESUME_TRACE_H */ -- cgit v1.2.3 From b97ea289cf6aff8d4cbcefe2b707bb9b00a73c73 Mon Sep 17 00:00:00 2001 From: Yijing Wang Date: Mon, 16 Mar 2015 11:18:56 +0800 Subject: PCI: Assign resources before drivers claim devices (pci_scan_root_bus()) Previously, pci_scan_root_bus() created a root PCI bus, enumerated the devices on it, and called pci_bus_add_devices(), which made the devices available for drivers to claim them. Most callers assigned resources to devices after pci_scan_root_bus() returns, which may be after drivers have claimed the devices. This is incorrect; the PCI core should not change device resources while a driver is managing the device. Remove pci_bus_add_devices() from pci_scan_root_bus() and do it after any resource assignment in the callers. Note that ARM's pci_common_init_dev() already called pci_bus_add_devices() after pci_scan_root_bus(), so we only need to remove the first call: pci_common_init_dev pcibios_init_hw pci_scan_root_bus pci_bus_add_devices # first call pci_bus_assign_resources pci_bus_add_devices # second call [bhelgaas: changelog, drop "root_bus" var in alpha common_init_pci(), return failure earlier in mn10300, add "return" in x86 pcibios_scan_root(), return early if xtensa platform_pcibios_fixup() fails] Signed-off-by: Yijing Wang Signed-off-by: Bjorn Helgaas CC: Richard Henderson CC: Ivan Kokshaysky CC: Matt Turner CC: David Howells CC: Tony Luck CC: Michal Simek CC: Ralf Baechle CC: Koichi Yasutake CC: Sebastian Ott CC: "David S. Miller" CC: Chris Metcalf CC: Chris Zankel CC: Max Filippov CC: Thomas Gleixner --- arch/x86/pci/common.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 3d2612b68694..95a0ba70376b 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -490,7 +490,9 @@ void pcibios_scan_root(int busnum) if (!bus) { pci_free_resource_list(&resources); kfree(sd); + return; } + pci_bus_add_devices(bus); } void __init pcibios_set_cache_line_size(void) -- cgit v1.2.3 From 88ad1a147e2c84d33cb50f5ebff1ece5e0cd4383 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 20 Mar 2015 14:18:12 +1030 Subject: lguest: fix pending interrupt test. Denys says: TEST with zero will always set ZF. Thus, "jnz send_interrupts" never jumps. We get interrupts regularly enough that this didn't cause immediate problems. Reported-by: Denys Vlasenko Signed-off-by: Rusty Russell --- arch/x86/lguest/head_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S index 6ddfe4fc23c3..05b0a85507ce 100644 --- a/arch/x86/lguest/head_32.S +++ b/arch/x86/lguest/head_32.S @@ -84,7 +84,7 @@ ENTRY(lg_irq_enable) * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we * jump to send_interrupts, otherwise we're done. */ - testl $0, lguest_data+LGUEST_DATA_irq_pending + cmpl $0, lguest_data+LGUEST_DATA_irq_pending jnz send_interrupts /* * One cool thing about x86 is that you can do many things without using -- cgit v1.2.3 From 4e16ed99416ef569a89782a7234f95007919fadd Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Thu, 26 Feb 2015 18:47:00 +0000 Subject: perf/x86/intel: Fix Makefile to actually build the cqm driver Someone fat fingered a merge conflict and lost the Makefile hunk. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/1424976420.15321.35.camel@mfleming-mobl1.ger.corp.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 80091ae54c2b..6c1ca139f736 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -39,7 +39,7 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o endif obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o perf_event_intel_cqm.o obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \ perf_event_intel_uncore_snb.o \ -- cgit v1.2.3 From 50f16a8bf9d7a92c437ed1867d0f7e1dc6a9aca9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Mar 2015 22:10:19 +0100 Subject: perf: Remove type specific target pointers The only reason CQM had to use a hard-coded pmu type was so it could use cqm_target in hw_perf_event. Do away with the {tp,bp,cqm}_target pointers and provide a non type specific one. This allows us to do away with that silly pmu type as well. Signed-off-by: Peter Zijlstra (Intel) Cc: Vince Weaver Cc: acme@kernel.org Cc: acme@redhat.com Cc: hpa@zytor.com Cc: jolsa@redhat.com Cc: kanaka.d.juvva@intel.com Cc: matt.fleming@intel.com Cc: tglx@linutronix.de Cc: torvalds@linux-foundation.org Cc: vikas.shivappa@linux.intel.com Link: http://lkml.kernel.org/r/20150305211019.GU21418@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 9a8ef8376fcd..e4d1b8b738fa 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -263,7 +263,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b) /* * Events that target same task are placed into the same cache group. */ - if (a->hw.cqm_target == b->hw.cqm_target) + if (a->hw.target == b->hw.target) return true; /* @@ -279,7 +279,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b) static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) { if (event->attach_state & PERF_ATTACH_TASK) - return perf_cgroup_from_task(event->hw.cqm_target); + return perf_cgroup_from_task(event->hw.target); return event->cgrp; } @@ -1365,8 +1365,7 @@ static int __init intel_cqm_init(void) __perf_cpu_notifier(intel_cqm_cpu_notifier); - ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", - PERF_TYPE_INTEL_CQM); + ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); if (ret) pr_err("Intel CQM perf registration failed: %d\n", ret); else -- cgit v1.2.3 From 41f055d49cb04d648a89a2cb6d57c94ff86d5bb6 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Tue, 24 Mar 2015 11:51:38 +1030 Subject: lguest: rename i386_head.S in the comments i386_head.S renamed to the head_32.S, let's update it in the comments too. Signed-off-by: Alexander Kuleshov Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index ac4453d8520e..543510a2f9e0 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -262,7 +262,7 @@ PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl); PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable); /*:*/ -/* These are in i386_head.S */ +/* These are in head_32.S */ extern void lg_irq_enable(void); extern void lg_restore_fl(unsigned long flags); @@ -1366,7 +1366,7 @@ static void lguest_restart(char *reason) * fit comfortably. * * First we need assembly templates of each of the patchable Guest operations, - * and these are in i386_head.S. + * and these are in head_32.S. */ /*G:060 We construct a table from the assembler templates: */ -- cgit v1.2.3 From 7042cb4eb30967b5eb9eeba04907882f04d6b6e5 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 24 Mar 2015 11:51:39 +1030 Subject: lguest: simplify lguest_iret Signed-off-by: Denys Vlasenko CC: lguest@lists.ozlabs.org CC: x86@kernel.org CC: linux-kernel@vger.kernel.org Signed-off-by: Rusty Russell --- arch/x86/lguest/head_32.S | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S index 05b0a85507ce..81678bf0fcb7 100644 --- a/arch/x86/lguest/head_32.S +++ b/arch/x86/lguest/head_32.S @@ -168,29 +168,28 @@ ENTRY(lg_restore_fl) * So we have to copy eflags from the stack to lguest_data.irq_enabled before * we do the "iret". * - * There are two problems with this: firstly, we need to use a register to do - * the copy and secondly, the whole thing needs to be atomic. The first - * problem is easy to solve: push %eax on the stack so we can use it, and then - * restore it at the end just before the real "iret". + * There are two problems with this: firstly, we can't clobber any registers + * and secondly, the whole thing needs to be atomic. The first problem + * is solved by using "push memory"/"pop memory" instruction pair for copying. * * The second is harder: copying eflags to lguest_data.irq_enabled will turn * interrupts on before we're finished, so we could be interrupted before we * return to userspace or wherever. Our solution to this is to surround the * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the * Host that it is *never* to interrupt us there, even if interrupts seem to be - * enabled. + * enabled. (It's not necessary to protect pop instruction, since + * data gets updated only after it completes, so we end up surrounding + * just one instruction, iret). */ ENTRY(lguest_iret) - pushl %eax - movl 12(%esp), %eax -lguest_noirq_start: + pushl 2*4(%esp) /* * Note the %ss: segment prefix here. Normal data accesses use the * "ds" segment, but that will have already been restored for whatever * we're returning to (such as userspace): we can't trust it. The %ss: * prefix makes sure we use the stack segment, which is still valid. */ - movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled - popl %eax + popl %ss:lguest_data+LGUEST_DATA_irq_enabled +lguest_noirq_start: iret lguest_noirq_end: -- cgit v1.2.3 From 2f921b5bb0511fb698681d8ef35c48be7a9116bf Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 24 Mar 2015 11:51:39 +1030 Subject: lguest: suppress interrupts for single insn, not range. The last patch reduced our interrupt-suppression region to one address, so simplify the code somewhat. Also, remove the obsolete undefined instruction ranges and the comment which refers to lguest_guest.S instead of head_32.S. Signed-off-by: Rusty Russell --- arch/x86/include/asm/lguest.h | 7 ++----- arch/x86/lguest/boot.c | 3 +-- arch/x86/lguest/head_32.S | 15 ++++++--------- 3 files changed, 9 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index e2d4a4afa8c3..3bbc07a57a31 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -20,13 +20,10 @@ extern unsigned long switcher_addr; /* Found in switcher.S */ extern unsigned long default_idt_entries[]; -/* Declarations for definitions in lguest_guest.S */ -extern char lguest_noirq_start[], lguest_noirq_end[]; +/* Declarations for definitions in arch/x86/lguest/head_32.S */ +extern char lguest_noirq_iret[]; extern const char lgstart_cli[], lgend_cli[]; -extern const char lgstart_sti[], lgend_sti[]; -extern const char lgstart_popf[], lgend_popf[]; extern const char lgstart_pushf[], lgend_pushf[]; -extern const char lgstart_iret[], lgend_iret[]; extern void lguest_iret(void); extern void lguest_init(void); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 543510a2f9e0..13616d708389 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -87,8 +87,7 @@ struct lguest_data lguest_data = { .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, - .noirq_start = (u32)lguest_noirq_start, - .noirq_end = (u32)lguest_noirq_end, + .noirq_iret = (u32)lguest_noirq_iret, .kernel_address = PAGE_OFFSET, .blocked_interrupts = { 1 }, /* Block timer interrupts */ .syscall_vec = SYSCALL_VECTOR, diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S index 81678bf0fcb7..d5ae63f5ec5d 100644 --- a/arch/x86/lguest/head_32.S +++ b/arch/x86/lguest/head_32.S @@ -133,9 +133,8 @@ ENTRY(lg_restore_fl) ret /*:*/ -/* These demark the EIP range where host should never deliver interrupts. */ -.global lguest_noirq_start -.global lguest_noirq_end +/* These demark the EIP where host should never deliver interrupts. */ +.global lguest_noirq_iret /*M:004 * When the Host reflects a trap or injects an interrupt into the Guest, it @@ -174,12 +173,11 @@ ENTRY(lg_restore_fl) * * The second is harder: copying eflags to lguest_data.irq_enabled will turn * interrupts on before we're finished, so we could be interrupted before we - * return to userspace or wherever. Our solution to this is to surround the - * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the + * return to userspace or wherever. Our solution to this is to tell the * Host that it is *never* to interrupt us there, even if interrupts seem to be * enabled. (It's not necessary to protect pop instruction, since - * data gets updated only after it completes, so we end up surrounding - * just one instruction, iret). + * data gets updated only after it completes, so we only need to protect + * one instruction, iret). */ ENTRY(lguest_iret) pushl 2*4(%esp) @@ -190,6 +188,5 @@ ENTRY(lguest_iret) * prefix makes sure we use the stack segment, which is still valid. */ popl %ss:lguest_data+LGUEST_DATA_irq_enabled -lguest_noirq_start: +lguest_noirq_iret: iret -lguest_noirq_end: -- cgit v1.2.3 From 6e0a0ea12962a2175a9f47621f9fe7a4c866cb12 Mon Sep 17 00:00:00 2001 From: Graeme Gregory Date: Tue, 24 Mar 2015 14:02:39 +0000 Subject: ACPI / sleep: Introduce CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT ACPI 5.1 does not currently support S states for ARM64 hardware but ACPI code will call acpi_target_system_state() and acpi_sleep_init() for device power management, so introduce CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT and select it for x86 and ia64 only to make sleep functions available, and also introduce stub function to allow other drivers to function until S states are defined for ARM64. It will be no functional change for x86 and IA64. Suggested-by: Rafael J. Wysocki Acked-by: Lorenzo Pieralisi Acked-by: Rafael J. Wysocki Signed-off-by: Graeme Gregory Signed-off-by: Tomasz Nowicki Signed-off-by: Hanjun Guo Signed-off-by: Will Deacon --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b7d31ca55187..c3ea9f9a29d1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -22,6 +22,7 @@ config X86_64 ### Arch settings config X86 def_bool y + select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_HAS_FAST_MULTIPLIER -- cgit v1.2.3 From 828aef376d7a129547bc4ebb949965040177e3da Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 24 Mar 2015 14:02:46 +0000 Subject: ACPI / processor: Introduce phys_cpuid_t for CPU hardware ID CPU hardware ID (phys_id) is defined as u32 in structure acpi_processor, but phys_id is used as int in acpi processor driver, so it will lead to some inconsistence for the drivers. Furthermore, to cater for ACPI arch ports that implement 64 bits CPU ids a generic CPU physical id type is required. So introduce typedef u32 phys_cpuid_t in a common file, and introduce a macro PHYS_CPUID_INVALID as (phys_cpuid_t)(-1) if it's not defined by other archs, this will solve the inconsistence in acpi processor driver, and will prepare for the ACPI on ARM64 for the 64 bit CPU hardware ID in the following patch. CC: Rafael J Wysocki Suggested-by: Lorenzo Pieralisi Reviewed-by: Grant Likely Acked-by: Sudeep Holla Acked-by: Lorenzo Pieralisi Acked-by: Rafael J. Wysocki Signed-off-by: Catalin Marinas [hj: reworked cpu physid map return codes] Signed-off-by: Hanjun Guo Signed-off-by: Will Deacon --- arch/x86/kernel/acpi/boot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 3d525c6124f6..e4f8582eb756 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -757,7 +757,7 @@ static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu) } /* wrapper to silence section mismatch warning */ -int __ref acpi_map_cpu(acpi_handle handle, int physid, int *pcpu) +int __ref acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu) { return _acpi_map_lsapic(handle, physid, pcpu); } -- cgit v1.2.3 From 0f1b5ca240c65ed9533f193720f337bf24fb2f2f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 17 Feb 2015 18:18:04 -0800 Subject: perf/x86/intel: Add new cache events table for Haswell Haswell offcore events are quite different from Sandy Bridge. Add a new table to handle Haswell properly. Note that the offcore bits listed in the SDM are not quite correct (this is currently being fixed). An uptodate list of bits is in the patch. The basic setup is similar to Sandy Bridge. The prefetch columns have been removed, as prefetch counting is not very reliable on Haswell. One L1 event that is not in the event list anymore has been also removed. - data reads do not include code reads (comparable to earlier Sandy Bridge tables) - data counts include speculative execution (except L1 write, dtlb, bpu) - remote node access includes both remote memory, remote cache, remote mmio. - prefetches are not included in the counts for consistency (different from Sandy Bridge, which includes prefetches in the remote node) Signed-off-by: Andi Kleen [ Removed the HSM30 comments; we don't have them for SNB/IVB either. ] Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1424225886-18652-1-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 194 ++++++++++++++++++++++++++++++++- 1 file changed, 192 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 9f1dd18fa395..5ef64bf88ecd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -415,6 +415,196 @@ static __initconst const u64 snb_hw_cache_event_ids }; +/* + * Notes on the events: + * - data reads do not include code reads (comparable to earlier tables) + * - data counts include speculative execution (except L1 write, dtlb, bpu) + * - remote node access includes remote memory, remote cache, remote mmio. + * - prefetches are not included in the counts because they are not + * reliably counted. + */ + +#define HSW_DEMAND_DATA_RD BIT_ULL(0) +#define HSW_DEMAND_RFO BIT_ULL(1) +#define HSW_ANY_RESPONSE BIT_ULL(16) +#define HSW_SUPPLIER_NONE BIT_ULL(17) +#define HSW_L3_MISS_LOCAL_DRAM BIT_ULL(22) +#define HSW_L3_MISS_REMOTE_HOP0 BIT_ULL(27) +#define HSW_L3_MISS_REMOTE_HOP1 BIT_ULL(28) +#define HSW_L3_MISS_REMOTE_HOP2P BIT_ULL(29) +#define HSW_L3_MISS (HSW_L3_MISS_LOCAL_DRAM| \ + HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \ + HSW_L3_MISS_REMOTE_HOP2P) +#define HSW_SNOOP_NONE BIT_ULL(31) +#define HSW_SNOOP_NOT_NEEDED BIT_ULL(32) +#define HSW_SNOOP_MISS BIT_ULL(33) +#define HSW_SNOOP_HIT_NO_FWD BIT_ULL(34) +#define HSW_SNOOP_HIT_WITH_FWD BIT_ULL(35) +#define HSW_SNOOP_HITM BIT_ULL(36) +#define HSW_SNOOP_NON_DRAM BIT_ULL(37) +#define HSW_ANY_SNOOP (HSW_SNOOP_NONE| \ + HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \ + HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \ + HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM) +#define HSW_SNOOP_DRAM (HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM) +#define HSW_DEMAND_READ HSW_DEMAND_DATA_RD +#define HSW_DEMAND_WRITE HSW_DEMAND_RFO +#define HSW_L3_MISS_REMOTE (HSW_L3_MISS_REMOTE_HOP0|\ + HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P) +#define HSW_LLC_ACCESS HSW_ANY_RESPONSE + +static __initconst const u64 hsw_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */ + [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */ + [ C(RESULT_MISS) ] = 0x0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x280, /* ICACHE.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */ + [ C(RESULT_MISS) ] = 0x108, /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */ + [ C(RESULT_MISS) ] = 0x149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x6085, /* ITLB_MISSES.STLB_HIT */ + [ C(RESULT_MISS) ] = 0x185, /* ITLB_MISSES.MISS_CAUSES_A_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, +}; + +static __initconst const u64 hsw_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ| + HSW_LLC_ACCESS, + [ C(RESULT_MISS) ] = HSW_DEMAND_READ| + HSW_L3_MISS|HSW_ANY_SNOOP, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE| + HSW_LLC_ACCESS, + [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE| + HSW_L3_MISS|HSW_ANY_SNOOP, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ| + HSW_L3_MISS_LOCAL_DRAM| + HSW_SNOOP_DRAM, + [ C(RESULT_MISS) ] = HSW_DEMAND_READ| + HSW_L3_MISS_REMOTE| + HSW_SNOOP_DRAM, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE| + HSW_L3_MISS_LOCAL_DRAM| + HSW_SNOOP_DRAM, + [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE| + HSW_L3_MISS_REMOTE| + HSW_SNOOP_DRAM, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, +}; + static __initconst const u64 westmere_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -2520,8 +2710,8 @@ __init int intel_pmu_init(void) case 69: /* 22nm Haswell ULT */ case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ x86_pmu.late_ack = true; - memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); intel_pmu_lbr_init_hsw(); -- cgit v1.2.3 From 91f1b70582c62576f429cf78d53751c66677553d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 17 Feb 2015 18:18:05 -0800 Subject: perf/x86/intel: Add Broadwell core support Add Broadwell support for Broadwell to perf. The basic support is very similar to Haswell. We use the new cache event list added for Haswell earlier. The only differences are a few bits related to remote nodes. To avoid an extra, mostly identical, table these are patched up in the initialization code. The constraint list has one new event that needs to be handled over Haswell. Includes code and testing from Kan Liang. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1424225886-18652-2-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 47 ++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 5ef64bf88ecd..28838536a9f7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -220,6 +220,15 @@ static struct event_constraint intel_hsw_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_bdw_event_constraints[] = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */ + INTEL_EVENT_CONSTRAINT(0xa3, 0x4), /* CYCLE_ACTIVITY.* */ + EVENT_CONSTRAINT_END +}; + static u64 intel_pmu_event_map(int hw_event) { return intel_perfmon_event_map[hw_event]; @@ -453,6 +462,12 @@ static __initconst const u64 snb_hw_cache_event_ids HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P) #define HSW_LLC_ACCESS HSW_ANY_RESPONSE +#define BDW_L3_MISS_LOCAL BIT(26) +#define BDW_L3_MISS (BDW_L3_MISS_LOCAL| \ + HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \ + HSW_L3_MISS_REMOTE_HOP2P) + + static __initconst const u64 hsw_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -2730,6 +2745,38 @@ __init int intel_pmu_init(void) pr_cont("Haswell events, "); break; + case 61: /* 14nm Broadwell Core-M */ + case 86: /* 14nm Broadwell Xeon D */ + x86_pmu.late_ack = true; + memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + + /* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */ + hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ | + BDW_L3_MISS|HSW_SNOOP_DRAM; + hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS| + HSW_SNOOP_DRAM; + hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ| + BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM; + hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE| + BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM; + + intel_pmu_lbr_init_snb(); + + x86_pmu.event_constraints = intel_bdw_event_constraints; + x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; + x86_pmu.extra_regs = intel_snbep_extra_regs; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; + /* all extra regs are per-cpu when HT is on */ + x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.er_flags |= ERF_NO_HT_SHARING; + + x86_pmu.hw_config = hsw_hw_config; + x86_pmu.get_event_constraints = hsw_get_event_constraints; + x86_pmu.cpu_events = hsw_events_attrs; + pr_cont("Broadwell events, "); + break; + default: switch (x86_pmu.version) { case 1: -- cgit v1.2.3 From 294fe0f52a44c6f207211de0686c369a961b5533 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 17 Feb 2015 18:18:06 -0800 Subject: perf/x86/intel: Add INST_RETIRED.ALL workarounds On Broadwell INST_RETIRED.ALL cannot be used with any period that doesn't have the lowest 6 bits cleared. And the period should not be smaller than 128. This is erratum BDM11 and BDM55: http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf BDM11: When using a period < 100; we may get incorrect PEBS/PMI interrupts and/or an invalid counter state. BDM55: When bit0-5 of the period are !0 we may get redundant PEBS records on overflow. Add a new callback to enforce this, and set it for Broadwell. How does this handle the case when an app requests a specific period with some of the bottom bits set? Short answer: Any useful instruction sampling period needs to be 4-6 orders of magnitude larger than 128, as an PMI every 128 instructions would instantly overwhelm the system and be throttled. So the +-64 error from this is really small compared to the period, much smaller than normal system jitter. Long answer (by Peterz): IFF we guarantee perf_event_attr::sample_period >= 128. Suppose we start out with sample_period=192; then we'll set period_left to 192, we'll end up with left = 128 (we truncate the lower bits). We get an interrupt, find that period_left = 64 (>0 so we return 0 and don't get an overflow handler), up that to 128. Then we trigger again, at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get an overflow). We increment with sample_period so we get left = 128. We fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an overflow). And on and on. So while the individual interrupts are 'wrong' we get then with interval=256,128 in exactly the right ratio to average out at 192. And this works for everything >=128. So the num_samples*fixed_period thing is still entirely correct +- 127, which is good enough I'd say, as you already have that error anyhow. So no need to 'fix' the tools, al we need to do is refuse to create INST_RETIRED:ALL events with sample_period < 128. Signed-off-by: Andi Kleen [ Updated comments and changelog a bit. ] Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1424225886-18652-3-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 9 +++++++++ arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_intel.c | 27 +++++++++++++++++++++++++++ 3 files changed, 37 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e0dab5ce61e9..ec6e982fd464 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -451,6 +451,12 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.type == PERF_TYPE_RAW) event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; + if (event->attr.sample_period && x86_pmu.limit_period) { + if (x86_pmu.limit_period(event, event->attr.sample_period) > + event->attr.sample_period) + return -EINVAL; + } + return x86_setup_perfctr(event); } @@ -988,6 +994,9 @@ int x86_perf_event_set_period(struct perf_event *event) if (left > x86_pmu.max_period) left = x86_pmu.max_period; + if (x86_pmu.limit_period) + left = x86_pmu.limit_period(event, left); + per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; /* diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index a371d27d6795..87e5081f4cdc 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -451,6 +451,7 @@ struct x86_pmu { struct x86_pmu_quirk *quirks; int perfctr_second_write; bool late_ack; + unsigned (*limit_period)(struct perf_event *event, unsigned l); /* * sysfs attrs diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 28838536a9f7..fc6dbc46af4a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2096,6 +2096,32 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) return c; } +/* + * Broadwell: + * + * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared + * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine + * the two to enforce a minimum period of 128 (the smallest value that has bits + * 0-5 cleared and >= 100). + * + * Because of how the code in x86_perf_event_set_period() works, the truncation + * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period + * to make up for the 'lost' events due to carrying the 'error' in period_left. + * + * Therefore the effective (average) period matches the requested period, + * despite coarser hardware granularity. + */ +static unsigned bdw_limit_period(struct perf_event *event, unsigned left) +{ + if ((event->hw.config & INTEL_ARCH_EVENT_MASK) == + X86_CONFIG(.event=0xc0, .umask=0x01)) { + if (left < 128) + left = 128; + left &= ~0x3fu; + } + return left; +} + PMU_FORMAT_ATTR(event, "config:0-7" ); PMU_FORMAT_ATTR(umask, "config:8-15" ); PMU_FORMAT_ATTR(edge, "config:18" ); @@ -2774,6 +2800,7 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; x86_pmu.cpu_events = hsw_events_attrs; + x86_pmu.limit_period = bdw_limit_period; pr_cont("Broadwell events, "); break; -- cgit v1.2.3 From 9332d250b4b4f67c633894b311e022e3cf943bd5 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Feb 2015 10:45:43 -0700 Subject: perf/x86: Remove redundant calls to perf_pmu_{dis|en}able() perf_pmu_disable() is called before pmu->add() and perf_pmu_enable() is called afterwards. No need to call these inside of x86_pmu_add() as well. Signed-off-by: David Ahern Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1424281543-67335-1-git-send-email-dsahern@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ec6e982fd464..ac41b3ad1fc9 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1044,7 +1044,6 @@ static int x86_pmu_add(struct perf_event *event, int flags) hwc = &event->hw; - perf_pmu_disable(event->pmu); n0 = cpuc->n_events; ret = n = collect_events(cpuc, event, false); if (ret < 0) @@ -1082,7 +1081,6 @@ done_collect: ret = 0; out: - perf_pmu_enable(event->pmu); return ret; } -- cgit v1.2.3 From 34f439278cef7b1177f8ce24f9fc81dfc6221d3b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 Feb 2015 14:05:38 +0100 Subject: perf: Add per event clockid support While thinking on the whole clock discussion it occurred to me we have two distinct uses of time: 1) the tracking of event/ctx/cgroup enabled/running/stopped times which includes the self-monitoring support in struct perf_event_mmap_page. 2) the actual timestamps visible in the data records. And we've been conflating them. The first is all about tracking time deltas, nobody should really care in what time base that happens, its all relative information, as long as its internally consistent it works. The second however is what people are worried about when having to merge their data with external sources. And here we have the discussion on MONOTONIC vs MONOTONIC_RAW etc.. Where MONOTONIC is good for correlating between machines (static offset), MONOTNIC_RAW is required for correlating against a fixed rate hardware clock. This means configurability; now 1) makes that hard because it needs to be internally consistent across groups of unrelated events; which is why we had to have a global perf_clock(). However, for 2) it doesn't really matter, perf itself doesn't care what it writes into the buffer. The below patch makes the distinction between these two cases by adding perf_event_clock() which is used for the second case. It further makes this configurable on a per-event basis, but adds a few sanity checks such that we cannot combine events with different clocks in confusing ways. And since we then have per-event configurability we might as well retain the 'legacy' behaviour as a default. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ac41b3ad1fc9..0420ebcac116 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1978,13 +1978,23 @@ void arch_perf_update_userpage(struct perf_event *event, data = cyc2ns_read_begin(); + /* + * Internal timekeeping for enabled/running/stopped times + * is always in the local_clock domain. + */ userpg->cap_user_time = 1; userpg->time_mult = data->cyc2ns_mul; userpg->time_shift = data->cyc2ns_shift; userpg->time_offset = data->cyc2ns_offset - now; - userpg->cap_user_time_zero = 1; - userpg->time_zero = data->cyc2ns_offset; + /* + * cap_user_time_zero doesn't make sense when we're using a different + * time base for the records. + */ + if (event->clock == &local_clock) { + userpg->cap_user_time_zero = 1; + userpg->time_zero = data->cyc2ns_offset; + } cyc2ns_read_end(data); } -- cgit v1.2.3 From eabdc320ece583e16e581306c720e5f1ff67c3bb Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Mon, 30 Mar 2015 21:58:17 +0200 Subject: crypto: aesni - mark AES-NI helper ciphers Flag all AES-NI helper ciphers as internal ciphers to prevent them from being called by normal users. Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 6893f4947583..f9a78f32f494 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -797,7 +797,9 @@ static int rfc4106_init(struct crypto_tfm *tfm) PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN); struct crypto_aead *cryptd_child; struct aesni_rfc4106_gcm_ctx *child_ctx; - cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0); + cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", + CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); if (IS_ERR(cryptd_tfm)) return PTR_ERR(cryptd_tfm); @@ -1262,7 +1264,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_name = "__aes-aesni", .cra_driver_name = "__driver-aes-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER | CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1, @@ -1281,7 +1283,8 @@ static struct crypto_alg aesni_algs[] = { { .cra_name = "__ecb-aes-aesni", .cra_driver_name = "__driver-ecb-aes-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1, @@ -1301,7 +1304,8 @@ static struct crypto_alg aesni_algs[] = { { .cra_name = "__cbc-aes-aesni", .cra_driver_name = "__driver-cbc-aes-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1, @@ -1365,7 +1369,8 @@ static struct crypto_alg aesni_algs[] = { { .cra_name = "__ctr-aes-aesni", .cra_driver_name = "__driver-ctr-aes-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1, @@ -1409,7 +1414,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_name = "__gcm-aes-aesni", .cra_driver_name = "__driver-gcm-aes-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_AEAD, + .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN, @@ -1479,7 +1484,8 @@ static struct crypto_alg aesni_algs[] = { { .cra_name = "__lrw-aes-aesni", .cra_driver_name = "__driver-lrw-aes-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct aesni_lrw_ctx), .cra_alignmask = 0, @@ -1500,7 +1506,8 @@ static struct crypto_alg aesni_algs[] = { { .cra_name = "__xts-aes-aesni", .cra_driver_name = "__driver-xts-aes-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct aesni_xts_ctx), .cra_alignmask = 0, -- cgit v1.2.3 From 6a9b52b7fa1a376c1749cf441d1aeb8572d3b691 Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Mon, 30 Mar 2015 22:01:49 +0200 Subject: crypto: clmulni - mark ghash clmulni helper ciphers Flag all ash clmulni helper ciphers as internal ciphers to prevent them from being called by normal users. Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu --- arch/x86/crypto/ghash-clmulni-intel_glue.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 8253d85aa165..2079baf06bdd 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -154,7 +154,8 @@ static struct shash_alg ghash_alg = { .cra_name = "__ghash", .cra_driver_name = "__ghash-pclmulqdqni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_flags = CRYPTO_ALG_TYPE_SHASH | + CRYPTO_ALG_INTERNAL, .cra_blocksize = GHASH_BLOCK_SIZE, .cra_ctxsize = sizeof(struct ghash_ctx), .cra_module = THIS_MODULE, @@ -261,7 +262,9 @@ static int ghash_async_init_tfm(struct crypto_tfm *tfm) struct cryptd_ahash *cryptd_tfm; struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); - cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", 0, 0); + cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", + CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); if (IS_ERR(cryptd_tfm)) return PTR_ERR(cryptd_tfm); ctx->cryptd_tfm = cryptd_tfm; -- cgit v1.2.3 From a62356a978180eb3ccd2bcac49764dfd628c72f8 Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Mon, 30 Mar 2015 22:03:17 +0200 Subject: crypto: camellia_aesni_avx2 - mark AES-NI Camellia helper ciphers Flag all AES-NI Camellia helper ciphers as internal ciphers to prevent them from being called by normal users. Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu --- arch/x86/crypto/camellia_aesni_avx2_glue.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index 9a07fafe3831..baf0ac21ace5 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -343,7 +343,8 @@ static struct crypto_alg cmll_algs[10] = { { .cra_name = "__ecb-camellia-aesni-avx2", .cra_driver_name = "__driver-ecb-camellia-aesni-avx2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAMELLIA_BLOCK_SIZE, .cra_ctxsize = sizeof(struct camellia_ctx), .cra_alignmask = 0, @@ -362,7 +363,8 @@ static struct crypto_alg cmll_algs[10] = { { .cra_name = "__cbc-camellia-aesni-avx2", .cra_driver_name = "__driver-cbc-camellia-aesni-avx2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAMELLIA_BLOCK_SIZE, .cra_ctxsize = sizeof(struct camellia_ctx), .cra_alignmask = 0, @@ -381,7 +383,8 @@ static struct crypto_alg cmll_algs[10] = { { .cra_name = "__ctr-camellia-aesni-avx2", .cra_driver_name = "__driver-ctr-camellia-aesni-avx2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct camellia_ctx), .cra_alignmask = 0, @@ -401,7 +404,8 @@ static struct crypto_alg cmll_algs[10] = { { .cra_name = "__lrw-camellia-aesni-avx2", .cra_driver_name = "__driver-lrw-camellia-aesni-avx2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAMELLIA_BLOCK_SIZE, .cra_ctxsize = sizeof(struct camellia_lrw_ctx), .cra_alignmask = 0, @@ -424,7 +428,8 @@ static struct crypto_alg cmll_algs[10] = { { .cra_name = "__xts-camellia-aesni-avx2", .cra_driver_name = "__driver-xts-camellia-aesni-avx2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAMELLIA_BLOCK_SIZE, .cra_ctxsize = sizeof(struct camellia_xts_ctx), .cra_alignmask = 0, -- cgit v1.2.3 From 680574e8b3b2f62e507cdd9ca35c77cad0344bb2 Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Mon, 30 Mar 2015 22:03:57 +0200 Subject: crypto: cast5_avx - mark CAST5 helper ciphers Flag all CAST5 helper ciphers as internal ciphers to prevent them from being called by normal users. Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu --- arch/x86/crypto/cast5_avx_glue.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index 60ada677a928..236c80974457 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -341,7 +341,8 @@ static struct crypto_alg cast5_algs[6] = { { .cra_name = "__ecb-cast5-avx", .cra_driver_name = "__driver-ecb-cast5-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAST5_BLOCK_SIZE, .cra_ctxsize = sizeof(struct cast5_ctx), .cra_alignmask = 0, @@ -360,7 +361,8 @@ static struct crypto_alg cast5_algs[6] = { { .cra_name = "__cbc-cast5-avx", .cra_driver_name = "__driver-cbc-cast5-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAST5_BLOCK_SIZE, .cra_ctxsize = sizeof(struct cast5_ctx), .cra_alignmask = 0, @@ -379,7 +381,8 @@ static struct crypto_alg cast5_algs[6] = { { .cra_name = "__ctr-cast5-avx", .cra_driver_name = "__driver-ctr-cast5-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct cast5_ctx), .cra_alignmask = 0, -- cgit v1.2.3 From 7d2c31dd70ed49210847ae8cc00f14064292b349 Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Mon, 30 Mar 2015 22:04:49 +0200 Subject: crypto: camellia_aesni_avx - mark AVX Camellia helper ciphers Flag all AVX Camellia helper ciphers as internal ciphers to prevent them from being called by normal users. Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu --- arch/x86/crypto/camellia_aesni_avx_glue.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index ed38d959add6..78818a1e73e3 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -335,7 +335,8 @@ static struct crypto_alg cmll_algs[10] = { { .cra_name = "__ecb-camellia-aesni", .cra_driver_name = "__driver-ecb-camellia-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAMELLIA_BLOCK_SIZE, .cra_ctxsize = sizeof(struct camellia_ctx), .cra_alignmask = 0, @@ -354,7 +355,8 @@ static struct crypto_alg cmll_algs[10] = { { .cra_name = "__cbc-camellia-aesni", .cra_driver_name = "__driver-cbc-camellia-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAMELLIA_BLOCK_SIZE, .cra_ctxsize = sizeof(struct camellia_ctx), .cra_alignmask = 0, @@ -373,7 +375,8 @@ static struct crypto_alg cmll_algs[10] = { { .cra_name = "__ctr-camellia-aesni", .cra_driver_name = "__driver-ctr-camellia-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct camellia_ctx), .cra_alignmask = 0, @@ -393,7 +396,8 @@ static struct crypto_alg cmll_algs[10] = { { .cra_name = "__lrw-camellia-aesni", .cra_driver_name = "__driver-lrw-camellia-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAMELLIA_BLOCK_SIZE, .cra_ctxsize = sizeof(struct camellia_lrw_ctx), .cra_alignmask = 0, @@ -416,7 +420,8 @@ static struct crypto_alg cmll_algs[10] = { { .cra_name = "__xts-camellia-aesni", .cra_driver_name = "__driver-xts-camellia-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAMELLIA_BLOCK_SIZE, .cra_ctxsize = sizeof(struct camellia_xts_ctx), .cra_alignmask = 0, -- cgit v1.2.3 From e69b8a46ca0ec38ef419071b00574bc053664e18 Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Mon, 30 Mar 2015 22:05:35 +0200 Subject: crypto: cast6_avx - mark CAST6 helper ciphers Flag all CAST6 helper ciphers as internal ciphers to prevent them from being called by normal users. Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu --- arch/x86/crypto/cast6_avx_glue.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 0160f68a57ff..f448810ca4ac 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -372,7 +372,8 @@ static struct crypto_alg cast6_algs[10] = { { .cra_name = "__ecb-cast6-avx", .cra_driver_name = "__driver-ecb-cast6-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAST6_BLOCK_SIZE, .cra_ctxsize = sizeof(struct cast6_ctx), .cra_alignmask = 0, @@ -391,7 +392,8 @@ static struct crypto_alg cast6_algs[10] = { { .cra_name = "__cbc-cast6-avx", .cra_driver_name = "__driver-cbc-cast6-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAST6_BLOCK_SIZE, .cra_ctxsize = sizeof(struct cast6_ctx), .cra_alignmask = 0, @@ -410,7 +412,8 @@ static struct crypto_alg cast6_algs[10] = { { .cra_name = "__ctr-cast6-avx", .cra_driver_name = "__driver-ctr-cast6-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct cast6_ctx), .cra_alignmask = 0, @@ -430,7 +433,8 @@ static struct crypto_alg cast6_algs[10] = { { .cra_name = "__lrw-cast6-avx", .cra_driver_name = "__driver-lrw-cast6-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAST6_BLOCK_SIZE, .cra_ctxsize = sizeof(struct cast6_lrw_ctx), .cra_alignmask = 0, @@ -453,7 +457,8 @@ static struct crypto_alg cast6_algs[10] = { { .cra_name = "__xts-cast6-avx", .cra_driver_name = "__driver-xts-cast6-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAST6_BLOCK_SIZE, .cra_ctxsize = sizeof(struct cast6_xts_ctx), .cra_alignmask = 0, -- cgit v1.2.3 From f82419acd8ad6b045a040ba8f5f0289972189826 Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Mon, 30 Mar 2015 22:06:13 +0200 Subject: crypto: serpent_avx2 - mark Serpent AVX2 helper ciphers Flag all Serpent AVX2 helper ciphers as internal ciphers to prevent them from being called by normal users. Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu --- arch/x86/crypto/serpent_avx2_glue.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 437e47a4d302..2f63dc89e7a9 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -309,7 +309,8 @@ static struct crypto_alg srp_algs[10] = { { .cra_name = "__ecb-serpent-avx2", .cra_driver_name = "__driver-ecb-serpent-avx2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_ctx), .cra_alignmask = 0, @@ -329,7 +330,8 @@ static struct crypto_alg srp_algs[10] = { { .cra_name = "__cbc-serpent-avx2", .cra_driver_name = "__driver-cbc-serpent-avx2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_ctx), .cra_alignmask = 0, @@ -349,7 +351,8 @@ static struct crypto_alg srp_algs[10] = { { .cra_name = "__ctr-serpent-avx2", .cra_driver_name = "__driver-ctr-serpent-avx2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct serpent_ctx), .cra_alignmask = 0, @@ -370,7 +373,8 @@ static struct crypto_alg srp_algs[10] = { { .cra_name = "__lrw-serpent-avx2", .cra_driver_name = "__driver-lrw-serpent-avx2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_lrw_ctx), .cra_alignmask = 0, @@ -394,7 +398,8 @@ static struct crypto_alg srp_algs[10] = { { .cra_name = "__xts-serpent-avx2", .cra_driver_name = "__driver-xts-serpent-avx2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_xts_ctx), .cra_alignmask = 0, -- cgit v1.2.3 From 65aed5394120953ccb6e307d75f9abd17c15740e Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Mon, 30 Mar 2015 22:07:05 +0200 Subject: crypto: serpent_avx - mark Serpent AVX helper ciphers Flag all Serpent AVX helper ciphers as internal ciphers to prevent them from being called by normal users. Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu --- arch/x86/crypto/serpent_avx_glue.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 7e217398b4eb..c8d478af8456 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -378,7 +378,8 @@ static struct crypto_alg serpent_algs[10] = { { .cra_name = "__ecb-serpent-avx", .cra_driver_name = "__driver-ecb-serpent-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_ctx), .cra_alignmask = 0, @@ -397,7 +398,8 @@ static struct crypto_alg serpent_algs[10] = { { .cra_name = "__cbc-serpent-avx", .cra_driver_name = "__driver-cbc-serpent-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_ctx), .cra_alignmask = 0, @@ -416,7 +418,8 @@ static struct crypto_alg serpent_algs[10] = { { .cra_name = "__ctr-serpent-avx", .cra_driver_name = "__driver-ctr-serpent-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct serpent_ctx), .cra_alignmask = 0, @@ -436,7 +439,8 @@ static struct crypto_alg serpent_algs[10] = { { .cra_name = "__lrw-serpent-avx", .cra_driver_name = "__driver-lrw-serpent-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_lrw_ctx), .cra_alignmask = 0, @@ -459,7 +463,8 @@ static struct crypto_alg serpent_algs[10] = { { .cra_name = "__xts-serpent-avx", .cra_driver_name = "__driver-xts-serpent-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_xts_ctx), .cra_alignmask = 0, -- cgit v1.2.3 From 748be1f1bfddfe83d4b31c17191b082e96c86867 Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Mon, 30 Mar 2015 22:07:45 +0200 Subject: crypto: serpent_sse2 - mark Serpent SSE2 helper ciphers Flag all Serpent SSE2 helper ciphers as internal ciphers to prevent them from being called by normal users. Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu --- arch/x86/crypto/serpent_sse2_glue.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index bf025adaea01..3643dd508f45 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -387,7 +387,8 @@ static struct crypto_alg serpent_algs[10] = { { .cra_name = "__ecb-serpent-sse2", .cra_driver_name = "__driver-ecb-serpent-sse2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_ctx), .cra_alignmask = 0, @@ -406,7 +407,8 @@ static struct crypto_alg serpent_algs[10] = { { .cra_name = "__cbc-serpent-sse2", .cra_driver_name = "__driver-cbc-serpent-sse2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_ctx), .cra_alignmask = 0, @@ -425,7 +427,8 @@ static struct crypto_alg serpent_algs[10] = { { .cra_name = "__ctr-serpent-sse2", .cra_driver_name = "__driver-ctr-serpent-sse2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct serpent_ctx), .cra_alignmask = 0, @@ -445,7 +448,8 @@ static struct crypto_alg serpent_algs[10] = { { .cra_name = "__lrw-serpent-sse2", .cra_driver_name = "__driver-lrw-serpent-sse2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_lrw_ctx), .cra_alignmask = 0, @@ -468,7 +472,8 @@ static struct crypto_alg serpent_algs[10] = { { .cra_name = "__xts-serpent-sse2", .cra_driver_name = "__driver-xts-serpent-sse2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_xts_ctx), .cra_alignmask = 0, -- cgit v1.2.3 From 4dda66f62e01e832e09d6d1fbd9c9070d57b049e Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Mon, 30 Mar 2015 22:08:53 +0200 Subject: crypto: twofish_avx - mark Twofish AVX helper ciphers Flag all Twofish AVX helper ciphers as internal ciphers to prevent them from being called by normal users. Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu --- arch/x86/crypto/twofish_avx_glue.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 1ac531ea9bcc..b5e2d5651851 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -340,7 +340,8 @@ static struct crypto_alg twofish_algs[10] = { { .cra_name = "__ecb-twofish-avx", .cra_driver_name = "__driver-ecb-twofish-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = TF_BLOCK_SIZE, .cra_ctxsize = sizeof(struct twofish_ctx), .cra_alignmask = 0, @@ -359,7 +360,8 @@ static struct crypto_alg twofish_algs[10] = { { .cra_name = "__cbc-twofish-avx", .cra_driver_name = "__driver-cbc-twofish-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = TF_BLOCK_SIZE, .cra_ctxsize = sizeof(struct twofish_ctx), .cra_alignmask = 0, @@ -378,7 +380,8 @@ static struct crypto_alg twofish_algs[10] = { { .cra_name = "__ctr-twofish-avx", .cra_driver_name = "__driver-ctr-twofish-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct twofish_ctx), .cra_alignmask = 0, @@ -398,7 +401,8 @@ static struct crypto_alg twofish_algs[10] = { { .cra_name = "__lrw-twofish-avx", .cra_driver_name = "__driver-lrw-twofish-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = TF_BLOCK_SIZE, .cra_ctxsize = sizeof(struct twofish_lrw_ctx), .cra_alignmask = 0, @@ -421,7 +425,8 @@ static struct crypto_alg twofish_algs[10] = { { .cra_name = "__xts-twofish-avx", .cra_driver_name = "__driver-xts-twofish-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = TF_BLOCK_SIZE, .cra_ctxsize = sizeof(struct twofish_xts_ctx), .cra_alignmask = 0, -- cgit v1.2.3 From 555fa17b2b8cc865c203de263443041eb75bc7f7 Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Mon, 30 Mar 2015 22:11:46 +0200 Subject: crypto: sha-mb - mark Multi buffer SHA1 helper cipher Flag all Multi buffer SHA1 helper ciphers as internal ciphers to prevent them from being called by normal users. Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu --- arch/x86/crypto/sha-mb/sha1_mb.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c index 9414fd964ecd..e510b1c5d690 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb.c +++ b/arch/x86/crypto/sha-mb/sha1_mb.c @@ -694,7 +694,8 @@ static struct shash_alg sha1_mb_shash_alg = { * use ASYNC flag as some buffers in multi-buffer * algo may not have completed before hashing thread sleep */ - .cra_flags = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_ASYNC, + .cra_flags = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_ASYNC | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(sha1_mb_shash_alg.base.cra_list), @@ -770,7 +771,9 @@ static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm) struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm); struct mcryptd_hash_ctx *mctx; - mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb", 0, 0); + mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb", + CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); if (IS_ERR(mcryptd_tfm)) return PTR_ERR(mcryptd_tfm); mctx = crypto_ahash_ctx(&mcryptd_tfm->base); -- cgit v1.2.3 From ec776ef6bbe1734c29cd6bd05219cd93b2731bd4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Apr 2015 09:12:18 +0200 Subject: x86/mm: Add support for the non-standard protected e820 type Various recent BIOSes support NVDIMMs or ADR using a non-standard e820 memory type, and Intel supplied reference Linux code using this type to various vendors. Wire this e820 table type up to export platform devices for the pmem driver so that we can use it in Linux. Based on earlier work from: Dave Jiang Dan Williams Includes fixes for NUMA regions from Boaz Harrosh. Tested-by: Ross Zwisler Signed-off-by: Christoph Hellwig Acked-by: Dan Williams Cc: Andrew Morton Cc: Andy Lutomirski Cc: Boaz Harrosh Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Jens Axboe Cc: Jens Axboe Cc: Keith Busch Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: linux-nvdimm@ml01.01.org Link: http://lkml.kernel.org/r/1427872339-6688-2-git-send-email-hch@lst.de [ Minor cleanups. ] Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 10 ++++++++ arch/x86/include/uapi/asm/e820.h | 10 ++++++++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/e820.c | 26 +++++++++++++++----- arch/x86/kernel/pmem.c | 53 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 94 insertions(+), 6 deletions(-) create mode 100644 arch/x86/kernel/pmem.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b7d31ca55187..9e3bcd6f4a48 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1430,6 +1430,16 @@ config ILLEGAL_POINTER_VALUE source "mm/Kconfig" +config X86_PMEM_LEGACY + bool "Support non-standard NVDIMMs and ADR protected memory" + help + Treat memory marked using the non-standard e820 type of 12 as used + by the Intel Sandy Bridge-EP reference BIOS as protected memory. + The kernel will offer these regions to the 'pmem' driver so + they can be used for persistent storage. + + Say Y if unsure. + config HIGHPTE bool "Allocate 3rd-level pagetables from highmem" depends on HIGHMEM diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h index d993e33f5236..960a8a9dc4ab 100644 --- a/arch/x86/include/uapi/asm/e820.h +++ b/arch/x86/include/uapi/asm/e820.h @@ -33,6 +33,16 @@ #define E820_NVS 4 #define E820_UNUSABLE 5 +/* + * This is a non-standardized way to represent ADR or NVDIMM regions that + * persist over a reboot. The kernel will ignore their special capabilities + * unless the CONFIG_X86_PMEM_LEGACY=y option is set. + * + * ( Note that older platforms also used 6 for the same type of memory, + * but newer versions switched to 12 as 6 was assigned differently. Some + * time they will learn... ) + */ +#define E820_PRAM 12 /* * reserved RAM used by kernel itself diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index cdb1b70ddad0..971f18cd9ca0 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -94,6 +94,7 @@ obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o +obj-$(CONFIG_X86_PMEM_LEGACY) += pmem.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 46201deee923..11cc7d54ec3f 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -149,6 +149,9 @@ static void __init e820_print_type(u32 type) case E820_UNUSABLE: printk(KERN_CONT "unusable"); break; + case E820_PRAM: + printk(KERN_CONT "persistent (type %u)", type); + break; default: printk(KERN_CONT "type %u", type); break; @@ -343,7 +346,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, * continue building up new bios map based on this * information */ - if (current_type != last_type) { + if (current_type != last_type || current_type == E820_PRAM) { if (last_type != 0) { new_bios[new_bios_entry].size = change_point[chgidx]->addr - last_addr; @@ -688,6 +691,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn) register_nosave_region(pfn, PFN_UP(ei->addr)); pfn = PFN_DOWN(ei->addr + ei->size); + if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) register_nosave_region(PFN_UP(ei->addr), pfn); @@ -748,7 +752,7 @@ u64 __init early_reserve_e820(u64 size, u64 align) /* * Find the highest page frame number we have available */ -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) +static unsigned long __init e820_end_pfn(unsigned long limit_pfn) { int i; unsigned long last_pfn = 0; @@ -759,7 +763,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) unsigned long start_pfn; unsigned long end_pfn; - if (ei->type != type) + /* + * Persistent memory is accounted as ram for purposes of + * establishing max_pfn and mem_map. + */ + if (ei->type != E820_RAM && ei->type != E820_PRAM) continue; start_pfn = ei->addr >> PAGE_SHIFT; @@ -784,12 +792,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) } unsigned long __init e820_end_of_ram_pfn(void) { - return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); + return e820_end_pfn(MAX_ARCH_PFN); } unsigned long __init e820_end_of_low_ram_pfn(void) { - return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM); + return e820_end_pfn(1UL << (32-PAGE_SHIFT)); } static void early_panic(char *msg) @@ -866,6 +874,9 @@ static int __init parse_memmap_one(char *p) } else if (*p == '$') { start_at = memparse(p+1, &p); e820_add_region(start_at, mem_size, E820_RESERVED); + } else if (*p == '!') { + start_at = memparse(p+1, &p); + e820_add_region(start_at, mem_size, E820_PRAM); } else e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); @@ -907,6 +918,7 @@ static inline const char *e820_type_to_string(int e820_type) case E820_ACPI: return "ACPI Tables"; case E820_NVS: return "ACPI Non-volatile Storage"; case E820_UNUSABLE: return "Unusable memory"; + case E820_PRAM: return "Persistent RAM"; default: return "reserved"; } } @@ -940,7 +952,9 @@ void __init e820_reserve_resources(void) * pci device BAR resource and insert them later in * pcibios_resource_survey() */ - if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) { + if (((e820.map[i].type != E820_RESERVED) && + (e820.map[i].type != E820_PRAM)) || + res->start < (1ULL<<20)) { res->flags |= IORESOURCE_BUSY; insert_resource(&iomem_resource, res); } diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c new file mode 100644 index 000000000000..3420c874ddc5 --- /dev/null +++ b/arch/x86/kernel/pmem.c @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2015, Christoph Hellwig. + */ +#include +#include +#include +#include +#include +#include + +static __init void register_pmem_device(struct resource *res) +{ + struct platform_device *pdev; + int error; + + pdev = platform_device_alloc("pmem", PLATFORM_DEVID_AUTO); + if (!pdev) + return; + + error = platform_device_add_resources(pdev, res, 1); + if (error) + goto out_put_pdev; + + error = platform_device_add(pdev); + if (error) + goto out_put_pdev; + return; + +out_put_pdev: + dev_warn(&pdev->dev, "failed to add 'pmem' (persistent memory) device!\n"); + platform_device_put(pdev); +} + +static __init int register_pmem_devices(void) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (ei->type == E820_PRAM) { + struct resource res = { + .flags = IORESOURCE_MEM, + .start = ei->addr, + .end = ei->addr + ei->size - 1, + }; + register_pmem_device(&res); + } + } + + return 0; +} +device_initcall(register_pmem_devices); -- cgit v1.2.3 From a436bb7b806383ae0593cab53d17fc9676270cd3 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 27 Mar 2015 20:43:36 +0900 Subject: kbuild: use relative path more to include Makefile Prior to this commit, it was impossible to use relative path to include Makefiles from the top level Makefile because the option "--include-dir=$(srctree)" becomes effective when Make enters into sub Makefiles. To use relative path in any places, this commit moves the option above the "sub-make" target. Signed-off-by: Masahiro Yamada Signed-off-by: Michal Marek --- arch/x86/Makefile | 2 +- arch/x86/Makefile.um | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 5ba2d9ce82dc..2fda005bb334 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -63,7 +63,7 @@ ifeq ($(CONFIG_X86_32),y) $(call cc-option,-fno-unit-at-a-time)) # CPU-specific tuning. Anything which can be shared with UML should go here. - include $(srctree)/arch/x86/Makefile_32.cpu + include arch/x86/Makefile_32.cpu KBUILD_CFLAGS += $(cflags-y) # temporary until string.h is fixed diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index 95eba554baf9..5b7e898ffd9a 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um @@ -18,7 +18,7 @@ LDS_EXTRA := -Ui386 export LDS_EXTRA # First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y. -include $(srctree)/arch/x86/Makefile_32.cpu +include arch/x86/Makefile_32.cpu # prevent gcc from keeping the stack 16 byte aligned. Taken from i386. cflags-y += $(call cc-option,-mpreferred-stack-boundary=2) -- cgit v1.2.3 From ed69628b3b04578179334393d7f5fe60a2681f1c Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 14 Jan 2015 14:18:19 +0200 Subject: x86: Add Intel Processor Trace (INTEL_PT) cpu feature detection Intel Processor Trace is an architecture extension that allows for program flow tracing. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kaixu Xia Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Robert Richter Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Link: http://lkml.kernel.org/r/1421237903-181015-11-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 + arch/x86/kernel/cpu/scattered.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 361922dcc9b1..c1553b70fed4 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -195,6 +195,7 @@ #define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */ #define X86_FEATURE_HWP_EPP ( 7*32+13) /* Intel HWP_EPP */ #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */ +#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 60639093d536..3d423a101fae 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -41,6 +41,7 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_HWP_ACT_WINDOW, CR_EAX, 9, 0x00000006, 0 }, { X86_FEATURE_HWP_EPP, CR_EAX,10, 0x00000006, 0 }, { X86_FEATURE_HWP_PKG_REQ, CR_EAX,11, 0x00000006, 0 }, + { X86_FEATURE_INTEL_PT, CR_EBX,25, 0x00000007, 0 }, { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, -- cgit v1.2.3 From 4807034248bed58d49a4f9f450c024e3b0f58577 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 14 Jan 2015 14:18:20 +0200 Subject: perf/x86: Mark Intel PT and LBR/BTS as mutually exclusive Intel PT cannot be used at the same time as LBR or BTS and will cause a general protection fault if they are used together. In order to avoid fixing up GPs in the fast path, instead we disallow creating LBR/BTS events when PT events are present and vice versa. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kaixu Xia Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Robert Richter Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Link: http://lkml.kernel.org/r/1421237903-181015-12-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 43 ++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/perf_event.h | 40 +++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/perf_event_intel.c | 11 +++++++++ 3 files changed, 94 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 0420ebcac116..549d01d6d996 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -263,6 +263,14 @@ static void hw_perf_event_destroy(struct perf_event *event) } } +void hw_perf_lbr_event_destroy(struct perf_event *event) +{ + hw_perf_event_destroy(event); + + /* undo the lbr/bts event accounting */ + x86_del_exclusive(x86_lbr_exclusive_lbr); +} + static inline int x86_pmu_initialized(void) { return x86_pmu.handle_irq != NULL; @@ -302,6 +310,35 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) return x86_pmu_extra_regs(val, event); } +/* + * Check if we can create event of a certain type (that no conflicting events + * are present). + */ +int x86_add_exclusive(unsigned int what) +{ + int ret = -EBUSY, i; + + if (atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) + return 0; + + mutex_lock(&pmc_reserve_mutex); + for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) + if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i])) + goto out; + + atomic_inc(&x86_pmu.lbr_exclusive[what]); + ret = 0; + +out: + mutex_unlock(&pmc_reserve_mutex); + return ret; +} + +void x86_del_exclusive(unsigned int what) +{ + atomic_dec(&x86_pmu.lbr_exclusive[what]); +} + int x86_setup_perfctr(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; @@ -346,6 +383,12 @@ int x86_setup_perfctr(struct perf_event *event) /* BTS is currently only allowed for user-mode. */ if (!attr->exclude_kernel) return -EOPNOTSUPP; + + /* disallow bts if conflicting events are present */ + if (x86_add_exclusive(x86_lbr_exclusive_lbr)) + return -EBUSY; + + event->destroy = hw_perf_lbr_event_destroy; } hwc->config |= config; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 87e5081f4cdc..47499661e8d4 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -408,6 +408,12 @@ union x86_pmu_config { #define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value +enum { + x86_lbr_exclusive_lbr, + x86_lbr_exclusive_pt, + x86_lbr_exclusive_max, +}; + /* * struct x86_pmu - generic x86 pmu */ @@ -505,6 +511,11 @@ struct x86_pmu { const int *lbr_sel_map; /* lbr_select mappings */ bool lbr_double_abort; /* duplicated lbr aborts */ + /* + * Intel PT/LBR/BTS are exclusive + */ + atomic_t lbr_exclusive[x86_lbr_exclusive_max]; + /* * Extra registers for events */ @@ -603,6 +614,12 @@ static inline int x86_pmu_rdpmc_index(int index) return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index; } +int x86_add_exclusive(unsigned int what); + +void x86_del_exclusive(unsigned int what); + +void hw_perf_lbr_event_destroy(struct perf_event *event); + int x86_setup_perfctr(struct perf_event *event); int x86_pmu_hw_config(struct perf_event *event); @@ -689,6 +706,29 @@ static inline int amd_pmu_init(void) #ifdef CONFIG_CPU_SUP_INTEL +static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) +{ + /* user explicitly requested branch sampling */ + if (has_branch_stack(event)) + return true; + + /* implicit branch sampling to correct PEBS skid */ + if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 && + x86_pmu.intel_cap.pebs_format < 2) + return true; + + return false; +} + +static inline bool intel_pmu_has_bts(struct perf_event *event) +{ + if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && + !event->attr.freq && event->hw.sample_period == 1) + return true; + + return false; +} + int intel_pmu_save_and_restart(struct perf_event *event); struct event_constraint * diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index fc6dbc46af4a..b7b3ff21c832 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1942,6 +1942,17 @@ static int intel_pmu_hw_config(struct perf_event *event) ret = intel_pmu_setup_lbr_filter(event); if (ret) return ret; + + /* + * BTS is set up earlier in this path, so don't account twice + */ + if (!intel_pmu_has_bts(event)) { + /* disallow lbr if conflicting events are present */ + if (x86_add_exclusive(x86_lbr_exclusive_lbr)) + return -EBUSY; + + event->destroy = hw_perf_lbr_event_destroy; + } } if (event->attr.type != PERF_TYPE_RAW) -- cgit v1.2.3 From 52ca9ced3f70779589e6ecc329baffe69d8f5f7a Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 30 Jan 2015 12:39:52 +0200 Subject: perf/x86/intel/pt: Add Intel PT PMU driver Add support for Intel Processor Trace (PT) to kernel's perf events. PT is an extension of Intel Architecture that collects information about software execuction such as control flow, execution modes and timings and formats it into highly compressed binary packets. Even being compressed, these packets are generated at hundreds of megabytes per second per core, which makes it impractical to decode them on the fly in the kernel. This driver exports trace data by through AUX space in the perf ring buffer, which is zero-copy mapped into userspace for faster data retrieval. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kaixu Xia Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Robert Richter Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Link: http://lkml.kernel.org/r/1422614392-114498-1-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/uapi/asm/msr-index.h | 18 + arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/intel_pt.h | 131 ++++ arch/x86/kernel/cpu/perf_event.h | 2 + arch/x86/kernel/cpu/perf_event_intel.c | 8 + arch/x86/kernel/cpu/perf_event_intel_pt.c | 1096 +++++++++++++++++++++++++++++ 6 files changed, 1256 insertions(+) create mode 100644 arch/x86/kernel/cpu/intel_pt.h create mode 100644 arch/x86/kernel/cpu/perf_event_intel_pt.c (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index 3ce079136c11..1a4eae695ca8 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -74,6 +74,24 @@ #define MSR_IA32_PERF_CAPABILITIES 0x00000345 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 +#define MSR_IA32_RTIT_CTL 0x00000570 +#define RTIT_CTL_TRACEEN BIT(0) +#define RTIT_CTL_OS BIT(2) +#define RTIT_CTL_USR BIT(3) +#define RTIT_CTL_CR3EN BIT(7) +#define RTIT_CTL_TOPA BIT(8) +#define RTIT_CTL_TSC_EN BIT(10) +#define RTIT_CTL_DISRETC BIT(11) +#define RTIT_CTL_BRANCH_EN BIT(13) +#define MSR_IA32_RTIT_STATUS 0x00000571 +#define RTIT_STATUS_CONTEXTEN BIT(1) +#define RTIT_STATUS_TRIGGEREN BIT(2) +#define RTIT_STATUS_ERROR BIT(4) +#define RTIT_STATUS_STOPPED BIT(5) +#define MSR_IA32_RTIT_CR3_MATCH 0x00000572 +#define MSR_IA32_RTIT_OUTPUT_BASE 0x00000560 +#define MSR_IA32_RTIT_OUTPUT_MASK 0x00000561 + #define MSR_MTRRfix64K_00000 0x00000250 #define MSR_MTRRfix16K_80000 0x00000258 #define MSR_MTRRfix16K_A0000 0x00000259 diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 6c1ca139f736..e6b353f10e09 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -40,6 +40,7 @@ endif obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o perf_event_intel_cqm.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_pt.o obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \ perf_event_intel_uncore_snb.o \ diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h new file mode 100644 index 000000000000..1c338b0eba05 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_pt.h @@ -0,0 +1,131 @@ +/* + * Intel(R) Processor Trace PMU driver for perf + * Copyright (c) 2013-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * Intel PT is specified in the Intel Architecture Instruction Set Extensions + * Programming Reference: + * http://software.intel.com/en-us/intel-isa-extensions + */ + +#ifndef __INTEL_PT_H__ +#define __INTEL_PT_H__ + +/* + * Single-entry ToPA: when this close to region boundary, switch + * buffers to avoid losing data. + */ +#define TOPA_PMI_MARGIN 512 + +/* + * Table of Physical Addresses bits + */ +enum topa_sz { + TOPA_4K = 0, + TOPA_8K, + TOPA_16K, + TOPA_32K, + TOPA_64K, + TOPA_128K, + TOPA_256K, + TOPA_512K, + TOPA_1MB, + TOPA_2MB, + TOPA_4MB, + TOPA_8MB, + TOPA_16MB, + TOPA_32MB, + TOPA_64MB, + TOPA_128MB, + TOPA_SZ_END, +}; + +static inline unsigned int sizes(enum topa_sz tsz) +{ + return 1 << (tsz + 12); +}; + +struct topa_entry { + u64 end : 1; + u64 rsvd0 : 1; + u64 intr : 1; + u64 rsvd1 : 1; + u64 stop : 1; + u64 rsvd2 : 1; + u64 size : 4; + u64 rsvd3 : 2; + u64 base : 36; + u64 rsvd4 : 16; +}; + +#define TOPA_SHIFT 12 +#define PT_CPUID_LEAVES 2 + +enum pt_capabilities { + PT_CAP_max_subleaf = 0, + PT_CAP_cr3_filtering, + PT_CAP_topa_output, + PT_CAP_topa_multiple_entries, + PT_CAP_payloads_lip, +}; + +struct pt_pmu { + struct pmu pmu; + u32 caps[4 * PT_CPUID_LEAVES]; +}; + +/** + * struct pt_buffer - buffer configuration; one buffer per task_struct or + * cpu, depending on perf event configuration + * @cpu: cpu for per-cpu allocation + * @tables: list of ToPA tables in this buffer + * @first: shorthand for first topa table + * @last: shorthand for last topa table + * @cur: current topa table + * @nr_pages: buffer size in pages + * @cur_idx: current output region's index within @cur table + * @output_off: offset within the current output region + * @data_size: running total of the amount of data in this buffer + * @lost: if data was lost/truncated + * @head: logical write offset inside the buffer + * @snapshot: if this is for a snapshot/overwrite counter + * @stop_pos: STOP topa entry in the buffer + * @intr_pos: INT topa entry in the buffer + * @data_pages: array of pages from perf + * @topa_index: table of topa entries indexed by page offset + */ +struct pt_buffer { + int cpu; + struct list_head tables; + struct topa *first, *last, *cur; + unsigned int cur_idx; + size_t output_off; + unsigned long nr_pages; + local_t data_size; + local_t lost; + local64_t head; + bool snapshot; + unsigned long stop_pos, intr_pos; + void **data_pages; + struct topa_entry *topa_index[0]; +}; + +/** + * struct pt - per-cpu pt context + * @handle: perf output handle + * @handle_nmi: do handle PT PMI on this cpu, there's an active event + */ +struct pt { + struct perf_output_handle handle; + int handle_nmi; +}; + +#endif /* __INTEL_PT_H__ */ diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 47499661e8d4..f04729ac3290 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -808,6 +808,8 @@ void intel_pmu_lbr_init_hsw(void); int intel_pmu_setup_lbr_filter(struct perf_event *event); +void intel_pt_interrupt(void); + int p4_pmu_init(void); int p6_pmu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index b7b3ff21c832..8eb22ce26303 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1589,6 +1589,14 @@ again: x86_pmu.drain_pebs(regs); } + /* + * Intel PT + */ + if (__test_and_clear_bit(55, (unsigned long *)&status)) { + handled++; + intel_pt_interrupt(); + } + /* * Checkpointed counters can lead to 'spurious' PMIs because the * rollback caused by the PMI will have cleared the overflow status diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c new file mode 100644 index 000000000000..a9a1092cf836 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -0,0 +1,1096 @@ +/* + * Intel(R) Processor Trace PMU driver for perf + * Copyright (c) 2013-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * Intel PT is specified in the Intel Architecture Instruction Set Extensions + * Programming Reference: + * http://software.intel.com/en-us/intel-isa-extensions + */ + +#undef DEBUG + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include + +#include +#include +#include + +#include "perf_event.h" +#include "intel_pt.h" + +static DEFINE_PER_CPU(struct pt, pt_ctx); + +static struct pt_pmu pt_pmu; + +enum cpuid_regs { + CR_EAX = 0, + CR_ECX, + CR_EDX, + CR_EBX +}; + +/* + * Capabilities of Intel PT hardware, such as number of address bits or + * supported output schemes, are cached and exported to userspace as "caps" + * attribute group of pt pmu device + * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store + * relevant bits together with intel_pt traces. + * + * These are necessary for both trace decoding (payloads_lip, contains address + * width encoded in IP-related packets), and event configuration (bitmasks with + * permitted values for certain bit fields). + */ +#define PT_CAP(_n, _l, _r, _m) \ + [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l, \ + .reg = _r, .mask = _m } + +static struct pt_cap_desc { + const char *name; + u32 leaf; + u8 reg; + u32 mask; +} pt_caps[] = { + PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff), + PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)), + PT_CAP(topa_output, 0, CR_ECX, BIT(0)), + PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)), + PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)), +}; + +static u32 pt_cap_get(enum pt_capabilities cap) +{ + struct pt_cap_desc *cd = &pt_caps[cap]; + u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg]; + unsigned int shift = __ffs(cd->mask); + + return (c & cd->mask) >> shift; +} + +static ssize_t pt_cap_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + struct dev_ext_attribute *ea = + container_of(attr, struct dev_ext_attribute, attr); + enum pt_capabilities cap = (long)ea->var; + + return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap)); +} + +static struct attribute_group pt_cap_group = { + .name = "caps", +}; + +PMU_FORMAT_ATTR(tsc, "config:10" ); +PMU_FORMAT_ATTR(noretcomp, "config:11" ); + +static struct attribute *pt_formats_attr[] = { + &format_attr_tsc.attr, + &format_attr_noretcomp.attr, + NULL, +}; + +static struct attribute_group pt_format_group = { + .name = "format", + .attrs = pt_formats_attr, +}; + +static const struct attribute_group *pt_attr_groups[] = { + &pt_cap_group, + &pt_format_group, + NULL, +}; + +static int __init pt_pmu_hw_init(void) +{ + struct dev_ext_attribute *de_attrs; + struct attribute **attrs; + size_t size; + long i; + + if (test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT)) { + for (i = 0; i < PT_CPUID_LEAVES; i++) + cpuid_count(20, i, + &pt_pmu.caps[CR_EAX + i * 4], + &pt_pmu.caps[CR_EBX + i * 4], + &pt_pmu.caps[CR_ECX + i * 4], + &pt_pmu.caps[CR_EDX + i * 4]); + } else { + return -ENODEV; + } + + size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps) + 1); + attrs = kzalloc(size, GFP_KERNEL); + if (!attrs) + goto err_attrs; + + size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps) + 1); + de_attrs = kzalloc(size, GFP_KERNEL); + if (!de_attrs) + goto err_de_attrs; + + for (i = 0; i < ARRAY_SIZE(pt_caps); i++) { + de_attrs[i].attr.attr.name = pt_caps[i].name; + + sysfs_attr_init(&de_attrs[i].attr.attr); + de_attrs[i].attr.attr.mode = S_IRUGO; + de_attrs[i].attr.show = pt_cap_show; + de_attrs[i].var = (void *)i; + attrs[i] = &de_attrs[i].attr.attr; + } + + pt_cap_group.attrs = attrs; + return 0; + +err_de_attrs: + kfree(de_attrs); +err_attrs: + kfree(attrs); + + return -ENOMEM; +} + +#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC) + +static bool pt_event_valid(struct perf_event *event) +{ + u64 config = event->attr.config; + + if ((config & PT_CONFIG_MASK) != config) + return false; + + return true; +} + +/* + * PT configuration helpers + * These all are cpu affine and operate on a local PT + */ + +static bool pt_is_running(void) +{ + u64 ctl; + + rdmsrl(MSR_IA32_RTIT_CTL, ctl); + + return !!(ctl & RTIT_CTL_TRACEEN); +} + +static void pt_config(struct perf_event *event) +{ + u64 reg; + + reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN; + + if (!event->attr.exclude_kernel) + reg |= RTIT_CTL_OS; + if (!event->attr.exclude_user) + reg |= RTIT_CTL_USR; + + reg |= (event->attr.config & PT_CONFIG_MASK); + + wrmsrl(MSR_IA32_RTIT_CTL, reg); +} + +static void pt_config_start(bool start) +{ + u64 ctl; + + rdmsrl(MSR_IA32_RTIT_CTL, ctl); + if (start) + ctl |= RTIT_CTL_TRACEEN; + else + ctl &= ~RTIT_CTL_TRACEEN; + wrmsrl(MSR_IA32_RTIT_CTL, ctl); + + /* + * A wrmsr that disables trace generation serializes other PT + * registers and causes all data packets to be written to memory, + * but a fence is required for the data to become globally visible. + * + * The below WMB, separating data store and aux_head store matches + * the consumer's RMB that separates aux_head load and data load. + */ + if (!start) + wmb(); +} + +static void pt_config_buffer(void *buf, unsigned int topa_idx, + unsigned int output_off) +{ + u64 reg; + + wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf)); + + reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32); + + wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg); +} + +/* + * Keep ToPA table-related metadata on the same page as the actual table, + * taking up a few words from the top + */ + +#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1) + +/** + * struct topa - page-sized ToPA table with metadata at the top + * @table: actual ToPA table entries, as understood by PT hardware + * @list: linkage to struct pt_buffer's list of tables + * @phys: physical address of this page + * @offset: offset of the first entry in this table in the buffer + * @size: total size of all entries in this table + * @last: index of the last initialized entry in this table + */ +struct topa { + struct topa_entry table[TENTS_PER_PAGE]; + struct list_head list; + u64 phys; + u64 offset; + size_t size; + int last; +}; + +/* make -1 stand for the last table entry */ +#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)]) + +/** + * topa_alloc() - allocate page-sized ToPA table + * @cpu: CPU on which to allocate. + * @gfp: Allocation flags. + * + * Return: On success, return the pointer to ToPA table page. + */ +static struct topa *topa_alloc(int cpu, gfp_t gfp) +{ + int node = cpu_to_node(cpu); + struct topa *topa; + struct page *p; + + p = alloc_pages_node(node, gfp | __GFP_ZERO, 0); + if (!p) + return NULL; + + topa = page_address(p); + topa->last = 0; + topa->phys = page_to_phys(p); + + /* + * In case of singe-entry ToPA, always put the self-referencing END + * link as the 2nd entry in the table + */ + if (!pt_cap_get(PT_CAP_topa_multiple_entries)) { + TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT; + TOPA_ENTRY(topa, 1)->end = 1; + } + + return topa; +} + +/** + * topa_free() - free a page-sized ToPA table + * @topa: Table to deallocate. + */ +static void topa_free(struct topa *topa) +{ + free_page((unsigned long)topa); +} + +/** + * topa_insert_table() - insert a ToPA table into a buffer + * @buf: PT buffer that's being extended. + * @topa: New topa table to be inserted. + * + * If it's the first table in this buffer, set up buffer's pointers + * accordingly; otherwise, add a END=1 link entry to @topa to the current + * "last" table and adjust the last table pointer to @topa. + */ +static void topa_insert_table(struct pt_buffer *buf, struct topa *topa) +{ + struct topa *last = buf->last; + + list_add_tail(&topa->list, &buf->tables); + + if (!buf->first) { + buf->first = buf->last = buf->cur = topa; + return; + } + + topa->offset = last->offset + last->size; + buf->last = topa; + + if (!pt_cap_get(PT_CAP_topa_multiple_entries)) + return; + + BUG_ON(last->last != TENTS_PER_PAGE - 1); + + TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT; + TOPA_ENTRY(last, -1)->end = 1; +} + +/** + * topa_table_full() - check if a ToPA table is filled up + * @topa: ToPA table. + */ +static bool topa_table_full(struct topa *topa) +{ + /* single-entry ToPA is a special case */ + if (!pt_cap_get(PT_CAP_topa_multiple_entries)) + return !!topa->last; + + return topa->last == TENTS_PER_PAGE - 1; +} + +/** + * topa_insert_pages() - create a list of ToPA tables + * @buf: PT buffer being initialized. + * @gfp: Allocation flags. + * + * This initializes a list of ToPA tables with entries from + * the data_pages provided by rb_alloc_aux(). + * + * Return: 0 on success or error code. + */ +static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp) +{ + struct topa *topa = buf->last; + int order = 0; + struct page *p; + + p = virt_to_page(buf->data_pages[buf->nr_pages]); + if (PagePrivate(p)) + order = page_private(p); + + if (topa_table_full(topa)) { + topa = topa_alloc(buf->cpu, gfp); + if (!topa) + return -ENOMEM; + + topa_insert_table(buf, topa); + } + + TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT; + TOPA_ENTRY(topa, -1)->size = order; + if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) { + TOPA_ENTRY(topa, -1)->intr = 1; + TOPA_ENTRY(topa, -1)->stop = 1; + } + + topa->last++; + topa->size += sizes(order); + + buf->nr_pages += 1ul << order; + + return 0; +} + +/** + * pt_topa_dump() - print ToPA tables and their entries + * @buf: PT buffer. + */ +static void pt_topa_dump(struct pt_buffer *buf) +{ + struct topa *topa; + + list_for_each_entry(topa, &buf->tables, list) { + int i; + + pr_debug("# table @%p (%p), off %llx size %zx\n", topa->table, + (void *)topa->phys, topa->offset, topa->size); + for (i = 0; i < TENTS_PER_PAGE; i++) { + pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n", + &topa->table[i], + (unsigned long)topa->table[i].base << TOPA_SHIFT, + sizes(topa->table[i].size), + topa->table[i].end ? 'E' : ' ', + topa->table[i].intr ? 'I' : ' ', + topa->table[i].stop ? 'S' : ' ', + *(u64 *)&topa->table[i]); + if ((pt_cap_get(PT_CAP_topa_multiple_entries) && + topa->table[i].stop) || + topa->table[i].end) + break; + } + } +} + +/** + * pt_buffer_advance() - advance to the next output region + * @buf: PT buffer. + * + * Advance the current pointers in the buffer to the next ToPA entry. + */ +static void pt_buffer_advance(struct pt_buffer *buf) +{ + buf->output_off = 0; + buf->cur_idx++; + + if (buf->cur_idx == buf->cur->last) { + if (buf->cur == buf->last) + buf->cur = buf->first; + else + buf->cur = list_entry(buf->cur->list.next, struct topa, + list); + buf->cur_idx = 0; + } +} + +/** + * pt_update_head() - calculate current offsets and sizes + * @pt: Per-cpu pt context. + * + * Update buffer's current write pointer position and data size. + */ +static void pt_update_head(struct pt *pt) +{ + struct pt_buffer *buf = perf_get_aux(&pt->handle); + u64 topa_idx, base, old; + + /* offset of the first region in this table from the beginning of buf */ + base = buf->cur->offset + buf->output_off; + + /* offset of the current output region within this table */ + for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++) + base += sizes(buf->cur->table[topa_idx].size); + + if (buf->snapshot) { + local_set(&buf->data_size, base); + } else { + old = (local64_xchg(&buf->head, base) & + ((buf->nr_pages << PAGE_SHIFT) - 1)); + if (base < old) + base += buf->nr_pages << PAGE_SHIFT; + + local_add(base - old, &buf->data_size); + } +} + +/** + * pt_buffer_region() - obtain current output region's address + * @buf: PT buffer. + */ +static void *pt_buffer_region(struct pt_buffer *buf) +{ + return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT); +} + +/** + * pt_buffer_region_size() - obtain current output region's size + * @buf: PT buffer. + */ +static size_t pt_buffer_region_size(struct pt_buffer *buf) +{ + return sizes(buf->cur->table[buf->cur_idx].size); +} + +/** + * pt_handle_status() - take care of possible status conditions + * @pt: Per-cpu pt context. + */ +static void pt_handle_status(struct pt *pt) +{ + struct pt_buffer *buf = perf_get_aux(&pt->handle); + int advance = 0; + u64 status; + + rdmsrl(MSR_IA32_RTIT_STATUS, status); + + if (status & RTIT_STATUS_ERROR) { + pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n"); + pt_topa_dump(buf); + status &= ~RTIT_STATUS_ERROR; + } + + if (status & RTIT_STATUS_STOPPED) { + status &= ~RTIT_STATUS_STOPPED; + + /* + * On systems that only do single-entry ToPA, hitting STOP + * means we are already losing data; need to let the decoder + * know. + */ + if (!pt_cap_get(PT_CAP_topa_multiple_entries) || + buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) { + local_inc(&buf->lost); + advance++; + } + } + + /* + * Also on single-entry ToPA implementations, interrupt will come + * before the output reaches its output region's boundary. + */ + if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot && + pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) { + void *head = pt_buffer_region(buf); + + /* everything within this margin needs to be zeroed out */ + memset(head + buf->output_off, 0, + pt_buffer_region_size(buf) - + buf->output_off); + advance++; + } + + if (advance) + pt_buffer_advance(buf); + + wrmsrl(MSR_IA32_RTIT_STATUS, status); +} + +/** + * pt_read_offset() - translate registers into buffer pointers + * @buf: PT buffer. + * + * Set buffer's output pointers from MSR values. + */ +static void pt_read_offset(struct pt_buffer *buf) +{ + u64 offset, base_topa; + + rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa); + buf->cur = phys_to_virt(base_topa); + + rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset); + /* offset within current output region */ + buf->output_off = offset >> 32; + /* index of current output region within this table */ + buf->cur_idx = (offset & 0xffffff80) >> 7; +} + +/** + * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry + * @buf: PT buffer. + * @pg: Page offset in the buffer. + * + * When advancing to the next output region (ToPA entry), given a page offset + * into the buffer, we need to find the offset of the first page in the next + * region. + */ +static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg) +{ + struct topa_entry *te = buf->topa_index[pg]; + + /* one region */ + if (buf->first == buf->last && buf->first->last == 1) + return pg; + + do { + pg++; + pg &= buf->nr_pages - 1; + } while (buf->topa_index[pg] == te); + + return pg; +} + +/** + * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer + * @buf: PT buffer. + * @handle: Current output handle. + * + * Place INT and STOP marks to prevent overwriting old data that the consumer + * hasn't yet collected. + */ +static int pt_buffer_reset_markers(struct pt_buffer *buf, + struct perf_output_handle *handle) + +{ + unsigned long idx, npages, end; + + if (buf->snapshot) + return 0; + + /* can't stop in the middle of an output region */ + if (buf->output_off + handle->size + 1 < + sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) + return -EINVAL; + + + /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */ + if (!pt_cap_get(PT_CAP_topa_multiple_entries)) + return 0; + + /* clear STOP and INT from current entry */ + buf->topa_index[buf->stop_pos]->stop = 0; + buf->topa_index[buf->intr_pos]->intr = 0; + + if (pt_cap_get(PT_CAP_topa_multiple_entries)) { + npages = (handle->size + 1) >> PAGE_SHIFT; + end = (local64_read(&buf->head) >> PAGE_SHIFT) + npages; + /*if (end > handle->wakeup >> PAGE_SHIFT) + end = handle->wakeup >> PAGE_SHIFT;*/ + idx = end & (buf->nr_pages - 1); + buf->stop_pos = idx; + idx = (local64_read(&buf->head) >> PAGE_SHIFT) + npages - 1; + idx &= buf->nr_pages - 1; + buf->intr_pos = idx; + } + + buf->topa_index[buf->stop_pos]->stop = 1; + buf->topa_index[buf->intr_pos]->intr = 1; + + return 0; +} + +/** + * pt_buffer_setup_topa_index() - build topa_index[] table of regions + * @buf: PT buffer. + * + * topa_index[] references output regions indexed by offset into the + * buffer for purposes of quick reverse lookup. + */ +static void pt_buffer_setup_topa_index(struct pt_buffer *buf) +{ + struct topa *cur = buf->first, *prev = buf->last; + struct topa_entry *te_cur = TOPA_ENTRY(cur, 0), + *te_prev = TOPA_ENTRY(prev, prev->last - 1); + int pg = 0, idx = 0, ntopa = 0; + + while (pg < buf->nr_pages) { + int tidx; + + /* pages within one topa entry */ + for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++) + buf->topa_index[pg] = te_prev; + + te_prev = te_cur; + + if (idx == cur->last - 1) { + /* advance to next topa table */ + idx = 0; + cur = list_entry(cur->list.next, struct topa, list); + ntopa++; + } else + idx++; + te_cur = TOPA_ENTRY(cur, idx); + } + +} + +/** + * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head + * @buf: PT buffer. + * @head: Write pointer (aux_head) from AUX buffer. + * + * Find the ToPA table and entry corresponding to given @head and set buffer's + * "current" pointers accordingly. + */ +static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head) +{ + int pg; + + if (buf->snapshot) + head &= (buf->nr_pages << PAGE_SHIFT) - 1; + + pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1); + pg = pt_topa_next_entry(buf, pg); + + buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK); + buf->cur_idx = ((unsigned long)buf->topa_index[pg] - + (unsigned long)buf->cur) / sizeof(struct topa_entry); + buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1); + + local64_set(&buf->head, head); + local_set(&buf->data_size, 0); +} + +/** + * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer + * @buf: PT buffer. + */ +static void pt_buffer_fini_topa(struct pt_buffer *buf) +{ + struct topa *topa, *iter; + + list_for_each_entry_safe(topa, iter, &buf->tables, list) { + /* + * right now, this is in free_aux() path only, so + * no need to unlink this table from the list + */ + topa_free(topa); + } +} + +/** + * pt_buffer_init_topa() - initialize ToPA table for pt buffer + * @buf: PT buffer. + * @size: Total size of all regions within this ToPA. + * @gfp: Allocation flags. + */ +static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages, + gfp_t gfp) +{ + struct topa *topa; + int err; + + topa = topa_alloc(buf->cpu, gfp); + if (!topa) + return -ENOMEM; + + topa_insert_table(buf, topa); + + while (buf->nr_pages < nr_pages) { + err = topa_insert_pages(buf, gfp); + if (err) { + pt_buffer_fini_topa(buf); + return -ENOMEM; + } + } + + pt_buffer_setup_topa_index(buf); + + /* link last table to the first one, unless we're double buffering */ + if (pt_cap_get(PT_CAP_topa_multiple_entries)) { + TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT; + TOPA_ENTRY(buf->last, -1)->end = 1; + } + + pt_topa_dump(buf); + return 0; +} + +/** + * pt_buffer_setup_aux() - set up topa tables for a PT buffer + * @cpu: Cpu on which to allocate, -1 means current. + * @pages: Array of pointers to buffer pages passed from perf core. + * @nr_pages: Number of pages in the buffer. + * @snapshot: If this is a snapshot/overwrite counter. + * + * This is a pmu::setup_aux callback that sets up ToPA tables and all the + * bookkeeping for an AUX buffer. + * + * Return: Our private PT buffer structure. + */ +static void * +pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot) +{ + struct pt_buffer *buf; + int node, ret; + + if (!nr_pages) + return NULL; + + if (cpu == -1) + cpu = raw_smp_processor_id(); + node = cpu_to_node(cpu); + + buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]), + GFP_KERNEL, node); + if (!buf) + return NULL; + + buf->cpu = cpu; + buf->snapshot = snapshot; + buf->data_pages = pages; + + INIT_LIST_HEAD(&buf->tables); + + ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL); + if (ret) { + kfree(buf); + return NULL; + } + + return buf; +} + +/** + * pt_buffer_free_aux() - perf AUX deallocation path callback + * @data: PT buffer. + */ +static void pt_buffer_free_aux(void *data) +{ + struct pt_buffer *buf = data; + + pt_buffer_fini_topa(buf); + kfree(buf); +} + +/** + * pt_buffer_is_full() - check if the buffer is full + * @buf: PT buffer. + * @pt: Per-cpu pt handle. + * + * If the user hasn't read data from the output region that aux_head + * points to, the buffer is considered full: the user needs to read at + * least this region and update aux_tail to point past it. + */ +static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt) +{ + if (buf->snapshot) + return false; + + if (local_read(&buf->data_size) >= pt->handle.size) + return true; + + return false; +} + +/** + * intel_pt_interrupt() - PT PMI handler + */ +void intel_pt_interrupt(void) +{ + struct pt *pt = this_cpu_ptr(&pt_ctx); + struct pt_buffer *buf; + struct perf_event *event = pt->handle.event; + + /* + * There may be a dangling PT bit in the interrupt status register + * after PT has been disabled by pt_event_stop(). Make sure we don't + * do anything (particularly, re-enable) for this event here. + */ + if (!ACCESS_ONCE(pt->handle_nmi)) + return; + + pt_config_start(false); + + if (!event) + return; + + buf = perf_get_aux(&pt->handle); + if (!buf) + return; + + pt_read_offset(buf); + + pt_handle_status(pt); + + pt_update_head(pt); + + perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0), + local_xchg(&buf->lost, 0)); + + if (!event->hw.state) { + int ret; + + buf = perf_aux_output_begin(&pt->handle, event); + if (!buf) { + event->hw.state = PERF_HES_STOPPED; + return; + } + + pt_buffer_reset_offsets(buf, pt->handle.head); + ret = pt_buffer_reset_markers(buf, &pt->handle); + if (ret) { + perf_aux_output_end(&pt->handle, 0, true); + return; + } + + pt_config_buffer(buf->cur->table, buf->cur_idx, + buf->output_off); + wrmsrl(MSR_IA32_RTIT_STATUS, 0); + pt_config(event); + } +} + +/* + * PMU callbacks + */ + +static void pt_event_start(struct perf_event *event, int mode) +{ + struct pt *pt = this_cpu_ptr(&pt_ctx); + struct pt_buffer *buf = perf_get_aux(&pt->handle); + + if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) { + event->hw.state = PERF_HES_STOPPED; + return; + } + + ACCESS_ONCE(pt->handle_nmi) = 1; + event->hw.state = 0; + + pt_config_buffer(buf->cur->table, buf->cur_idx, + buf->output_off); + wrmsrl(MSR_IA32_RTIT_STATUS, 0); + pt_config(event); +} + +static void pt_event_stop(struct perf_event *event, int mode) +{ + struct pt *pt = this_cpu_ptr(&pt_ctx); + + /* + * Protect against the PMI racing with disabling wrmsr, + * see comment in intel_pt_interrupt(). + */ + ACCESS_ONCE(pt->handle_nmi) = 0; + pt_config_start(false); + + if (event->hw.state == PERF_HES_STOPPED) + return; + + event->hw.state = PERF_HES_STOPPED; + + if (mode & PERF_EF_UPDATE) { + struct pt *pt = this_cpu_ptr(&pt_ctx); + struct pt_buffer *buf = perf_get_aux(&pt->handle); + + if (!buf) + return; + + if (WARN_ON_ONCE(pt->handle.event != event)) + return; + + pt_read_offset(buf); + + pt_handle_status(pt); + + pt_update_head(pt); + } +} + +static void pt_event_del(struct perf_event *event, int mode) +{ + struct pt *pt = this_cpu_ptr(&pt_ctx); + struct pt_buffer *buf; + + pt_event_stop(event, PERF_EF_UPDATE); + + buf = perf_get_aux(&pt->handle); + + if (buf) { + if (buf->snapshot) + pt->handle.head = + local_xchg(&buf->data_size, + buf->nr_pages << PAGE_SHIFT); + perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0), + local_xchg(&buf->lost, 0)); + } +} + +static int pt_event_add(struct perf_event *event, int mode) +{ + struct pt_buffer *buf; + struct pt *pt = this_cpu_ptr(&pt_ctx); + struct hw_perf_event *hwc = &event->hw; + int ret = -EBUSY; + + if (pt->handle.event) + goto out; + + buf = perf_aux_output_begin(&pt->handle, event); + if (!buf) { + ret = -EINVAL; + goto out; + } + + pt_buffer_reset_offsets(buf, pt->handle.head); + if (!buf->snapshot) { + ret = pt_buffer_reset_markers(buf, &pt->handle); + if (ret) { + perf_aux_output_end(&pt->handle, 0, true); + goto out; + } + } + + if (mode & PERF_EF_START) { + pt_event_start(event, 0); + if (hwc->state == PERF_HES_STOPPED) { + pt_event_del(event, 0); + ret = -EBUSY; + } + } else { + hwc->state = PERF_HES_STOPPED; + } + + ret = 0; +out: + + if (ret) + hwc->state = PERF_HES_STOPPED; + + return ret; +} + +static void pt_event_read(struct perf_event *event) +{ +} + +static void pt_event_destroy(struct perf_event *event) +{ + x86_del_exclusive(x86_lbr_exclusive_pt); +} + +static int pt_event_init(struct perf_event *event) +{ + if (event->attr.type != pt_pmu.pmu.type) + return -ENOENT; + + if (!pt_event_valid(event)) + return -EINVAL; + + if (x86_add_exclusive(x86_lbr_exclusive_pt)) + return -EBUSY; + + event->destroy = pt_event_destroy; + + return 0; +} + +static __init int pt_init(void) +{ + int ret, cpu, prior_warn = 0; + + BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE); + get_online_cpus(); + for_each_online_cpu(cpu) { + u64 ctl; + + ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl); + if (!ret && (ctl & RTIT_CTL_TRACEEN)) + prior_warn++; + } + put_online_cpus(); + + if (prior_warn) { + x86_add_exclusive(x86_lbr_exclusive_pt); + pr_warn("PT is enabled at boot time, doing nothing\n"); + + return -EBUSY; + } + + ret = pt_pmu_hw_init(); + if (ret) + return ret; + + if (!pt_cap_get(PT_CAP_topa_output)) { + pr_warn("ToPA output is not supported on this CPU\n"); + return -ENODEV; + } + + if (!pt_cap_get(PT_CAP_topa_multiple_entries)) + pt_pmu.pmu.capabilities = + PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF; + + pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE; + pt_pmu.pmu.attr_groups = pt_attr_groups; + pt_pmu.pmu.task_ctx_nr = perf_sw_context; + pt_pmu.pmu.event_init = pt_event_init; + pt_pmu.pmu.add = pt_event_add; + pt_pmu.pmu.del = pt_event_del; + pt_pmu.pmu.start = pt_event_start; + pt_pmu.pmu.stop = pt_event_stop; + pt_pmu.pmu.read = pt_event_read; + pt_pmu.pmu.setup_aux = pt_buffer_setup_aux; + pt_pmu.pmu.free_aux = pt_buffer_free_aux; + ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1); + + return ret; +} + +module_init(pt_init); -- cgit v1.2.3 From 8062382c8dbe2dc11d37e7f0b139508cf10de9d4 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 30 Jan 2015 12:40:35 +0200 Subject: perf/x86/intel/bts: Add BTS PMU driver Add support for Branch Trace Store (BTS) via kernel perf event infrastructure. The difference with the existing implementation of BTS support is that this one is a separate PMU that exports events' trace buffers to userspace by means of AUX area of the perf buffer, which is zero-copy mapped into userspace. The immediate benefit is that the buffer size can be much bigger, resulting in fewer interrupts and no kernel side copying is involved and little to no trace data loss. Also, kernel code can be traced with this driver. The old way of collecting BTS traces still works. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kaixu Xia Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Robert Richter Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Link: http://lkml.kernel.org/r/1422614435-114702-1-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/perf_event.h | 7 + arch/x86/kernel/cpu/perf_event_intel.c | 6 +- arch/x86/kernel/cpu/perf_event_intel_bts.c | 525 +++++++++++++++++++++++++++++ arch/x86/kernel/cpu/perf_event_intel_ds.c | 3 +- 5 files changed, 540 insertions(+), 3 deletions(-) create mode 100644 arch/x86/kernel/cpu/perf_event_intel_bts.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index e6b353f10e09..9bff68798836 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -40,7 +40,7 @@ endif obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o perf_event_intel_cqm.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_pt.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_pt.o perf_event_intel_bts.o obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \ perf_event_intel_uncore_snb.o \ diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index f04729ac3290..eaebfd707016 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -410,6 +410,7 @@ union x86_pmu_config { enum { x86_lbr_exclusive_lbr, + x86_lbr_exclusive_bts, x86_lbr_exclusive_pt, x86_lbr_exclusive_max, }; @@ -810,6 +811,12 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); +int intel_bts_interrupt(void); + +void intel_bts_enable_local(void); + +void intel_bts_disable_local(void); + int p4_pmu_init(void); int p6_pmu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 8eb22ce26303..b9861e19cd3d 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1242,6 +1242,8 @@ static void intel_pmu_disable_all(void) if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) intel_pmu_disable_bts(); + else + intel_bts_disable_local(); intel_pmu_pebs_disable_all(); intel_pmu_lbr_disable_all(); @@ -1264,7 +1266,8 @@ static void intel_pmu_enable_all(int added) return; intel_pmu_enable_bts(event->hw.config); - } + } else + intel_bts_enable_local(); } /* @@ -1550,6 +1553,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) apic_write(APIC_LVTPC, APIC_DM_NMI); intel_pmu_disable_all(); handled = intel_pmu_drain_bts_buffer(); + handled += intel_bts_interrupt(); status = intel_pmu_get_status(); if (!status) goto done; diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c new file mode 100644 index 000000000000..fb1a4c28f3e1 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c @@ -0,0 +1,525 @@ +/* + * BTS PMU driver for perf + * Copyright (c) 2013-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#undef DEBUG + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "perf_event.h" + +struct bts_ctx { + struct perf_output_handle handle; + struct debug_store ds_back; + int started; +}; + +static DEFINE_PER_CPU(struct bts_ctx, bts_ctx); + +#define BTS_RECORD_SIZE 24 +#define BTS_SAFETY_MARGIN 4080 + +struct bts_phys { + struct page *page; + unsigned long size; + unsigned long offset; + unsigned long displacement; +}; + +struct bts_buffer { + size_t real_size; /* multiple of BTS_RECORD_SIZE */ + unsigned int nr_pages; + unsigned int nr_bufs; + unsigned int cur_buf; + bool snapshot; + local_t data_size; + local_t lost; + local_t head; + unsigned long end; + void **data_pages; + struct bts_phys buf[0]; +}; + +struct pmu bts_pmu; + +void intel_pmu_enable_bts(u64 config); +void intel_pmu_disable_bts(void); + +static size_t buf_size(struct page *page) +{ + return 1 << (PAGE_SHIFT + page_private(page)); +} + +static void * +bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite) +{ + struct bts_buffer *buf; + struct page *page; + int node = (cpu == -1) ? cpu : cpu_to_node(cpu); + unsigned long offset; + size_t size = nr_pages << PAGE_SHIFT; + int pg, nbuf, pad; + + /* count all the high order buffers */ + for (pg = 0, nbuf = 0; pg < nr_pages;) { + page = virt_to_page(pages[pg]); + if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1)) + return NULL; + pg += 1 << page_private(page); + nbuf++; + } + + /* + * to avoid interrupts in overwrite mode, only allow one physical + */ + if (overwrite && nbuf > 1) + return NULL; + + buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node); + if (!buf) + return NULL; + + buf->nr_pages = nr_pages; + buf->nr_bufs = nbuf; + buf->snapshot = overwrite; + buf->data_pages = pages; + buf->real_size = size - size % BTS_RECORD_SIZE; + + for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) { + unsigned int __nr_pages; + + page = virt_to_page(pages[pg]); + __nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1; + buf->buf[nbuf].page = page; + buf->buf[nbuf].offset = offset; + buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0); + buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement; + pad = buf->buf[nbuf].size % BTS_RECORD_SIZE; + buf->buf[nbuf].size -= pad; + + pg += __nr_pages; + offset += __nr_pages << PAGE_SHIFT; + } + + return buf; +} + +static void bts_buffer_free_aux(void *data) +{ + kfree(data); +} + +static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx) +{ + return buf->buf[idx].offset + buf->buf[idx].displacement; +} + +static void +bts_config_buffer(struct bts_buffer *buf) +{ + int cpu = raw_smp_processor_id(); + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + struct bts_phys *phys = &buf->buf[buf->cur_buf]; + unsigned long index, thresh = 0, end = phys->size; + struct page *page = phys->page; + + index = local_read(&buf->head); + + if (!buf->snapshot) { + if (buf->end < phys->offset + buf_size(page)) + end = buf->end - phys->offset - phys->displacement; + + index -= phys->offset + phys->displacement; + + if (end - index > BTS_SAFETY_MARGIN) + thresh = end - BTS_SAFETY_MARGIN; + else if (end - index > BTS_RECORD_SIZE) + thresh = end - BTS_RECORD_SIZE; + else + thresh = end; + } + + ds->bts_buffer_base = (u64)page_address(page) + phys->displacement; + ds->bts_index = ds->bts_buffer_base + index; + ds->bts_absolute_maximum = ds->bts_buffer_base + end; + ds->bts_interrupt_threshold = !buf->snapshot + ? ds->bts_buffer_base + thresh + : ds->bts_absolute_maximum + BTS_RECORD_SIZE; +} + +static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head) +{ + unsigned long index = head - phys->offset; + + memset(page_address(phys->page) + index, 0, phys->size - index); +} + +static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts) +{ + if (buf->snapshot) + return false; + + if (local_read(&buf->data_size) >= bts->handle.size || + bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE) + return true; + + return false; +} + +static void bts_update(struct bts_ctx *bts) +{ + int cpu = raw_smp_processor_id(); + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + struct bts_buffer *buf = perf_get_aux(&bts->handle); + unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head; + + if (!buf) + return; + + head = index + bts_buffer_offset(buf, buf->cur_buf); + old = local_xchg(&buf->head, head); + + if (!buf->snapshot) { + if (old == head) + return; + + if (ds->bts_index >= ds->bts_absolute_maximum) + local_inc(&buf->lost); + + /* + * old and head are always in the same physical buffer, so we + * can subtract them to get the data size. + */ + local_add(head - old, &buf->data_size); + } else { + local_set(&buf->data_size, head); + } +} + +static void __bts_event_start(struct perf_event *event) +{ + struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct bts_buffer *buf = perf_get_aux(&bts->handle); + u64 config = 0; + + if (!buf || bts_buffer_is_full(buf, bts)) + return; + + event->hw.state = 0; + + if (!buf->snapshot) + config |= ARCH_PERFMON_EVENTSEL_INT; + if (!event->attr.exclude_kernel) + config |= ARCH_PERFMON_EVENTSEL_OS; + if (!event->attr.exclude_user) + config |= ARCH_PERFMON_EVENTSEL_USR; + + bts_config_buffer(buf); + + /* + * local barrier to make sure that ds configuration made it + * before we enable BTS + */ + wmb(); + + intel_pmu_enable_bts(config); +} + +static void bts_event_start(struct perf_event *event, int flags) +{ + struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + + __bts_event_start(event); + + /* PMI handler: this counter is running and likely generating PMIs */ + ACCESS_ONCE(bts->started) = 1; +} + +static void __bts_event_stop(struct perf_event *event) +{ + /* + * No extra synchronization is mandated by the documentation to have + * BTS data stores globally visible. + */ + intel_pmu_disable_bts(); + + if (event->hw.state & PERF_HES_STOPPED) + return; + + ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED; +} + +static void bts_event_stop(struct perf_event *event, int flags) +{ + struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + + /* PMI handler: don't restart this counter */ + ACCESS_ONCE(bts->started) = 0; + + __bts_event_stop(event); + + if (flags & PERF_EF_UPDATE) + bts_update(bts); +} + +void intel_bts_enable_local(void) +{ + struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + + if (bts->handle.event && bts->started) + __bts_event_start(bts->handle.event); +} + +void intel_bts_disable_local(void) +{ + struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + + if (bts->handle.event) + __bts_event_stop(bts->handle.event); +} + +static int +bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) +{ + unsigned long head, space, next_space, pad, gap, skip, wakeup; + unsigned int next_buf; + struct bts_phys *phys, *next_phys; + int ret; + + if (buf->snapshot) + return 0; + + head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1); + if (WARN_ON_ONCE(head != local_read(&buf->head))) + return -EINVAL; + + phys = &buf->buf[buf->cur_buf]; + space = phys->offset + phys->displacement + phys->size - head; + pad = space; + if (space > handle->size) { + space = handle->size; + space -= space % BTS_RECORD_SIZE; + } + if (space <= BTS_SAFETY_MARGIN) { + /* See if next phys buffer has more space */ + next_buf = buf->cur_buf + 1; + if (next_buf >= buf->nr_bufs) + next_buf = 0; + next_phys = &buf->buf[next_buf]; + gap = buf_size(phys->page) - phys->displacement - phys->size + + next_phys->displacement; + skip = pad + gap; + if (handle->size >= skip) { + next_space = next_phys->size; + if (next_space + skip > handle->size) { + next_space = handle->size - skip; + next_space -= next_space % BTS_RECORD_SIZE; + } + if (next_space > space || !space) { + if (pad) + bts_buffer_pad_out(phys, head); + ret = perf_aux_output_skip(handle, skip); + if (ret) + return ret; + /* Advance to next phys buffer */ + phys = next_phys; + space = next_space; + head = phys->offset + phys->displacement; + /* + * After this, cur_buf and head won't match ds + * anymore, so we must not be racing with + * bts_update(). + */ + buf->cur_buf = next_buf; + local_set(&buf->head, head); + } + } + } + + /* Don't go far beyond wakeup watermark */ + wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup - + handle->head; + if (space > wakeup) { + space = wakeup; + space -= space % BTS_RECORD_SIZE; + } + + buf->end = head + space; + + /* + * If we have no space, the lost notification would have been sent when + * we hit absolute_maximum - see bts_update() + */ + if (!space) + return -ENOSPC; + + return 0; +} + +int intel_bts_interrupt(void) +{ + struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct perf_event *event = bts->handle.event; + struct bts_buffer *buf; + s64 old_head; + int err; + + if (!event || !bts->started) + return 0; + + buf = perf_get_aux(&bts->handle); + /* + * Skip snapshot counters: they don't use the interrupt, but + * there's no other way of telling, because the pointer will + * keep moving + */ + if (!buf || buf->snapshot) + return 0; + + old_head = local_read(&buf->head); + bts_update(bts); + + /* no new data */ + if (old_head == local_read(&buf->head)) + return 0; + + perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), + !!local_xchg(&buf->lost, 0)); + + buf = perf_aux_output_begin(&bts->handle, event); + if (!buf) + return 1; + + err = bts_buffer_reset(buf, &bts->handle); + if (err) + perf_aux_output_end(&bts->handle, 0, false); + + return 1; +} + +static void bts_event_del(struct perf_event *event, int mode) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct bts_buffer *buf = perf_get_aux(&bts->handle); + + bts_event_stop(event, PERF_EF_UPDATE); + + if (buf) { + if (buf->snapshot) + bts->handle.head = + local_xchg(&buf->data_size, + buf->nr_pages << PAGE_SHIFT); + perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), + !!local_xchg(&buf->lost, 0)); + } + + cpuc->ds->bts_index = bts->ds_back.bts_buffer_base; + cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base; + cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum; + cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold; +} + +static int bts_event_add(struct perf_event *event, int mode) +{ + struct bts_buffer *buf; + struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int ret = -EBUSY; + + event->hw.state = PERF_HES_STOPPED; + + if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + return -EBUSY; + + if (bts->handle.event) + return -EBUSY; + + buf = perf_aux_output_begin(&bts->handle, event); + if (!buf) + return -EINVAL; + + ret = bts_buffer_reset(buf, &bts->handle); + if (ret) { + perf_aux_output_end(&bts->handle, 0, false); + return ret; + } + + bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base; + bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum; + bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold; + + if (mode & PERF_EF_START) { + bts_event_start(event, 0); + if (hwc->state & PERF_HES_STOPPED) { + bts_event_del(event, 0); + return -EBUSY; + } + } + + return 0; +} + +static void bts_event_destroy(struct perf_event *event) +{ + x86_del_exclusive(x86_lbr_exclusive_bts); +} + +static int bts_event_init(struct perf_event *event) +{ + if (event->attr.type != bts_pmu.type) + return -ENOENT; + + if (x86_add_exclusive(x86_lbr_exclusive_bts)) + return -EBUSY; + + event->destroy = bts_event_destroy; + + return 0; +} + +static void bts_event_read(struct perf_event *event) +{ +} + +static __init int bts_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts) + return -ENODEV; + + bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE; + bts_pmu.task_ctx_nr = perf_sw_context; + bts_pmu.event_init = bts_event_init; + bts_pmu.add = bts_event_add; + bts_pmu.del = bts_event_del; + bts_pmu.start = bts_event_start; + bts_pmu.stop = bts_event_stop; + bts_pmu.read = bts_event_read; + bts_pmu.setup_aux = bts_buffer_setup_aux; + bts_pmu.free_aux = bts_buffer_free_aux; + + return perf_pmu_register(&bts_pmu, "intel_bts", -1); +} + +module_init(bts_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 073983398364..a5149c7abe73 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -461,7 +461,8 @@ void intel_pmu_enable_bts(u64 config) debugctlmsr |= DEBUGCTLMSR_TR; debugctlmsr |= DEBUGCTLMSR_BTS; - debugctlmsr |= DEBUGCTLMSR_BTINT; + if (config & ARCH_PERFMON_EVENTSEL_INT) + debugctlmsr |= DEBUGCTLMSR_BTINT; if (!(config & ARCH_PERFMON_EVENTSEL_OS)) debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS; -- cgit v1.2.3 From 9a5e3fb52ae5458c8bf1a67129b96c39b541a582 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 17 Nov 2014 20:06:53 +0100 Subject: perf/x86: Rename x86_pmu::er_flags to 'flags' Because it will be used for more than just tracking the presence of extra registers. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Cc: bp@alien8.de Cc: jolsa@redhat.com Cc: kan.liang@intel.com Cc: maria.n.dimakopoulou@gmail.com Link: http://lkml.kernel.org/r/1416251225-17721-2-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 9 ++++++--- arch/x86/kernel/cpu/perf_event_intel.c | 24 ++++++++++++------------ 2 files changed, 18 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index eaebfd707016..5264010c9a08 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -521,7 +521,7 @@ struct x86_pmu { * Extra registers for events */ struct extra_reg *extra_regs; - unsigned int er_flags; + unsigned int flags; /* * Intel host/guest support (KVM) @@ -545,8 +545,11 @@ do { \ x86_pmu.quirks = &__quirk; \ } while (0) -#define ERF_NO_HT_SHARING 1 -#define ERF_HAS_RSP_1 2 +/* + * x86_pmu flags + */ +#define PMU_FL_NO_HT_SHARING 0x1 /* no hyper-threading resource sharing */ +#define PMU_FL_HAS_RSP_1 0x2 /* has 2 equivalent offcore_rsp regs */ #define EVENT_VAR(_id) event_attr_##_id #define EVENT_PTR(_id) &event_attr_##_id.attr.attr diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 1c78f44f4f93..e85988e2ecc7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1667,7 +1667,7 @@ intel_bts_constraints(struct perf_event *event) static int intel_alt_er(int idx) { - if (!(x86_pmu.er_flags & ERF_HAS_RSP_1)) + if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1)) return idx; if (idx == EXTRA_REG_RSP_0) @@ -2250,7 +2250,7 @@ static void intel_pmu_cpu_starting(int cpu) if (!cpuc->shared_regs) return; - if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) { + if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) { for_each_cpu(i, topology_thread_cpumask(cpu)) { struct intel_shared_regs *pc; @@ -2671,7 +2671,7 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_slm_event_constraints; x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints; x86_pmu.extra_regs = intel_slm_extra_regs; - x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; pr_cont("Silvermont events, "); break; @@ -2689,7 +2689,7 @@ __init int intel_pmu_init(void) x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; x86_pmu.extra_regs = intel_westmere_extra_regs; - x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.cpu_events = nhm_events_attrs; @@ -2721,8 +2721,8 @@ __init int intel_pmu_init(void) else x86_pmu.extra_regs = intel_snb_extra_regs; /* all extra regs are per-cpu when HT is on */ - x86_pmu.er_flags |= ERF_HAS_RSP_1; - x86_pmu.er_flags |= ERF_NO_HT_SHARING; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_NO_HT_SHARING; x86_pmu.cpu_events = snb_events_attrs; @@ -2756,8 +2756,8 @@ __init int intel_pmu_init(void) else x86_pmu.extra_regs = intel_snb_extra_regs; /* all extra regs are per-cpu when HT is on */ - x86_pmu.er_flags |= ERF_HAS_RSP_1; - x86_pmu.er_flags |= ERF_NO_HT_SHARING; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_NO_HT_SHARING; x86_pmu.cpu_events = snb_events_attrs; @@ -2784,8 +2784,8 @@ __init int intel_pmu_init(void) x86_pmu.extra_regs = intel_snbep_extra_regs; x86_pmu.pebs_aliases = intel_pebs_aliases_snb; /* all extra regs are per-cpu when HT is on */ - x86_pmu.er_flags |= ERF_HAS_RSP_1; - x86_pmu.er_flags |= ERF_NO_HT_SHARING; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_NO_HT_SHARING; x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; @@ -2817,8 +2817,8 @@ __init int intel_pmu_init(void) x86_pmu.extra_regs = intel_snbep_extra_regs; x86_pmu.pebs_aliases = intel_pebs_aliases_snb; /* all extra regs are per-cpu when HT is on */ - x86_pmu.er_flags |= ERF_HAS_RSP_1; - x86_pmu.er_flags |= ERF_NO_HT_SHARING; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_NO_HT_SHARING; x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; -- cgit v1.2.3 From 90413464313e00fe4975f4a0ebf25fe31d01f793 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 17 Nov 2014 20:06:54 +0100 Subject: perf/x86: Vectorize cpuc->kfree_on_online Make the cpuc->kfree_on_online a vector to accommodate more than one entry and add the second entry to be used by a later patch. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Maria Dimakopoulou Cc: bp@alien8.de Cc: jolsa@redhat.com Cc: kan.liang@intel.com Link: http://lkml.kernel.org/r/1416251225-17721-3-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 10 +++++++--- arch/x86/kernel/cpu/perf_event.h | 8 +++++++- arch/x86/kernel/cpu/perf_event_amd.c | 3 ++- arch/x86/kernel/cpu/perf_event_intel.c | 4 +++- 4 files changed, 19 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 549d01d6d996..682ef00727e7 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1373,11 +1373,12 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - int ret = NOTIFY_OK; + int i, ret = NOTIFY_OK; switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - cpuc->kfree_on_online = NULL; + for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) + cpuc->kfree_on_online[i] = NULL; if (x86_pmu.cpu_prepare) ret = x86_pmu.cpu_prepare(cpu); break; @@ -1388,7 +1389,10 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) break; case CPU_ONLINE: - kfree(cpuc->kfree_on_online); + for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) { + kfree(cpuc->kfree_on_online[i]); + cpuc->kfree_on_online[i] = NULL; + } break; case CPU_DYING: diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 5264010c9a08..55b915511e53 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -125,6 +125,12 @@ struct intel_shared_regs { #define MAX_LBR_ENTRIES 16 +enum { + X86_PERF_KFREE_SHARED = 0, + X86_PERF_KFREE_EXCL = 1, + X86_PERF_KFREE_MAX +}; + struct cpu_hw_events { /* * Generic x86 PMC bits @@ -187,7 +193,7 @@ struct cpu_hw_events { /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ u64 perf_ctr_virt_mask; - void *kfree_on_online; + void *kfree_on_online[X86_PERF_KFREE_MAX]; }; #define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\ diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 28926311aac1..e4302b8fed2a 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -382,6 +382,7 @@ static int amd_pmu_cpu_prepare(int cpu) static void amd_pmu_cpu_starting(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED]; struct amd_nb *nb; int i, nb_id; @@ -399,7 +400,7 @@ static void amd_pmu_cpu_starting(int cpu) continue; if (nb->nb_id == nb_id) { - cpuc->kfree_on_online = cpuc->amd_nb; + *onln = cpuc->amd_nb; cpuc->amd_nb = nb; break; } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index e85988e2ecc7..c0ed5a4b9537 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2251,12 +2251,14 @@ static void intel_pmu_cpu_starting(int cpu) return; if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) { + void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED]; + for_each_cpu(i, topology_thread_cpumask(cpu)) { struct intel_shared_regs *pc; pc = per_cpu(cpu_hw_events, i).shared_regs; if (pc && pc->core_id == core_id) { - cpuc->kfree_on_online = cpuc->shared_regs; + *onln = cpuc->shared_regs; cpuc->shared_regs = pc; break; } -- cgit v1.2.3 From c5362c0c376486afcf3c91d3c2691d348ac1e2fd Mon Sep 17 00:00:00 2001 From: Maria Dimakopoulou Date: Mon, 17 Nov 2014 20:06:55 +0100 Subject: perf/x86: Add 3 new scheduling callbacks This patch adds 3 new PMU model specific callbacks during the event scheduling done by x86_schedule_events(). ->start_scheduling(): invoked when entering the schedule routine. ->stop_scheduling(): invoked at the end of the schedule routine ->commit_scheduling(): invoked for each committed event To be used optionally by model-specific code. Signed-off-by: Maria Dimakopoulou Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Stephane Eranian Cc: bp@alien8.de Cc: jolsa@redhat.com Cc: kan.liang@intel.com Link: http://lkml.kernel.org/r/1416251225-17721-4-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 9 +++++++++ arch/x86/kernel/cpu/perf_event.h | 9 +++++++++ 2 files changed, 18 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 682ef00727e7..cd6115867fb8 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -784,6 +784,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) bitmap_zero(used_mask, X86_PMC_IDX_MAX); + if (x86_pmu.start_scheduling) + x86_pmu.start_scheduling(cpuc); + for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { hwc = &cpuc->event_list[i]->hw; c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); @@ -830,6 +833,8 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) for (i = 0; i < n; i++) { e = cpuc->event_list[i]; e->hw.flags |= PERF_X86_EVENT_COMMITTED; + if (x86_pmu.commit_scheduling) + x86_pmu.commit_scheduling(cpuc, e, assign[i]); } } /* @@ -850,6 +855,10 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) x86_pmu.put_event_constraints(cpuc, e); } } + + if (x86_pmu.stop_scheduling) + x86_pmu.stop_scheduling(cpuc); + return num ? -EINVAL : 0; } diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 55b915511e53..ea27e63fc945 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -460,6 +460,15 @@ struct x86_pmu { void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); + + void (*commit_scheduling)(struct cpu_hw_events *cpuc, + struct perf_event *event, + int cntr); + + void (*start_scheduling)(struct cpu_hw_events *cpuc); + + void (*stop_scheduling)(struct cpu_hw_events *cpuc); + struct event_constraint *event_constraints; struct x86_pmu_quirk *quirks; int perfctr_second_write; -- cgit v1.2.3 From 79cba822443a168c8f7f5b853d9c7225a6d5415e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 17 Nov 2014 20:06:56 +0100 Subject: perf/x86: Add 'index' param to get_event_constraint() callback This patch adds an index parameter to the get_event_constraint() x86_pmu callback. It is expected to represent the index of the event in the cpuc->event_list[] array. When the callback is used for fake_cpuc (evnet validation), then the index must be -1. The motivation for passing the index is to use it to index into another cpuc array. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Cc: bp@alien8.de Cc: jolsa@redhat.com Cc: kan.liang@intel.com Cc: maria.n.dimakopoulou@gmail.com Link: http://lkml.kernel.org/r/1416251225-17721-5-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 4 ++-- arch/x86/kernel/cpu/perf_event.h | 4 +++- arch/x86/kernel/cpu/perf_event_amd.c | 6 ++++-- arch/x86/kernel/cpu/perf_event_intel.c | 15 ++++++++++----- 4 files changed, 19 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index cd6115867fb8..71755401476c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -789,7 +789,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { hwc = &cpuc->event_list[i]->hw; - c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); + c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]); hwc->constraint = c; wmin = min(wmin, c->weight); @@ -1777,7 +1777,7 @@ static int validate_event(struct perf_event *event) if (IS_ERR(fake_cpuc)) return PTR_ERR(fake_cpuc); - c = x86_pmu.get_event_constraints(fake_cpuc, event); + c = x86_pmu.get_event_constraints(fake_cpuc, -1, event); if (!c || !c->weight) ret = -EINVAL; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index ea27e63fc945..24a65057c1c0 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -456,6 +456,7 @@ struct x86_pmu { u64 max_period; struct event_constraint * (*get_event_constraints)(struct cpu_hw_events *cpuc, + int idx, struct perf_event *event); void (*put_event_constraints)(struct cpu_hw_events *cpuc, @@ -751,7 +752,8 @@ static inline bool intel_pmu_has_bts(struct perf_event *event) int intel_pmu_save_and_restart(struct perf_event *event); struct event_constraint * -x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event); +x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event); struct intel_shared_regs *allocate_shared_regs(int cpu); diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index e4302b8fed2a..1cee5d2d7ece 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -430,7 +430,8 @@ static void amd_pmu_cpu_dead(int cpu) } static struct event_constraint * -amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) { /* * if not NB event or no NB, then no constraints @@ -538,7 +539,8 @@ static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0); static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); static struct event_constraint * -amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event) +amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; unsigned int event_code = amd_get_event_code(hwc); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index c0ed5a4b9537..2dd34b57d3ff 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1827,7 +1827,8 @@ intel_shared_regs_constraints(struct cpu_hw_events *cpuc, } struct event_constraint * -x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) { struct event_constraint *c; @@ -1844,7 +1845,8 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) } static struct event_constraint * -intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) { struct event_constraint *c; @@ -1860,7 +1862,7 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event if (c) return c; - return x86_get_event_constraints(cpuc, event); + return x86_get_event_constraints(cpuc, idx, event); } static void @@ -2105,9 +2107,12 @@ static struct event_constraint counter2_constraint = EVENT_CONSTRAINT(0, 0x4, 0); static struct event_constraint * -hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) { - struct event_constraint *c = intel_get_event_constraints(cpuc, event); + struct event_constraint *c; + + c = intel_get_event_constraints(cpuc, idx, event); /* Handle special quirk on in_tx_checkpointed only in counter 2 */ if (event->hw.config & HSW_IN_TX_CHECKPOINTED) { -- cgit v1.2.3 From 6f6539cad926f55d5eb6e79d05bbe99f0d54d56d Mon Sep 17 00:00:00 2001 From: Maria Dimakopoulou Date: Mon, 17 Nov 2014 20:06:57 +0100 Subject: perf/x86/intel: Add cross-HT counter exclusion infrastructure This patch adds a new shared_regs style structure to the per-cpu x86 state (cpuc). It is used to coordinate access between counters which must be used with exclusion across HyperThreads on Intel processors. This new struct is not needed on each PMU, thus is is allocated on demand. Signed-off-by: Maria Dimakopoulou [peterz: spinlock_t -> raw_spinlock_t] Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Stephane Eranian Cc: bp@alien8.de Cc: jolsa@redhat.com Cc: kan.liang@intel.com Link: http://lkml.kernel.org/r/1416251225-17721-6-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 32 +++++++++++++++ arch/x86/kernel/cpu/perf_event_intel.c | 71 +++++++++++++++++++++++++++++++--- 2 files changed, 98 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 24a65057c1c0..f31f90e2d859 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -71,6 +71,7 @@ struct event_constraint { #define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */ #define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */ #define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */ +#define PERF_X86_EVENT_EXCL 0x40 /* HT exclusivity on counter */ #define PERF_X86_EVENT_RDPMC_ALLOWED 0x40 /* grant rdpmc permission */ @@ -123,6 +124,26 @@ struct intel_shared_regs { unsigned core_id; /* per-core: core id */ }; +enum intel_excl_state_type { + INTEL_EXCL_UNUSED = 0, /* counter is unused */ + INTEL_EXCL_SHARED = 1, /* counter can be used by both threads */ + INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */ +}; + +struct intel_excl_states { + enum intel_excl_state_type init_state[X86_PMC_IDX_MAX]; + enum intel_excl_state_type state[X86_PMC_IDX_MAX]; +}; + +struct intel_excl_cntrs { + raw_spinlock_t lock; + + struct intel_excl_states states[2]; + + int refcnt; /* per-core: #HT threads */ + unsigned core_id; /* per-core: core id */ +}; + #define MAX_LBR_ENTRIES 16 enum { @@ -185,6 +206,12 @@ struct cpu_hw_events { * used on Intel NHM/WSM/SNB */ struct intel_shared_regs *shared_regs; + /* + * manage exclusive counter access between hyperthread + */ + struct event_constraint *constraint_list; /* in enable order */ + struct intel_excl_cntrs *excl_cntrs; + int excl_thread_id; /* 0 or 1 */ /* * AMD specific bits @@ -208,6 +235,10 @@ struct cpu_hw_events { #define EVENT_CONSTRAINT(c, n, m) \ __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0) +#define INTEL_EXCLEVT_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\ + 0, PERF_X86_EVENT_EXCL) + /* * The overlap flag marks event constraints with overlapping counter * masks. This is the case if the counter mask of such an event is not @@ -566,6 +597,7 @@ do { \ */ #define PMU_FL_NO_HT_SHARING 0x1 /* no hyper-threading resource sharing */ #define PMU_FL_HAS_RSP_1 0x2 /* has 2 equivalent offcore_rsp regs */ +#define PMU_FL_EXCL_CNTRS 0x4 /* has exclusive counter requirements */ #define EVENT_VAR(_id) event_attr_##_id #define EVENT_PTR(_id) &event_attr_##_id.attr.attr diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 2dd34b57d3ff..7f54000fd0f1 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2224,16 +2224,52 @@ struct intel_shared_regs *allocate_shared_regs(int cpu) return regs; } +static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu) +{ + struct intel_excl_cntrs *c; + int i; + + c = kzalloc_node(sizeof(struct intel_excl_cntrs), + GFP_KERNEL, cpu_to_node(cpu)); + if (c) { + raw_spin_lock_init(&c->lock); + for (i = 0; i < X86_PMC_IDX_MAX; i++) { + c->states[0].state[i] = INTEL_EXCL_UNUSED; + c->states[0].init_state[i] = INTEL_EXCL_UNUSED; + + c->states[1].state[i] = INTEL_EXCL_UNUSED; + c->states[1].init_state[i] = INTEL_EXCL_UNUSED; + } + c->core_id = -1; + } + return c; +} + static int intel_pmu_cpu_prepare(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map)) - return NOTIFY_OK; + if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) { + cpuc->shared_regs = allocate_shared_regs(cpu); + if (!cpuc->shared_regs) + return NOTIFY_BAD; + } - cpuc->shared_regs = allocate_shared_regs(cpu); - if (!cpuc->shared_regs) - return NOTIFY_BAD; + if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { + size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint); + + cpuc->constraint_list = kzalloc(sz, GFP_KERNEL); + if (!cpuc->constraint_list) + return NOTIFY_BAD; + + cpuc->excl_cntrs = allocate_excl_cntrs(cpu); + if (!cpuc->excl_cntrs) { + kfree(cpuc->constraint_list); + kfree(cpuc->shared_regs); + return NOTIFY_BAD; + } + cpuc->excl_thread_id = 0; + } return NOTIFY_OK; } @@ -2274,12 +2310,29 @@ static void intel_pmu_cpu_starting(int cpu) if (x86_pmu.lbr_sel_map) cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR]; + + if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { + for_each_cpu(i, topology_thread_cpumask(cpu)) { + struct intel_excl_cntrs *c; + + c = per_cpu(cpu_hw_events, i).excl_cntrs; + if (c && c->core_id == core_id) { + cpuc->kfree_on_online[1] = cpuc->excl_cntrs; + cpuc->excl_cntrs = c; + cpuc->excl_thread_id = 1; + break; + } + } + cpuc->excl_cntrs->core_id = core_id; + cpuc->excl_cntrs->refcnt++; + } } static void intel_pmu_cpu_dying(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); struct intel_shared_regs *pc; + struct intel_excl_cntrs *c; pc = cpuc->shared_regs; if (pc) { @@ -2287,6 +2340,14 @@ static void intel_pmu_cpu_dying(int cpu) kfree(pc); cpuc->shared_regs = NULL; } + c = cpuc->excl_cntrs; + if (c) { + if (c->core_id == -1 || --c->refcnt == 0) + kfree(c); + cpuc->excl_cntrs = NULL; + kfree(cpuc->constraint_list); + cpuc->constraint_list = NULL; + } fini_debug_store_on_cpu(cpu); } -- cgit v1.2.3 From e979121b1b1556e184492e6fc149bbe188fc83e6 Mon Sep 17 00:00:00 2001 From: Maria Dimakopoulou Date: Mon, 17 Nov 2014 20:06:58 +0100 Subject: perf/x86/intel: Implement cross-HT corruption bug workaround This patch implements a software workaround for a HW erratum on Intel SandyBridge, IvyBridge and Haswell processors with Hyperthreading enabled. The errata are documented for each processor in their respective specification update documents: - SandyBridge: BJ122 - IvyBridge: BV98 - Haswell: HSD29 The bug causes silent counter corruption across hyperthreads only when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3). Counters measuring those events may leak counts to the sibling counter. For instance, counter 0, thread 0 measuring event 0xd0, may leak to counter 0, thread 1, regardless of the event measured there. The size of the leak is not predictible. It all depends on the workload and the state of each sibling hyper-thread. The corrupting events do undercount as a consequence of the leak. The leak is compensated automatically only when the sibling counter measures the exact same corrupting event AND the workload is on the two threads is the same. Given, there is no way to guarantee this, a work-around is necessary. Furthermore, there is a serious problem if the leaked count is added to a low-occurrence event. In that case the corruption on the low occurrence event can be very large, e.g., orders of magnitude. There is no HW or FW workaround for this problem. The bug is very easy to reproduce on a loaded system. Here is an example on a Haswell client, where CPU0, CPU4 are siblings. We load the CPUs with a simple triad app streaming large floating-point vector. We use 0x81d0 corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and 0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not using the LBR, the 0x20cc event should be zero. $ taskset -c 0 triad & $ taskset -c 4 triad & $ perf stat -a -C 0 -e r81d0 sleep 100 & $ perf stat -a -C 4 -r20cc sleep 10 Performance counter stats for 'system wide': 139 277 291 r20cc 10,000969126 seconds time elapsed In this example, 0x81d0 and r20cc ar eusing sinling counters on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it from 0 to 139 millions occurrences. This patch provides a software workaround to this problem by modifying the way events are scheduled onto counters by the kernel. The patch forces cross-thread mutual exclusion between counters in case a corrupting event is measured by one of the hyper-threads. If thread 0, counter 0 is measuring event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting event is measured on any hyper-thread, event scheduling proceeds as before. The same example run with the workaround enabled, yield the correct answer: $ taskset -c 0 triad & $ taskset -c 4 triad & $ perf stat -a -C 0 -e r81d0 sleep 100 & $ perf stat -a -C 4 -r20cc sleep 10 Performance counter stats for 'system wide': 0 r20cc 10,000969126 seconds time elapsed The patch does provide correctness for all non-corrupting events. It does not "repatriate" the leaked counts back to the leaking counter. This is planned for a second patch series. This patch series makes this repatriation more easy by guaranteeing the sibling counter is not measuring any useful event. The patch introduces dynamic constraints for events. That means that events which did not have constraints, i.e., could be measured on any counters, may now be constrained to a subset of the counters depending on what is going on the sibling thread. The algorithm is similar to a cache coherency protocol. We call it XSU in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU counter. As a consequence of the workaround, users may see an increased amount of event multiplexing, even in situtations where there are fewer events than counters measured on a CPU. Patch has been tested on all three impacted processors. Note that when HT is off, there is no corruption. However, the workaround is still enabled, yet not costing too much. Adding a dynamic detection of HT on turned out to be complex are requiring too much to code to be justified. This patch addresses the issue when PEBS is not used. A subsequent patch fixes the problem when PEBS is used. Signed-off-by: Maria Dimakopoulou [spinlock_t -> raw_spinlock_t] Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Stephane Eranian Cc: bp@alien8.de Cc: jolsa@redhat.com Cc: kan.liang@intel.com Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 31 ++-- arch/x86/kernel/cpu/perf_event.h | 6 + arch/x86/kernel/cpu/perf_event_intel.c | 307 ++++++++++++++++++++++++++++++++- 3 files changed, 331 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 71755401476c..b8b7a1277d8d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -779,7 +779,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) struct event_constraint *c; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; struct perf_event *e; - int i, wmin, wmax, num = 0; + int i, wmin, wmax, unsched = 0; struct hw_perf_event *hwc; bitmap_zero(used_mask, X86_PMC_IDX_MAX); @@ -822,14 +822,20 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) /* slow path */ if (i != n) - num = perf_assign_events(cpuc->event_list, n, wmin, - wmax, assign); + unsched = perf_assign_events(cpuc->event_list, n, wmin, + wmax, assign); /* - * Mark the event as committed, so we do not put_constraint() - * in case new events are added and fail scheduling. + * In case of success (unsched = 0), mark events as committed, + * so we do not put_constraint() in case new events are added + * and fail to be scheduled + * + * We invoke the lower level commit callback to lock the resource + * + * We do not need to do all of this in case we are called to + * validate an event group (assign == NULL) */ - if (!num && assign) { + if (!unsched && assign) { for (i = 0; i < n; i++) { e = cpuc->event_list[i]; e->hw.flags |= PERF_X86_EVENT_COMMITTED; @@ -837,11 +843,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) x86_pmu.commit_scheduling(cpuc, e, assign[i]); } } - /* - * scheduling failed or is just a simulation, - * free resources if necessary - */ - if (!assign || num) { + + if (!assign || unsched) { + for (i = 0; i < n; i++) { e = cpuc->event_list[i]; /* @@ -851,6 +855,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if ((e->hw.flags & PERF_X86_EVENT_COMMITTED)) continue; + /* + * release events that failed scheduling + */ if (x86_pmu.put_event_constraints) x86_pmu.put_event_constraints(cpuc, e); } @@ -859,7 +866,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (x86_pmu.stop_scheduling) x86_pmu.stop_scheduling(cpuc); - return num ? -EINVAL : 0; + return unsched ? -EINVAL : 0; } /* diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index f31f90e2d859..236afee35587 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -72,6 +72,7 @@ struct event_constraint { #define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */ #define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */ #define PERF_X86_EVENT_EXCL 0x40 /* HT exclusivity on counter */ +#define PERF_X86_EVENT_DYNAMIC 0x80 /* dynamic alloc'd constraint */ #define PERF_X86_EVENT_RDPMC_ALLOWED 0x40 /* grant rdpmc permission */ @@ -133,6 +134,7 @@ enum intel_excl_state_type { struct intel_excl_states { enum intel_excl_state_type init_state[X86_PMC_IDX_MAX]; enum intel_excl_state_type state[X86_PMC_IDX_MAX]; + bool sched_started; /* true if scheduling has started */ }; struct intel_excl_cntrs { @@ -296,6 +298,10 @@ struct cpu_hw_events { #define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) +#define INTEL_EXCLUEVT_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ + HWEIGHT(n), 0, PERF_X86_EVENT_EXCL) + #define INTEL_PLD_CONSTRAINT(c, n) \ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 7f54000fd0f1..91cc7749d7ce 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1845,7 +1845,7 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx, } static struct event_constraint * -intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, +__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) { struct event_constraint *c; @@ -1865,6 +1865,254 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, return x86_get_event_constraints(cpuc, idx, event); } +static void +intel_start_scheduling(struct cpu_hw_events *cpuc) +{ + struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; + struct intel_excl_states *xl, *xlo; + int tid = cpuc->excl_thread_id; + int o_tid = 1 - tid; /* sibling thread */ + + /* + * nothing needed if in group validation mode + */ + if (cpuc->is_fake) + return; + /* + * no exclusion needed + */ + if (!excl_cntrs) + return; + + xlo = &excl_cntrs->states[o_tid]; + xl = &excl_cntrs->states[tid]; + + xl->sched_started = true; + + /* + * lock shared state until we are done scheduling + * in stop_event_scheduling() + * makes scheduling appear as a transaction + */ + WARN_ON_ONCE(!irqs_disabled()); + raw_spin_lock(&excl_cntrs->lock); + + /* + * save initial state of sibling thread + */ + memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state)); +} + +static void +intel_stop_scheduling(struct cpu_hw_events *cpuc) +{ + struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; + struct intel_excl_states *xl, *xlo; + int tid = cpuc->excl_thread_id; + int o_tid = 1 - tid; /* sibling thread */ + + /* + * nothing needed if in group validation mode + */ + if (cpuc->is_fake) + return; + /* + * no exclusion needed + */ + if (!excl_cntrs) + return; + + xlo = &excl_cntrs->states[o_tid]; + xl = &excl_cntrs->states[tid]; + + /* + * make new sibling thread state visible + */ + memcpy(xlo->state, xlo->init_state, sizeof(xlo->state)); + + xl->sched_started = false; + /* + * release shared state lock (acquired in intel_start_scheduling()) + */ + raw_spin_unlock(&excl_cntrs->lock); +} + +static struct event_constraint * +intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, + int idx, struct event_constraint *c) +{ + struct event_constraint *cx; + struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; + struct intel_excl_states *xl, *xlo; + int is_excl, i; + int tid = cpuc->excl_thread_id; + int o_tid = 1 - tid; /* alternate */ + + /* + * validating a group does not require + * enforcing cross-thread exclusion + */ + if (cpuc->is_fake) + return c; + + /* + * event requires exclusive counter access + * across HT threads + */ + is_excl = c->flags & PERF_X86_EVENT_EXCL; + + /* + * xl = state of current HT + * xlo = state of sibling HT + */ + xl = &excl_cntrs->states[tid]; + xlo = &excl_cntrs->states[o_tid]; + + cx = c; + + /* + * because we modify the constraint, we need + * to make a copy. Static constraints come + * from static const tables. + * + * only needed when constraint has not yet + * been cloned (marked dynamic) + */ + if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) { + + /* sanity check */ + if (idx < 0) + return &emptyconstraint; + + /* + * grab pre-allocated constraint entry + */ + cx = &cpuc->constraint_list[idx]; + + /* + * initialize dynamic constraint + * with static constraint + */ + memcpy(cx, c, sizeof(*cx)); + + /* + * mark constraint as dynamic, so we + * can free it later on + */ + cx->flags |= PERF_X86_EVENT_DYNAMIC; + } + + /* + * From here on, the constraint is dynamic. + * Either it was just allocated above, or it + * was allocated during a earlier invocation + * of this function + */ + + /* + * Modify static constraint with current dynamic + * state of thread + * + * EXCLUSIVE: sibling counter measuring exclusive event + * SHARED : sibling counter measuring non-exclusive event + * UNUSED : sibling counter unused + */ + for_each_set_bit(i, cx->idxmsk, X86_PMC_IDX_MAX) { + /* + * exclusive event in sibling counter + * our corresponding counter cannot be used + * regardless of our event + */ + if (xl->state[i] == INTEL_EXCL_EXCLUSIVE) + __clear_bit(i, cx->idxmsk); + /* + * if measuring an exclusive event, sibling + * measuring non-exclusive, then counter cannot + * be used + */ + if (is_excl && xl->state[i] == INTEL_EXCL_SHARED) + __clear_bit(i, cx->idxmsk); + } + + /* + * recompute actual bit weight for scheduling algorithm + */ + cx->weight = hweight64(cx->idxmsk64); + + /* + * if we return an empty mask, then switch + * back to static empty constraint to avoid + * the cost of freeing later on + */ + if (cx->weight == 0) + cx = &emptyconstraint; + + return cx; +} + +static struct event_constraint * +intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct event_constraint *c = event->hw.constraint; + + /* + * first time only + * - static constraint: no change across incremental scheduling calls + * - dynamic constraint: handled by intel_get_excl_constraints() + */ + if (!c) + c = __intel_get_event_constraints(cpuc, idx, event); + + if (cpuc->excl_cntrs) + return intel_get_excl_constraints(cpuc, event, idx, c); + + return c; +} + +static void intel_put_excl_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; + struct intel_excl_states *xlo, *xl; + unsigned long flags = 0; /* keep compiler happy */ + int tid = cpuc->excl_thread_id; + int o_tid = 1 - tid; + + /* + * nothing needed if in group validation mode + */ + if (cpuc->is_fake) + return; + + WARN_ON_ONCE(!excl_cntrs); + + if (!excl_cntrs) + return; + + xl = &excl_cntrs->states[tid]; + xlo = &excl_cntrs->states[o_tid]; + + /* + * put_constraint may be called from x86_schedule_events() + * which already has the lock held so here make locking + * conditional + */ + if (!xl->sched_started) + raw_spin_lock_irqsave(&excl_cntrs->lock, flags); + + /* + * if event was actually assigned, then mark the + * counter state as unused now + */ + if (hwc->idx >= 0) + xlo->state[hwc->idx] = INTEL_EXCL_UNUSED; + + if (!xl->sched_started) + raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags); +} + static void intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) @@ -1883,7 +2131,57 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, static void intel_put_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { + struct event_constraint *c = event->hw.constraint; + intel_put_shared_regs_event_constraints(cpuc, event); + + /* + * is PMU has exclusive counter restrictions, then + * all events are subject to and must call the + * put_excl_constraints() routine + */ + if (c && cpuc->excl_cntrs) + intel_put_excl_constraints(cpuc, event); + + /* cleanup dynamic constraint */ + if (c && (c->flags & PERF_X86_EVENT_DYNAMIC)) + event->hw.constraint = NULL; +} + +static void intel_commit_scheduling(struct cpu_hw_events *cpuc, + struct perf_event *event, int cntr) +{ + struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; + struct event_constraint *c = event->hw.constraint; + struct intel_excl_states *xlo, *xl; + int tid = cpuc->excl_thread_id; + int o_tid = 1 - tid; + int is_excl; + + if (cpuc->is_fake || !c) + return; + + is_excl = c->flags & PERF_X86_EVENT_EXCL; + + if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) + return; + + WARN_ON_ONCE(!excl_cntrs); + + if (!excl_cntrs) + return; + + xl = &excl_cntrs->states[tid]; + xlo = &excl_cntrs->states[o_tid]; + + WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock)); + + if (cntr >= 0) { + if (is_excl) + xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE; + else + xlo->init_state[cntr] = INTEL_EXCL_SHARED; + } } static void intel_pebs_aliases_core2(struct perf_event *event) @@ -2349,6 +2647,13 @@ static void intel_pmu_cpu_dying(int cpu) cpuc->constraint_list = NULL; } + c = cpuc->excl_cntrs; + if (c) { + if (c->core_id == -1 || --c->refcnt == 0) + kfree(c); + cpuc->excl_cntrs = NULL; + } + fini_debug_store_on_cpu(cpu); } -- cgit v1.2.3 From 93fcf72cc0fa286aa8c3e11a1a8fd4659f0e27c0 Mon Sep 17 00:00:00 2001 From: Maria Dimakopoulou Date: Mon, 17 Nov 2014 20:06:59 +0100 Subject: perf/x86/intel: Enforce HT bug workaround for SNB/IVB/HSW This patches activates the HT bug workaround for the SNB/IVB/HSW processors. This covers non-PEBS mode. Activation is done thru the constraint tables. Both client and server processors needs this workaround. Signed-off-by: Maria Dimakopoulou Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Stephane Eranian Cc: bp@alien8.de Cc: jolsa@redhat.com Cc: kan.liang@intel.com Link: http://lkml.kernel.org/r/1416251225-17721-8-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 53 ++++++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 91cc7749d7ce..9350de0f5407 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -113,6 +113,12 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */ INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ + + INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ + EVENT_CONSTRAINT_END }; @@ -131,15 +137,12 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly = INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ - /* - * Errata BV98 -- MEM_*_RETIRED events can leak between counters of SMT - * siblings; disable these events because they can corrupt unrelated - * counters. - */ - INTEL_EVENT_CONSTRAINT(0xd0, 0x0), /* MEM_UOPS_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xd1, 0x0), /* MEM_LOAD_UOPS_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xd2, 0x0), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xd3, 0x0), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ + + INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ + EVENT_CONSTRAINT_END }; @@ -217,6 +220,12 @@ static struct event_constraint intel_hsw_event_constraints[] = { INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), + + INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ + EVENT_CONSTRAINT_END }; @@ -2865,6 +2874,27 @@ static __init void intel_nehalem_quirk(void) } } +/* + * enable software workaround for errata: + * SNB: BJ122 + * IVB: BV98 + * HSW: HSD29 + * + * Only needed when HT is enabled. However detecting + * this is too difficult and model specific so we enable + * it even with HT off for now. + */ +static __init void intel_ht_bug(void) +{ + x86_pmu.flags |= PMU_FL_EXCL_CNTRS; + + x86_pmu.commit_scheduling = intel_commit_scheduling; + x86_pmu.start_scheduling = intel_start_scheduling; + x86_pmu.stop_scheduling = intel_stop_scheduling; + + pr_info("CPU erratum BJ122, BV98, HSD29 worked around\n"); +} + EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3"); EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82") @@ -3079,6 +3109,7 @@ __init int intel_pmu_init(void) case 42: /* 32nm SandyBridge */ case 45: /* 32nm SandyBridge-E/EN/EP */ x86_add_quirk(intel_sandybridge_quirk); + x86_add_quirk(intel_ht_bug); memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, @@ -3093,6 +3124,8 @@ __init int intel_pmu_init(void) x86_pmu.extra_regs = intel_snbep_extra_regs; else x86_pmu.extra_regs = intel_snb_extra_regs; + + /* all extra regs are per-cpu when HT is on */ x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; @@ -3111,6 +3144,7 @@ __init int intel_pmu_init(void) case 58: /* 22nm IvyBridge */ case 62: /* 22nm IvyBridge-EP/EX */ + x86_add_quirk(intel_ht_bug); memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); /* dTLB-load-misses on IVB is different than SNB */ @@ -3146,6 +3180,7 @@ __init int intel_pmu_init(void) case 63: /* 22nm Haswell Server */ case 69: /* 22nm Haswell ULT */ case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ + x86_add_quirk(intel_ht_bug); x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); -- cgit v1.2.3 From b63b4b459a78a9a45ea47a4803b8d1868e9d17d5 Mon Sep 17 00:00:00 2001 From: Maria Dimakopoulou Date: Mon, 17 Nov 2014 20:07:00 +0100 Subject: perf/x86/intel: Enforce HT bug workaround with PEBS for SNB/IVB/HSW This patch modifies the PEBS constraint tables for SNB/IVB/HSW such that corrupting events supporting PEBS activate the HT workaround. Signed-off-by: Maria Dimakopoulou Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Stephane Eranian Cc: bp@alien8.de Cc: jolsa@redhat.com Cc: kan.liang@intel.com Link: http://lkml.kernel.org/r/1416251225-17721-9-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 20 +++++++++++++++++++- arch/x86/kernel/cpu/perf_event_intel_ds.c | 28 ++++++++++++++++++---------- 2 files changed, 37 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 236afee35587..2ba71ecc244f 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -326,22 +326,40 @@ struct cpu_hw_events { /* Check flags and event code, and set the HSW load flag */ #define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \ - __EVENT_CONSTRAINT(code, n, \ + __EVENT_CONSTRAINT(code, n, \ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) +#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \ + __EVENT_CONSTRAINT(code, n, \ + ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ + HWEIGHT(n), 0, \ + PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL) + /* Check flags and event code/umask, and set the HSW store flag */ #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \ __EVENT_CONSTRAINT(code, n, \ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW) +#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \ + __EVENT_CONSTRAINT(code, n, \ + INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ + HWEIGHT(n), 0, \ + PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL) + /* Check flags and event code/umask, and set the HSW load flag */ #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \ __EVENT_CONSTRAINT(code, n, \ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) +#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \ + __EVENT_CONSTRAINT(code, n, \ + INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ + HWEIGHT(n), 0, \ + PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL) + /* Check flags and event code/umask, and set the HSW N/A flag */ #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \ __EVENT_CONSTRAINT(code, n, \ diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index a5149c7abe73..ca69ea56c712 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -612,6 +612,10 @@ struct event_constraint intel_snb_pebs_event_constraints[] = { INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), + INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ /* Allow all events as PEBS with no flags */ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), EVENT_CONSTRAINT_END @@ -623,6 +627,10 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = { INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), + INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ /* Allow all events as PEBS with no flags */ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), EVENT_CONSTRAINT_END @@ -634,16 +642,16 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = { /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */ - INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ - INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */ - INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */ + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */ /* Allow all events as PEBS with no flags */ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), EVENT_CONSTRAINT_END -- cgit v1.2.3 From a90738c2cb0dceb882e0d7f7f9141e0062809b4d Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 17 Nov 2014 20:07:01 +0100 Subject: perf/x86/intel: Fix intel_get_event_constraints() for dynamic constraints With dynamic constraint, we need to restart from the static constraints each time the intel_get_event_constraints() is called. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Maria Dimakopoulou Cc: bp@alien8.de Cc: jolsa@redhat.com Cc: kan.liang@intel.com Link: http://lkml.kernel.org/r/1416251225-17721-10-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 9350de0f5407..4773ae0b52d0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2063,20 +2063,25 @@ static struct event_constraint * intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) { - struct event_constraint *c = event->hw.constraint; + struct event_constraint *c1 = event->hw.constraint; + struct event_constraint *c2; /* * first time only * - static constraint: no change across incremental scheduling calls * - dynamic constraint: handled by intel_get_excl_constraints() */ - if (!c) - c = __intel_get_event_constraints(cpuc, idx, event); + c2 = __intel_get_event_constraints(cpuc, idx, event); + if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) { + bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX); + c1->weight = c2->weight; + c2 = c1; + } if (cpuc->excl_cntrs) - return intel_get_excl_constraints(cpuc, event, idx, c); + return intel_get_excl_constraints(cpuc, event, idx, c2); - return c; + return c2; } static void intel_put_excl_constraints(struct cpu_hw_events *cpuc, -- cgit v1.2.3 From c02cdbf60b51b8d98a49185535f5d527a2965142 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 17 Nov 2014 20:07:02 +0100 Subject: perf/x86/intel: Limit to half counters when the HT workaround is enabled, to avoid exclusive mode starvation This patch limits the number of counters available to each CPU when the HT bug workaround is enabled. This is necessary to avoid situation of counter starvation. Such can arise from configuration where one HT thread, HT0, is using all 4 counters with corrupting events which require exclusion the the sibling HT, HT1. In such case, HT1 would not be able to schedule any event until HT0 is done. To mitigate this problem, this patch artificially limits the number of counters to 2. That way, we can gurantee that at least 2 counters are not in exclusive mode and therefore allow the sibling thread to schedule events of the same type (system vs. per-thread). The 2 counters are not determined in advance. We simply set the limit to two events per HT. This helps mitigate starvation in case of events with specific counter constraints such a PREC_DIST. Note that this does not elimintate the starvation is all cases. But it is better than not having it. (Solution suggested by Peter Zjilstra.) Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Cc: bp@alien8.de Cc: jolsa@redhat.com Cc: kan.liang@intel.com Cc: maria.n.dimakopoulou@gmail.com Link: http://lkml.kernel.org/r/1416251225-17721-11-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 2 ++ arch/x86/kernel/cpu/perf_event_intel.c | 22 ++++++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 2ba71ecc244f..176749cca98f 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -134,6 +134,8 @@ enum intel_excl_state_type { struct intel_excl_states { enum intel_excl_state_type init_state[X86_PMC_IDX_MAX]; enum intel_excl_state_type state[X86_PMC_IDX_MAX]; + int num_alloc_cntrs;/* #counters allocated */ + int max_alloc_cntrs;/* max #counters allowed */ bool sched_started; /* true if scheduling has started */ }; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 4773ae0b52d0..4187d3f4ed12 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1897,7 +1897,7 @@ intel_start_scheduling(struct cpu_hw_events *cpuc) xl = &excl_cntrs->states[tid]; xl->sched_started = true; - + xl->num_alloc_cntrs = 0; /* * lock shared state until we are done scheduling * in stop_event_scheduling() @@ -1963,7 +1963,6 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, */ if (cpuc->is_fake) return c; - /* * event requires exclusive counter access * across HT threads @@ -1977,6 +1976,18 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, xl = &excl_cntrs->states[tid]; xlo = &excl_cntrs->states[o_tid]; + /* + * do not allow scheduling of more than max_alloc_cntrs + * which is set to half the available generic counters. + * this helps avoid counter starvation of sibling thread + * by ensuring at most half the counters cannot be in + * exclusive mode. There is not designated counters for the + * limits. Any N/2 counters can be used. This helps with + * events with specifix counter constraints + */ + if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs) + return &emptyconstraint; + cx = c; /* @@ -2624,6 +2635,8 @@ static void intel_pmu_cpu_starting(int cpu) cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR]; if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { + int h = x86_pmu.num_counters >> 1; + for_each_cpu(i, topology_thread_cpumask(cpu)) { struct intel_excl_cntrs *c; @@ -2637,6 +2650,11 @@ static void intel_pmu_cpu_starting(int cpu) } cpuc->excl_cntrs->core_id = core_id; cpuc->excl_cntrs->refcnt++; + /* + * set hard limit to half the number of generic counters + */ + cpuc->excl_cntrs->states[0].max_alloc_cntrs = h; + cpuc->excl_cntrs->states[1].max_alloc_cntrs = h; } } -- cgit v1.2.3 From b37609c30e41264c4df4acff78abfc894499a49b Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 17 Nov 2014 20:07:04 +0100 Subject: perf/x86/intel: Make the HT bug workaround conditional on HT enabled This patch disables the PMU HT bug when Hyperthreading (HT) is disabled. We cannot do this test immediately when perf_events is initialized. We need to wait until the topology information is setup properly. As such, we register a later initcall, check the topology and potentially disable the workaround. To do this, we need to ensure there is no user of the PMU. At this point of the boot, the only user is the NMI watchdog, thus we disable it during the switch and re-enable it right after. Having the workaround disabled when it is not needed provides some benefits by limiting the overhead is time and space. The workaround still ensures correct scheduling of the corrupting memory events (0xd0, 0xd1, 0xd2) when HT is off. Those events can only be measured on counters 0-3. Something else the current kernel did not handle correctly. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Cc: bp@alien8.de Cc: jolsa@redhat.com Cc: kan.liang@intel.com Cc: maria.n.dimakopoulou@gmail.com Link: http://lkml.kernel.org/r/1416251225-17721-13-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 5 ++ arch/x86/kernel/cpu/perf_event_intel.c | 95 ++++++++++++++++++++++++++-------- 2 files changed, 79 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 176749cca98f..7250c0281f9d 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -624,6 +624,7 @@ do { \ #define PMU_FL_NO_HT_SHARING 0x1 /* no hyper-threading resource sharing */ #define PMU_FL_HAS_RSP_1 0x2 /* has 2 equivalent offcore_rsp regs */ #define PMU_FL_EXCL_CNTRS 0x4 /* has exclusive counter requirements */ +#define PMU_FL_EXCL_ENABLED 0x8 /* exclusive counter active */ #define EVENT_VAR(_id) event_attr_##_id #define EVENT_PTR(_id) &event_attr_##_id.attr.attr @@ -904,6 +905,10 @@ int knc_pmu_init(void); ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); +static inline int is_ht_workaround_enabled(void) +{ + return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED); +} #else /* CONFIG_CPU_SUP_INTEL */ static inline void reserve_ds_buffers(void) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 4187d3f4ed12..6ea61a572fb0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -1885,8 +1886,9 @@ intel_start_scheduling(struct cpu_hw_events *cpuc) /* * nothing needed if in group validation mode */ - if (cpuc->is_fake) + if (cpuc->is_fake || !is_ht_workaround_enabled()) return; + /* * no exclusion needed */ @@ -1923,7 +1925,7 @@ intel_stop_scheduling(struct cpu_hw_events *cpuc) /* * nothing needed if in group validation mode */ - if (cpuc->is_fake) + if (cpuc->is_fake || !is_ht_workaround_enabled()) return; /* * no exclusion needed @@ -1961,7 +1963,13 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, * validating a group does not require * enforcing cross-thread exclusion */ - if (cpuc->is_fake) + if (cpuc->is_fake || !is_ht_workaround_enabled()) + return c; + + /* + * no exclusion needed + */ + if (!excl_cntrs) return c; /* * event requires exclusive counter access @@ -2658,18 +2666,11 @@ static void intel_pmu_cpu_starting(int cpu) } } -static void intel_pmu_cpu_dying(int cpu) +static void free_excl_cntrs(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - struct intel_shared_regs *pc; struct intel_excl_cntrs *c; - pc = cpuc->shared_regs; - if (pc) { - if (pc->core_id == -1 || --pc->refcnt == 0) - kfree(pc); - cpuc->shared_regs = NULL; - } c = cpuc->excl_cntrs; if (c) { if (c->core_id == -1 || --c->refcnt == 0) @@ -2678,14 +2679,22 @@ static void intel_pmu_cpu_dying(int cpu) kfree(cpuc->constraint_list); cpuc->constraint_list = NULL; } +} - c = cpuc->excl_cntrs; - if (c) { - if (c->core_id == -1 || --c->refcnt == 0) - kfree(c); - cpuc->excl_cntrs = NULL; +static void intel_pmu_cpu_dying(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + struct intel_shared_regs *pc; + + pc = cpuc->shared_regs; + if (pc) { + if (pc->core_id == -1 || --pc->refcnt == 0) + kfree(pc); + cpuc->shared_regs = NULL; } + free_excl_cntrs(cpu); + fini_debug_store_on_cpu(cpu); } @@ -2904,18 +2913,18 @@ static __init void intel_nehalem_quirk(void) * HSW: HSD29 * * Only needed when HT is enabled. However detecting - * this is too difficult and model specific so we enable - * it even with HT off for now. + * if HT is enabled is difficult (model specific). So instead, + * we enable the workaround in the early boot, and verify if + * it is needed in a later initcall phase once we have valid + * topology information to check if HT is actually enabled */ static __init void intel_ht_bug(void) { - x86_pmu.flags |= PMU_FL_EXCL_CNTRS; + x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED; x86_pmu.commit_scheduling = intel_commit_scheduling; x86_pmu.start_scheduling = intel_start_scheduling; x86_pmu.stop_scheduling = intel_stop_scheduling; - - pr_info("CPU erratum BJ122, BV98, HSD29 worked around\n"); } EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3"); @@ -3343,3 +3352,47 @@ __init int intel_pmu_init(void) return 0; } + +/* + * HT bug: phase 2 init + * Called once we have valid topology information to check + * whether or not HT is enabled + * If HT is off, then we disable the workaround + */ +static __init int fixup_ht_bug(void) +{ + int cpu = smp_processor_id(); + int w, c; + /* + * problem not present on this CPU model, nothing to do + */ + if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED)) + return 0; + + w = cpumask_weight(topology_thread_cpumask(cpu)); + if (w > 1) { + pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n"); + return 0; + } + + watchdog_nmi_disable_all(); + + x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED); + + x86_pmu.commit_scheduling = NULL; + x86_pmu.start_scheduling = NULL; + x86_pmu.stop_scheduling = NULL; + + watchdog_nmi_enable_all(); + + get_online_cpus(); + + for_each_online_cpu(c) { + free_excl_cntrs(c); + } + + put_online_cpus(); + pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n"); + return 0; +} +subsys_initcall(fixup_ht_bug) -- cgit v1.2.3 From 8882edf735738c949aba4b65d3ec3453066bab12 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 27 Feb 2015 09:48:30 -0800 Subject: perf/x86/intel: Reset more state in PMU reset The PMU reset code didn't quite keep up with newer PMU features. Improve it a bit to really reset a modern PMU: - Clear all overflow status - Clear LBRs and freezing state - Disable fixed counters too Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: eranian@google.com Link: http://lkml.kernel.org/r/1425059312-18217-2-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 6ea61a572fb0..59994602bb94 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1538,6 +1538,18 @@ static void intel_pmu_reset(void) if (ds) ds->bts_index = ds->bts_buffer_base; + /* Ack all overflows and disable fixed counters */ + if (x86_pmu.version >= 2) { + intel_pmu_ack_status(intel_pmu_get_status()); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + } + + /* Reset LBRs and LBR freezing */ + if (x86_pmu.lbr_nr) { + update_debugctlmsr(get_debugctlmsr() & + ~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR)); + } + local_irq_restore(flags); } -- cgit v1.2.3 From da3e606d885a17525eb18afd423f5c438860b833 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 27 Feb 2015 09:48:31 -0800 Subject: perf/x86: Dump DEBUGCTL in PMU dump LBRs and LBR freezing are controlled through the DEBUGCTL MSR. So dump the state of DEBUGCTL too when dumping the PMU state. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: eranian@google.com Link: http://lkml.kernel.org/r/1425059312-18217-3-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b8b7a1277d8d..994737263daa 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1171,7 +1171,7 @@ static void x86_pmu_start(struct perf_event *event, int flags) void perf_event_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; - u64 pebs; + u64 pebs, debugctl; struct cpu_hw_events *cpuc; unsigned long flags; int cpu, idx; @@ -1197,6 +1197,10 @@ void perf_event_print_debug(void) pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); + if (x86_pmu.lbr_nr) { + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl); + } } pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); -- cgit v1.2.3 From 15fde1101a1aed11958e0d86bc360f01866a74b1 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 27 Feb 2015 09:48:32 -0800 Subject: perf/x86: Only dump PEBS register when PEBS has been detected Technically PEBS_ENABLED is only guaranteed to exist when we detected PEBS. So add a check for this to the PMU dump function. I don't think it can happen on a real CPU, but could in a VM. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: eranian@google.com Link: http://lkml.kernel.org/r/1425059312-18217-4-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 994737263daa..689e35760924 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1189,14 +1189,16 @@ void perf_event_print_debug(void) rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); - rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); pr_info("\n"); pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); pr_info("CPU#%d: status: %016llx\n", cpu, status); pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); - pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); + if (x86_pmu.pebs_constraints) { + rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); + pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); + } if (x86_pmu.lbr_nr) { rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl); -- cgit v1.2.3 From 1a78d93750bb5f61abdc59a91fc3bd06a214542a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 20 Mar 2015 10:11:23 -0700 Subject: perf/x86/intel: Streamline LBR MSR handling in PMI The perf PMI currently does unnecessary MSR accesses when LBRs are enabled. We use LBR freezing, or when in callstack mode force the LBRs to only filter on ring 3. So there is no need to disable the LBRs explicitely in the PMI handler. Also we always unnecessarily rewrite LBR_SELECT in the LBR handler, even though it can never change. 5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */ 5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */ 5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */ 5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */ 5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */ 5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */ 5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */ 5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */ This patch: - Avoids disabling already frozen LBRs unnecessarily in the PMI - Avoids changing LBR_SELECT in the PMI Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: eranian@google.com Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 2 +- arch/x86/kernel/cpu/perf_event_intel.c | 23 ++++++++++++++++++----- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 12 ++++++++---- 3 files changed, 27 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 7250c0281f9d..329f0356ad4a 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -870,7 +870,7 @@ void intel_pmu_lbr_enable(struct perf_event *event); void intel_pmu_lbr_disable(struct perf_event *event); -void intel_pmu_lbr_enable_all(void); +void intel_pmu_lbr_enable_all(bool pmi); void intel_pmu_lbr_disable_all(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 59994602bb94..9da2400c2ec3 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1244,7 +1244,10 @@ static __initconst const u64 slm_hw_cache_event_ids }, }; -static void intel_pmu_disable_all(void) +/* + * Use from PMIs where the LBRs are already disabled. + */ +static void __intel_pmu_disable_all(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1256,15 +1259,20 @@ static void intel_pmu_disable_all(void) intel_bts_disable_local(); intel_pmu_pebs_disable_all(); +} + +static void intel_pmu_disable_all(void) +{ + __intel_pmu_disable_all(); intel_pmu_lbr_disable_all(); } -static void intel_pmu_enable_all(int added) +static void __intel_pmu_enable_all(int added, bool pmi) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); intel_pmu_pebs_enable_all(); - intel_pmu_lbr_enable_all(); + intel_pmu_lbr_enable_all(pmi); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); @@ -1280,6 +1288,11 @@ static void intel_pmu_enable_all(int added) intel_bts_enable_local(); } +static void intel_pmu_enable_all(int added) +{ + __intel_pmu_enable_all(added, false); +} + /* * Workaround for: * Intel Errata AAK100 (model 26) @@ -1573,7 +1586,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) */ if (!x86_pmu.late_ack) apic_write(APIC_LVTPC, APIC_DM_NMI); - intel_pmu_disable_all(); + __intel_pmu_disable_all(); handled = intel_pmu_drain_bts_buffer(); handled += intel_bts_interrupt(); status = intel_pmu_get_status(); @@ -1658,7 +1671,7 @@ again: goto again; done: - intel_pmu_enable_all(0); + __intel_pmu_enable_all(0, true); /* * Only unmask the NMI after the overflow counters * have been reset. This avoids spurious NMIs on diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 0473874109cb..3d537252f011 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -132,12 +132,16 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); * otherwise it becomes near impossible to get a reliable stack. */ -static void __intel_pmu_lbr_enable(void) +static void __intel_pmu_lbr_enable(bool pmi) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); u64 debugctl, lbr_select = 0; - if (cpuc->lbr_sel) { + /* + * No need to reprogram LBR_SELECT in a PMI, as it + * did not change. + */ + if (cpuc->lbr_sel && !pmi) { lbr_select = cpuc->lbr_sel->config; wrmsrl(MSR_LBR_SELECT, lbr_select); } @@ -351,12 +355,12 @@ void intel_pmu_lbr_disable(struct perf_event *event) } } -void intel_pmu_lbr_enable_all(void) +void intel_pmu_lbr_enable_all(bool pmi) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (cpuc->lbr_users) - __intel_pmu_lbr_enable(); + __intel_pmu_lbr_enable(pmi); } void intel_pmu_lbr_disable_all(void) -- cgit v1.2.3 From cd1f11de69226cc7ce7e7f22bdab9010103ddaa6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 20 Mar 2015 10:11:24 -0700 Subject: perf/x86/intel: Avoid rewriting DEBUGCTL with the same value for LBRs perf with LBRs on has a tendency to rewrite the DEBUGCTL MSR with the same value. Add a little optimization to skip the unnecessary write. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: eranian@google.com Link: http://lkml.kernel.org/r/1426871484-21285-2-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 3d537252f011..94e5b506caa6 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -135,7 +135,7 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); static void __intel_pmu_lbr_enable(bool pmi) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - u64 debugctl, lbr_select = 0; + u64 debugctl, lbr_select = 0, orig_debugctl; /* * No need to reprogram LBR_SELECT in a PMI, as it @@ -147,6 +147,7 @@ static void __intel_pmu_lbr_enable(bool pmi) } rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + orig_debugctl = debugctl; debugctl |= DEBUGCTLMSR_LBR; /* * LBR callstack does not work well with FREEZE_LBRS_ON_PMI. @@ -155,7 +156,8 @@ static void __intel_pmu_lbr_enable(bool pmi) */ if (!(lbr_select & LBR_CALL_STACK)) debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + if (orig_debugctl != debugctl) + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); } static void __intel_pmu_lbr_disable(void) -- cgit v1.2.3 From 2e54a5bdba107f80a9ba90065fb43bba0498147d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 2 Apr 2015 17:57:59 +0200 Subject: perf/x86/intel/pt: Fix the 32-bit build On a 32-bit build I got: arch/x86/kernel/cpu/perf_event_intel_pt.c:413:5: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] arch/x86/kernel/cpu/perf_event_intel_bts.c:162:24: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] Fix it. The code should probably be (re-)tested on 32-bit systems to make sure all is fine. Cc: Alexander Shishkin Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kaixu Xia Cc: linux-kernel@vger.kernel.org Cc: Peter Zijlstra (Intel) Cc: Robert Richter Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_bts.c | 2 +- arch/x86/kernel/cpu/perf_event_intel_pt.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c index fb1a4c28f3e1..ac1f0c55f379 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_bts.c +++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c @@ -159,7 +159,7 @@ bts_config_buffer(struct bts_buffer *buf) thresh = end; } - ds->bts_buffer_base = (u64)page_address(page) + phys->displacement; + ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement; ds->bts_index = ds->bts_buffer_base + index; ds->bts_absolute_maximum = ds->bts_buffer_base + end; ds->bts_interrupt_threshold = !buf->snapshot diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index a9a1092cf836..f5a3afc65371 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -409,8 +409,8 @@ static void pt_topa_dump(struct pt_buffer *buf) list_for_each_entry(topa, &buf->tables, list) { int i; - pr_debug("# table @%p (%p), off %llx size %zx\n", topa->table, - (void *)topa->phys, topa->offset, topa->size); + pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table, + topa->phys, topa->offset, topa->size); for (i = 0; i < TENTS_PER_PAGE; i++) { pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n", &topa->table[i], -- cgit v1.2.3 From 824b43763c562ee2feec16bb4017785528f3b54c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 9 Apr 2015 12:55:46 +0200 Subject: crypto: x86/sha1_ssse3 - move SHA-1 SSSE3 implementation to base layer This removes all the boilerplate from the existing implementation, and replaces it with calls into the base layer. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/sha1_ssse3_glue.c | 139 ++++++++------------------------------ 1 file changed, 28 insertions(+), 111 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 6c20fe04a738..33d1b9dc14cc 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include @@ -44,132 +44,51 @@ asmlinkage void sha1_transform_avx(u32 *digest, const char *data, #define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, - unsigned int rounds); + unsigned int rounds); #endif -static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int); - - -static int sha1_ssse3_init(struct shash_desc *desc) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - - *sctx = (struct sha1_state){ - .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }, - }; - - return 0; -} - -static int __sha1_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len, unsigned int partial) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - unsigned int done = 0; - - sctx->count += len; - - if (partial) { - done = SHA1_BLOCK_SIZE - partial; - memcpy(sctx->buffer + partial, data, done); - sha1_transform_asm(sctx->state, sctx->buffer, 1); - } - - if (len - done >= SHA1_BLOCK_SIZE) { - const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE; - - sha1_transform_asm(sctx->state, data + done, rounds); - done += rounds * SHA1_BLOCK_SIZE; - } - - memcpy(sctx->buffer, data + done, len - done); - - return 0; -} +static void (*sha1_transform_asm)(u32 *, const char *, unsigned int); static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, unsigned int len) { struct sha1_state *sctx = shash_desc_ctx(desc); - unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; - int res; - /* Handle the fast case right here */ - if (partial + len < SHA1_BLOCK_SIZE) { - sctx->count += len; - memcpy(sctx->buffer + partial, data, len); + if (!irq_fpu_usable() || + (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE) + return crypto_sha1_update(desc, data, len); - return 0; - } + /* make sure casting to sha1_block_fn() is safe */ + BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0); - if (!irq_fpu_usable()) { - res = crypto_sha1_update(desc, data, len); - } else { - kernel_fpu_begin(); - res = __sha1_ssse3_update(desc, data, len, partial); - kernel_fpu_end(); - } - - return res; -} - - -/* Add padding and return the message digest. */ -static int sha1_ssse3_final(struct shash_desc *desc, u8 *out) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - unsigned int i, index, padlen; - __be32 *dst = (__be32 *)out; - __be64 bits; - static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, }; - - bits = cpu_to_be64(sctx->count << 3); - - /* Pad out to 56 mod 64 and append length */ - index = sctx->count % SHA1_BLOCK_SIZE; - padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index); - if (!irq_fpu_usable()) { - crypto_sha1_update(desc, padding, padlen); - crypto_sha1_update(desc, (const u8 *)&bits, sizeof(bits)); - } else { - kernel_fpu_begin(); - /* We need to fill a whole block for __sha1_ssse3_update() */ - if (padlen <= 56) { - sctx->count += padlen; - memcpy(sctx->buffer + index, padding, padlen); - } else { - __sha1_ssse3_update(desc, padding, padlen, index); - } - __sha1_ssse3_update(desc, (const u8 *)&bits, sizeof(bits), 56); - kernel_fpu_end(); - } - - /* Store state in digest */ - for (i = 0; i < 5; i++) - dst[i] = cpu_to_be32(sctx->state[i]); - - /* Wipe context */ - memset(sctx, 0, sizeof(*sctx)); + kernel_fpu_begin(); + sha1_base_do_update(desc, data, len, + (sha1_block_fn *)sha1_transform_asm); + kernel_fpu_end(); return 0; } -static int sha1_ssse3_export(struct shash_desc *desc, void *out) +static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) { - struct sha1_state *sctx = shash_desc_ctx(desc); + if (!irq_fpu_usable()) + return crypto_sha1_finup(desc, data, len, out); - memcpy(out, sctx, sizeof(*sctx)); + kernel_fpu_begin(); + if (len) + sha1_base_do_update(desc, data, len, + (sha1_block_fn *)sha1_transform_asm); + sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_transform_asm); + kernel_fpu_end(); - return 0; + return sha1_base_finish(desc, out); } -static int sha1_ssse3_import(struct shash_desc *desc, const void *in) +/* Add padding and return the message digest. */ +static int sha1_ssse3_final(struct shash_desc *desc, u8 *out) { - struct sha1_state *sctx = shash_desc_ctx(desc); - - memcpy(sctx, in, sizeof(*sctx)); - - return 0; + return sha1_ssse3_finup(desc, NULL, 0, out); } #ifdef CONFIG_AS_AVX2 @@ -186,13 +105,11 @@ static void sha1_apply_transform_avx2(u32 *digest, const char *data, static struct shash_alg alg = { .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_ssse3_init, + .init = sha1_base_init, .update = sha1_ssse3_update, .final = sha1_ssse3_final, - .export = sha1_ssse3_export, - .import = sha1_ssse3_import, + .finup = sha1_ssse3_finup, .descsize = sizeof(struct sha1_state), - .statesize = sizeof(struct sha1_state), .base = { .cra_name = "sha1", .cra_driver_name= "sha1-ssse3", -- cgit v1.2.3 From 1631030ae63aef0a54fe08813e0f4e26c8ef9c78 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 9 Apr 2015 12:55:47 +0200 Subject: crypto: x86/sha256_ssse3 - move SHA-224/256 SSSE3 implementation to base layer This removes all the boilerplate from the existing implementation, and replaces it with calls into the base layer. It also changes the prototypes of the core asm functions to be compatible with the base prototype void (sha256_block_fn)(struct sha256_state *sst, u8 const *src, int blocks) so that they can be passed to the base layer directly. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/sha256-avx-asm.S | 10 +- arch/x86/crypto/sha256-avx2-asm.S | 10 +- arch/x86/crypto/sha256-ssse3-asm.S | 10 +- arch/x86/crypto/sha256_ssse3_glue.c | 193 +++++++----------------------------- 4 files changed, 50 insertions(+), 173 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S index 642f15687a0a..92b3b5d75ba9 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/crypto/sha256-avx-asm.S @@ -96,10 +96,10 @@ SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00 BYTE_FLIP_MASK = %xmm13 NUM_BLKS = %rdx # 3rd arg -CTX = %rsi # 2nd arg -INP = %rdi # 1st arg +INP = %rsi # 2nd arg +CTX = %rdi # 1st arg -SRND = %rdi # clobbers INP +SRND = %rsi # clobbers INP c = %ecx d = %r8d e = %edx @@ -342,8 +342,8 @@ a = TMP_ ######################################################################## ## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks) -## arg 1 : pointer to input data -## arg 2 : pointer to digest +## arg 1 : pointer to digest +## arg 2 : pointer to input data ## arg 3 : Num blocks ######################################################################## .text diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 9e86944c539d..570ec5ec62d7 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -91,12 +91,12 @@ BYTE_FLIP_MASK = %ymm13 X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK NUM_BLKS = %rdx # 3rd arg -CTX = %rsi # 2nd arg -INP = %rdi # 1st arg +INP = %rsi # 2nd arg +CTX = %rdi # 1st arg c = %ecx d = %r8d e = %edx # clobbers NUM_BLKS -y3 = %edi # clobbers INP +y3 = %esi # clobbers INP TBL = %rbp @@ -523,8 +523,8 @@ STACK_SIZE = _RSP + _RSP_SIZE ######################################################################## ## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks) -## arg 1 : pointer to input data -## arg 2 : pointer to digest +## arg 1 : pointer to digest +## arg 2 : pointer to input data ## arg 3 : Num blocks ######################################################################## .text diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S index f833b74d902b..2cedc44e8121 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/crypto/sha256-ssse3-asm.S @@ -88,10 +88,10 @@ SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00 BYTE_FLIP_MASK = %xmm12 NUM_BLKS = %rdx # 3rd arg -CTX = %rsi # 2nd arg -INP = %rdi # 1st arg +INP = %rsi # 2nd arg +CTX = %rdi # 1st arg -SRND = %rdi # clobbers INP +SRND = %rsi # clobbers INP c = %ecx d = %r8d e = %edx @@ -348,8 +348,8 @@ a = TMP_ ######################################################################## ## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks) -## arg 1 : pointer to input data -## arg 2 : pointer to digest +## arg 1 : pointer to digest +## arg 2 : pointer to input data ## arg 3 : Num blocks ######################################################################## .text diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 8fad72f4dfd2..ccc338881ee8 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -36,195 +36,74 @@ #include #include #include -#include +#include #include #include #include #include -asmlinkage void sha256_transform_ssse3(const char *data, u32 *digest, - u64 rounds); +asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data, + u64 rounds); #ifdef CONFIG_AS_AVX -asmlinkage void sha256_transform_avx(const char *data, u32 *digest, +asmlinkage void sha256_transform_avx(u32 *digest, const char *data, u64 rounds); #endif #ifdef CONFIG_AS_AVX2 -asmlinkage void sha256_transform_rorx(const char *data, u32 *digest, - u64 rounds); +asmlinkage void sha256_transform_rorx(u32 *digest, const char *data, + u64 rounds); #endif -static asmlinkage void (*sha256_transform_asm)(const char *, u32 *, u64); - - -static int sha256_ssse3_init(struct shash_desc *desc) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - - sctx->state[0] = SHA256_H0; - sctx->state[1] = SHA256_H1; - sctx->state[2] = SHA256_H2; - sctx->state[3] = SHA256_H3; - sctx->state[4] = SHA256_H4; - sctx->state[5] = SHA256_H5; - sctx->state[6] = SHA256_H6; - sctx->state[7] = SHA256_H7; - sctx->count = 0; - - return 0; -} - -static int __sha256_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len, unsigned int partial) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - unsigned int done = 0; - - sctx->count += len; - - if (partial) { - done = SHA256_BLOCK_SIZE - partial; - memcpy(sctx->buf + partial, data, done); - sha256_transform_asm(sctx->buf, sctx->state, 1); - } - - if (len - done >= SHA256_BLOCK_SIZE) { - const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE; - - sha256_transform_asm(data + done, sctx->state, (u64) rounds); - - done += rounds * SHA256_BLOCK_SIZE; - } - - memcpy(sctx->buf, data + done, len - done); - - return 0; -} +static void (*sha256_transform_asm)(u32 *, const char *, u64); static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, unsigned int len) { struct sha256_state *sctx = shash_desc_ctx(desc); - unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; - int res; - /* Handle the fast case right here */ - if (partial + len < SHA256_BLOCK_SIZE) { - sctx->count += len; - memcpy(sctx->buf + partial, data, len); + if (!irq_fpu_usable() || + (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE) + return crypto_sha256_update(desc, data, len); - return 0; - } - - if (!irq_fpu_usable()) { - res = crypto_sha256_update(desc, data, len); - } else { - kernel_fpu_begin(); - res = __sha256_ssse3_update(desc, data, len, partial); - kernel_fpu_end(); - } - - return res; -} + /* make sure casting to sha256_block_fn() is safe */ + BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0); - -/* Add padding and return the message digest. */ -static int sha256_ssse3_final(struct shash_desc *desc, u8 *out) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - unsigned int i, index, padlen; - __be32 *dst = (__be32 *)out; - __be64 bits; - static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, }; - - bits = cpu_to_be64(sctx->count << 3); - - /* Pad out to 56 mod 64 and append length */ - index = sctx->count % SHA256_BLOCK_SIZE; - padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index); - - if (!irq_fpu_usable()) { - crypto_sha256_update(desc, padding, padlen); - crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits)); - } else { - kernel_fpu_begin(); - /* We need to fill a whole block for __sha256_ssse3_update() */ - if (padlen <= 56) { - sctx->count += padlen; - memcpy(sctx->buf + index, padding, padlen); - } else { - __sha256_ssse3_update(desc, padding, padlen, index); - } - __sha256_ssse3_update(desc, (const u8 *)&bits, - sizeof(bits), 56); - kernel_fpu_end(); - } - - /* Store state in digest */ - for (i = 0; i < 8; i++) - dst[i] = cpu_to_be32(sctx->state[i]); - - /* Wipe context */ - memset(sctx, 0, sizeof(*sctx)); + kernel_fpu_begin(); + sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha256_transform_asm); + kernel_fpu_end(); return 0; } -static int sha256_ssse3_export(struct shash_desc *desc, void *out) +static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) { - struct sha256_state *sctx = shash_desc_ctx(desc); + if (!irq_fpu_usable()) + return crypto_sha256_finup(desc, data, len, out); - memcpy(out, sctx, sizeof(*sctx)); + kernel_fpu_begin(); + if (len) + sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha256_transform_asm); + sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_transform_asm); + kernel_fpu_end(); - return 0; + return sha256_base_finish(desc, out); } -static int sha256_ssse3_import(struct shash_desc *desc, const void *in) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - - memcpy(sctx, in, sizeof(*sctx)); - - return 0; -} - -static int sha224_ssse3_init(struct shash_desc *desc) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - - sctx->state[0] = SHA224_H0; - sctx->state[1] = SHA224_H1; - sctx->state[2] = SHA224_H2; - sctx->state[3] = SHA224_H3; - sctx->state[4] = SHA224_H4; - sctx->state[5] = SHA224_H5; - sctx->state[6] = SHA224_H6; - sctx->state[7] = SHA224_H7; - sctx->count = 0; - - return 0; -} - -static int sha224_ssse3_final(struct shash_desc *desc, u8 *hash) +/* Add padding and return the message digest. */ +static int sha256_ssse3_final(struct shash_desc *desc, u8 *out) { - u8 D[SHA256_DIGEST_SIZE]; - - sha256_ssse3_final(desc, D); - - memcpy(hash, D, SHA224_DIGEST_SIZE); - memzero_explicit(D, SHA256_DIGEST_SIZE); - - return 0; + return sha256_ssse3_finup(desc, NULL, 0, out); } static struct shash_alg algs[] = { { .digestsize = SHA256_DIGEST_SIZE, - .init = sha256_ssse3_init, + .init = sha256_base_init, .update = sha256_ssse3_update, .final = sha256_ssse3_final, - .export = sha256_ssse3_export, - .import = sha256_ssse3_import, + .finup = sha256_ssse3_finup, .descsize = sizeof(struct sha256_state), - .statesize = sizeof(struct sha256_state), .base = { .cra_name = "sha256", .cra_driver_name = "sha256-ssse3", @@ -235,13 +114,11 @@ static struct shash_alg algs[] = { { } }, { .digestsize = SHA224_DIGEST_SIZE, - .init = sha224_ssse3_init, + .init = sha224_base_init, .update = sha256_ssse3_update, - .final = sha224_ssse3_final, - .export = sha256_ssse3_export, - .import = sha256_ssse3_import, + .final = sha256_ssse3_final, + .finup = sha256_ssse3_finup, .descsize = sizeof(struct sha256_state), - .statesize = sizeof(struct sha256_state), .base = { .cra_name = "sha224", .cra_driver_name = "sha224-ssse3", -- cgit v1.2.3 From e68410ebf62676dfb93aafff7c55b76644f37072 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 9 Apr 2015 12:55:48 +0200 Subject: crypto: x86/sha512_ssse3 - move SHA-384/512 SSSE3 implementation to base layer This removes all the boilerplate from the existing implementation, and replaces it with calls into the base layer. It also changes the prototypes of the core asm functions to be compatible with the base prototype void (sha512_block_fn)(struct sha256_state *sst, u8 const *src, int blocks) so that they can be passed to the base layer directly. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/sha512-avx-asm.S | 6 +- arch/x86/crypto/sha512-avx2-asm.S | 6 +- arch/x86/crypto/sha512-ssse3-asm.S | 6 +- arch/x86/crypto/sha512_ssse3_glue.c | 202 +++++++----------------------------- 4 files changed, 44 insertions(+), 176 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S index 974dde9bc6cd..565274d6a641 100644 --- a/arch/x86/crypto/sha512-avx-asm.S +++ b/arch/x86/crypto/sha512-avx-asm.S @@ -54,9 +54,9 @@ # Virtual Registers # ARG1 -msg = %rdi +digest = %rdi # ARG2 -digest = %rsi +msg = %rsi # ARG3 msglen = %rdx T1 = %rcx @@ -271,7 +271,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE .endm ######################################################################## -# void sha512_transform_avx(const void* M, void* D, u64 L) +# void sha512_transform_avx(void* D, const void* M, u64 L) # Purpose: Updates the SHA512 digest stored at D with the message stored in M. # The size of the message pointed to by M must be an integer multiple of SHA512 # message blocks. diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index 568b96105f5c..a4771dcd1fcf 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -70,9 +70,9 @@ XFER = YTMP0 BYTE_FLIP_MASK = %ymm9 # 1st arg -INP = %rdi +CTX = %rdi # 2nd arg -CTX = %rsi +INP = %rsi # 3rd arg NUM_BLKS = %rdx @@ -562,7 +562,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE .endm ######################################################################## -# void sha512_transform_rorx(const void* M, void* D, uint64_t L)# +# void sha512_transform_rorx(void* D, const void* M, uint64_t L)# # Purpose: Updates the SHA512 digest stored at D with the message stored in M. # The size of the message pointed to by M must be an integer multiple of SHA512 # message blocks. diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S index fb56855d51f5..e610e29cbc81 100644 --- a/arch/x86/crypto/sha512-ssse3-asm.S +++ b/arch/x86/crypto/sha512-ssse3-asm.S @@ -53,9 +53,9 @@ # Virtual Registers # ARG1 -msg = %rdi +digest = %rdi # ARG2 -digest = %rsi +msg = %rsi # ARG3 msglen = %rdx T1 = %rcx @@ -269,7 +269,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE .endm ######################################################################## -# void sha512_transform_ssse3(const void* M, void* D, u64 L)# +# void sha512_transform_ssse3(void* D, const void* M, u64 L)# # Purpose: Updates the SHA512 digest stored at D with the message stored in M. # The size of the message pointed to by M must be an integer multiple of SHA512 # message blocks. diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 0b6af26832bf..d9fa4c1e063f 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -34,205 +34,75 @@ #include #include #include -#include +#include #include #include #include #include -asmlinkage void sha512_transform_ssse3(const char *data, u64 *digest, - u64 rounds); +asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data, + u64 rounds); #ifdef CONFIG_AS_AVX -asmlinkage void sha512_transform_avx(const char *data, u64 *digest, +asmlinkage void sha512_transform_avx(u64 *digest, const char *data, u64 rounds); #endif #ifdef CONFIG_AS_AVX2 -asmlinkage void sha512_transform_rorx(const char *data, u64 *digest, - u64 rounds); +asmlinkage void sha512_transform_rorx(u64 *digest, const char *data, + u64 rounds); #endif -static asmlinkage void (*sha512_transform_asm)(const char *, u64 *, u64); - - -static int sha512_ssse3_init(struct shash_desc *desc) -{ - struct sha512_state *sctx = shash_desc_ctx(desc); - - sctx->state[0] = SHA512_H0; - sctx->state[1] = SHA512_H1; - sctx->state[2] = SHA512_H2; - sctx->state[3] = SHA512_H3; - sctx->state[4] = SHA512_H4; - sctx->state[5] = SHA512_H5; - sctx->state[6] = SHA512_H6; - sctx->state[7] = SHA512_H7; - sctx->count[0] = sctx->count[1] = 0; - - return 0; -} +static void (*sha512_transform_asm)(u64 *, const char *, u64); -static int __sha512_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len, unsigned int partial) +static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, + unsigned int len) { struct sha512_state *sctx = shash_desc_ctx(desc); - unsigned int done = 0; - - sctx->count[0] += len; - if (sctx->count[0] < len) - sctx->count[1]++; - if (partial) { - done = SHA512_BLOCK_SIZE - partial; - memcpy(sctx->buf + partial, data, done); - sha512_transform_asm(sctx->buf, sctx->state, 1); - } - - if (len - done >= SHA512_BLOCK_SIZE) { - const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE; + if (!irq_fpu_usable() || + (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE) + return crypto_sha512_update(desc, data, len); - sha512_transform_asm(data + done, sctx->state, (u64) rounds); - - done += rounds * SHA512_BLOCK_SIZE; - } + /* make sure casting to sha512_block_fn() is safe */ + BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0); - memcpy(sctx->buf, data + done, len - done); + kernel_fpu_begin(); + sha512_base_do_update(desc, data, len, + (sha512_block_fn *)sha512_transform_asm); + kernel_fpu_end(); return 0; } -static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len) +static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) { - struct sha512_state *sctx = shash_desc_ctx(desc); - unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE; - int res; - - /* Handle the fast case right here */ - if (partial + len < SHA512_BLOCK_SIZE) { - sctx->count[0] += len; - if (sctx->count[0] < len) - sctx->count[1]++; - memcpy(sctx->buf + partial, data, len); - - return 0; - } + if (!irq_fpu_usable()) + return crypto_sha512_finup(desc, data, len, out); - if (!irq_fpu_usable()) { - res = crypto_sha512_update(desc, data, len); - } else { - kernel_fpu_begin(); - res = __sha512_ssse3_update(desc, data, len, partial); - kernel_fpu_end(); - } + kernel_fpu_begin(); + if (len) + sha512_base_do_update(desc, data, len, + (sha512_block_fn *)sha512_transform_asm); + sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_transform_asm); + kernel_fpu_end(); - return res; + return sha512_base_finish(desc, out); } - /* Add padding and return the message digest. */ static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) { - struct sha512_state *sctx = shash_desc_ctx(desc); - unsigned int i, index, padlen; - __be64 *dst = (__be64 *)out; - __be64 bits[2]; - static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, }; - - /* save number of bits */ - bits[1] = cpu_to_be64(sctx->count[0] << 3); - bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61); - - /* Pad out to 112 mod 128 and append length */ - index = sctx->count[0] & 0x7f; - padlen = (index < 112) ? (112 - index) : ((128+112) - index); - - if (!irq_fpu_usable()) { - crypto_sha512_update(desc, padding, padlen); - crypto_sha512_update(desc, (const u8 *)&bits, sizeof(bits)); - } else { - kernel_fpu_begin(); - /* We need to fill a whole block for __sha512_ssse3_update() */ - if (padlen <= 112) { - sctx->count[0] += padlen; - if (sctx->count[0] < padlen) - sctx->count[1]++; - memcpy(sctx->buf + index, padding, padlen); - } else { - __sha512_ssse3_update(desc, padding, padlen, index); - } - __sha512_ssse3_update(desc, (const u8 *)&bits, - sizeof(bits), 112); - kernel_fpu_end(); - } - - /* Store state in digest */ - for (i = 0; i < 8; i++) - dst[i] = cpu_to_be64(sctx->state[i]); - - /* Wipe context */ - memset(sctx, 0, sizeof(*sctx)); - - return 0; -} - -static int sha512_ssse3_export(struct shash_desc *desc, void *out) -{ - struct sha512_state *sctx = shash_desc_ctx(desc); - - memcpy(out, sctx, sizeof(*sctx)); - - return 0; -} - -static int sha512_ssse3_import(struct shash_desc *desc, const void *in) -{ - struct sha512_state *sctx = shash_desc_ctx(desc); - - memcpy(sctx, in, sizeof(*sctx)); - - return 0; -} - -static int sha384_ssse3_init(struct shash_desc *desc) -{ - struct sha512_state *sctx = shash_desc_ctx(desc); - - sctx->state[0] = SHA384_H0; - sctx->state[1] = SHA384_H1; - sctx->state[2] = SHA384_H2; - sctx->state[3] = SHA384_H3; - sctx->state[4] = SHA384_H4; - sctx->state[5] = SHA384_H5; - sctx->state[6] = SHA384_H6; - sctx->state[7] = SHA384_H7; - - sctx->count[0] = sctx->count[1] = 0; - - return 0; -} - -static int sha384_ssse3_final(struct shash_desc *desc, u8 *hash) -{ - u8 D[SHA512_DIGEST_SIZE]; - - sha512_ssse3_final(desc, D); - - memcpy(hash, D, SHA384_DIGEST_SIZE); - memzero_explicit(D, SHA512_DIGEST_SIZE); - - return 0; + return sha512_ssse3_finup(desc, NULL, 0, out); } static struct shash_alg algs[] = { { .digestsize = SHA512_DIGEST_SIZE, - .init = sha512_ssse3_init, + .init = sha512_base_init, .update = sha512_ssse3_update, .final = sha512_ssse3_final, - .export = sha512_ssse3_export, - .import = sha512_ssse3_import, + .finup = sha512_ssse3_finup, .descsize = sizeof(struct sha512_state), - .statesize = sizeof(struct sha512_state), .base = { .cra_name = "sha512", .cra_driver_name = "sha512-ssse3", @@ -243,13 +113,11 @@ static struct shash_alg algs[] = { { } }, { .digestsize = SHA384_DIGEST_SIZE, - .init = sha384_ssse3_init, + .init = sha384_base_init, .update = sha512_ssse3_update, - .final = sha384_ssse3_final, - .export = sha512_ssse3_export, - .import = sha512_ssse3_import, + .final = sha512_ssse3_final, + .finup = sha512_ssse3_finup, .descsize = sizeof(struct sha512_state), - .statesize = sizeof(struct sha512_state), .base = { .cra_name = "sha384", .cra_driver_name = "sha384-ssse3", -- cgit v1.2.3 From 066450be419fa48007a9f29e19828f2a86198754 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 12 Apr 2015 11:11:21 +0200 Subject: perf/x86/intel/pt: Clean up the control flow in pt_pmu_hw_init() Dan Carpenter pointed out that the control flow in pt_pmu_hw_init() is a bit messy: for example the kfree(de_attrs) is entirely superfluous. Another problem is the inconsistent mixing of label based and direct return error handling. Add modern, label based error handling instead and clean up the code a bit as well. Note that we'll still do a kfree(NULL) in the normal case - this does not matter as this is an init path and kfree() returns early if it sees a NULL. Reported-by: Dan Carpenter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150409090805.GG17605@mwanda Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_pt.c | 53 +++++++++++++++++-------------- 1 file changed, 30 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index f5a3afc65371..f2770641c0fd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -119,48 +119,55 @@ static int __init pt_pmu_hw_init(void) struct dev_ext_attribute *de_attrs; struct attribute **attrs; size_t size; + int ret; long i; - if (test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT)) { - for (i = 0; i < PT_CPUID_LEAVES; i++) - cpuid_count(20, i, - &pt_pmu.caps[CR_EAX + i * 4], - &pt_pmu.caps[CR_EBX + i * 4], - &pt_pmu.caps[CR_ECX + i * 4], - &pt_pmu.caps[CR_EDX + i * 4]); - } else { - return -ENODEV; + attrs = NULL; + ret = -ENODEV; + if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT)) + goto fail; + + for (i = 0; i < PT_CPUID_LEAVES; i++) { + cpuid_count(20, i, + &pt_pmu.caps[CR_EAX + i*4], + &pt_pmu.caps[CR_EBX + i*4], + &pt_pmu.caps[CR_ECX + i*4], + &pt_pmu.caps[CR_EDX + i*4]); } - size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps) + 1); + ret = -ENOMEM; + size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1); attrs = kzalloc(size, GFP_KERNEL); if (!attrs) - goto err_attrs; + goto fail; - size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps) + 1); + size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1); de_attrs = kzalloc(size, GFP_KERNEL); if (!de_attrs) - goto err_de_attrs; + goto fail; for (i = 0; i < ARRAY_SIZE(pt_caps); i++) { - de_attrs[i].attr.attr.name = pt_caps[i].name; + struct dev_ext_attribute *de_attr = de_attrs + i; + + de_attr->attr.attr.name = pt_caps[i].name; + + sysfs_attr_init(&de_attrs->attr.attr); - sysfs_attr_init(&de_attrs[i].attr.attr); - de_attrs[i].attr.attr.mode = S_IRUGO; - de_attrs[i].attr.show = pt_cap_show; - de_attrs[i].var = (void *)i; - attrs[i] = &de_attrs[i].attr.attr; + de_attr->attr.attr.mode = S_IRUGO; + de_attr->attr.show = pt_cap_show; + de_attr->var = (void *)i; + + attrs[i] = &de_attr->attr.attr; } pt_cap_group.attrs = attrs; + return 0; -err_de_attrs: - kfree(de_attrs); -err_attrs: +fail: kfree(attrs); - return -ENOMEM; + return ret; } #define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC) -- cgit v1.2.3 From fd223849f10a28fa40201652b5f13d52fa8f2bb0 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Sun, 13 Jul 2014 17:41:57 +0200 Subject: um: Remove signal translation and exec_domain As execution domain support is gone we can remove signal translation from the signal code and remove exec_domain from thread_info. Signed-off-by: Richard Weinberger --- arch/x86/um/signal.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 0c8c32bfd792..592491d1d70d 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -549,13 +549,6 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, if (err) return err; - /* Set up registers for signal handler */ - { - struct exec_domain *ed = current_thread_info()->exec_domain; - if (unlikely(ed && ed->signal_invmap && sig < 32)) - sig = ed->signal_invmap[sig]; - } - PT_REGS_SP(regs) = (unsigned long) frame; PT_REGS_DI(regs) = sig; /* In case the signal handler was declared without prototypes */ -- cgit v1.2.3 From 3050a35fba296196cb00e87f4a96aa7d9ed17a7b Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Sun, 13 Jul 2014 17:43:51 +0200 Subject: x86: Remove signal translation and exec_domain As execution domain support is gone we can remove signal translation from the signal code and remove exec_domain from thread_info. Signed-off-by: Richard Weinberger --- arch/x86/include/asm/thread_info.h | 3 --- arch/x86/kernel/signal.c | 16 +--------------- 2 files changed, 1 insertion(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 1d4e4f279a32..2df52baf5228 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -19,13 +19,11 @@ */ #ifndef __ASSEMBLY__ struct task_struct; -struct exec_domain; #include #include struct thread_info { struct task_struct *task; /* main task structure */ - struct exec_domain *exec_domain; /* execution domain */ __u32 flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ @@ -39,7 +37,6 @@ struct thread_info { #define INIT_THREAD_INFO(tsk) \ { \ .task = &tsk, \ - .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ .saved_preempt_count = INIT_PREEMPT_COUNT, \ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index e5042463c1bc..5ddc7ec20e75 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -592,24 +592,10 @@ badframe: return 0; } -/* - * OK, we're invoking a handler: - */ -static int signr_convert(int sig) -{ -#ifdef CONFIG_X86_32 - struct thread_info *info = current_thread_info(); - - if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32) - return info->exec_domain->signal_invmap[sig]; -#endif /* CONFIG_X86_32 */ - return sig; -} - static int setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) { - int usig = signr_convert(ksig->sig); + int usig = ksig->sig; sigset_t *set = sigmask_to_save(); compat_sigset_t *cset = (compat_sigset_t *) set; -- cgit v1.2.3 From d0b5e15f0c0fdd759dd3dd48dc2dc2e7199e0da0 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 18 Mar 2015 21:31:27 +0100 Subject: um: Remove SKAS3/4 support Before we had SKAS0 UML had two modes of operation TT (tracing thread) and SKAS3/4 (separated kernel address space). TT was known to be insecure and got removed a long time ago. SKAS3/4 required a few (3 or 4) patches on the host side which never went mainline. The last host patch is 10 years old. With SKAS0 mode (separated kernel address space using 0 host patches), default since 2005, SKAS3/4 is obsolete and can be removed. Signed-off-by: Richard Weinberger --- arch/x86/um/ldt.c | 227 +++++++------------------------ arch/x86/um/shared/sysdep/faultinfo_32.h | 3 - arch/x86/um/shared/sysdep/faultinfo_64.h | 3 - arch/x86/um/shared/sysdep/skas_ptrace.h | 22 --- 4 files changed, 48 insertions(+), 207 deletions(-) delete mode 100644 arch/x86/um/shared/sysdep/skas_ptrace.h (limited to 'arch/x86') diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c index 8e08176f0bcb..5c0b711d2433 100644 --- a/arch/x86/um/ldt.c +++ b/arch/x86/um/ldt.c @@ -8,9 +8,7 @@ #include #include #include -#include #include -#include #include extern int modify_ldt(int func, void *ptr, unsigned long bytecount); @@ -19,105 +17,20 @@ static long write_ldt_entry(struct mm_id *mm_idp, int func, struct user_desc *desc, void **addr, int done) { long res; - - if (proc_mm) { - /* - * This is a special handling for the case, that the mm to - * modify isn't current->active_mm. - * If this is called directly by modify_ldt, - * (current->active_mm->context.skas.u == mm_idp) - * will be true. So no call to __switch_mm(mm_idp) is done. - * If this is called in case of init_new_ldt or PTRACE_LDT, - * mm_idp won't belong to current->active_mm, but child->mm. - * So we need to switch child's mm into our userspace, then - * later switch back. - * - * Note: I'm unsure: should interrupts be disabled here? - */ - if (!current->active_mm || current->active_mm == &init_mm || - mm_idp != ¤t->active_mm->context.id) - __switch_mm(mm_idp); - } - - if (ptrace_ldt) { - struct ptrace_ldt ldt_op = (struct ptrace_ldt) { - .func = func, - .ptr = desc, - .bytecount = sizeof(*desc)}; - u32 cpu; - int pid; - - if (!proc_mm) - pid = mm_idp->u.pid; - else { - cpu = get_cpu(); - pid = userspace_pid[cpu]; - } - - res = os_ptrace_ldt(pid, 0, (unsigned long) &ldt_op); - - if (proc_mm) - put_cpu(); - } - else { - void *stub_addr; - res = syscall_stub_data(mm_idp, (unsigned long *)desc, - (sizeof(*desc) + sizeof(long) - 1) & - ~(sizeof(long) - 1), - addr, &stub_addr); - if (!res) { - unsigned long args[] = { func, - (unsigned long)stub_addr, - sizeof(*desc), - 0, 0, 0 }; - res = run_syscall_stub(mm_idp, __NR_modify_ldt, args, - 0, addr, done); - } + void *stub_addr; + res = syscall_stub_data(mm_idp, (unsigned long *)desc, + (sizeof(*desc) + sizeof(long) - 1) & + ~(sizeof(long) - 1), + addr, &stub_addr); + if (!res) { + unsigned long args[] = { func, + (unsigned long)stub_addr, + sizeof(*desc), + 0, 0, 0 }; + res = run_syscall_stub(mm_idp, __NR_modify_ldt, args, + 0, addr, done); } - if (proc_mm) { - /* - * This is the second part of special handling, that makes - * PTRACE_LDT possible to implement. - */ - if (current->active_mm && current->active_mm != &init_mm && - mm_idp != ¤t->active_mm->context.id) - __switch_mm(¤t->active_mm->context.id); - } - - return res; -} - -static long read_ldt_from_host(void __user * ptr, unsigned long bytecount) -{ - int res, n; - struct ptrace_ldt ptrace_ldt = (struct ptrace_ldt) { - .func = 0, - .bytecount = bytecount, - .ptr = kmalloc(bytecount, GFP_KERNEL)}; - u32 cpu; - - if (ptrace_ldt.ptr == NULL) - return -ENOMEM; - - /* - * This is called from sys_modify_ldt only, so userspace_pid gives - * us the right number - */ - - cpu = get_cpu(); - res = os_ptrace_ldt(userspace_pid[cpu], 0, (unsigned long) &ptrace_ldt); - put_cpu(); - if (res < 0) - goto out; - - n = copy_to_user(ptr, ptrace_ldt.ptr, res); - if (n != 0) - res = -EFAULT; - - out: - kfree(ptrace_ldt.ptr); - return res; } @@ -145,9 +58,6 @@ static int read_ldt(void __user * ptr, unsigned long bytecount) bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; err = bytecount; - if (ptrace_ldt) - return read_ldt_from_host(ptr, bytecount); - mutex_lock(&ldt->lock); if (ldt->entry_count <= LDT_DIRECT_ENTRIES) { size = LDT_ENTRY_SIZE*LDT_DIRECT_ENTRIES; @@ -229,17 +139,11 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int func) goto out; } - if (!ptrace_ldt) - mutex_lock(&ldt->lock); + mutex_lock(&ldt->lock); err = write_ldt_entry(mm_idp, func, &ldt_info, &addr, 1); if (err) goto out_unlock; - else if (ptrace_ldt) { - /* With PTRACE_LDT available, this is used as a flag only */ - ldt->entry_count = 1; - goto out; - } if (ldt_info.entry_number >= ldt->entry_count && ldt_info.entry_number >= LDT_DIRECT_ENTRIES) { @@ -393,91 +297,56 @@ long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm) int i; long page, err=0; void *addr = NULL; - struct proc_mm_op copy; - if (!ptrace_ldt) - mutex_init(&new_mm->arch.ldt.lock); + mutex_init(&new_mm->arch.ldt.lock); if (!from_mm) { memset(&desc, 0, sizeof(desc)); /* - * We have to initialize a clean ldt. + * Now we try to retrieve info about the ldt, we + * inherited from the host. All ldt-entries found + * will be reset in the following loop */ - if (proc_mm) { - /* - * If the new mm was created using proc_mm, host's - * default-ldt currently is assigned, which normally - * contains the call-gates for lcall7 and lcall27. - * To remove these gates, we simply write an empty - * entry as number 0 to the host. - */ - err = write_ldt_entry(&new_mm->id, 1, &desc, &addr, 1); - } - else{ - /* - * Now we try to retrieve info about the ldt, we - * inherited from the host. All ldt-entries found - * will be reset in the following loop - */ - ldt_get_host_info(); - for (num_p=host_ldt_entries; *num_p != -1; num_p++) { - desc.entry_number = *num_p; - err = write_ldt_entry(&new_mm->id, 1, &desc, - &addr, *(num_p + 1) == -1); - if (err) - break; - } + ldt_get_host_info(); + for (num_p=host_ldt_entries; *num_p != -1; num_p++) { + desc.entry_number = *num_p; + err = write_ldt_entry(&new_mm->id, 1, &desc, + &addr, *(num_p + 1) == -1); + if (err) + break; } new_mm->arch.ldt.entry_count = 0; goto out; } - if (proc_mm) { - /* - * We have a valid from_mm, so we now have to copy the LDT of - * from_mm to new_mm, because using proc_mm an new mm with - * an empty/default LDT was created in new_mm() - */ - copy = ((struct proc_mm_op) { .op = MM_COPY_SEGMENTS, - .u = - { .copy_segments = - from_mm->id.u.mm_fd } } ); - i = os_write_file(new_mm->id.u.mm_fd, ©, sizeof(copy)); - if (i != sizeof(copy)) - printk(KERN_ERR "new_mm : /proc/mm copy_segments " - "failed, err = %d\n", -i); - } - - if (!ptrace_ldt) { - /* - * Our local LDT is used to supply the data for - * modify_ldt(READLDT), if PTRACE_LDT isn't available, - * i.e., we have to use the stub for modify_ldt, which - * can't handle the big read buffer of up to 64kB. - */ - mutex_lock(&from_mm->arch.ldt.lock); - if (from_mm->arch.ldt.entry_count <= LDT_DIRECT_ENTRIES) - memcpy(new_mm->arch.ldt.u.entries, from_mm->arch.ldt.u.entries, - sizeof(new_mm->arch.ldt.u.entries)); - else { - i = from_mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE; - while (i-->0) { - page = __get_free_page(GFP_KERNEL|__GFP_ZERO); - if (!page) { - err = -ENOMEM; - break; - } - new_mm->arch.ldt.u.pages[i] = - (struct ldt_entry *) page; - memcpy(new_mm->arch.ldt.u.pages[i], - from_mm->arch.ldt.u.pages[i], PAGE_SIZE); + /* + * Our local LDT is used to supply the data for + * modify_ldt(READLDT), if PTRACE_LDT isn't available, + * i.e., we have to use the stub for modify_ldt, which + * can't handle the big read buffer of up to 64kB. + */ + mutex_lock(&from_mm->arch.ldt.lock); + if (from_mm->arch.ldt.entry_count <= LDT_DIRECT_ENTRIES) + memcpy(new_mm->arch.ldt.u.entries, from_mm->arch.ldt.u.entries, + sizeof(new_mm->arch.ldt.u.entries)); + else { + i = from_mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE; + while (i-->0) { + page = __get_free_page(GFP_KERNEL|__GFP_ZERO); + if (!page) { + err = -ENOMEM; + break; } + new_mm->arch.ldt.u.pages[i] = + (struct ldt_entry *) page; + memcpy(new_mm->arch.ldt.u.pages[i], + from_mm->arch.ldt.u.pages[i], PAGE_SIZE); } - new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count; - mutex_unlock(&from_mm->arch.ldt.lock); } + new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count; + mutex_unlock(&from_mm->arch.ldt.lock); out: return err; @@ -488,7 +357,7 @@ void free_ldt(struct mm_context *mm) { int i; - if (!ptrace_ldt && mm->arch.ldt.entry_count > LDT_DIRECT_ENTRIES) { + if (mm->arch.ldt.entry_count > LDT_DIRECT_ENTRIES) { i = mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE; while (i-- > 0) free_page((long) mm->arch.ldt.u.pages[i]); diff --git a/arch/x86/um/shared/sysdep/faultinfo_32.h b/arch/x86/um/shared/sysdep/faultinfo_32.h index a26086b8a800..b6f2437ec29c 100644 --- a/arch/x86/um/shared/sysdep/faultinfo_32.h +++ b/arch/x86/um/shared/sysdep/faultinfo_32.h @@ -27,9 +27,6 @@ struct faultinfo { /* This is Page Fault */ #define SEGV_IS_FIXABLE(fi) ((fi)->trap_no == 14) -/* SKAS3 has no trap_no on i386, but get_skas_faultinfo() sets it to 0. */ -#define SEGV_MAYBE_FIXABLE(fi) ((fi)->trap_no == 0 && ptrace_faultinfo) - #define PTRACE_FULL_FAULTINFO 0 #endif diff --git a/arch/x86/um/shared/sysdep/faultinfo_64.h b/arch/x86/um/shared/sysdep/faultinfo_64.h index f811cbe15d62..ee88f88974ea 100644 --- a/arch/x86/um/shared/sysdep/faultinfo_64.h +++ b/arch/x86/um/shared/sysdep/faultinfo_64.h @@ -27,9 +27,6 @@ struct faultinfo { /* This is Page Fault */ #define SEGV_IS_FIXABLE(fi) ((fi)->trap_no == 14) -/* No broken SKAS API, which doesn't pass trap_no, here. */ -#define SEGV_MAYBE_FIXABLE(fi) 0 - #define PTRACE_FULL_FAULTINFO 1 #endif diff --git a/arch/x86/um/shared/sysdep/skas_ptrace.h b/arch/x86/um/shared/sysdep/skas_ptrace.h deleted file mode 100644 index 453febe98993..000000000000 --- a/arch/x86/um/shared/sysdep/skas_ptrace.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL - */ - -#ifndef __SYSDEP_X86_SKAS_PTRACE_H -#define __SYSDEP_X86_SKAS_PTRACE_H - -struct ptrace_faultinfo { - int is_write; - unsigned long addr; -}; - -struct ptrace_ldt { - int func; - void *ptr; - unsigned long bytecount; -}; - -#define PTRACE_LDT 54 - -#endif -- cgit v1.2.3 From 28fa468f53163bc0b867b4cc75a9e36e7ed4dbbd Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 18 Mar 2015 21:42:54 +0100 Subject: um: Remove broken SMP support At times where UML used the TT mode to operate it had kind of SMP support. It never got finished nor was stable. Let's rip out that cruft and stop confusing developers which do tree-wide SMP cleanups. If someone wants SMP support UML it has do be done from scratch. Signed-off-by: Richard Weinberger --- arch/x86/um/asm/barrier.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index 2d7d9a1f5b53..aec75a5d8722 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -36,22 +36,11 @@ #endif /* CONFIG_X86_PPRO_FENCE */ #define dma_wmb() barrier() -#ifdef CONFIG_SMP - -#define smp_mb() mb() -#define smp_rmb() dma_rmb() -#define smp_wmb() barrier() -#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) - -#else /* CONFIG_SMP */ - #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() #define set_mb(var, value) do { var = value; barrier(); } while (0) -#endif /* CONFIG_SMP */ - #define read_barrier_depends() do { } while (0) #define smp_read_barrier_depends() do { } while (0) -- cgit v1.2.3 From a98a6d864d3b84219a6ec6213b00c260fb52f9f4 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 18 Mar 2015 21:59:35 +0100 Subject: um: Remove broken highmem support Highmem was always buggy and experimental on UML(i386). In times where 64 bit computers are default we can remove that experimental code. Signed-off-by: Richard Weinberger --- arch/x86/um/Makefile | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index eafa324eb7a5..acb384d24669 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -21,7 +21,6 @@ obj-$(CONFIG_BINFMT_ELF) += elfcore.o subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o subarch-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += ../lib/rwsem.o -subarch-$(CONFIG_HIGHMEM) += ../mm/highmem_32.o else -- cgit v1.2.3 From fc9bea0e28db8cbfe0a08c1bfb1796bfd7adf49b Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 24 Mar 2015 18:31:24 +0300 Subject: x86, UML: fix integer overflow in ELF_ET_DYN_BASE Almost all arches define ELF_ET_DYN_BASE as 2/3 of TASK_SIZE. Though it seems that some architectures do this in a wrong way. The problem is that 2*TASK_SIZE may overflow 32-bits so the real ELF_ET_DYN_BASE becomes wrong. Fix this overflow by dividing TASK_SIZE prior to multiplying: (TASK_SIZE / 3 * 2) Signed-off-by: Andrey Ryabinin Signed-off-by: Richard Weinberger --- arch/x86/um/asm/elf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h index 25a1022dd793..0a656b727b1a 100644 --- a/arch/x86/um/asm/elf.h +++ b/arch/x86/um/asm/elf.h @@ -210,7 +210,7 @@ extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu); #define ELF_EXEC_PAGESIZE 4096 -#define ELF_ET_DYN_BASE (2 * TASK_SIZE / 3) +#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2) extern long elf_aux_hwcap; #define ELF_HWCAP (elf_aux_hwcap) -- cgit v1.2.3 From c4d30668da689de2f27bb0b19de4430d6c95d7cf Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 10 Apr 2015 00:22:56 -0400 Subject: x86 msr-index: define MSR_TURBO_RATIO_LIMIT,1,2 MSR_TURBO_RATIO_LIMIT has grown into a set of three registers. Add the documented names for them, in preparation for deleting the previous ad-hoc names: +#define MSR_TURBO_RATIO_LIMIT 0x000001ad +#define MSR_TURBO_RATIO_LIMIT1 0x000001ae +#define MSR_TURBO_RATIO_LIMIT2 0x000001af Signed-off-by: Len Brown Cc: x86@kernel.org --- arch/x86/include/uapi/asm/msr-index.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index 3ce079136c11..c4c75272314a 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -61,6 +61,9 @@ #define MSR_OFFCORE_RSP_1 0x000001a7 #define MSR_NHM_TURBO_RATIO_LIMIT 0x000001ad #define MSR_IVT_TURBO_RATIO_LIMIT 0x000001ae +#define MSR_TURBO_RATIO_LIMIT 0x000001ad +#define MSR_TURBO_RATIO_LIMIT1 0x000001ae +#define MSR_TURBO_RATIO_LIMIT2 0x000001af #define MSR_LBR_SELECT 0x000001c8 #define MSR_LBR_TOS 0x000001c9 -- cgit v1.2.3 From 9e9c3fe40bcd28e3f98f0ad8408435f4503f2781 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Sun, 12 Apr 2015 21:47:15 +0300 Subject: KVM: x86: Fix MSR_IA32_BNDCFGS in msrs_to_save kvm_init_msr_list is currently called before hardware_setup. As a result, vmx_mpx_supported always returns false when kvm_init_msr_list checks whether to save MSR_IA32_BNDCFGS. Move kvm_init_msr_list after vmx_hardware_setup is called to fix this issue. Signed-off-by: Nadav Amit Message-Id: <1428864435-4732-1-git-send-email-namit@cs.technion.ac.il> Cc: stable@vger.kernel.org # 3.15+ Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e1a81267f3f6..ed31c31b2485 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5799,7 +5799,6 @@ int kvm_arch_init(void *opaque) kvm_set_mmio_spte_mask(); kvm_x86_ops = ops; - kvm_init_msr_list(); kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0); @@ -7253,7 +7252,14 @@ void kvm_arch_hardware_disable(void) int kvm_arch_hardware_setup(void) { - return kvm_x86_ops->hardware_setup(); + int r; + + r = kvm_x86_ops->hardware_setup(); + if (r != 0) + return r; + + kvm_init_msr_list(); + return 0; } void kvm_arch_hardware_unsetup(void) -- cgit v1.2.3 From bea15428b9d6bc36e87288f168ab314619a66757 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 13 Apr 2015 15:40:02 +0200 Subject: KVM: x86: cleanup kvm_irq_delivery_to_apic_fast Sparse is reporting a "we previously assumed 'src' could be null" error. This is true as far as the static analyzer can see, but in practice only IPIs can set shorthand to self and they also set 'src', so it's ok. Still, move the initialization of x2apic_ipi (and thus the NULL check for src right before the first use. While at it, initializing ret to "false" is somewhat confusing because of the almost immediate assigned of "true" to the same variable. Thus, initialize it to "true" and modify it in the only path that used to use the value from "bool ret = false". There is no change in generated code from this change. Reported-by: Dan Carpenter Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d67206a7b99a..629af0f1c5c4 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -683,8 +683,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, unsigned long bitmap = 1; struct kvm_lapic **dst; int i; - bool ret = false; - bool x2apic_ipi = src && apic_x2apic_mode(src); + bool ret, x2apic_ipi; *r = -1; @@ -696,16 +695,18 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, if (irq->shorthand) return false; + x2apic_ipi = src && apic_x2apic_mode(src); if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST)) return false; + ret = true; rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); - if (!map) + if (!map) { + ret = false; goto out; - - ret = true; + } if (irq->dest_mode == APIC_DEST_PHYSICAL) { if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) -- cgit v1.2.3 From 692297d8f96887f836d9049a653ed05a71cf48fb Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Tue, 14 Apr 2015 15:44:19 -0700 Subject: watchdog: introduce the hardlockup_detector_disable() function Have kvm_guest_init() use hardlockup_detector_disable() instead of watchdog_enable_hardlockup_detector(false). Remove the watchdog_hardlockup_detector_is_enabled() and the watchdog_enable_hardlockup_detector() function which are no longer needed. Signed-off-by: Ulrich Obergfell Signed-off-by: Don Zickus Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/kvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index e354cc6446ab..9435620062df 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -513,7 +513,7 @@ void __init kvm_guest_init(void) * can get false positives too easily, for example if the host is * overcommitted. */ - watchdog_enable_hardlockup_detector(false); + hardlockup_detector_disable(); } static noinline uint32_t __kvm_cpuid_base(void) -- cgit v1.2.3 From 982333683385343d8d2db9a1df69c98406f42687 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Apr 2015 15:46:14 -0700 Subject: x86: expose number of page table levels on Kconfig level We would want to use number of page table level to define mm_struct. Let's expose it as CONFIG_PGTABLE_LEVELS. Signed-off-by: Kirill A. Shutemov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Tested-by: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 6 ++++++ arch/x86/include/asm/paravirt.h | 8 ++++---- arch/x86/include/asm/paravirt_types.h | 8 ++++---- arch/x86/include/asm/pgalloc.h | 8 ++++---- arch/x86/include/asm/pgtable-2level_types.h | 1 - arch/x86/include/asm/pgtable-3level_types.h | 2 -- arch/x86/include/asm/pgtable.h | 8 ++++---- arch/x86/include/asm/pgtable_64_types.h | 1 - arch/x86/include/asm/pgtable_types.h | 4 ++-- arch/x86/kernel/paravirt.c | 6 +++--- arch/x86/mm/pgtable.c | 14 +++++++------- arch/x86/xen/mmu.c | 14 +++++++------- 12 files changed, 41 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index faff6934c05a..3e3aaad23414 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -277,6 +277,12 @@ config ARCH_SUPPORTS_UPROBES config FIX_EARLYCON_MEM def_bool y +config PGTABLE_LEVELS + int + default 4 if X86_64 + default 3 if X86_PAE + default 2 + source "init/Kconfig" source "kernel/Kconfig.freezer" diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 5f6051d5d139..8957810ad7d1 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -545,7 +545,7 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val); } -#if PAGETABLE_LEVELS >= 3 +#if CONFIG_PGTABLE_LEVELS >= 3 static inline pmd_t __pmd(pmdval_t val) { pmdval_t ret; @@ -585,7 +585,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud) PVOP_VCALL2(pv_mmu_ops.set_pud, pudp, val); } -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 static inline pud_t __pud(pudval_t val) { pudval_t ret; @@ -636,9 +636,9 @@ static inline void pud_clear(pud_t *pudp) set_pud(pudp, __pud(0)); } -#endif /* PAGETABLE_LEVELS == 4 */ +#endif /* CONFIG_PGTABLE_LEVELS == 4 */ -#endif /* PAGETABLE_LEVELS >= 3 */ +#endif /* CONFIG_PGTABLE_LEVELS >= 3 */ #ifdef CONFIG_X86_PAE /* Special-case pte-setting operations for PAE, which can't update a diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 7549b8b369e4..f7b0b5c112f2 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -294,7 +294,7 @@ struct pv_mmu_ops { struct paravirt_callee_save pgd_val; struct paravirt_callee_save make_pgd; -#if PAGETABLE_LEVELS >= 3 +#if CONFIG_PGTABLE_LEVELS >= 3 #ifdef CONFIG_X86_PAE void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); void (*pte_clear)(struct mm_struct *mm, unsigned long addr, @@ -308,13 +308,13 @@ struct pv_mmu_ops { struct paravirt_callee_save pmd_val; struct paravirt_callee_save make_pmd; -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 struct paravirt_callee_save pud_val; struct paravirt_callee_save make_pud; void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); -#endif /* PAGETABLE_LEVELS == 4 */ -#endif /* PAGETABLE_LEVELS >= 3 */ +#endif /* CONFIG_PGTABLE_LEVELS == 4 */ +#endif /* CONFIG_PGTABLE_LEVELS >= 3 */ struct pv_lazy_ops lazy_mode; diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index c4412e972bbd..bf7f8b55b0f9 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -77,7 +77,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, #define pmd_pgtable(pmd) pmd_page(pmd) -#if PAGETABLE_LEVELS > 2 +#if CONFIG_PGTABLE_LEVELS > 2 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { struct page *page; @@ -116,7 +116,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) } #endif /* CONFIG_X86_PAE */ -#if PAGETABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 3 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) { paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); @@ -142,7 +142,7 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, ___pud_free_tlb(tlb, pud); } -#endif /* PAGETABLE_LEVELS > 3 */ -#endif /* PAGETABLE_LEVELS > 2 */ +#endif /* CONFIG_PGTABLE_LEVELS > 3 */ +#endif /* CONFIG_PGTABLE_LEVELS > 2 */ #endif /* _ASM_X86_PGALLOC_H */ diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index daacc23e3fb9..392576433e77 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -17,7 +17,6 @@ typedef union { #endif /* !__ASSEMBLY__ */ #define SHARED_KERNEL_PMD 0 -#define PAGETABLE_LEVELS 2 /* * traditional i386 two-level paging structure: diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 1bd5876c8649..bcc89625ebe5 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -24,8 +24,6 @@ typedef union { #define SHARED_KERNEL_PMD 1 #endif -#define PAGETABLE_LEVELS 3 - /* * PGDIR_SHIFT determines what a top-level page table entry can map */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a0c35bf6cb92..fe57e7a98839 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -551,7 +551,7 @@ static inline unsigned long pages_to_mb(unsigned long npg) return npg >> (20 - PAGE_SHIFT); } -#if PAGETABLE_LEVELS > 2 +#if CONFIG_PGTABLE_LEVELS > 2 static inline int pud_none(pud_t pud) { return native_pud_val(pud) == 0; @@ -594,9 +594,9 @@ static inline int pud_large(pud_t pud) { return 0; } -#endif /* PAGETABLE_LEVELS > 2 */ +#endif /* CONFIG_PGTABLE_LEVELS > 2 */ -#if PAGETABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 3 static inline int pgd_present(pgd_t pgd) { return pgd_flags(pgd) & _PAGE_PRESENT; @@ -633,7 +633,7 @@ static inline int pgd_none(pgd_t pgd) { return !native_pgd_val(pgd); } -#endif /* PAGETABLE_LEVELS > 3 */ +#endif /* CONFIG_PGTABLE_LEVELS > 3 */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 602b6028c5b6..e6844dfb4471 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -20,7 +20,6 @@ typedef struct { pteval_t pte; } pte_t; #endif /* !__ASSEMBLY__ */ #define SHARED_KERNEL_PMD 0 -#define PAGETABLE_LEVELS 4 /* * PGDIR_SHIFT determines what a top-level page table entry can map diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 8c7c10802e9c..78f0c8cbe316 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -234,7 +234,7 @@ static inline pgdval_t pgd_flags(pgd_t pgd) return native_pgd_val(pgd) & PTE_FLAGS_MASK; } -#if PAGETABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 3 typedef struct { pudval_t pud; } pud_t; static inline pud_t native_make_pud(pmdval_t val) @@ -255,7 +255,7 @@ static inline pudval_t native_pud_val(pud_t pud) } #endif -#if PAGETABLE_LEVELS > 2 +#if CONFIG_PGTABLE_LEVELS > 2 typedef struct { pmdval_t pmd; } pmd_t; static inline pmd_t native_make_pmd(pmdval_t val) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 548d25f00c90..c614dd492f5f 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -443,7 +443,7 @@ struct pv_mmu_ops pv_mmu_ops = { .ptep_modify_prot_start = __ptep_modify_prot_start, .ptep_modify_prot_commit = __ptep_modify_prot_commit, -#if PAGETABLE_LEVELS >= 3 +#if CONFIG_PGTABLE_LEVELS >= 3 #ifdef CONFIG_X86_PAE .set_pte_atomic = native_set_pte_atomic, .pte_clear = native_pte_clear, @@ -454,13 +454,13 @@ struct pv_mmu_ops pv_mmu_ops = { .pmd_val = PTE_IDENT, .make_pmd = PTE_IDENT, -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 .pud_val = PTE_IDENT, .make_pud = PTE_IDENT, .set_pgd = native_set_pgd, #endif -#endif /* PAGETABLE_LEVELS >= 3 */ +#endif /* CONFIG_PGTABLE_LEVELS >= 3 */ .pte_val = PTE_IDENT, .pgd_val = PTE_IDENT, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5a7e5252c878..b28edfecbdfe 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -58,7 +58,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) tlb_remove_page(tlb, pte); } -#if PAGETABLE_LEVELS > 2 +#if CONFIG_PGTABLE_LEVELS > 2 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { struct page *page = virt_to_page(pmd); @@ -74,14 +74,14 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) tlb_remove_page(tlb, page); } -#if PAGETABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); tlb_remove_page(tlb, virt_to_page(pud)); } -#endif /* PAGETABLE_LEVELS > 3 */ -#endif /* PAGETABLE_LEVELS > 2 */ +#endif /* CONFIG_PGTABLE_LEVELS > 3 */ +#endif /* CONFIG_PGTABLE_LEVELS > 2 */ static inline void pgd_list_add(pgd_t *pgd) { @@ -117,9 +117,9 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) /* If the pgd points to a shared pagetable level (either the ptes in non-PAE, or shared PMD in PAE), then just copy the references from swapper_pg_dir. */ - if (PAGETABLE_LEVELS == 2 || - (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || - PAGETABLE_LEVELS == 4) { + if (CONFIG_PGTABLE_LEVELS == 2 || + (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || + CONFIG_PGTABLE_LEVELS == 4) { clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index adca9e2b6553..65083ad63b6f 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -502,7 +502,7 @@ __visible pmd_t xen_make_pmd(pmdval_t pmd) } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 __visible pudval_t xen_pud_val(pud_t pud) { return pte_mfn_to_pfn(pud.pud); @@ -589,7 +589,7 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val) xen_mc_issue(PARAVIRT_LAZY_MMU); } -#endif /* PAGETABLE_LEVELS == 4 */ +#endif /* CONFIG_PGTABLE_LEVELS == 4 */ /* * (Yet another) pagetable walker. This one is intended for pinning a @@ -1628,7 +1628,7 @@ static void xen_release_pmd(unsigned long pfn) xen_release_ptpage(pfn, PT_PMD); } -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) { xen_alloc_ptpage(mm, pfn, PT_PUD); @@ -2046,7 +2046,7 @@ static void __init xen_post_allocator_init(void) pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 pv_mmu_ops.set_pgd = xen_set_pgd; #endif @@ -2056,7 +2056,7 @@ static void __init xen_post_allocator_init(void) pv_mmu_ops.alloc_pmd = xen_alloc_pmd; pv_mmu_ops.release_pte = xen_release_pte; pv_mmu_ops.release_pmd = xen_release_pmd; -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 pv_mmu_ops.alloc_pud = xen_alloc_pud; pv_mmu_ops.release_pud = xen_release_pud; #endif @@ -2122,14 +2122,14 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 .pud_val = PV_CALLEE_SAVE(xen_pud_val), .make_pud = PV_CALLEE_SAVE(xen_make_pud), .set_pgd = xen_set_pgd_hyper, .alloc_pud = xen_alloc_pmd_init, .release_pud = xen_release_pmd_init, -#endif /* PAGETABLE_LEVELS == 4 */ +#endif /* CONFIG_PGTABLE_LEVELS == 4 */ .activate_mm = xen_activate_mm, .dup_mmap = xen_dup_mmap, -- cgit v1.2.3 From 5d72b4fba40ef4b3f7a1a11d6aacc85d9af81561 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 14 Apr 2015 15:47:29 -0700 Subject: x86, mm: support huge I/O mapping capability I/F Implement huge I/O mapping capability interfaces for ioremap() on x86. IOREMAP_MAX_ORDER is defined to PUD_SHIFT on x86/64 and PMD_SHIFT on x86/32, which overrides the default value defined in . Signed-off-by: Toshi Kani Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Arnd Bergmann Cc: Dave Hansen Cc: Robert Elliott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/page_types.h | 2 ++ arch/x86/mm/ioremap.c | 23 +++++++++++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index f97fbe3abb67..c7c712f2648b 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -40,8 +40,10 @@ #ifdef CONFIG_X86_64 #include +#define IOREMAP_MAX_ORDER (PUD_SHIFT) #else #include +#define IOREMAP_MAX_ORDER (PMD_SHIFT) #endif /* CONFIG_X86_64 */ #ifndef __ASSEMBLY__ diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index fdf617c00e2f..5ead4d6cf3a7 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -67,8 +67,13 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages, /* * Remap an arbitrary physical address space into the kernel virtual - * address space. Needed when the kernel wants to access high addresses - * directly. + * address space. It transparently creates kernel huge I/O mapping when + * the physical address is aligned by a huge page size (1GB or 2MB) and + * the requested size is at least the huge page size. + * + * NOTE: MTRRs can override PAT memory types with a 4KB granularity. + * Therefore, the mapping code falls back to use a smaller page toward 4KB + * when a mapping range is covered by non-WB type of MTRRs. * * NOTE! We need to allow non-page-aligned mappings too: we will obviously * have to convert them into an offset in a page-aligned mapping, but the @@ -326,6 +331,20 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); +int arch_ioremap_pud_supported(void) +{ +#ifdef CONFIG_X86_64 + return cpu_has_gbpages; +#else + return 0; +#endif +} + +int arch_ioremap_pmd_supported(void) +{ + return cpu_has_pse; +} + /* * Convert a physical pointer to a virtual kernel pointer for /dev/mem * access -- cgit v1.2.3 From 6b6378355b925050eb6fa966742d8c2d65ff0d83 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 14 Apr 2015 15:47:32 -0700 Subject: x86, mm: support huge KVA mappings on x86 Implement huge KVA mapping interfaces on x86. On x86, MTRRs can override PAT memory types with a 4KB granularity. When using a huge page, MTRRs can override the memory type of the huge page, which may lead a performance penalty. The processor can also behave in an undefined manner if a huge page is mapped to a memory range that MTRRs have mapped with multiple different memory types. Therefore, the mapping code falls back to use a smaller page size toward 4KB when a mapping range is covered by non-WB type of MTRRs. The WB type of MTRRs has no affect on the PAT memory types. pud_set_huge() and pmd_set_huge() call mtrr_type_lookup() to see if a given range is covered by MTRRs. MTRR_TYPE_WRBACK indicates that the range is either covered by WB or not covered and the MTRR default value is set to WB. 0xFF indicates that MTRRs are disabled. HAVE_ARCH_HUGE_VMAP is selected when X86_64 or X86_32 with X86_PAE is set. X86_32 without X86_PAE is not supported since such config can unlikey be benefited from this feature, and there was an issue found in testing. [fengguang.wu@intel.com: ioremap_pud_capable can be static] Signed-off-by: Toshi Kani Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Arnd Bergmann Cc: Dave Hansen Cc: Robert Elliott Signed-off-by: Fengguang Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/mm/pgtable.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3e3aaad23414..0f948cefaeb1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -99,6 +99,7 @@ config X86 select IRQ_FORCED_THREADING select HAVE_BPF_JIT if X86_64 select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_ARCH_HUGE_VMAP if X86_64 || (X86_32 && X86_PAE) select ARCH_HAS_SG_CHAIN select CLKEVT_I8253 select ARCH_HAVE_NMI_SAFE_CMPXCHG diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index b28edfecbdfe..0b97d2c75df3 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -4,6 +4,7 @@ #include #include #include +#include #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO @@ -560,3 +561,67 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, { __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); } + +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) +{ + u8 mtrr; + + /* + * Do not use a huge page when the range is covered by non-WB type + * of MTRRs. + */ + mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE); + if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF)) + return 0; + + prot = pgprot_4k_2_large(prot); + + set_pte((pte_t *)pud, pfn_pte( + (u64)addr >> PAGE_SHIFT, + __pgprot(pgprot_val(prot) | _PAGE_PSE))); + + return 1; +} + +int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) +{ + u8 mtrr; + + /* + * Do not use a huge page when the range is covered by non-WB type + * of MTRRs. + */ + mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE); + if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF)) + return 0; + + prot = pgprot_4k_2_large(prot); + + set_pte((pte_t *)pmd, pfn_pte( + (u64)addr >> PAGE_SHIFT, + __pgprot(pgprot_val(prot) | _PAGE_PSE))); + + return 1; +} + +int pud_clear_huge(pud_t *pud) +{ + if (pud_large(*pud)) { + pud_clear(pud); + return 1; + } + + return 0; +} + +int pmd_clear_huge(pmd_t *pmd) +{ + if (pmd_large(*pmd)) { + pmd_clear(pmd); + return 1; + } + + return 0; +} +#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ -- cgit v1.2.3 From 82168140bc4cec7ec9bad39705518541149ff8b7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 14 Apr 2015 15:47:45 -0700 Subject: x86: standardize mmap_rnd() usage In preparation for splitting out ET_DYN ASLR, this refactors the use of mmap_rnd() to be used similarly to arm, and extracts the checking of PF_RANDOMIZE. Signed-off-by: Kees Cook Reviewed-by: Ingo Molnar Cc: Oleg Nesterov Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/mmap.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index df4552bd239e..ebfa52030d5c 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -67,22 +67,21 @@ static int mmap_is_legacy(void) static unsigned long mmap_rnd(void) { - unsigned long rnd = 0; + unsigned long rnd; /* - * 8 bits of randomness in 32bit mmaps, 20 address space bits - * 28 bits of randomness in 64bit mmaps, 40 address space bits - */ - if (current->flags & PF_RANDOMIZE) { - if (mmap_is_ia32()) - rnd = get_random_int() % (1<<8); - else - rnd = get_random_int() % (1<<28); - } + * 8 bits of randomness in 32bit mmaps, 20 address space bits + * 28 bits of randomness in 64bit mmaps, 40 address space bits + */ + if (mmap_is_ia32()) + rnd = (unsigned long)get_random_int() % (1<<8); + else + rnd = (unsigned long)get_random_int() % (1<<28); + return rnd << PAGE_SHIFT; } -static unsigned long mmap_base(void) +static unsigned long mmap_base(unsigned long rnd) { unsigned long gap = rlimit(RLIMIT_STACK); @@ -91,19 +90,19 @@ static unsigned long mmap_base(void) else if (gap > MAX_GAP) gap = MAX_GAP; - return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd()); + return PAGE_ALIGN(TASK_SIZE - gap - rnd); } /* * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64 * does, but not when emulating X86_32 */ -static unsigned long mmap_legacy_base(void) +static unsigned long mmap_legacy_base(unsigned long rnd) { if (mmap_is_ia32()) return TASK_UNMAPPED_BASE; else - return TASK_UNMAPPED_BASE + mmap_rnd(); + return TASK_UNMAPPED_BASE + rnd; } /* @@ -112,13 +111,18 @@ static unsigned long mmap_legacy_base(void) */ void arch_pick_mmap_layout(struct mm_struct *mm) { - mm->mmap_legacy_base = mmap_legacy_base(); - mm->mmap_base = mmap_base(); + unsigned long random_factor = 0UL; + + if (current->flags & PF_RANDOMIZE) + random_factor = mmap_rnd(); + + mm->mmap_legacy_base = mmap_legacy_base(random_factor); if (mmap_is_legacy()) { mm->mmap_base = mm->mmap_legacy_base; mm->get_unmapped_area = arch_get_unmapped_area; } else { + mm->mmap_base = mmap_base(random_factor); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } -- cgit v1.2.3 From 2b68f6caeac271620cd2f9362aeaed360e317df0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 14 Apr 2015 15:48:00 -0700 Subject: mm: expose arch_mmap_rnd when available When an architecture fully supports randomizing the ELF load location, a per-arch mmap_rnd() function is used to find a randomized mmap base. In preparation for randomizing the location of ET_DYN binaries separately from mmap, this renames and exports these functions as arch_mmap_rnd(). Additionally introduces CONFIG_ARCH_HAS_ELF_RANDOMIZE for describing this feature on architectures that support it (which is a superset of ARCH_BINFMT_ELF_RANDOMIZE_PIE, since s390 already supports a separated ET_DYN ASLR from mmap ASLR without the ARCH_BINFMT_ELF_RANDOMIZE_PIE logic). Signed-off-by: Kees Cook Cc: Hector Marco-Gisbert Cc: Russell King Reviewed-by: Ingo Molnar Cc: Catalin Marinas Cc: Will Deacon Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Alexander Viro Cc: Oleg Nesterov Cc: Andy Lutomirski Cc: "David A. Long" Cc: Andrey Ryabinin Cc: Arun Chandran Cc: Yann Droneaud Cc: Min-Hua Chen Cc: Paul Burton Cc: Alex Smith Cc: Markos Chandras Cc: Vineeth Vijayan Cc: Jeff Bailey Cc: Michael Holzheu Cc: Ben Hutchings Cc: Behan Webster Cc: Ismael Ripoll Cc: Jan-Simon Mller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/mm/mmap.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0f948cefaeb1..782ddbbc1c9a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -88,6 +88,7 @@ config X86 select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP select HAVE_USER_RETURN_NOTIFIER select ARCH_BINFMT_ELF_RANDOMIZE_PIE + select ARCH_HAS_ELF_RANDOMIZE select HAVE_ARCH_JUMP_LABEL select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select SPARSE_IRQ diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index ebfa52030d5c..9d518d693b4b 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -65,7 +65,7 @@ static int mmap_is_legacy(void) return sysctl_legacy_va_layout; } -static unsigned long mmap_rnd(void) +unsigned long arch_mmap_rnd(void) { unsigned long rnd; @@ -114,7 +114,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) - random_factor = mmap_rnd(); + random_factor = arch_mmap_rnd(); mm->mmap_legacy_base = mmap_legacy_base(random_factor); -- cgit v1.2.3 From d1fd836dcf00d2028c700c7e44d2c23404062c90 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 14 Apr 2015 15:48:07 -0700 Subject: mm: split ET_DYN ASLR from mmap ASLR This fixes the "offset2lib" weakness in ASLR for arm, arm64, mips, powerpc, and x86. The problem is that if there is a leak of ASLR from the executable (ET_DYN), it means a leak of shared library offset as well (mmap), and vice versa. Further details and a PoC of this attack is available here: http://cybersecurity.upv.es/attacks/offset2lib/offset2lib.html With this patch, a PIE linked executable (ET_DYN) has its own ASLR region: $ ./show_mmaps_pie 54859ccd6000-54859ccd7000 r-xp ... /tmp/show_mmaps_pie 54859ced6000-54859ced7000 r--p ... /tmp/show_mmaps_pie 54859ced7000-54859ced8000 rw-p ... /tmp/show_mmaps_pie 7f75be764000-7f75be91f000 r-xp ... /lib/x86_64-linux-gnu/libc.so.6 7f75be91f000-7f75beb1f000 ---p ... /lib/x86_64-linux-gnu/libc.so.6 7f75beb1f000-7f75beb23000 r--p ... /lib/x86_64-linux-gnu/libc.so.6 7f75beb23000-7f75beb25000 rw-p ... /lib/x86_64-linux-gnu/libc.so.6 7f75beb25000-7f75beb2a000 rw-p ... 7f75beb2a000-7f75beb4d000 r-xp ... /lib64/ld-linux-x86-64.so.2 7f75bed45000-7f75bed46000 rw-p ... 7f75bed46000-7f75bed47000 r-xp ... 7f75bed47000-7f75bed4c000 rw-p ... 7f75bed4c000-7f75bed4d000 r--p ... /lib64/ld-linux-x86-64.so.2 7f75bed4d000-7f75bed4e000 rw-p ... /lib64/ld-linux-x86-64.so.2 7f75bed4e000-7f75bed4f000 rw-p ... 7fffb3741000-7fffb3762000 rw-p ... [stack] 7fffb377b000-7fffb377d000 r--p ... [vvar] 7fffb377d000-7fffb377f000 r-xp ... [vdso] The change is to add a call the newly created arch_mmap_rnd() into the ELF loader for handling ET_DYN ASLR in a separate region from mmap ASLR, as was already done on s390. Removes CONFIG_BINFMT_ELF_RANDOMIZE_PIE, which is no longer needed. Signed-off-by: Kees Cook Reported-by: Hector Marco-Gisbert Cc: Russell King Reviewed-by: Ingo Molnar Cc: Catalin Marinas Cc: Will Deacon Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Alexander Viro Cc: Oleg Nesterov Cc: Andy Lutomirski Cc: "David A. Long" Cc: Andrey Ryabinin Cc: Arun Chandran Cc: Yann Droneaud Cc: Min-Hua Chen Cc: Paul Burton Cc: Alex Smith Cc: Markos Chandras Cc: Vineeth Vijayan Cc: Jeff Bailey Cc: Michael Holzheu Cc: Ben Hutchings Cc: Behan Webster Cc: Ismael Ripoll Cc: Jan-Simon Mller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 782ddbbc1c9a..1f7f185934a5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -87,7 +87,6 @@ config X86 select HAVE_ARCH_KMEMCHECK select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP select HAVE_USER_RETURN_NOTIFIER - select ARCH_BINFMT_ELF_RANDOMIZE_PIE select ARCH_HAS_ELF_RANDOMIZE select HAVE_ARCH_JUMP_LABEL select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE -- cgit v1.2.3 From 204db6ed17743000691d930368a5abd6ea541c58 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 14 Apr 2015 15:48:12 -0700 Subject: mm: fold arch_randomize_brk into ARCH_HAS_ELF_RANDOMIZE The arch_randomize_brk() function is used on several architectures, even those that don't support ET_DYN ASLR. To avoid bulky extern/#define tricks, consolidate the support under CONFIG_ARCH_HAS_ELF_RANDOMIZE for the architectures that support it, while still handling CONFIG_COMPAT_BRK. Signed-off-by: Kees Cook Cc: Hector Marco-Gisbert Cc: Russell King Reviewed-by: Ingo Molnar Cc: Catalin Marinas Cc: Will Deacon Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Alexander Viro Cc: Oleg Nesterov Cc: Andy Lutomirski Cc: "David A. Long" Cc: Andrey Ryabinin Cc: Arun Chandran Cc: Yann Droneaud Cc: Min-Hua Chen Cc: Paul Burton Cc: Alex Smith Cc: Markos Chandras Cc: Vineeth Vijayan Cc: Jeff Bailey Cc: Michael Holzheu Cc: Ben Hutchings Cc: Behan Webster Cc: Ismael Ripoll Cc: Jan-Simon Mller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/elf.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 935588d95c82..f161c189c27b 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -339,9 +339,6 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); #define compat_arch_setup_additional_pages compat_arch_setup_additional_pages -extern unsigned long arch_randomize_brk(struct mm_struct *mm); -#define arch_randomize_brk arch_randomize_brk - /* * True on X86_32 or when emulating IA32 on X86_64 */ -- cgit v1.2.3 From 4a20799d11f64e6b8725cacc7619b1ae1dbf9acd Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Tue, 14 Apr 2015 15:48:27 -0700 Subject: mm: move memtest under mm Memtest is a simple feature which fills the memory with a given set of patterns and validates memory contents, if bad memory regions is detected it reserves them via memblock API. Since memblock API is widely used by other architectures this feature can be enabled outside of x86 world. This patch set promotes memtest to live under generic mm umbrella and enables memtest feature for arm/arm64. It was reported that this patch set was useful for tracking down an issue with some errant DMA on an arm64 platform. This patch (of 6): There is nothing platform dependent in the core memtest code, so other platforms might benefit from this feature too. [linux@roeck-us.net: MEMTEST depends on MEMBLOCK] Signed-off-by: Vladimir Murzin Acked-by: Will Deacon Tested-by: Mark Rutland Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Catalin Marinas Cc: Russell King Cc: Paul Bolle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 11 ----- arch/x86/include/asm/e820.h | 8 --- arch/x86/mm/Makefile | 2 - arch/x86/mm/memtest.c | 118 -------------------------------------------- 4 files changed, 139 deletions(-) delete mode 100644 arch/x86/mm/memtest.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1f7f185934a5..d43e7e1c784b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -721,17 +721,6 @@ endif #HYPERVISOR_GUEST config NO_BOOTMEM def_bool y -config MEMTEST - bool "Memtest" - ---help--- - This option adds a kernel parameter 'memtest', which allows memtest - to be set. - memtest=0, mean disabled; -- default - memtest=1, mean do 1 test pattern; - ... - memtest=4, mean do 4 test patterns. - If you are unsure how to answer this question, answer N. - source "arch/x86/Kconfig.cpu" config HPET_TIMER diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 779c2efe2e97..3ab0537872fb 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -40,14 +40,6 @@ static inline void e820_mark_nosave_regions(unsigned long limit_pfn) } #endif -#ifdef CONFIG_MEMTEST -extern void early_memtest(unsigned long start, unsigned long end); -#else -static inline void early_memtest(unsigned long start, unsigned long end) -{ -} -#endif - extern unsigned long e820_end_of_ram_pfn(void); extern unsigned long e820_end_of_low_ram_pfn(void); extern u64 early_reserve_e820(u64 sizet, u64 align); diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index c4cc74006c61..a482d105172b 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -32,6 +32,4 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o -obj-$(CONFIG_MEMTEST) += memtest.o - obj-$(CONFIG_X86_INTEL_MPX) += mpx.o diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c deleted file mode 100644 index 1e9da795767a..000000000000 --- a/arch/x86/mm/memtest.c +++ /dev/null @@ -1,118 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static u64 patterns[] __initdata = { - /* The first entry has to be 0 to leave memtest with zeroed memory */ - 0, - 0xffffffffffffffffULL, - 0x5555555555555555ULL, - 0xaaaaaaaaaaaaaaaaULL, - 0x1111111111111111ULL, - 0x2222222222222222ULL, - 0x4444444444444444ULL, - 0x8888888888888888ULL, - 0x3333333333333333ULL, - 0x6666666666666666ULL, - 0x9999999999999999ULL, - 0xccccccccccccccccULL, - 0x7777777777777777ULL, - 0xbbbbbbbbbbbbbbbbULL, - 0xddddddddddddddddULL, - 0xeeeeeeeeeeeeeeeeULL, - 0x7a6c7258554e494cULL, /* yeah ;-) */ -}; - -static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) -{ - printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n", - (unsigned long long) pattern, - (unsigned long long) start_bad, - (unsigned long long) end_bad); - memblock_reserve(start_bad, end_bad - start_bad); -} - -static void __init memtest(u64 pattern, u64 start_phys, u64 size) -{ - u64 *p, *start, *end; - u64 start_bad, last_bad; - u64 start_phys_aligned; - const size_t incr = sizeof(pattern); - - start_phys_aligned = ALIGN(start_phys, incr); - start = __va(start_phys_aligned); - end = start + (size - (start_phys_aligned - start_phys)) / incr; - start_bad = 0; - last_bad = 0; - - for (p = start; p < end; p++) - *p = pattern; - - for (p = start; p < end; p++, start_phys_aligned += incr) { - if (*p == pattern) - continue; - if (start_phys_aligned == last_bad + incr) { - last_bad += incr; - continue; - } - if (start_bad) - reserve_bad_mem(pattern, start_bad, last_bad + incr); - start_bad = last_bad = start_phys_aligned; - } - if (start_bad) - reserve_bad_mem(pattern, start_bad, last_bad + incr); -} - -static void __init do_one_pass(u64 pattern, u64 start, u64 end) -{ - u64 i; - phys_addr_t this_start, this_end; - - for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) { - this_start = clamp_t(phys_addr_t, this_start, start, end); - this_end = clamp_t(phys_addr_t, this_end, start, end); - if (this_start < this_end) { - printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", - (unsigned long long)this_start, - (unsigned long long)this_end, - (unsigned long long)cpu_to_be64(pattern)); - memtest(pattern, this_start, this_end - this_start); - } - } -} - -/* default is disabled */ -static int memtest_pattern __initdata; - -static int __init parse_memtest(char *arg) -{ - if (arg) - memtest_pattern = simple_strtoul(arg, NULL, 0); - else - memtest_pattern = ARRAY_SIZE(patterns); - - return 0; -} - -early_param("memtest", parse_memtest); - -void __init early_memtest(unsigned long start, unsigned long end) -{ - unsigned int i; - unsigned int idx = 0; - - if (!memtest_pattern) - return; - - printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); - for (i = memtest_pattern-1; i < UINT_MAX; --i) { - idx = i % ARRAY_SIZE(patterns); - do_one_pass(patterns[idx], start, end); - } -} -- cgit v1.2.3 From 130005231c9f2090b1b177e2cca9841b562c1784 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 15 Apr 2015 10:24:54 +0800 Subject: kvm: mmu: don't do memslot overflow check As Andres pointed out: | I don't understand the value of this check here. Are we looking for a | broken memslot? Shouldn't this be a BUG_ON? Is this the place to care | about these things? npages is capped to KVM_MEM_MAX_NR_PAGES, i.e. | 2^31. A 64 bit overflow would be caused by a gigantic gfn_start which | would be trouble in many other ways. This patch drops the memslot overflow check to make the codes more simple. Reviewed-by: Andres Lagar-Cavilla Signed-off-by: Wanpeng Li Message-Id: <1429064694-3072-1-git-send-email-wanpeng.li@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 146f295ee322..07bb22157338 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4504,19 +4504,12 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, bool flush = false; unsigned long *rmapp; unsigned long last_index, index; - gfn_t gfn_start, gfn_end; spin_lock(&kvm->mmu_lock); - gfn_start = memslot->base_gfn; - gfn_end = memslot->base_gfn + memslot->npages - 1; - - if (gfn_start >= gfn_end) - goto out; - rmapp = memslot->arch.rmap[0]; - last_index = gfn_to_index(gfn_end, memslot->base_gfn, - PT_PAGE_TABLE_LEVEL); + last_index = gfn_to_index(memslot->base_gfn + memslot->npages - 1, + memslot->base_gfn, PT_PAGE_TABLE_LEVEL); for (index = 0; index <= last_index; ++index, ++rmapp) { if (*rmapp) @@ -4534,7 +4527,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, if (flush) kvm_flush_remote_tlbs(kvm); -out: spin_unlock(&kvm->mmu_lock); } -- cgit v1.2.3 From decf63336e356423300b935afbebeca1fcb15184 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 14 Apr 2015 12:04:10 +0800 Subject: KVM: MMU: fix comment in kvm_mmu_zap_collapsible_spte Soft mmu uses direct shadow page to fill guest large mapping with small pages if huge mapping is disallowed on host. So zapping direct shadow page works well both for soft mmu and hard mmu, it's just less widely applicable. Fix the comment to reflect this. Signed-off-by: Xiao Guangrong Message-Id: <552C91BA.1010703@linux.intel.com> [Fix comment wording further. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 07bb22157338..d43867c33bc4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4481,9 +4481,11 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, pfn = spte_to_pfn(*sptep); /* - * Only EPT supported for now; otherwise, one would need to - * find out efficiently whether the guest page tables are - * also using huge pages. + * We cannot do huge page mapping for indirect shadow pages, + * which are found on the last rmap (level = 1) when not using + * tdp; such shadow pages are synced with the page table in + * the guest, and the guest page table is using 4K page size + * mapping if the indirect sp has level = 1. */ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && -- cgit v1.2.3 From bb668734c4c960c8f61f017585b323b97e5f47b5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 17 Mar 2015 22:26:21 +0000 Subject: VFS: assorted d_backing_inode() annotations Signed-off-by: David Howells Signed-off-by: Al Viro --- arch/x86/kvm/assigned-dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c index 6eb5c20ee373..d090ecf08809 100644 --- a/arch/x86/kvm/assigned-dev.c +++ b/arch/x86/kvm/assigned-dev.c @@ -666,7 +666,7 @@ static int probe_sysfs_permissions(struct pci_dev *dev) if (r) return r; - inode = path.dentry->d_inode; + inode = d_backing_inode(path.dentry); r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS); path_put(&path); -- cgit v1.2.3 From 3ac62bc0602794dc36aad77a7ef5772e989b2e22 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 15 Apr 2015 16:17:45 -0700 Subject: x86: mtrr: if: remove use of seq_printf return value The seq_printf return value, because it's frequently misused, will eventually be converted to void. See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to seq_has_overflowed() and make public") Signed-off-by: Joe Perches Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/mtrr/if.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index a041e094b8b9..d76f13d6d8d6 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -404,11 +404,10 @@ static const struct file_operations mtrr_fops = { static int mtrr_seq_show(struct seq_file *seq, void *offset) { char factor; - int i, max, len; + int i, max; mtrr_type type; unsigned long base, size; - len = 0; max = num_var_ranges; for (i = 0; i < max; i++) { mtrr_if->get(i, &base, &size, &type); @@ -425,11 +424,10 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) size >>= 20 - PAGE_SHIFT; } /* Base can be > 32bit */ - len += seq_printf(seq, "reg%02i: base=0x%06lx000 " - "(%5luMB), size=%5lu%cB, count=%d: %s\n", - i, base, base >> (20 - PAGE_SHIFT), size, - factor, mtrr_usage_table[i], - mtrr_attrib_to_str(type)); + seq_printf(seq, "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n", + i, base, base >> (20 - PAGE_SHIFT), + size, factor, + mtrr_usage_table[i], mtrr_attrib_to_str(type)); } return 0; } -- cgit v1.2.3 From fd0f86b66425bd8c6af8985881e82b28c30fd450 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 16 Apr 2015 00:40:25 -0700 Subject: x86/ptrace: Fix the TIF_FORCED_TF logic in handle_signal() When the TIF_SINGLESTEP tracee dequeues a signal, handle_signal() clears TIF_FORCED_TF and X86_EFLAGS_TF but leaves TIF_SINGLESTEP set. If the tracer does PTRACE_SINGLESTEP again, enable_single_step() sets X86_EFLAGS_TF but not TIF_FORCED_TF. This means that the subsequent PTRACE_CONT doesn't not clear X86_EFLAGS_TF, and the tracee gets the wrong SIGTRAP. Test-case (needs -O2 to avoid prologue insns in signal handler): #include #include #include #include #include #include #include void handler(int n) { asm("nop"); } int child(void) { assert(ptrace(PTRACE_TRACEME, 0,0,0) == 0); signal(SIGALRM, handler); kill(getpid(), SIGALRM); return 0x23; } void *getip(int pid) { return (void*)ptrace(PTRACE_PEEKUSER, pid, offsetof(struct user, regs.rip), 0); } int main(void) { int pid, status; pid = fork(); if (!pid) return child(); assert(wait(&status) == pid); assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGALRM); assert(ptrace(PTRACE_SINGLESTEP, pid, 0, SIGALRM) == 0); assert(wait(&status) == pid); assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP); assert((getip(pid) - (void*)handler) == 0); assert(ptrace(PTRACE_SINGLESTEP, pid, 0, SIGALRM) == 0); assert(wait(&status) == pid); assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP); assert((getip(pid) - (void*)handler) == 1); assert(ptrace(PTRACE_CONT, pid, 0,0) == 0); assert(wait(&status) == pid); assert(WIFEXITED(status) && WEXITSTATUS(status) == 0x23); return 0; } The last assert() fails because PTRACE_CONT wrongly triggers another single-step and X86_EFLAGS_TF can't be cleared by debugger until the tracee does sys_rt_sigreturn(). Change handle_signal() to do user_disable_single_step() if stepping, we do not need to preserve TIF_SINGLESTEP because we are going to do ptrace_notify(), and it is simply wrong to leak this bit. While at it, change the comment to explain why we also need to clear TF unconditionally after setup_rt_frame(). Note: in the longer term we should probably change setup_sigcontext() to use get_flags() and then just remove this user_disable_single_step(). And, the state of TIF_FORCED_TF can be wrong after restore_sigcontext() which can set/clear TF, this needs another fix. This fix fixes the 'single_step_syscall_32' testcase in the x86 testsuite: Before: ~/linux/tools/testing/selftests/x86> ./single_step_syscall_32 [RUN] Set TF and check nop [OK] Survived with TF set and 9 traps [RUN] Set TF and check int80 [OK] Survived with TF set and 9 traps [RUN] Set TF and check a fast syscall [WARN] Hit 10000 SIGTRAPs with si_addr 0xf7789cc0, ip 0xf7789cc0 Trace/breakpoint trap (core dumped) After: ~/linux/linux/tools/testing/selftests/x86> ./single_step_syscall_32 [RUN] Set TF and check nop [OK] Survived with TF set and 9 traps [RUN] Set TF and check int80 [OK] Survived with TF set and 9 traps [RUN] Set TF and check a fast syscall [OK] Survived with TF set and 39 traps [RUN] Fast syscall with TF cleared [OK] Nothing unexpected happened Reported-by: Evan Teran Reported-by: Pedro Alves Tested-by: Andres Freund Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner [ Added x86 self-test info. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 3e581865c8e2..d185bdd95a4b 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -630,7 +630,8 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { - bool failed; + bool stepping, failed; + /* Are we from a system call? */ if (syscall_get_nr(current, regs) >= 0) { /* If so, check system call restarting.. */ @@ -654,12 +655,13 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) } /* - * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF - * flag so that register information in the sigcontext is correct. + * If TF is set due to a debugger (TIF_FORCED_TF), clear TF now + * so that register information in the sigcontext is correct and + * then notify the tracer before entering the signal handler. */ - if (unlikely(regs->flags & X86_EFLAGS_TF) && - likely(test_and_clear_thread_flag(TIF_FORCED_TF))) - regs->flags &= ~X86_EFLAGS_TF; + stepping = test_thread_flag(TIF_SINGLESTEP); + if (stepping) + user_disable_single_step(current); failed = (setup_rt_frame(ksig, regs) < 0); if (!failed) { @@ -670,10 +672,8 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) * it might disable possible debug exception from the * signal handler. * - * Clear TF when entering the signal handler, but - * notify any tracer that was single-stepping it. - * The tracer may want to single-step inside the - * handler too. + * Clear TF for the case when it wasn't set by debugger to + * avoid the recursive send_sigtrap() in SIGTRAP handler. */ regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF); /* @@ -682,7 +682,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) if (used_math()) fpu_reset_state(current); } - signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP)); + signal_setup_done(failed, ksig, stepping); } #ifdef CONFIG_X86_32 -- cgit v1.2.3 From c857eb56e6e8e53dccd8d1e7ea90bcaf3311996d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 15 Apr 2015 20:14:53 +0200 Subject: perf/x86: Fix hw_perf_event::flags collision Somehow we ended up with overlapping flags when merging the RDPMC control flag - this is bad, fix it. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 329f0356ad4a..6ac5cb7a9e14 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -65,15 +65,15 @@ struct event_constraint { /* * struct hw_perf_event.flags flags */ -#define PERF_X86_EVENT_PEBS_LDLAT 0x1 /* ld+ldlat data address sampling */ -#define PERF_X86_EVENT_PEBS_ST 0x2 /* st data address sampling */ -#define PERF_X86_EVENT_PEBS_ST_HSW 0x4 /* haswell style datala, store */ -#define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */ -#define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */ -#define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */ -#define PERF_X86_EVENT_EXCL 0x40 /* HT exclusivity on counter */ -#define PERF_X86_EVENT_DYNAMIC 0x80 /* dynamic alloc'd constraint */ -#define PERF_X86_EVENT_RDPMC_ALLOWED 0x40 /* grant rdpmc permission */ +#define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */ +#define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */ +#define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */ +#define PERF_X86_EVENT_COMMITTED 0x0008 /* event passed commit_txn */ +#define PERF_X86_EVENT_PEBS_LD_HSW 0x0010 /* haswell style datala, load */ +#define PERF_X86_EVENT_PEBS_NA_HSW 0x0020 /* haswell style datala, unknown */ +#define PERF_X86_EVENT_EXCL 0x0040 /* HT exclusivity on counter */ +#define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */ +#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */ struct amd_nb { -- cgit v1.2.3 From 517e6341fa123ec3a2f9ea78ad547be910529881 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 11 Apr 2015 12:16:22 +0200 Subject: perf/x86/intel: Fix Core2,Atom,NHM,WSM cycles:pp events Ingo reported that cycles:pp didn't work for him on some machines. It turns out that in this commit: af4bdcf675cf perf/x86/intel: Disallow flags for most Core2/Atom/Nehalem/Westmere events Andi forgot to explicitly allow that event when he disabled event flags for PEBS on those uarchs. Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Fixes: af4bdcf675cf ("perf/x86/intel: Disallow flags for most Core2/Atom/Nehalem/Westmere events") Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index ca69ea56c712..813f75d71175 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -558,6 +558,8 @@ struct event_constraint intel_core2_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01), EVENT_CONSTRAINT_END }; @@ -565,6 +567,8 @@ struct event_constraint intel_atom_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01), EVENT_CONSTRAINT_END }; @@ -588,6 +592,8 @@ struct event_constraint intel_nehalem_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f), EVENT_CONSTRAINT_END }; @@ -603,6 +609,8 @@ struct event_constraint intel_westmere_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f), EVENT_CONSTRAINT_END }; -- cgit v1.2.3 From 645523960102fa0ac0578d070630e49ab05f06d1 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 26 Mar 2015 14:28:45 -0700 Subject: perf/x86/intel/rapl: Fix energy counter measurements but supporing per domain energy units RAPL energy hardware unit can vary within a single CPU package, e.g. HSW server DRAM has a fixed energy unit of 15.3 uJ (2^-16) whereas the unit on other domains can be enumerated from power unit MSR. There might be other variations in the future, this patch adds per cpu model quirk to allow special handling of certain cpus. hw_unit is also removed from per cpu data since it is not per cpu and the sampling rate for energy counter is typically not high. Without this patch, DRAM domain on HSW servers will be counted 4x higher than the real energy counter. Signed-off-by: Jacob Pan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Stephane Eranian Cc: Andi Kleen Cc: Arnaldo Carvalho de Melo Cc: H. Peter Anvin Cc: Paul Mackerras Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1427405325-780-1-git-send-email-jacob.jun.pan@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_rapl.c | 94 ++++++++++++++++++++++------- 1 file changed, 73 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index c4bb8b8e5017..999289b94025 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -62,6 +62,14 @@ #define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */ #define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ +#define NR_RAPL_DOMAINS 0x4 +static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { + "pp0-core", + "package", + "dram", + "pp1-gpu", +}; + /* Clients have PP0, PKG */ #define RAPL_IDX_CLN (1< NR_RAPL_DOMAINS) { + pr_warn("invalid domain %d, failed to scale data\n", cfg); + return v; + } /* * scale delta to smallest unit (1/2^32) * users must then scale back: count * 1/(1e9*2^32) to get Joules * or use ldexp(count, -32). * Watts = Joules/Time delta */ - return v << (32 - __this_cpu_read(rapl_pmu)->hw_unit); + return v << (32 - rapl_hw_unit[cfg - 1]); } static u64 rapl_event_update(struct perf_event *event) @@ -173,7 +195,7 @@ again: delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - sdelta = rapl_scale(delta); + sdelta = rapl_scale(delta, event->hw.config); local64_add(sdelta, &event->count); @@ -546,12 +568,22 @@ static void rapl_cpu_init(int cpu) cpumask_set_cpu(cpu, &rapl_cpu_mask); } +static __init void rapl_hsw_server_quirk(void) +{ + /* + * DRAM domain on HSW server has fixed energy unit which can be + * different than the unit from power unit MSR. + * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2 + * of 2. Datasheet, September 2014, Reference Number: 330784-001 " + */ + rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16; +} + static int rapl_cpu_prepare(int cpu) { struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); int phys_id = topology_physical_package_id(cpu); u64 ms; - u64 msr_rapl_power_unit_bits; if (pmu) return 0; @@ -559,24 +591,13 @@ static int rapl_cpu_prepare(int cpu) if (phys_id < 0) return -1; - /* protect rdmsrl() to handle virtualization */ - if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) - return -1; - pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); if (!pmu) return -1; - spin_lock_init(&pmu->lock); INIT_LIST_HEAD(&pmu->active_list); - /* - * grab power unit as: 1/2^unit Joules - * - * we cache in local PMU instance - */ - pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; pmu->pmu = &rapl_pmu_class; /* @@ -586,8 +607,8 @@ static int rapl_cpu_prepare(int cpu) * divide interval by 2 to avoid lockstep (2 * 100) * if hw unit is 32, then we use 2 ms 1/200/2 */ - if (pmu->hw_unit < 32) - ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1)); + if (rapl_hw_unit[0] < 32) + ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1)); else ms = 2; @@ -655,6 +676,20 @@ static int rapl_cpu_notifier(struct notifier_block *self, return NOTIFY_OK; } +static int rapl_check_hw_unit(void) +{ + u64 msr_rapl_power_unit_bits; + int i; + + /* protect rdmsrl() to handle virtualization */ + if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) + return -1; + for (i = 0; i < NR_RAPL_DOMAINS; i++) + rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; + + return 0; +} + static const struct x86_cpu_id rapl_cpu_match[] = { [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 }, [1] = {}, @@ -664,6 +699,8 @@ static int __init rapl_pmu_init(void) { struct rapl_pmu *pmu; int cpu, ret; + struct x86_pmu_quirk *quirk; + int i; /* * check for Intel processor family 6 @@ -678,6 +715,11 @@ static int __init rapl_pmu_init(void) rapl_cntr_mask = RAPL_IDX_CLN; rapl_pmu_events_group.attrs = rapl_events_cln_attr; break; + case 63: /* Haswell-Server */ + rapl_add_quirk(rapl_hsw_server_quirk); + rapl_cntr_mask = RAPL_IDX_SRV; + rapl_pmu_events_group.attrs = rapl_events_srv_attr; + break; case 60: /* Haswell */ case 69: /* Haswell-Celeron */ rapl_cntr_mask = RAPL_IDX_HSW; @@ -693,7 +735,13 @@ static int __init rapl_pmu_init(void) /* unsupported */ return 0; } + ret = rapl_check_hw_unit(); + if (ret) + return ret; + /* run cpu model quirks */ + for (quirk = rapl_quirks; quirk; quirk = quirk->next) + quirk->func(); cpu_notifier_register_begin(); for_each_online_cpu(cpu) { @@ -714,14 +762,18 @@ static int __init rapl_pmu_init(void) pmu = __this_cpu_read(rapl_pmu); - pr_info("RAPL PMU detected, hw unit 2^-%d Joules," + pr_info("RAPL PMU detected," " API unit is 2^-32 Joules," " %d fixed counters" " %llu ms ovfl timer\n", - pmu->hw_unit, hweight32(rapl_cntr_mask), ktime_to_ms(pmu->timer_interval)); - + for (i = 0; i < NR_RAPL_DOMAINS; i++) { + if (rapl_cntr_mask & (1 << i)) { + pr_info("hw unit of domain %s 2^-%d Joules\n", + rapl_domain_names[i], rapl_hw_unit[i]); + } + } out: cpu_notifier_register_done(); -- cgit v1.2.3 From 78d504bcd769cc496f63b626f507039eab2316b7 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 2 Apr 2015 04:12:57 -0400 Subject: perf/x86/intel: Add Broadwell support for the LBR callstack Same as Haswell, Broadwell also support the LBR callstack. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andi Kleen Link: http://lkml.kernel.org/r/1427962377-40955-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 9da2400c2ec3..219d3fb423a1 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -3275,7 +3275,7 @@ __init int intel_pmu_init(void) hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE| BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM; - intel_pmu_lbr_init_snb(); + intel_pmu_lbr_init_hsw(); x86_pmu.event_constraints = intel_bdw_event_constraints; x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; -- cgit v1.2.3 From 18ecb3bfa5a9f6fffbb3eeb4369f0b9463438ec0 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 16 Apr 2015 20:41:37 +0200 Subject: x86/fpu: Load xsave pointer *after* initialization So I was playing with gdb today and did this simple thing: gdb /bin/ls ... (gdb) run Box exploded with this splat: BUG: unable to handle kernel NULL pointer dereference at 00000000000001d0 IP: [] xstateregs_get+0x7a/0x120 [...] Call Trace: ptrace_regset ptrace_request ? wait_task_inactive ? preempt_count_sub arch_ptrace ? ptrace_get_task_struct SyS_ptrace system_call_fastpath ... because we do cache &target->thread.fpu.state->xsave into the local variable xsave but that pointer is NULL at that time and it gets initialized later, in init_fpu(), see: e7f180dcd8ab ("x86/fpu: Change xstateregs_get()/set() to use ->xsave.i387 rather than ->fxsave") The fix is simple: load xsave *after* init_fpu() has run. Also do the same in xstateregs_set(), as suggested by Oleg Nesterov. Signed-off-by: Borislav Petkov Acked-by: Oleg Nesterov Cc: Andy Lutomirski Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Rik van Riel Cc: Tavis Ormandy Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1429209697-5902-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/i387.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 367f39d35e9c..009183276bb7 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -341,7 +341,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - struct xsave_struct *xsave = &target->thread.fpu.state->xsave; + struct xsave_struct *xsave; int ret; if (!cpu_has_xsave) @@ -351,6 +351,8 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; + xsave = &target->thread.fpu.state->xsave; + /* * Copy the 48bytes defined by the software first into the xstate * memory layout in the thread struct, so that we can copy the entire @@ -369,7 +371,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct xsave_struct *xsave = &target->thread.fpu.state->xsave; + struct xsave_struct *xsave; int ret; if (!cpu_has_xsave) @@ -379,6 +381,8 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; + xsave = &target->thread.fpu.state->xsave; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); /* * mxcsr reserved bits must be masked to zero for security reasons. -- cgit v1.2.3 From 8eb68bf75eb7ea73bce88cb9d42efd3bbcece3cd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 16 Apr 2015 12:49:04 -0700 Subject: x86: switch to using asm-generic for seccomp.h Switch to using the newly created asm-generic/seccomp.h for the seccomp strict mode syscall definitions. The obsolete sigreturn syscall override is retained in 32-bit mode, and the ia32 syscall overrides are used in the compat case. Remaining definitions were identical. Signed-off-by: Kees Cook Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/seccomp.h | 21 ++++++++++++++++++--- arch/x86/include/asm/seccomp_32.h | 11 ----------- arch/x86/include/asm/seccomp_64.h | 17 ----------------- 3 files changed, 18 insertions(+), 31 deletions(-) delete mode 100644 arch/x86/include/asm/seccomp_32.h delete mode 100644 arch/x86/include/asm/seccomp_64.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h index 0f3d7f099224..0c8c7c8861b4 100644 --- a/arch/x86/include/asm/seccomp.h +++ b/arch/x86/include/asm/seccomp.h @@ -1,5 +1,20 @@ +#ifndef _ASM_X86_SECCOMP_H +#define _ASM_X86_SECCOMP_H + +#include + #ifdef CONFIG_X86_32 -# include -#else -# include +#define __NR_seccomp_sigreturn __NR_sigreturn #endif + +#ifdef CONFIG_COMPAT +#include +#define __NR_seccomp_read_32 __NR_ia32_read +#define __NR_seccomp_write_32 __NR_ia32_write +#define __NR_seccomp_exit_32 __NR_ia32_exit +#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn +#endif + +#include + +#endif /* _ASM_X86_SECCOMP_H */ diff --git a/arch/x86/include/asm/seccomp_32.h b/arch/x86/include/asm/seccomp_32.h deleted file mode 100644 index b811d6f5780c..000000000000 --- a/arch/x86/include/asm/seccomp_32.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef _ASM_X86_SECCOMP_32_H -#define _ASM_X86_SECCOMP_32_H - -#include - -#define __NR_seccomp_read __NR_read -#define __NR_seccomp_write __NR_write -#define __NR_seccomp_exit __NR_exit -#define __NR_seccomp_sigreturn __NR_sigreturn - -#endif /* _ASM_X86_SECCOMP_32_H */ diff --git a/arch/x86/include/asm/seccomp_64.h b/arch/x86/include/asm/seccomp_64.h deleted file mode 100644 index 84ec1bd161a5..000000000000 --- a/arch/x86/include/asm/seccomp_64.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef _ASM_X86_SECCOMP_64_H -#define _ASM_X86_SECCOMP_64_H - -#include -#include - -#define __NR_seccomp_read __NR_read -#define __NR_seccomp_write __NR_write -#define __NR_seccomp_exit __NR_exit -#define __NR_seccomp_sigreturn __NR_rt_sigreturn - -#define __NR_seccomp_read_32 __NR_ia32_read -#define __NR_seccomp_write_32 __NR_ia32_write -#define __NR_seccomp_exit_32 __NR_ia32_exit -#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn - -#endif /* _ASM_X86_SECCOMP_64_H */ -- cgit v1.2.3 From 98b228f55014870092c15d7d168fecac69f2f12a Mon Sep 17 00:00:00 2001 From: Roy Franz Date: Wed, 15 Apr 2015 16:32:24 -0700 Subject: x86/efi: Store upper bits of command line buffer address in ext_cmd_line_ptr Until now, the EFI stub was only setting the 32 bit cmd_line_ptr in the setup_header structure, so on 64 bit platforms this could be truncated. This patch adds setting the upper bits of the buffer address in ext_cmd_line_ptr. This case was likely never hit, as the allocation for this buffer is done at the lowest available address. Only x86_64 kernels have this problem, as the 1-1 mapping mandated by EFI ensures that all memory is 32 bit addressable on 32 bit platforms. The EFI stub does not support mixed mode, so the 32 bit kernel on 64 bit firmware case does not need to be handled. Signed-off-by: Roy Franz Cc: Signed-off-by: Matt Fleming --- arch/x86/boot/compressed/eboot.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 92b9a5f2aed6..5999980206bf 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -1110,6 +1110,8 @@ struct boot_params *make_boot_params(struct efi_config *c) if (!cmdline_ptr) goto fail; hdr->cmd_line_ptr = (unsigned long)cmdline_ptr; + /* Fill in upper bits of command line address, NOP on 32 bit */ + boot_params->ext_cmd_line_ptr = (u64)(unsigned long)cmdline_ptr >> 32; hdr->ramdisk_image = 0; hdr->ramdisk_size = 0; -- cgit v1.2.3 From 0c99241c93b8060441f3c8434848e54b5338f922 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 16 Apr 2015 12:38:30 +0200 Subject: perf/x86/intel/pt: Fix and clean up error handling in pt_event_add() Dan Carpenter reported that pt_event_add() has buggy error handling logic: it returns 0 instead of -EBUSY when it fails to start a newly added event. Furthermore, the control flow in this function is messy, with cleanup labels mixed with direct returns. Fix the bug and clean up the code by converting it to a straight fast path for the regular non-failing case, plus a clear sequence of cascading goto labels to do all cleanup. NOTE: I materially changed the existing clean up logic in the pt_event_start() failure case to use the direct perf_aux_output_end() path, not pt_event_del(), because perf_aux_output_end() is enough here. Reported-by: Dan Carpenter Acked-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Julia Lawall Cc: Linus Torvalds Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150416103830.GB7847@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_pt.c | 33 ++++++++++++++----------------- 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index f2770641c0fd..ffe666c2c6b5 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -988,39 +988,36 @@ static int pt_event_add(struct perf_event *event, int mode) int ret = -EBUSY; if (pt->handle.event) - goto out; + goto fail; buf = perf_aux_output_begin(&pt->handle, event); - if (!buf) { - ret = -EINVAL; - goto out; - } + ret = -EINVAL; + if (!buf) + goto fail_stop; pt_buffer_reset_offsets(buf, pt->handle.head); if (!buf->snapshot) { ret = pt_buffer_reset_markers(buf, &pt->handle); - if (ret) { - perf_aux_output_end(&pt->handle, 0, true); - goto out; - } + if (ret) + goto fail_end_stop; } if (mode & PERF_EF_START) { pt_event_start(event, 0); - if (hwc->state == PERF_HES_STOPPED) { - pt_event_del(event, 0); - ret = -EBUSY; - } + ret = -EBUSY; + if (hwc->state == PERF_HES_STOPPED) + goto fail_end_stop; } else { hwc->state = PERF_HES_STOPPED; } - ret = 0; -out: - - if (ret) - hwc->state = PERF_HES_STOPPED; + return 0; +fail_end_stop: + perf_aux_output_end(&pt->handle, 0, true); +fail_stop: + hwc->state = PERF_HES_STOPPED; +fail: return ret; } -- cgit v1.2.3 From a6dfa128ce5c414ab46b1d690f7a1b8decb8526d Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 17 Apr 2015 15:04:48 -0400 Subject: config: Enable NEED_DMA_MAP_STATE by default when SWIOTLB is selected A huge amount of NIC drivers use the DMA API, however if compiled under 32-bit an very important part of the DMA API can be ommitted leading to the drivers not working at all (especially if used with 'swiotlb=force iommu=soft'). As Prashant Sreedharan explains it: "the driver [tg3] uses DEFINE_DMA_UNMAP_ADDR(), dma_unmap_addr_set() to keep a copy of the dma "mapping" and dma_unmap_addr() to get the "mapping" value. On most of the platforms this is a no-op, but ... with "iommu=soft and swiotlb=force" this house keeping is required, ... otherwise we pass 0 while calling pci_unmap_/pci_dma_sync_ instead of the DMA address." As such enable this even when using 32-bit kernels. Reported-by: Ian Jackson Signed-off-by: Konrad Rzeszutek Wilk Acked-by: David S. Miller Acked-by: Prashant Sreedharan Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Michael Chan Cc: Thomas Gleixner Cc: boris.ostrovsky@oracle.com Cc: cascardo@linux.vnet.ibm.com Cc: david.vrabel@citrix.com Cc: sanjeevb@broadcom.com Cc: siva.kallam@broadcom.com Cc: vyasevich@gmail.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/20150417190448.GA9462@l.oracle.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index faff6934c05a..eb79036e3503 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -177,7 +177,7 @@ config SBUS config NEED_DMA_MAP_STATE def_bool y - depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG + depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG || SWIOTLB config NEED_SG_DMA_LENGTH def_bool y -- cgit v1.2.3 From 0b2bb6925eb602eae993a4b5c282a8c18ad1c949 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Thu, 26 Mar 2015 00:50:30 -0400 Subject: tools/power turbostat: Initial Skylake support Skylake adds some additional residency counters. Skylake supports a different mix of RAPL registers from any previous product. In most other ways, Skylake is like Broadwell. Signed-off-by: Len Brown --- arch/x86/include/uapi/asm/msr-index.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index c4c75272314a..fe01b0a784e7 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -150,6 +150,11 @@ #define MSR_PP1_ENERGY_STATUS 0x00000641 #define MSR_PP1_POLICY 0x00000642 +#define MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658 +#define MSR_PKG_ANY_CORE_C0_RES 0x00000659 +#define MSR_PKG_ANY_GFXE_C0_RES 0x0000065A +#define MSR_PKG_BOTH_CORE_GFXE_C0_RES 0x0000065B + #define MSR_CORE_C1_RES 0x00000660 #define MSR_CC6_DEMOTION_POLICY_CONFIG 0x00000668 -- cgit v1.2.3 From 94d4b4765b7ddb8478b0d57663cf7a08e2263bbf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 23 Nov 2012 19:19:07 +0100 Subject: x86/mm: Clean up types in xlate_dev_mem_ptr() Pavel Machek reported the following compiler warning on x86/32 CONFIG_HIGHMEM64G=y builds: arch/x86/mm/ioremap.c:344:10: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] Clean up the types in this function by using a single natural type for internal calculations (unsigned long), to make it more apparent what's happening, and also to remove fragile casts. Reported-by: Pavel Machek Cc: jgross@suse.com Cc: roland@purestorage.com Link: http://lkml.kernel.org/r/20150416080440.GA507@amd Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index fdf617c00e2f..4bf037b20f47 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -332,18 +332,20 @@ EXPORT_SYMBOL(iounmap); */ void *xlate_dev_mem_ptr(phys_addr_t phys) { - void *addr; - unsigned long start = phys & PAGE_MASK; + unsigned long start = phys & PAGE_MASK; + unsigned long offset = phys & ~PAGE_MASK; + unsigned long vaddr; /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ if (page_is_ram(start >> PAGE_SHIFT)) return __va(phys); - addr = (void __force *)ioremap_cache(start, PAGE_SIZE); - if (addr) - addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); + vaddr = (unsigned long)ioremap_cache(start, PAGE_SIZE); + /* Only add the offset on success and return NULL if the ioremap() failed: */ + if (vaddr) + vaddr += offset; - return addr; + return (void *)vaddr; } void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) -- cgit v1.2.3 From 085e68eeafbf76e21848ad5bafaecec88a11dd64 Mon Sep 17 00:00:00 2001 From: Ben Serebrin Date: Thu, 16 Apr 2015 11:58:05 -0700 Subject: KVM: VMX: Preserve host CR4.MCE value while in guest mode. The host's decision to enable machine check exceptions should remain in force during non-root mode. KVM was writing 0 to cr4 on VCPU reset and passed a slightly-modified 0 to the vmcs.guest_cr4 value. Tested: Built. On earlier version, tested by injecting machine check while a guest is spinning. Before the change, if guest CR4.MCE==0, then the machine check is escalated to Catastrophic Error (CATERR) and the machine dies. If guest CR4.MCE==1, then the machine check causes VMEXIT and is handled normally by host Linux. After the change, injecting a machine check causes normal Linux machine check handling. Signed-off-by: Ben Serebrin Reviewed-by: Venkatesh Srinivas Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f5e8dce8046c..f7b61687bd79 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3622,8 +3622,16 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? - KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); + /* + * Pass through host's Machine Check Enable value to hw_cr4, which + * is in force while we are in guest mode. Do not let guests control + * this bit, even if host CR4.MCE == 0. + */ + unsigned long hw_cr4 = + (cr4_read_shadow() & X86_CR4_MCE) | + (cr4 & ~X86_CR4_MCE) | + (to_vmx(vcpu)->rmode.vm86_active ? + KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); if (cr4 & X86_CR4_VMXE) { /* -- cgit v1.2.3 From 3b6e042188994466ec257b71296b5f85b894dcd9 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 21 Apr 2015 17:26:23 +0200 Subject: perf/x86/intel: Add cpu_(prepare|starting|dying) for core_pmu The core_pmu does not define cpu_* callbacks, which handles allocation of 'struct cpu_hw_events::shared_regs' data, initialization of debug store and PMU_FL_EXCL_CNTRS counters. While this probably won't happen on bare metal, virtual CPU can define x86_pmu.extra_regs together with PMU version 1 and thus be using core_pmu -> using shared_regs data without it being allocated. That could could leave to following panic: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] _spin_lock_irqsave+0x1f/0x40 SNIP [] __intel_shared_reg_get_constraints+0x69/0x1e0 [] intel_get_event_constraints+0x9b/0x180 [] x86_schedule_events+0x75/0x1d0 [] ? check_preempt_curr+0x7c/0x90 [] ? try_to_wake_up+0x24e/0x3e0 [] ? default_wake_function+0x12/0x20 [] ? autoremove_wake_function+0x16/0x40 [] ? __wake_up_common+0x59/0x90 [] ? __d_lookup+0xa7/0x150 [] ? do_lookup+0x9f/0x230 [] ? dput+0x9a/0x150 [] ? path_to_nameidata+0x25/0x60 [] ? __link_path_walk+0x7da/0x1000 [] ? x86_pmu_add+0xb9/0x170 [] x86_pmu_commit_txn+0x67/0xc0 [] ? mntput_no_expire+0x30/0x110 [] ? path_put+0x31/0x40 [] ? current_fs_time+0x27/0x30 [] ? mem_cgroup_get_reclaim_stat_from_page+0x20/0x70 [] group_sched_in+0x13a/0x170 [] ? sched_clock+0x9/0x10 [] ctx_sched_in+0x2e8/0x330 [] perf_event_sched_in+0x6b/0xb0 [] perf_event_context_sched_in+0x76/0xc0 [] perf_event_comm+0x1bb/0x2e0 [] set_task_comm+0x69/0x80 [] setup_new_exec+0xe1/0x2e0 [] load_elf_binary+0x3ce/0x1ab0 Adding cpu_(prepare|starting|dying) for core_pmu to have shared_regs data allocated for core_pmu. AFAICS there's no harm to initialize debug store and PMU_FL_EXCL_CNTRS either for core_pmu. Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/20150421152623.GC13169@krava.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 66 +++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 219d3fb423a1..960e85de13fb 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2533,34 +2533,6 @@ ssize_t intel_event_sysfs_show(char *page, u64 config) return x86_event_sysfs_show(page, config, event); } -static __initconst const struct x86_pmu core_pmu = { - .name = "core", - .handle_irq = x86_pmu_handle_irq, - .disable_all = x86_pmu_disable_all, - .enable_all = core_pmu_enable_all, - .enable = core_pmu_enable_event, - .disable = x86_pmu_disable_event, - .hw_config = x86_pmu_hw_config, - .schedule_events = x86_schedule_events, - .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, - .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .event_map = intel_pmu_event_map, - .max_events = ARRAY_SIZE(intel_perfmon_event_map), - .apic = 1, - /* - * Intel PMCs cannot be accessed sanely above 32 bit width, - * so we install an artificial 1<<31 period regardless of - * the generic event period: - */ - .max_period = (1ULL << 31) - 1, - .get_event_constraints = intel_get_event_constraints, - .put_event_constraints = intel_put_event_constraints, - .event_constraints = intel_core_event_constraints, - .guest_get_msrs = core_guest_get_msrs, - .format_attrs = intel_arch_formats_attr, - .events_sysfs_show = intel_event_sysfs_show, -}; - struct intel_shared_regs *allocate_shared_regs(int cpu) { struct intel_shared_regs *regs; @@ -2743,6 +2715,44 @@ static struct attribute *intel_arch3_formats_attr[] = { NULL, }; +static __initconst const struct x86_pmu core_pmu = { + .name = "core", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = core_pmu_enable_all, + .enable = core_pmu_enable_event, + .disable = x86_pmu_disable_event, + .hw_config = x86_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32-bit width, + * so we install an artificial 1<<31 period regardless of + * the generic event period: + */ + .max_period = (1ULL<<31) - 1, + .get_event_constraints = intel_get_event_constraints, + .put_event_constraints = intel_put_event_constraints, + .event_constraints = intel_core_event_constraints, + .guest_get_msrs = core_guest_get_msrs, + .format_attrs = intel_arch_formats_attr, + .events_sysfs_show = intel_event_sysfs_show, + + /* + * Virtual (or funny metal) CPU can define x86_pmu.extra_regs + * together with PMU version 1 and thus be using core_pmu with + * shared_regs. We need following callbacks here to allocate + * it properly. + */ + .cpu_prepare = intel_pmu_cpu_prepare, + .cpu_starting = intel_pmu_cpu_starting, + .cpu_dying = intel_pmu_cpu_dying, +}; + static __initconst const struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, -- cgit v1.2.3 From 80bcffb376a6890dd7452b12c1ba032f8f24fef6 Mon Sep 17 00:00:00 2001 From: Sonny Rao Date: Mon, 20 Apr 2015 15:34:07 -0700 Subject: perf/x86/intel/uncore: Add support for Intel Haswell ULT (lower power Mobile Processor) IMC uncore PMUs This uncore is the same as the Haswell desktop part but uses a different PCI ID. Signed-off-by: Sonny Rao Cc: Arnaldo Carvalho de Melo Cc: Bjorn Helgaas Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1429569247-16697-1-git-send-email-sonnyrao@chromium.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c index 3001015b755c..ca75e70865ef 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c @@ -1,6 +1,9 @@ /* Nehalem/SandBridge/Haswell uncore support */ #include "perf_event_intel_uncore.h" +/* Uncore IMC PCI Id */ +#define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04 + /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff #define SNB_UNC_CTL_UMASK_MASK 0x0000ff00 @@ -472,6 +475,10 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, { /* end: all zeroes */ }, }; @@ -502,6 +509,7 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver), /* 3rd Gen Core processor */ IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */ IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */ + IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core ULT Mobile Processor */ { /* end marker */ } }; -- cgit v1.2.3 From 0140e6141e4f1d4b15fb469e6912b0e71b7d1cc2 Mon Sep 17 00:00:00 2001 From: Sonny Rao Date: Tue, 21 Apr 2015 12:33:11 -0700 Subject: perf/x86/intel/uncore: Move PCI IDs for IMC to uncore driver This keeps all the related PCI IDs together in the driver where they are used. Signed-off-by: Sonny Rao Acked-by: Bjorn Helgaas Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1429644791-25724-1-git-send-email-sonnyrao@chromium.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c index ca75e70865ef..4562e9e22c60 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c @@ -1,7 +1,11 @@ /* Nehalem/SandBridge/Haswell uncore support */ #include "perf_event_intel_uncore.h" -/* Uncore IMC PCI Id */ +/* Uncore IMC PCI IDs */ +#define PCI_DEVICE_ID_INTEL_SNB_IMC 0x0100 +#define PCI_DEVICE_ID_INTEL_IVB_IMC 0x0154 +#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150 +#define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00 #define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04 /* SNB event control */ -- cgit v1.2.3 From 00425bb181c204c8f250fec122e2817a930e0286 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 Apr 2015 08:37:09 +0200 Subject: crypto: x86/sha512_ssse3 - fixup for asm function prototype change Patch e68410ebf626 ("crypto: x86/sha512_ssse3 - move SHA-384/512 SSSE3 implementation to base layer") changed the prototypes of the core asm SHA-512 implementations so that they are compatible with the prototype used by the base layer. However, in one instance, the register that was used for passing the input buffer was reused as a scratch register later on in the code, and since the input buffer param changed places with the digest param -which needs to be written back before the function returns- this resulted in the scratch register to be dereferenced in a memory write operation, causing a GPF. Fix this by changing the scratch register to use the same register as the input buffer param again. Fixes: e68410ebf626 ("crypto: x86/sha512_ssse3 - move SHA-384/512 SSSE3 implementation to base layer") Reported-By: Bobby Powers Tested-By: Bobby Powers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/sha512-avx2-asm.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index a4771dcd1fcf..1f20b35d8573 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -79,7 +79,7 @@ NUM_BLKS = %rdx c = %rcx d = %r8 e = %rdx -y3 = %rdi +y3 = %rsi TBL = %rbp -- cgit v1.2.3 From d869844bd081081bf537e806a44811884230643e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 23 Apr 2015 08:33:59 -0700 Subject: x86: fix special __probe_kernel_write() tail zeroing case Commit cae2a173fe94 ("x86: clean up/fix 'copy_in_user()' tail zeroing") fixed the failure case tail zeroing of one special case of the x86-64 generic user-copy routine, namely when used for the user-to-user case ("copy_in_user()"). But in the process it broke an even more unusual case: using the user copy routine for kernel-to-kernel copying. Now, normally kernel-kernel copies are obviously done using memcpy(), but we have a couple of special cases when we use the user-copy functions. One is when we pass a kernel buffer to a regular user-buffer routine, using set_fs(KERNEL_DS). That's a "normal" case, and continued to work fine, because it never takes any faults (with the possible exception of a silent and successful vmalloc fault). But Jan Beulich pointed out another, very unusual, special case: when we use the user-copy routines not because it's a path that expects a user pointer, but for a couple of ftrace/kgdb cases that want to do a kernel copy, but do so using "unsafe" buffers, and use the user-copy routine to gracefully handle faults. IOW, for probe_kernel_write(). And that broke for the case of a faulting kernel destination, because we saw the kernel destination and wanted to try to clear the tail of the buffer. Which doesn't work, since that's what faults. This only triggers for things like kgdb and ftrace users (eg trying setting a breakpoint on read-only memory), but it's definitely a bug. The fix is to not compare against the kernel address start (TASK_SIZE), but instead use the same limits "access_ok()" uses. Reported-and-tested-by: Jan Beulich Cc: stable@vger.kernel.org # 4.0 Signed-off-by: Linus Torvalds --- arch/x86/lib/usercopy_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 1f33b3d1fd68..0a42327a59d7 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -82,7 +82,7 @@ copy_user_handle_tail(char *to, char *from, unsigned len) clac(); /* If the destination is a kernel buffer, we always clear the end */ - if ((unsigned long)to >= TASK_SIZE_MAX) + if (!__addr_ok(to)) memset(to, 0, len); return len; } -- cgit v1.2.3 From 61f01dd941ba9e06d2bf05994450ecc3d61b6b8b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 26 Apr 2015 16:47:59 -0700 Subject: x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue AMD CPUs don't reinitialize the SS descriptor on SYSRET, so SYSRET with SS == 0 results in an invalid usermode state in which SS is apparently equal to __USER_DS but causes #SS if used. Work around the issue by setting SS to __KERNEL_DS __switch_to, thus ensuring that SYSRET never happens with SS set to NULL. This was exposed by a recent vDSO cleanup. Fixes: e7d6eefaaa44 x86/vdso32/syscall.S: Do not load __USER32_DS to %ss Signed-off-by: Andy Lutomirski Cc: Peter Anvin Cc: Borislav Petkov Cc: Denys Vlasenko Cc: Brian Gerst Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32entry.S | 7 +++++++ arch/x86/include/asm/cpufeature.h | 1 + arch/x86/kernel/cpu/amd.c | 3 +++ arch/x86/kernel/entry_64.S | 9 +++++++++ arch/x86/kernel/process_64.c | 28 ++++++++++++++++++++++++++++ 5 files changed, 48 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index a821b1cd4fa7..72bf2680f819 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -427,6 +427,13 @@ sysretl_from_sys_call: * cs and ss are loaded from MSRs. * (Note: 32bit->32bit SYSRET is different: since r11 * does not exist, it merely sets eflags.IF=1). + * + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss + * descriptor is not reinitialized. This means that we must + * avoid SYSRET with SS == NULL, which could happen if we schedule, + * exit the kernel, and re-enter using an interrupt vector. (All + * interrupt entries on x86_64 set SS to NULL.) We prevent that + * from happening by reloading SS in __switch_to. */ USERGS_SYSRET32 diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 7ee9b94d9921..3d6606fb97d0 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -265,6 +265,7 @@ #define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ #define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ +#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index fd470ebf924e..e4cf63301ff4 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -720,6 +720,9 @@ static void init_amd(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH)) if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH); + + /* AMD CPUs don't reset SS attributes on SYSRET */ + set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c7b238494b31..02c2eff7478d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -295,6 +295,15 @@ system_call_fastpath: * rflags from r11 (but RF and VM bits are forced to 0), * cs and ss are loaded from MSRs. * Restoration of rflags re-enables interrupts. + * + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss + * descriptor is not reinitialized. This means that we should + * avoid SYSRET with SS == NULL, which could happen if we schedule, + * exit the kernel, and re-enter using an interrupt vector. (All + * interrupt entries on x86_64 set SS to NULL.) We prevent that + * from happening by reloading SS in __switch_to. (Actually + * detecting the failure in 64-bit userspace is tricky but can be + * done.) */ USERGS_SYSRET64 diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 4baaa972f52a..ddfdbf74f174 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -419,6 +419,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); + if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { + /* + * AMD CPUs have a misfeature: SYSRET sets the SS selector but + * does not update the cached descriptor. As a result, if we + * do SYSRET while SS is NULL, we'll end up in user mode with + * SS apparently equal to __USER_DS but actually unusable. + * + * The straightforward workaround would be to fix it up just + * before SYSRET, but that would slow down the system call + * fast paths. Instead, we ensure that SS is never NULL in + * system call context. We do this by replacing NULL SS + * selectors at every context switch. SYSCALL sets up a valid + * SS, so the only way to get NULL is to re-enter the kernel + * from CPL 3 through an interrupt. Since that can't happen + * in the same task as a running syscall, we are guaranteed to + * context switch between every interrupt vector entry and a + * subsequent SYSRET. + * + * We read SS first because SS reads are much faster than + * writes. Out of caution, we force SS to __KERNEL_DS even if + * it previously had a different non-NULL value. + */ + unsigned short ss_sel; + savesegment(ss, ss_sel); + if (ss_sel != __KERNEL_DS) + loadsegment(ss, __KERNEL_DS); + } + return prev_p; } -- cgit v1.2.3 From 5dca0d9147458be9b9363b8a484aa77d710b412a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Wed, 25 Mar 2015 12:08:14 +0100 Subject: kvm: x86: fix kvmclock update protocol MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kvmclock spec says that the host will increment a version field to an odd number, then update stuff, then increment it to an even number. The host is buggy and doesn't do this, and the result is observable when one vcpu reads another vcpu's kvmclock data. There's no good way for a guest kernel to keep its vdso from reading a different vcpu's kvmclock data, but we don't need to care about changing VCPUs as long as we read a consistent data from kvmclock. (VCPU can change outside of this loop too, so it doesn't matter if we return a value not fit for this VCPU.) Based on a patch by Radim Krčmář. Reviewed-by: Radim Krčmář Acked-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ed31c31b2485..c73efcd03e29 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1669,12 +1669,28 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) &guest_hv_clock, sizeof(guest_hv_clock)))) return 0; - /* - * The interface expects us to write an even number signaling that the - * update is finished. Since the guest won't see the intermediate - * state, we just increase by 2 at the end. + /* This VCPU is paused, but it's legal for a guest to read another + * VCPU's kvmclock, so we really have to follow the specification where + * it says that version is odd if data is being modified, and even after + * it is consistent. + * + * Version field updates must be kept separate. This is because + * kvm_write_guest_cached might use a "rep movs" instruction, and + * writes within a string instruction are weakly ordered. So there + * are three writes overall. + * + * As a small optimization, only write the version field in the first + * and third write. The vcpu->pv_time cache is still valid, because the + * version field is the first in the struct. */ - vcpu->hv_clock.version = guest_hv_clock.version + 2; + BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); + + vcpu->hv_clock.version = guest_hv_clock.version + 1; + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); + + smp_wmb(); /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); @@ -1695,6 +1711,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) kvm_write_guest_cached(v->kvm, &vcpu->pv_time, &vcpu->hv_clock, sizeof(vcpu->hv_clock)); + + smp_wmb(); + + vcpu->hv_clock.version++; + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); return 0; } -- cgit v1.2.3 From 73459e2a1ada09a68c02cc5b73f3116fc8194b3d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 23 Apr 2015 13:20:18 +0200 Subject: x86: pvclock: Really remove the sched notifier for cross-cpu migrations This reverts commits 0a4e6be9ca17c54817cf814b4b5aa60478c6df27 and 80f7fdb1c7f0f9266421f823964fd1962681f6ce. The task migration notifier was originally introduced in order to support the pvclock vsyscall with non-synchronized TSC, but KVM only supports it with synchronized TSC. Hence, on KVM the race condition is only needed due to a bad implementation on the host side, and even then it's so rare that it's mostly theoretical. As far as KVM is concerned it's possible to fix the host, avoiding the additional complexity in the vDSO and the (re)introduction of the task migration notifier. Xen, on the other hand, hasn't yet implemented vsyscall support at all, so we do not care about its plans for non-synchronized TSC. Reported-by: Peter Zijlstra Suggested-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/pvclock.h | 1 - arch/x86/kernel/pvclock.c | 44 ------------------------------------------ arch/x86/vdso/vclock_gettime.c | 34 ++++++++++++++------------------ 3 files changed, 15 insertions(+), 64 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 25b1cc07d496..d6b078e9fa28 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -95,7 +95,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, struct pvclock_vsyscall_time_info { struct pvclock_vcpu_time_info pvti; - u32 migrate_count; } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index e5ecd20e72dd..2f355d229a58 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -141,46 +141,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } -static struct pvclock_vsyscall_time_info *pvclock_vdso_info; - -static struct pvclock_vsyscall_time_info * -pvclock_get_vsyscall_user_time_info(int cpu) -{ - if (!pvclock_vdso_info) { - BUG(); - return NULL; - } - - return &pvclock_vdso_info[cpu]; -} - -struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) -{ - return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; -} - #ifdef CONFIG_X86_64 -static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, - void *v) -{ - struct task_migration_notifier *mn = v; - struct pvclock_vsyscall_time_info *pvti; - - pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); - - /* this is NULL when pvclock vsyscall is not initialized */ - if (unlikely(pvti == NULL)) - return NOTIFY_DONE; - - pvti->migrate_count++; - - return NOTIFY_DONE; -} - -static struct notifier_block pvclock_migrate = { - .notifier_call = pvclock_task_migrate, -}; - /* * Initialize the generic pvclock vsyscall state. This will allocate * a/some page(s) for the per-vcpu pvclock information, set up a @@ -194,17 +155,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); - pvclock_vdso_info = i; - for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, __pa(i) + (idx*PAGE_SIZE), PAGE_KERNEL_VVAR); } - - register_task_migration_notifier(&pvclock_migrate); - return 0; } #endif diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 40d2473836c9..9793322751e0 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -82,15 +82,18 @@ static notrace cycle_t vread_pvclock(int *mode) cycle_t ret; u64 last; u32 version; - u32 migrate_count; u8 flags; unsigned cpu, cpu1; /* - * When looping to get a consistent (time-info, tsc) pair, we - * also need to deal with the possibility we can switch vcpus, - * so make sure we always re-fetch time-info for the current vcpu. + * Note: hypervisor must guarantee that: + * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. + * 2. that per-CPU pvclock time info is updated if the + * underlying CPU changes. + * 3. that version is increased whenever underlying CPU + * changes. + * */ do { cpu = __getcpu() & VGETCPU_CPU_MASK; @@ -99,27 +102,20 @@ static notrace cycle_t vread_pvclock(int *mode) * __getcpu() calls (Gleb). */ - /* Make sure migrate_count will change if we leave the VCPU. */ - do { - pvti = get_pvti(cpu); - migrate_count = pvti->migrate_count; - - cpu1 = cpu; - cpu = __getcpu() & VGETCPU_CPU_MASK; - } while (unlikely(cpu != cpu1)); + pvti = get_pvti(cpu); version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); /* * Test we're still on the cpu as well as the version. - * - We must read TSC of pvti's VCPU. - * - KVM doesn't follow the versioning protocol, so data could - * change before version if we left the VCPU. + * We could have been migrated just after the first + * vgetcpu but before fetching the version, so we + * wouldn't notice a version change. */ - smp_rmb(); - } while (unlikely((pvti->pvti.version & 1) || - pvti->pvti.version != version || - pvti->migrate_count != migrate_count)); + cpu1 = __getcpu() & VGETCPU_CPU_MASK; + } while (unlikely(cpu != cpu1 || + (pvti->pvti.version & 1) || + pvti->pvti.version != version)); if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) *mode = VCLOCK_NONE; -- cgit v1.2.3 From 2b953a5e994ce279904ec70220f7d4f31d380a0a Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Tue, 28 Apr 2015 18:46:20 -0400 Subject: xen: Suspend ticks on all CPUs during suspend Commit 77e32c89a711 ("clockevents: Manage device's state separately for the core") decouples clockevent device's modes from states. With this change when a Xen guest tries to resume, it won't be calling its set_mode op which needs to be done on each VCPU in order to make the hypervisor aware that we are in oneshot mode. This happens because clockevents_tick_resume() (which is an intermediate step of resuming ticks on a processor) doesn't call clockevents_set_state() anymore and because during suspend clockevent devices on all VCPUs (except for the one doing the suspend) are left in ONESHOT state. As result, during resume the clockevents state machine will assume that device is already where it should be and doesn't need to be updated. To avoid this problem we should suspend ticks on all VCPUs during suspend. Signed-off-by: Boris Ostrovsky Signed-off-by: David Vrabel --- arch/x86/xen/suspend.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index d9497698645a..53b4c0811f4f 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -88,7 +88,17 @@ static void xen_vcpu_notify_restore(void *data) tick_resume_local(); } +static void xen_vcpu_notify_suspend(void *data) +{ + tick_suspend_local(); +} + void xen_arch_resume(void) { on_each_cpu(xen_vcpu_notify_restore, NULL, 1); } + +void xen_arch_suspend(void) +{ + on_each_cpu(xen_vcpu_notify_suspend, NULL, 1); +} -- cgit v1.2.3 From 2c62e8492ed7358bbe7da51666c7e0f6da9474ee Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Thu, 30 Apr 2015 12:41:28 +0800 Subject: x86/PCI/ACPI: Make all resources except [io 0xcf8-0xcff] available on PCI bus An IO port or MMIO resource assigned to a PCI host bridge may be consumed by the host bridge itself or available to its child bus/devices. The ACPI specification defines a bit (Producer/Consumer) to tell whether the resource is consumed by the host bridge itself, but firmware hasn't used that bit consistently, so we can't rely on it. Before commit 593669c2ac0f ("x86/PCI/ACPI: Use common ACPI resource interfaces to simplify implementation"), arch/x86/pci/acpi.c ignored all IO port resources defined by acpi_resource_io and acpi_resource_fixed_io to filter out IO ports consumed by the host bridge itself. Commit 593669c2ac0f ("x86/PCI/ACPI: Use common ACPI resource interfaces to simplify implementation") started accepting all IO port and MMIO resources, which caused a regression that IO port resources consumed by the host bridge itself became available to its child devices. Then commit 63f1789ec716 ("x86/PCI/ACPI: Ignore resources consumed by host bridge itself") ignored resources consumed by the host bridge itself by checking the IORESOURCE_WINDOW flag, which accidently removed MMIO resources defined by acpi_resource_memory24, acpi_resource_memory32 and acpi_resource_fixed_memory32. On x86 and IA64 platforms, all IO port and MMIO resources are assumed to be available to child bus/devices except one special case: IO port [0xCF8-0xCFF] is consumed by the host bridge itself to access PCI configuration space. So explicitly filter out PCI CFG IO ports[0xCF8-0xCFF]. This solution will also ease the way to consolidate ACPI PCI host bridge common code from x86, ia64 and ARM64. Related ACPI table are archived at: https://bugzilla.kernel.org/show_bug.cgi?id=94221 Related discussions at: http://patchwork.ozlabs.org/patch/461633/ https://lkml.org/lkml/2015/3/29/304 Fixes: 63f1789ec716 (Ignore resources consumed by host bridge itself) Reported-by: Bernhard Thaler Signed-off-by: Jiang Liu Cc: 4.0+ # 4.0+ Reviewed-by: Bjorn Helgaas Signed-off-by: Rafael J. Wysocki --- arch/x86/pci/acpi.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index e4695985f9de..d93963340c3c 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -325,6 +325,26 @@ static void release_pci_root_info(struct pci_host_bridge *bridge) kfree(info); } +/* + * An IO port or MMIO resource assigned to a PCI host bridge may be + * consumed by the host bridge itself or available to its child + * bus/devices. The ACPI specification defines a bit (Producer/Consumer) + * to tell whether the resource is consumed by the host bridge itself, + * but firmware hasn't used that bit consistently, so we can't rely on it. + * + * On x86 and IA64 platforms, all IO port and MMIO resources are assumed + * to be available to child bus/devices except one special case: + * IO port [0xCF8-0xCFF] is consumed by the host bridge itself + * to access PCI configuration space. + * + * So explicitly filter out PCI CFG IO ports[0xCF8-0xCFF]. + */ +static bool resource_is_pcicfg_ioport(struct resource *res) +{ + return (res->flags & IORESOURCE_IO) && + res->start == 0xCF8 && res->end == 0xCFF; +} + static void probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device, int busnum, int domain, @@ -346,8 +366,8 @@ static void probe_pci_root_info(struct pci_root_info *info, "no IO and memory resources present in _CRS\n"); else resource_list_for_each_entry_safe(entry, tmp, list) { - if ((entry->res->flags & IORESOURCE_WINDOW) == 0 || - (entry->res->flags & IORESOURCE_DISABLED)) + if ((entry->res->flags & IORESOURCE_DISABLED) || + resource_is_pcicfg_ioport(entry->res)) resource_list_destroy_entry(entry); else entry->res->name = info->name; -- cgit v1.2.3 From e8a4a2696fecb398b0288c43c0e0dbb91e265bb2 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Mon, 4 May 2015 21:15:31 -0700 Subject: x86/spinlocks: Fix regression in spinlock contention detection A spinlock is regarded as contended when there is at least one waiter. Currently, the code that checks whether there are any waiters rely on tail value being greater than head. However, this is not true if tail reaches the max value and wraps back to zero, so arch_spin_is_contended() incorrectly returns 0 (not contended) when tail is smaller than head. The original code (before regression) handled this case by casting the (tail - head) to an unsigned value. This change simply restores that behavior. Fixes: d6abfdb20223 ("x86/spinlocks/paravirt: Fix memory corruption on unlock") Signed-off-by: Tahsin Erdogan Cc: peterz@infradead.org Cc: Waiman.Long@hp.com Cc: borntraeger@de.ibm.com Cc: oleg@redhat.com Cc: raghavendra.kt@linux.vnet.ibm.com Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1430799331-20445-1-git-send-email-tahsin@google.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/spinlock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index cf87de3fc390..64b611782ef0 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -169,7 +169,7 @@ static inline int arch_spin_is_contended(arch_spinlock_t *lock) struct __raw_tickets tmp = READ_ONCE(lock->tickets); tmp.head &= ~TICKET_SLOWPATH_FLAG; - return (tmp.tail - tmp.head) > TICKET_LOCK_INC; + return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC; } #define arch_spin_is_contended arch_spin_is_contended -- cgit v1.2.3 From a71dbdaa8ca2933391b08e0ae5567083e3af0892 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 4 May 2015 11:02:15 -0400 Subject: hypervisor/x86/xen: Unset X86_BUG_SYSRET_SS_ATTRS on Xen PV guests Commit 61f01dd941ba ("x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue") makes AMD processors set SS to __KERNEL_DS in __switch_to() to deal with cases when SS is NULL. This breaks Xen PV guests who do not want to load SS with__KERNEL_DS. Since the problem that the commit is trying to address would have to be fixed in the hypervisor (if it in fact exists under Xen) there is no reason to set X86_BUG_SYSRET_SS_ATTRS flag for PV VPCUs here. This can be easily achieved by adding x86_hyper_xen_hvm.set_cpu_features op which will clear this flag. (And since this structure is no longer HVM-specific we should do some renaming). Signed-off-by: Boris Ostrovsky Reported-by: Sander Eikelenboom Signed-off-by: David Vrabel --- arch/x86/include/asm/hypervisor.h | 2 +- arch/x86/kernel/cpu/hypervisor.c | 4 ++-- arch/x86/xen/enlighten.c | 27 ++++++++++++++++++--------- 3 files changed, 21 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index e42f758a0fbd..055ea9941dd5 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -50,7 +50,7 @@ extern const struct hypervisor_x86 *x86_hyper; /* Recognized hypervisors */ extern const struct hypervisor_x86 x86_hyper_vmware; extern const struct hypervisor_x86 x86_hyper_ms_hyperv; -extern const struct hypervisor_x86 x86_hyper_xen_hvm; +extern const struct hypervisor_x86 x86_hyper_xen; extern const struct hypervisor_x86 x86_hyper_kvm; extern void init_hypervisor(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 36ce402a3fa5..d820d8eae96b 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -27,8 +27,8 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = { -#ifdef CONFIG_XEN_PVHVM - &x86_hyper_xen_hvm, +#ifdef CONFIG_XEN + &x86_hyper_xen, #endif &x86_hyper_vmware, &x86_hyper_ms_hyperv, diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 94578efd3067..46957ead3060 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1760,6 +1760,9 @@ static struct notifier_block xen_hvm_cpu_notifier = { static void __init xen_hvm_guest_init(void) { + if (xen_pv_domain()) + return; + init_hvm_pv_info(); xen_hvm_init_shared_info(); @@ -1775,6 +1778,7 @@ static void __init xen_hvm_guest_init(void) xen_hvm_init_time_ops(); xen_hvm_init_mmu_ops(); } +#endif static bool xen_nopv = false; static __init int xen_parse_nopv(char *arg) @@ -1784,14 +1788,11 @@ static __init int xen_parse_nopv(char *arg) } early_param("xen_nopv", xen_parse_nopv); -static uint32_t __init xen_hvm_platform(void) +static uint32_t __init xen_platform(void) { if (xen_nopv) return 0; - if (xen_pv_domain()) - return 0; - return xen_cpuid_base(); } @@ -1809,11 +1810,19 @@ bool xen_hvm_need_lapic(void) } EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); -const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = { - .name = "Xen HVM", - .detect = xen_hvm_platform, +static void xen_set_cpu_features(struct cpuinfo_x86 *c) +{ + if (xen_pv_domain()) + clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); +} + +const struct hypervisor_x86 x86_hyper_xen = { + .name = "Xen", + .detect = xen_platform, +#ifdef CONFIG_XEN_PVHVM .init_platform = xen_hvm_guest_init, +#endif .x2apic_available = xen_x2apic_para_available, + .set_cpu_features = xen_set_cpu_features, }; -EXPORT_SYMBOL(x86_hyper_xen_hvm); -#endif +EXPORT_SYMBOL(x86_hyper_xen); -- cgit v1.2.3 From de71ad2c97862eae1516aa36528cc3b317c17b2f Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Mon, 4 May 2015 15:16:44 -0300 Subject: x86: Make cpu_tss available to external modules Commit 75182b1632 ("x86/asm/entry: Switch all C consumers of kernel_stack to this_cpu_sp0()") changed current_thread_info to use this_cpu_sp0, and indirectly made it rely on init_tss which was exported with EXPORT_PER_CPU_SYMBOL_GPL. As a result some macros and inline functions such as set/get_fs, test_thread_flag and variants have been made unusable for external modules. Make cpu_tss exported with EXPORT_PER_CPU_SYMBOL so that these functions are accessible again, as they were previously. Signed-off-by: Marc Dionne Acked-by: Andy Lutomirski Link: http://lkml.kernel.org/r/1430763404-21221-1-git-send-email-marc.dionne@your-file-system.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 8213da62b1b7..bfc99b3b6522 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -57,7 +57,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, #endif }; -EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss); +EXPORT_PER_CPU_SYMBOL(cpu_tss); #ifdef CONFIG_X86_64 static DEFINE_PER_CPU(unsigned char, is_idle); -- cgit v1.2.3 From c88d47480d300eaad80c213d50c9bf6077fc49bc Mon Sep 17 00:00:00 2001 From: Bobby Powers Date: Mon, 27 Apr 2015 08:10:41 -0700 Subject: x86/fpu: Always restore_xinit_state() when use_eager_cpu() The following commit: f893959b0898 ("x86/fpu: Don't abuse drop_init_fpu() in flush_thread()") removed drop_init_fpu() usage from flush_thread(). This seems to break things for me - the Go 1.4 test suite fails all over the place with floating point comparision errors (offending commit found through bisection). The functional change was that flush_thread() after this commit only calls restore_init_xstate() when both use_eager_fpu() and !used_math() are true. drop_init_fpu() (now fpu_reset_state()) calls restore_init_xstate() regardless of whether current used_math() - apply the same logic here. Switch used_math() -> tsk_used_math(tsk) to consistently use the grabbed tsk instead of current, like in the rest of flush_thread(). Tested-by: Dave Hansen Signed-off-by: Bobby Powers Signed-off-by: Borislav Petkov Acked-by: Oleg Nesterov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Pekka Riikonen Cc: Quentin Casasnovas Cc: Rik van Riel Cc: Suresh Siddha Cc: Thomas Gleixner Fixes: f893959b ("x86/fpu: Don't abuse drop_init_fpu() in flush_thread()") Link: http://lkml.kernel.org/r/1430147441-9820-1-git-send-email-bobbypowers@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index bfc99b3b6522..6e338e3b1dc0 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -156,11 +156,13 @@ void flush_thread(void) /* FPU state will be reallocated lazily at the first use. */ drop_fpu(tsk); free_thread_xstate(tsk); - } else if (!used_math()) { - /* kthread execs. TODO: cleanup this horror. */ - if (WARN_ON(init_fpu(tsk))) - force_sig(SIGKILL, tsk); - user_fpu_begin(); + } else { + if (!tsk_used_math(tsk)) { + /* kthread execs. TODO: cleanup this horror. */ + if (WARN_ON(init_fpu(tsk))) + force_sig(SIGKILL, tsk); + user_fpu_begin(); + } restore_init_xstate(); } } -- cgit v1.2.3 From 8746515d7f04c9ea94cf43e2db1fd2cfca93276d Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 24 Apr 2015 10:16:40 +0100 Subject: xen: Add __GFP_DMA flag when xen_swiotlb_init gets free pages on ARM Make sure that xen_swiotlb_init allocates buffers that are DMA capable when at least one memblock is available below 4G. Otherwise we assume that all devices on the SoC can cope with >4G addresses. We do this on ARM and ARM64, where dom0 is mapped 1:1, so pfn == mfn in this case. No functional changes on x86. From: Chen Baozi Signed-off-by: Chen Baozi Signed-off-by: Stefano Stabellini Tested-by: Chen Baozi Acked-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- arch/x86/include/asm/xen/page.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 358dcd338915..c44a5d53e464 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -269,4 +269,9 @@ static inline bool xen_arch_need_swiotlb(struct device *dev, return false; } +static inline unsigned long xen_get_swiotlb_free_pages(unsigned int order) +{ + return __get_free_pages(__GFP_NOWARN, order); +} + #endif /* _ASM_X86_XEN_PAGE_H */ -- cgit v1.2.3