From 8e57c586c6a8333c266a6cf76d50e110f2954558 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 3 Feb 2015 12:40:54 +0100
Subject: perf/x86/intel/uncore: Delete an unnecessary check before
 pci_dev_put() call

The pci_dev_put() function tests whether its argument is NULL and then
returns immediately. Thus the test around the call is not needed.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/54D0B59C.2060106@users.sourceforge.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index 21af6149edf2..12d9548457e7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -1132,8 +1132,7 @@ static int snbep_pci2phy_map_init(int devid)
 		}
 	}
 
-	if (ubox_dev)
-		pci_dev_put(ubox_dev);
+	pci_dev_put(ubox_dev);
 
 	return err ? pcibios_err_to_errno(err) : 0;
 }
-- 
cgit v1.2.3


From c796b205b88c775fd220c1a63390bac6a8cdda3f Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <aravind.gopalakrishnan@amd.com>
Date: Fri, 23 Jan 2015 12:19:35 -0600
Subject: perf/x86/amd/ibs: Convert force_ibs_eilvt_setup() to void

The caller of force_ibs_eilvt_setup() is ibs_eilvt_setup()
which does not care about the return values.

So mark it void and clean up the return statements.

Signed-off-by: Aravind Gopalakrishnan <aravind.gopalakrishnan@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <hpa@zytor.com>
Cc: <paulus@samba.org>
Cc: <tglx@linutronix.de>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1422037175-20957-1-git-send-email-aravind.gopalakrishnan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_amd_ibs.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index a61f5c6911da..989d3c215d2b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -796,7 +796,7 @@ static int setup_ibs_ctl(int ibs_eilvt_off)
  * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
  * is using the new offset.
  */
-static int force_ibs_eilvt_setup(void)
+static void force_ibs_eilvt_setup(void)
 {
 	int offset;
 	int ret;
@@ -811,26 +811,24 @@ static int force_ibs_eilvt_setup(void)
 
 	if (offset == APIC_EILVT_NR_MAX) {
 		printk(KERN_DEBUG "No EILVT entry available\n");
-		return -EBUSY;
+		return;
 	}
 
 	ret = setup_ibs_ctl(offset);
 	if (ret)
 		goto out;
 
-	if (!ibs_eilvt_valid()) {
-		ret = -EFAULT;
+	if (!ibs_eilvt_valid())
 		goto out;
-	}
 
 	pr_info("IBS: LVT offset %d assigned\n", offset);
 
-	return 0;
+	return;
 out:
 	preempt_disable();
 	put_eilvt(offset);
 	preempt_enable();
-	return ret;
+	return;
 }
 
 static void ibs_eilvt_setup(void)
-- 
cgit v1.2.3


From 27ac905b8f88d28779b0661809286b5ba2817d37 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:55:57 -0500
Subject: perf/x86/intel: Reduce lbr_sel_map[] size

The index of lbr_sel_map is bit value of perf branch_sample_type.
PERF_SAMPLE_BRANCH_MAX is 1024 at present, so each lbr_sel_map uses
4096 bytes. By using bit shift as index, we can reduce lbr_sel_map
size to 40 bytes. This patch defines 'bit shift' for branch types,
and use 'bit shift' to define lbr_sel_maps.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: jolsa@redhat.com
Cc: linux-api@vger.kernel.org
Link: http://lkml.kernel.org/r/1415156173-10035-2-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h           |  4 +++
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 54 ++++++++++++++----------------
 2 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index df525d2be1e8..0c45b22495dc 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -515,6 +515,10 @@ struct x86_pmu {
 	struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
 };
 
+enum {
+	PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE = PERF_SAMPLE_BRANCH_MAX_SHIFT,
+};
+
 #define x86_add_quirk(func_)						\
 do {									\
 	static struct x86_pmu_quirk __quirk __initdata = {		\
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 58f1a94beaf0..8bc078f43a82 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -69,10 +69,6 @@ static enum {
 #define LBR_FROM_FLAG_IN_TX    (1ULL << 62)
 #define LBR_FROM_FLAG_ABORT    (1ULL << 61)
 
-#define for_each_branch_sample_type(x) \
-	for ((x) = PERF_SAMPLE_BRANCH_USER; \
-	     (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
-
 /*
  * x86control flow change classification
  * x86control flow changes include branches, interrupts, traps, faults
@@ -403,14 +399,14 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
 {
 	struct hw_perf_event_extra *reg;
 	u64 br_type = event->attr.branch_sample_type;
-	u64 mask = 0, m;
-	u64 v;
+	u64 mask = 0, v;
+	int i;
 
-	for_each_branch_sample_type(m) {
-		if (!(br_type & m))
+	for (i = 0; i < PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE; i++) {
+		if (!(br_type & (1ULL << i)))
 			continue;
 
-		v = x86_pmu.lbr_sel_map[m];
+		v = x86_pmu.lbr_sel_map[i];
 		if (v == LBR_NOT_SUPP)
 			return -EOPNOTSUPP;
 
@@ -678,35 +674,35 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 /*
  * Map interface branch filters onto LBR filters
  */
-static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
-	[PERF_SAMPLE_BRANCH_ANY]	= LBR_ANY,
-	[PERF_SAMPLE_BRANCH_USER]	= LBR_USER,
-	[PERF_SAMPLE_BRANCH_KERNEL]	= LBR_KERNEL,
-	[PERF_SAMPLE_BRANCH_HV]		= LBR_IGN,
-	[PERF_SAMPLE_BRANCH_ANY_RETURN]	= LBR_RETURN | LBR_REL_JMP
-					| LBR_IND_JMP | LBR_FAR,
+static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
+	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
+	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
+	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
+	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
+	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_REL_JMP
+						| LBR_IND_JMP | LBR_FAR,
 	/*
 	 * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
 	 */
-	[PERF_SAMPLE_BRANCH_ANY_CALL] =
+	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
 	 LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
 	/*
 	 * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
 	 */
-	[PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
-	[PERF_SAMPLE_BRANCH_COND]     = LBR_JCC,
+	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
+	[PERF_SAMPLE_BRANCH_COND_SHIFT]     = LBR_JCC,
 };
 
-static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
-	[PERF_SAMPLE_BRANCH_ANY]	= LBR_ANY,
-	[PERF_SAMPLE_BRANCH_USER]	= LBR_USER,
-	[PERF_SAMPLE_BRANCH_KERNEL]	= LBR_KERNEL,
-	[PERF_SAMPLE_BRANCH_HV]		= LBR_IGN,
-	[PERF_SAMPLE_BRANCH_ANY_RETURN]	= LBR_RETURN | LBR_FAR,
-	[PERF_SAMPLE_BRANCH_ANY_CALL]	= LBR_REL_CALL | LBR_IND_CALL
-					| LBR_FAR,
-	[PERF_SAMPLE_BRANCH_IND_CALL]	= LBR_IND_CALL,
-	[PERF_SAMPLE_BRANCH_COND]       = LBR_JCC,
+static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
+	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
+	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
+	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
+	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
+	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_FAR,
+	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
+						| LBR_FAR,
+	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]	= LBR_IND_CALL,
+	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
 };
 
 /* core */
-- 
cgit v1.2.3


From ba532500c5651a4be4108acc64ed99a95cb005b3 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:55:58 -0500
Subject: perf: Introduce pmu context switch callback

The callback is invoked when process is scheduled in or out.
It provides mechanism for later patches to save/store the LBR
stack. For the schedule in case, the callback is invoked at
the same place that flush branch stack callback is invoked.
So it also can replace the flush branch stack callback. To
avoid unnecessary overhead, the callback is enabled only when
there are events use the LBR stack.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-3-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 7 +++++++
 arch/x86/kernel/cpu/perf_event.h | 2 ++
 2 files changed, 9 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b71a7f86d68a..0efbd6cc2966 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1914,6 +1914,12 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
 	NULL,
 };
 
+static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+	if (x86_pmu.sched_task)
+		x86_pmu.sched_task(ctx, sched_in);
+}
+
 static void x86_pmu_flush_branch_stack(void)
 {
 	if (x86_pmu.flush_branch_stack)
@@ -1950,6 +1956,7 @@ static struct pmu pmu = {
 
 	.event_idx		= x86_pmu_event_idx,
 	.flush_branch_stack	= x86_pmu_flush_branch_stack,
+	.sched_task		= x86_pmu_sched_task,
 };
 
 void arch_perf_update_userpage(struct perf_event *event,
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 0c45b22495dc..211b54c0c00c 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -473,6 +473,8 @@ struct x86_pmu {
 
 	void		(*check_microcode)(void);
 	void		(*flush_branch_stack)(void);
+	void		(*sched_task)(struct perf_event_context *ctx,
+				      bool sched_in);
 
 	/*
 	 * Intel Arch Perfmon v2+
-- 
cgit v1.2.3


From 2a0ad3b326a9024ba86dca4028499d31fa0c6c4d Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:55:59 -0500
Subject: perf/x86/intel: Use context switch callback to flush LBR stack

Previous commit introduces context switch callback, its function
overlaps with the flush branch stack callback. So we can use the
context switch callback to flush LBR stack.

This patch adds code that uses the flush branch callback to
flush the LBR stack when task is being scheduled in. The callback
is enabled only when there are events use the LBR hardware. This
patch also removes all old flush branch stack code.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-4-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c           |  7 -------
 arch/x86/kernel/cpu/perf_event.h           |  3 ++-
 arch/x86/kernel/cpu/perf_event_intel.c     | 14 +-------------
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 27 +++++++++++++++++++++++++++
 4 files changed, 30 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0efbd6cc2966..6b1fd26a37cf 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1920,12 +1920,6 @@ static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
 		x86_pmu.sched_task(ctx, sched_in);
 }
 
-static void x86_pmu_flush_branch_stack(void)
-{
-	if (x86_pmu.flush_branch_stack)
-		x86_pmu.flush_branch_stack();
-}
-
 void perf_check_microcode(void)
 {
 	if (x86_pmu.check_microcode)
@@ -1955,7 +1949,6 @@ static struct pmu pmu = {
 	.commit_txn		= x86_pmu_commit_txn,
 
 	.event_idx		= x86_pmu_event_idx,
-	.flush_branch_stack	= x86_pmu_flush_branch_stack,
 	.sched_task		= x86_pmu_sched_task,
 };
 
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 211b54c0c00c..949d0083a29e 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -472,7 +472,6 @@ struct x86_pmu {
 	void		(*cpu_dead)(int cpu);
 
 	void		(*check_microcode)(void);
-	void		(*flush_branch_stack)(void);
 	void		(*sched_task)(struct perf_event_context *ctx,
 				      bool sched_in);
 
@@ -733,6 +732,8 @@ void intel_pmu_pebs_disable_all(void);
 
 void intel_ds_init(void);
 
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+
 void intel_pmu_lbr_reset(void);
 
 void intel_pmu_lbr_enable(struct perf_event *event);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 498b6d967138..424fbf74dee7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2044,18 +2044,6 @@ static void intel_pmu_cpu_dying(int cpu)
 	fini_debug_store_on_cpu(cpu);
 }
 
-static void intel_pmu_flush_branch_stack(void)
-{
-	/*
-	 * Intel LBR does not tag entries with the
-	 * PID of the current task, then we need to
-	 * flush it on ctxsw
-	 * For now, we simply reset it
-	 */
-	if (x86_pmu.lbr_nr)
-		intel_pmu_lbr_reset();
-}
-
 PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
 
 PMU_FORMAT_ATTR(ldlat, "config1:0-15");
@@ -2107,7 +2095,7 @@ static __initconst const struct x86_pmu intel_pmu = {
 	.cpu_starting		= intel_pmu_cpu_starting,
 	.cpu_dying		= intel_pmu_cpu_dying,
 	.guest_get_msrs		= intel_guest_get_msrs,
-	.flush_branch_stack	= intel_pmu_flush_branch_stack,
+	.sched_task		= intel_pmu_lbr_sched_task,
 };
 
 static __init void intel_clovertown_quirk(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 8bc078f43a82..c0e23c5f85bb 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -177,6 +177,31 @@ void intel_pmu_lbr_reset(void)
 		intel_pmu_lbr_reset_64();
 }
 
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (!x86_pmu.lbr_nr)
+		return;
+
+	/*
+	 * When sampling the branck stack in system-wide, it may be
+	 * necessary to flush the stack on context switch. This happens
+	 * when the branch stack does not tag its entries with the pid
+	 * of the current task. Otherwise it becomes impossible to
+	 * associate a branch entry with a task. This ambiguity is more
+	 * likely to appear when the branch stack supports priv level
+	 * filtering and the user sets it to monitor only at the user
+	 * level (which could be a useful measurement in system-wide
+	 * mode). In that case, the risk is high of having a branch
+	 * stack with branch from multiple tasks.
+ 	 */
+	if (sched_in) {
+		intel_pmu_lbr_reset();
+		cpuc->lbr_context = ctx;
+	}
+}
+
 void intel_pmu_lbr_enable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -195,6 +220,7 @@ void intel_pmu_lbr_enable(struct perf_event *event)
 	cpuc->br_sel = event->hw.branch_reg.reg;
 
 	cpuc->lbr_users++;
+	perf_sched_cb_inc(event->ctx->pmu);
 }
 
 void intel_pmu_lbr_disable(struct perf_event *event)
@@ -206,6 +232,7 @@ void intel_pmu_lbr_disable(struct perf_event *event)
 
 	cpuc->lbr_users--;
 	WARN_ON_ONCE(cpuc->lbr_users < 0);
+	perf_sched_cb_dec(event->ctx->pmu);
 
 	if (cpuc->enabled && !cpuc->lbr_users) {
 		__intel_pmu_lbr_disable();
-- 
cgit v1.2.3


From e9d7f7cd97c45e2c612d3b38be05b4cfb27939ee Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:56:00 -0500
Subject: perf/x86/intel: Add basic Haswell LBR call stack support

Haswell has a new feature that utilizes the existing LBR facility to
record call chains. To enable this feature, bits (JCC, NEAR_IND_JMP,
NEAR_REL_JMP, FAR_BRANCH, EN_CALLSTACK) in LBR_SELECT must be set to 1,
bits (NEAR_REL_CALL, NEAR-IND_CALL, NEAR_RET) must be cleared. Due to
a hardware bug of Haswell, this feature doesn't work well with
FREEZE_LBRS_ON_PMI.

When the call stack feature is enabled, the LBR stack will capture
unfiltered call data normally, but as return instructions are executed,
the last captured branch record is flushed from the on-chip registers
in a last-in first-out (LIFO) manner. Thus, branch information relative
to leaf functions will not be captured, while preserving the call stack
information of the main line execution path.

This patch defines a separate lbr_sel map for Haswell. The map contains
a new entry for the call stack feature.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-5-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h           | 14 ++++-
 arch/x86/kernel/cpu/perf_event_intel.c     |  2 +-
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 91 ++++++++++++++++++++++--------
 3 files changed, 83 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 949d0083a29e..c9a62c5bca75 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -517,7 +517,11 @@ struct x86_pmu {
 };
 
 enum {
-	PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE = PERF_SAMPLE_BRANCH_MAX_SHIFT,
+	PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = PERF_SAMPLE_BRANCH_MAX_SHIFT,
+	PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE,
+
+	PERF_SAMPLE_BRANCH_CALL_STACK =
+				1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT,
 };
 
 #define x86_add_quirk(func_)						\
@@ -551,6 +555,12 @@ static struct perf_pmu_events_attr event_attr_##v = {			\
 
 extern struct x86_pmu x86_pmu __read_mostly;
 
+static inline bool x86_pmu_has_lbr_callstack(void)
+{
+	return  x86_pmu.lbr_sel_map &&
+		x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
+}
+
 DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
 
 int x86_perf_event_set_period(struct perf_event *event);
@@ -754,6 +764,8 @@ void intel_pmu_lbr_init_atom(void);
 
 void intel_pmu_lbr_init_snb(void);
 
+void intel_pmu_lbr_init_hsw(void);
+
 int intel_pmu_setup_lbr_filter(struct perf_event *event);
 
 int p4_pmu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 424fbf74dee7..a485ba124476 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2537,7 +2537,7 @@ __init int intel_pmu_init(void)
 		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
 
-		intel_pmu_lbr_init_snb();
+		intel_pmu_lbr_init_hsw();
 
 		x86_pmu.event_constraints = intel_hsw_event_constraints;
 		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index c0e23c5f85bb..ac8279e560a1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -39,6 +39,7 @@ static enum {
 #define LBR_IND_JMP_BIT		6 /* do not capture indirect jumps */
 #define LBR_REL_JMP_BIT		7 /* do not capture relative jumps */
 #define LBR_FAR_BIT		8 /* do not capture far branches */
+#define LBR_CALL_STACK_BIT	9 /* enable call stack */
 
 #define LBR_KERNEL	(1 << LBR_KERNEL_BIT)
 #define LBR_USER	(1 << LBR_USER_BIT)
@@ -49,6 +50,7 @@ static enum {
 #define LBR_REL_JMP	(1 << LBR_REL_JMP_BIT)
 #define LBR_IND_JMP	(1 << LBR_IND_JMP_BIT)
 #define LBR_FAR		(1 << LBR_FAR_BIT)
+#define LBR_CALL_STACK	(1 << LBR_CALL_STACK_BIT)
 
 #define LBR_PLM (LBR_KERNEL | LBR_USER)
 
@@ -74,24 +76,25 @@ static enum {
  * x86control flow changes include branches, interrupts, traps, faults
  */
 enum {
-	X86_BR_NONE     = 0,      /* unknown */
-
-	X86_BR_USER     = 1 << 0, /* branch target is user */
-	X86_BR_KERNEL   = 1 << 1, /* branch target is kernel */
-
-	X86_BR_CALL     = 1 << 2, /* call */
-	X86_BR_RET      = 1 << 3, /* return */
-	X86_BR_SYSCALL  = 1 << 4, /* syscall */
-	X86_BR_SYSRET   = 1 << 5, /* syscall return */
-	X86_BR_INT      = 1 << 6, /* sw interrupt */
-	X86_BR_IRET     = 1 << 7, /* return from interrupt */
-	X86_BR_JCC      = 1 << 8, /* conditional */
-	X86_BR_JMP      = 1 << 9, /* jump */
-	X86_BR_IRQ      = 1 << 10,/* hw interrupt or trap or fault */
-	X86_BR_IND_CALL = 1 << 11,/* indirect calls */
-	X86_BR_ABORT    = 1 << 12,/* transaction abort */
-	X86_BR_IN_TX    = 1 << 13,/* in transaction */
-	X86_BR_NO_TX    = 1 << 14,/* not in transaction */
+	X86_BR_NONE		= 0,      /* unknown */
+
+	X86_BR_USER		= 1 << 0, /* branch target is user */
+	X86_BR_KERNEL		= 1 << 1, /* branch target is kernel */
+
+	X86_BR_CALL		= 1 << 2, /* call */
+	X86_BR_RET		= 1 << 3, /* return */
+	X86_BR_SYSCALL		= 1 << 4, /* syscall */
+	X86_BR_SYSRET		= 1 << 5, /* syscall return */
+	X86_BR_INT		= 1 << 6, /* sw interrupt */
+	X86_BR_IRET		= 1 << 7, /* return from interrupt */
+	X86_BR_JCC		= 1 << 8, /* conditional */
+	X86_BR_JMP		= 1 << 9, /* jump */
+	X86_BR_IRQ		= 1 << 10,/* hw interrupt or trap or fault */
+	X86_BR_IND_CALL		= 1 << 11,/* indirect calls */
+	X86_BR_ABORT		= 1 << 12,/* transaction abort */
+	X86_BR_IN_TX		= 1 << 13,/* in transaction */
+	X86_BR_NO_TX		= 1 << 14,/* not in transaction */
+	X86_BR_CALL_STACK	= 1 << 15,/* call stack */
 };
 
 #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -373,7 +376,7 @@ void intel_pmu_lbr_read(void)
  * - in case there is no HW filter
  * - in case the HW filter has errata or limitations
  */
-static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
+static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
 {
 	u64 br_type = event->attr.branch_sample_type;
 	int mask = 0;
@@ -410,11 +413,21 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
 	if (br_type & PERF_SAMPLE_BRANCH_COND)
 		mask |= X86_BR_JCC;
 
+	if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
+		if (!x86_pmu_has_lbr_callstack())
+			return -EOPNOTSUPP;
+		if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
+			return -EINVAL;
+		mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
+			X86_BR_CALL_STACK;
+	}
+
 	/*
 	 * stash actual user request into reg, it may
 	 * be used by fixup code for some CPU
 	 */
 	event->hw.branch_reg.reg = mask;
+	return 0;
 }
 
 /*
@@ -443,8 +456,12 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
 	reg = &event->hw.branch_reg;
 	reg->idx = EXTRA_REG_LBR;
 
-	/* LBR_SELECT operates in suppress mode so invert mask */
-	reg->config = ~mask & x86_pmu.lbr_sel_mask;
+	/*
+	 * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
+	 * in suppress mode. So LBR_SELECT should be set to
+	 * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
+	 */
+	reg->config = mask ^ x86_pmu.lbr_sel_mask;
 
 	return 0;
 }
@@ -462,7 +479,9 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
 	/*
 	 * setup SW LBR filter
 	 */
-	intel_pmu_setup_sw_lbr_filter(event);
+	ret = intel_pmu_setup_sw_lbr_filter(event);
+	if (ret)
+		return ret;
 
 	/*
 	 * setup HW LBR filter, if any
@@ -732,6 +751,20 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
 	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
 };
 
+static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
+	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
+	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
+	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
+	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
+	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_FAR,
+	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
+						| LBR_FAR,
+	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]	= LBR_IND_CALL,
+	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
+	[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
+						| LBR_RETURN | LBR_CALL_STACK,
+};
+
 /* core */
 void __init intel_pmu_lbr_init_core(void)
 {
@@ -788,6 +821,20 @@ void __init intel_pmu_lbr_init_snb(void)
 	pr_cont("16-deep LBR, ");
 }
 
+/* haswell */
+void intel_pmu_lbr_init_hsw(void)
+{
+	x86_pmu.lbr_nr	 = 16;
+	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
+	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+
+	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
+
+	pr_cont("16-deep LBR, ");
+}
+
 /* atom */
 void __init intel_pmu_lbr_init_atom(void)
 {
-- 
cgit v1.2.3


From e18bf526422769611e7248135e36a4cea0e4e38d Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:56:03 -0500
Subject: perf/x86/intel: Allocate space for storing LBR stack

When the LBR call stack is enabled, it is necessary to save/restore
the LBR stack on context switch. We can use pmu specific data to
store LBR stack when task is scheduled out. This patch adds code
that allocates the pmu specific data.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-8-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 4 ++++
 arch/x86/kernel/cpu/perf_event.h | 7 +++++++
 2 files changed, 11 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 6b1fd26a37cf..8ffd71ec2173 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -432,6 +432,9 @@ int x86_pmu_hw_config(struct perf_event *event)
 		}
 	}
 
+	if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
+		event->attach_state |= PERF_ATTACH_TASK_DATA;
+
 	/*
 	 * Generate PMC IRQs:
 	 * (keep 'enabled' bit clear for now)
@@ -1950,6 +1953,7 @@ static struct pmu pmu = {
 
 	.event_idx		= x86_pmu_event_idx,
 	.sched_task		= x86_pmu_sched_task,
+	.task_ctx_size          = sizeof(struct x86_perf_task_context),
 };
 
 void arch_perf_update_userpage(struct perf_event *event,
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index c9a62c5bca75..69c26b396cf4 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -516,6 +516,13 @@ struct x86_pmu {
 	struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
 };
 
+struct x86_perf_task_context {
+	u64 lbr_from[MAX_LBR_ENTRIES];
+	u64 lbr_to[MAX_LBR_ENTRIES];
+	int lbr_callstack_users;
+	int lbr_stack_state;
+};
+
 enum {
 	PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = PERF_SAMPLE_BRANCH_MAX_SHIFT,
 	PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE,
-- 
cgit v1.2.3


From 63f0c1d84196b712fe9de99a8514486ab416d517 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:56:04 -0500
Subject: perf/x86/intel: Track number of events that use the LBR callstack

When enabling/disabling an event, check if the event uses the LBR
callstack feature, adjust the LBR callstack usage count accordingly.
Later patch will use the usage count to decide if LBR stack should
be saved/restored.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-9-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index ac8279e560a1..ac8e54200934 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -205,9 +205,15 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
 	}
 }
 
+static inline bool branch_user_callstack(unsigned br_sel)
+{
+	return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
+}
+
 void intel_pmu_lbr_enable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct x86_perf_task_context *task_ctx;
 
 	if (!x86_pmu.lbr_nr)
 		return;
@@ -222,6 +228,12 @@ void intel_pmu_lbr_enable(struct perf_event *event)
 	}
 	cpuc->br_sel = event->hw.branch_reg.reg;
 
+	if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+					event->ctx->task_ctx_data) {
+		task_ctx = event->ctx->task_ctx_data;
+		task_ctx->lbr_callstack_users++;
+	}
+
 	cpuc->lbr_users++;
 	perf_sched_cb_inc(event->ctx->pmu);
 }
@@ -229,10 +241,17 @@ void intel_pmu_lbr_enable(struct perf_event *event)
 void intel_pmu_lbr_disable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct x86_perf_task_context *task_ctx;
 
 	if (!x86_pmu.lbr_nr)
 		return;
 
+	if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+					event->ctx->task_ctx_data) {
+		task_ctx = event->ctx->task_ctx_data;
+		task_ctx->lbr_callstack_users--;
+	}
+
 	cpuc->lbr_users--;
 	WARN_ON_ONCE(cpuc->lbr_users < 0);
 	perf_sched_cb_dec(event->ctx->pmu);
-- 
cgit v1.2.3


From 76cb2c617f12a4dd53c0e899972813b805ad6cc2 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:56:05 -0500
Subject: perf/x86/intel: Save/restore LBR stack during context switch

When the LBR call stack is enabled, it is necessary to save/restore
the LBR stack on context switch. The solution is saving/restoring
the LBR stack to/from task's perf event context.

The LBR stack is saved/restored only when there are events that use
the LBR call stack. If no event uses LBR call stack, the LBR stack
is reset when task is scheduled in.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-10-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 88 ++++++++++++++++++++++++++----
 1 file changed, 76 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index ac8e54200934..a8b3f236cf37 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -180,13 +180,89 @@ void intel_pmu_lbr_reset(void)
 		intel_pmu_lbr_reset_64();
 }
 
+/*
+ * TOS = most recently recorded branch
+ */
+static inline u64 intel_pmu_lbr_tos(void)
+{
+	u64 tos;
+
+	rdmsrl(x86_pmu.lbr_tos, tos);
+	return tos;
+}
+
+enum {
+	LBR_NONE,
+	LBR_VALID,
+};
+
+static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
+{
+	int i;
+	unsigned lbr_idx, mask;
+	u64 tos;
+
+	if (task_ctx->lbr_callstack_users == 0 ||
+	    task_ctx->lbr_stack_state == LBR_NONE) {
+		intel_pmu_lbr_reset();
+		return;
+	}
+
+	mask = x86_pmu.lbr_nr - 1;
+	tos = intel_pmu_lbr_tos();
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		lbr_idx = (tos - i) & mask;
+		wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+		wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+	}
+	task_ctx->lbr_stack_state = LBR_NONE;
+}
+
+static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
+{
+	int i;
+	unsigned lbr_idx, mask;
+	u64 tos;
+
+	if (task_ctx->lbr_callstack_users == 0) {
+		task_ctx->lbr_stack_state = LBR_NONE;
+		return;
+	}
+
+	mask = x86_pmu.lbr_nr - 1;
+	tos = intel_pmu_lbr_tos();
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		lbr_idx = (tos - i) & mask;
+		rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+		rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+	}
+	task_ctx->lbr_stack_state = LBR_VALID;
+}
+
 void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct x86_perf_task_context *task_ctx;
 
 	if (!x86_pmu.lbr_nr)
 		return;
 
+	/*
+	 * If LBR callstack feature is enabled and the stack was saved when
+	 * the task was scheduled out, restore the stack. Otherwise flush
+	 * the LBR stack.
+	 */
+	task_ctx = ctx ? ctx->task_ctx_data : NULL;
+	if (task_ctx) {
+		if (sched_in) {
+			__intel_pmu_lbr_restore(task_ctx);
+			cpuc->lbr_context = ctx;
+		} else {
+			__intel_pmu_lbr_save(task_ctx);
+		}
+		return;
+	}
+
 	/*
 	 * When sampling the branck stack in system-wide, it may be
 	 * necessary to flush the stack on context switch. This happens
@@ -279,18 +355,6 @@ void intel_pmu_lbr_disable_all(void)
 		__intel_pmu_lbr_disable();
 }
 
-/*
- * TOS = most recently recorded branch
- */
-static inline u64 intel_pmu_lbr_tos(void)
-{
-	u64 tos;
-
-	rdmsrl(x86_pmu.lbr_tos, tos);
-
-	return tos;
-}
-
 static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 {
 	unsigned long mask = x86_pmu.lbr_nr - 1;
-- 
cgit v1.2.3


From a46a23000198d929391aa9dac8de68734efa2703 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:56:06 -0500
Subject: perf: Simplify the branch stack check

Use event->attr.branch_sample_type to replace
intel_pmu_needs_lbr_smpl() for avoiding duplicated code that
implicitly enables the LBR.

Currently, branch stack can be enabled by user explicitly requesting
branch sampling or implicit branch sampling to correct PEBS skid.

For user explicitly requested branch sampling, the branch_sample_type
is explicitly set by user. For PEBS case, the branch_sample_type is also
implicitly set to PERF_SAMPLE_BRANCH_ANY in x86_pmu_hw_config.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-11-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index a485ba124476..9f1dd18fa395 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1029,20 +1029,6 @@ static __initconst const u64 slm_hw_cache_event_ids
  },
 };
 
-static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
-{
-	/* user explicitly requested branch sampling */
-	if (has_branch_stack(event))
-		return true;
-
-	/* implicit branch sampling to correct PEBS skid */
-	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
-	    x86_pmu.intel_cap.pebs_format < 2)
-		return true;
-
-	return false;
-}
-
 static void intel_pmu_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1207,7 +1193,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
 	 * must disable before any actual event
 	 * because any event may be combined with LBR
 	 */
-	if (intel_pmu_needs_lbr_smpl(event))
+	if (needs_branch_stack(event))
 		intel_pmu_lbr_disable(event);
 
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
@@ -1268,7 +1254,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
 	 * must enabled before any actual event
 	 * because any event may be combined with LBR
 	 */
-	if (intel_pmu_needs_lbr_smpl(event))
+	if (needs_branch_stack(event))
 		intel_pmu_lbr_enable(event);
 
 	if (event->attr.exclude_host)
@@ -1747,7 +1733,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
 	if (event->attr.precise_ip && x86_pmu.pebs_aliases)
 		x86_pmu.pebs_aliases(event);
 
-	if (intel_pmu_needs_lbr_smpl(event)) {
+	if (needs_branch_stack(event)) {
 		ret = intel_pmu_setup_lbr_filter(event);
 		if (ret)
 			return ret;
-- 
cgit v1.2.3


From 4b854900995194601d767fcd112307b21ed278b2 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:56:08 -0500
Subject: perf/x86/intel: Re-organize code that implicitly enables LBR/PEBS

Make later patch more readable, no logic change.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-13-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 59 ++++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 8ffd71ec2173..e0dab5ce61e9 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -399,36 +399,35 @@ int x86_pmu_hw_config(struct perf_event *event)
 
 		if (event->attr.precise_ip > precise)
 			return -EOPNOTSUPP;
-		/*
-		 * check that PEBS LBR correction does not conflict with
-		 * whatever the user is asking with attr->branch_sample_type
-		 */
-		if (event->attr.precise_ip > 1 &&
-		    x86_pmu.intel_cap.pebs_format < 2) {
-			u64 *br_type = &event->attr.branch_sample_type;
-
-			if (has_branch_stack(event)) {
-				if (!precise_br_compat(event))
-					return -EOPNOTSUPP;
-
-				/* branch_sample_type is compatible */
-
-			} else {
-				/*
-				 * user did not specify  branch_sample_type
-				 *
-				 * For PEBS fixups, we capture all
-				 * the branches at the priv level of the
-				 * event.
-				 */
-				*br_type = PERF_SAMPLE_BRANCH_ANY;
-
-				if (!event->attr.exclude_user)
-					*br_type |= PERF_SAMPLE_BRANCH_USER;
-
-				if (!event->attr.exclude_kernel)
-					*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
-			}
+	}
+	/*
+	 * check that PEBS LBR correction does not conflict with
+	 * whatever the user is asking with attr->branch_sample_type
+	 */
+	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
+		u64 *br_type = &event->attr.branch_sample_type;
+
+		if (has_branch_stack(event)) {
+			if (!precise_br_compat(event))
+				return -EOPNOTSUPP;
+
+			/* branch_sample_type is compatible */
+
+		} else {
+			/*
+			 * user did not specify  branch_sample_type
+			 *
+			 * For PEBS fixups, we capture all
+			 * the branches at the priv level of the
+			 * event.
+			 */
+			*br_type = PERF_SAMPLE_BRANCH_ANY;
+
+			if (!event->attr.exclude_user)
+				*br_type |= PERF_SAMPLE_BRANCH_USER;
+
+			if (!event->attr.exclude_kernel)
+				*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
 		}
 	}
 
-- 
cgit v1.2.3


From 2c70d0086e4e9e2440f0f78098090f32bde14277 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:56:10 -0500
Subject: perf/x86/intel: Disable FREEZE_LBRS_ON_PMI when LBR operates in
 callstack mode

LBR callstack is designed for PEBS, It does not work well with
FREEZE_LBRS_ON_PMI for non PEBS event. If FREEZE_LBRS_ON_PMI is set for
non PEBS event, PMIs near call/return instructions may cause superfluous
increase/decrease of LBR_TOS.

This patch modifies __intel_pmu_lbr_enable() to not enable
FREEZE_LBRS_ON_PMI when LBR operates in callstack mode. We currently
don't use LBR callstack to capture kernel space callchain, so disabling
FREEZE_LBRS_ON_PMI should not be a problem.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-15-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index a8b3f236cf37..92a44fdbc9d3 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -131,14 +131,23 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
 
 static void __intel_pmu_lbr_enable(void)
 {
-	u64 debugctl;
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	u64 debugctl, lbr_select = 0;
 
-	if (cpuc->lbr_sel)
-		wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
+	if (cpuc->lbr_sel) {
+		lbr_select = cpuc->lbr_sel->config;
+		wrmsrl(MSR_LBR_SELECT, lbr_select);
+	}
 
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-	debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+	debugctl |= DEBUGCTLMSR_LBR;
+	/*
+	 * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
+	 * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
+	 * may cause superfluous increase/decrease of LBR_TOS.
+	 */
+	if (!(lbr_select & LBR_CALL_STACK))
+		debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 }
 
-- 
cgit v1.2.3


From aa54ae9b87b83af7edabcc34a299e7e014609af4 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:56:11 -0500
Subject: perf/x86/intel: Discard zero length call entries in LBR call stack

"Zero length call" uses the attribute of the call instruction to push
the immediate instruction pointer on to the stack and then pops off
that address into a register. This is accomplished without any matching
return instruction. It confuses the hardware and make the recorded call
stack incorrect.

We can partially resolve this issue by: decode call instructions and
discard any zero length call entry in the LBR stack.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-16-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 92a44fdbc9d3..084f2eb20c8b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -94,7 +94,8 @@ enum {
 	X86_BR_ABORT		= 1 << 12,/* transaction abort */
 	X86_BR_IN_TX		= 1 << 13,/* in transaction */
 	X86_BR_NO_TX		= 1 << 14,/* not in transaction */
-	X86_BR_CALL_STACK	= 1 << 15,/* call stack */
+	X86_BR_ZERO_CALL	= 1 << 15,/* zero length call */
+	X86_BR_CALL_STACK	= 1 << 16,/* call stack */
 };
 
 #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -111,13 +112,15 @@ enum {
 	 X86_BR_JMP	 |\
 	 X86_BR_IRQ	 |\
 	 X86_BR_ABORT	 |\
-	 X86_BR_IND_CALL)
+	 X86_BR_IND_CALL |\
+	 X86_BR_ZERO_CALL)
 
 #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
 
 #define X86_BR_ANY_CALL		 \
 	(X86_BR_CALL		|\
 	 X86_BR_IND_CALL	|\
+	 X86_BR_ZERO_CALL	|\
 	 X86_BR_SYSCALL		|\
 	 X86_BR_IRQ		|\
 	 X86_BR_INT)
@@ -702,6 +705,12 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
 		ret = X86_BR_INT;
 		break;
 	case 0xe8: /* call near rel */
+		insn_get_immediate(&insn);
+		if (insn.immediate1.value == 0) {
+			/* zero length call */
+			ret = X86_BR_ZERO_CALL;
+			break;
+		}
 	case 0x9a: /* call far absolute */
 		ret = X86_BR_CALL;
 		break;
-- 
cgit v1.2.3


From 2c44b1936bb3b135a3fac8b3493394d42e51cf70 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 5 Nov 2014 10:36:45 +0100
Subject: perf/x86/intel: Expose LBR callstack to user space tooling

With LBR call stack feature enable, there are three callchain options.
Enable the 3rd callchain option (LBR callstack) to user space tooling.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-api@vger.kernel.org
Link: http://lkml.kernel.org/r/20141105093759.GQ10501@worktop.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h           | 8 --------
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 8 ++++----
 2 files changed, 4 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 69c26b396cf4..a371d27d6795 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -523,14 +523,6 @@ struct x86_perf_task_context {
 	int lbr_stack_state;
 };
 
-enum {
-	PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = PERF_SAMPLE_BRANCH_MAX_SHIFT,
-	PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE,
-
-	PERF_SAMPLE_BRANCH_CALL_STACK =
-				1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT,
-};
-
 #define x86_add_quirk(func_)						\
 do {									\
 	static struct x86_pmu_quirk __quirk __initdata = {		\
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 084f2eb20c8b..0473874109cb 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -537,7 +537,7 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
 	u64 mask = 0, v;
 	int i;
 
-	for (i = 0; i < PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE; i++) {
+	for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
 		if (!(br_type & (1ULL << i)))
 			continue;
 
@@ -821,7 +821,7 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 /*
  * Map interface branch filters onto LBR filters
  */
-static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
+static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
 	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
 	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
 	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
@@ -840,7 +840,7 @@ static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
 	[PERF_SAMPLE_BRANCH_COND_SHIFT]     = LBR_JCC,
 };
 
-static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
+static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
 	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
 	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
 	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
@@ -852,7 +852,7 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
 	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
 };
 
-static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
+static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
 	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
 	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
 	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
-- 
cgit v1.2.3


From 4421f8f0fa02bc982b410cd773223cc280791c54 Mon Sep 17 00:00:00 2001
From: Miroslav Benes <mbenes@suse.cz>
Date: Wed, 18 Feb 2015 15:21:07 +0100
Subject: livepatch: remove extern specifier from header files

Storage-class specifier 'extern' is redundant in front of the function
declaration. According to the C specification it has the same meaning as
if not present at all. So remove it.

Signed-off-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/include/asm/livepatch.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index a455a53d789a..2d29197bd2fb 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -32,8 +32,8 @@ static inline int klp_check_compiler_support(void)
 #endif
 	return 0;
 }
-extern int klp_write_module_reloc(struct module *mod, unsigned long type,
-				  unsigned long loc, unsigned long value);
+int klp_write_module_reloc(struct module *mod, unsigned long type,
+			   unsigned long loc, unsigned long value);
 
 static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
 {
-- 
cgit v1.2.3


From cbc82b17263877ea5d21e84c58ce03f0292458a1 Mon Sep 17 00:00:00 2001
From: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Date: Fri, 23 Jan 2015 18:45:43 +0000
Subject: x86: Add support for Intel Cache QoS Monitoring (CQM) detection

This patch adds support for the new Cache QoS Monitoring (CQM)
feature found in future Intel Xeon processors.  It includes the
new values to track CQM resources to the cpuinfo_x86 structure,
plus the CPUID detection routines for CQM.

CQM allows a process, or set of processes, to be tracked by the CPU
to determine the cache usage of that task group.  Using this data
from the CPU, software can be written to extract this data and
report cache usage and occupancy for a particular process, or
group of processes.

More information about Cache QoS Monitoring can be found in the
Intel (R) x86 Architecture Software Developer Manual, section 17.14.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Chris Webb <chris@arachsys.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Jacob Shin <jacob.w.shin@gmail.com>
Cc: Jan Beulich <JBeulich@suse.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Steven Honeyman <stevenhoneyman@gmail.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/1422038748-21397-5-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeature.h |  9 ++++++++-
 arch/x86/include/asm/processor.h  |  3 +++
 arch/x86/kernel/cpu/common.c      | 39 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 50 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 90a54851aedc..361922dcc9b1 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -12,7 +12,7 @@
 #include <asm/disabled-features.h>
 #endif
 
-#define NCAPINTS	11	/* N 32-bit words worth of info */
+#define NCAPINTS	13	/* N 32-bit words worth of info */
 #define NBUGINTS	1	/* N 32-bit bug flags */
 
 /*
@@ -226,6 +226,7 @@
 #define X86_FEATURE_ERMS	( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
 #define X86_FEATURE_INVPCID	( 9*32+10) /* Invalidate Processor Context ID */
 #define X86_FEATURE_RTM		( 9*32+11) /* Restricted Transactional Memory */
+#define X86_FEATURE_CQM		( 9*32+12) /* Cache QoS Monitoring */
 #define X86_FEATURE_MPX		( 9*32+14) /* Memory Protection Extension */
 #define X86_FEATURE_AVX512F	( 9*32+16) /* AVX-512 Foundation */
 #define X86_FEATURE_RDSEED	( 9*32+18) /* The RDSEED instruction */
@@ -242,6 +243,12 @@
 #define X86_FEATURE_XGETBV1	(10*32+ 2) /* XGETBV with ECX = 1 */
 #define X86_FEATURE_XSAVES	(10*32+ 3) /* XSAVES/XRSTORS */
 
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
+#define X86_FEATURE_CQM_LLC	(11*32+ 1) /* LLC QoS if 1 */
+
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
+#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
+
 /*
  * BUG word(s)
  */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ec1c93588cef..a12d50e04d7a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -109,6 +109,9 @@ struct cpuinfo_x86 {
 	/* in KB - valid for CPUS which support this call: */
 	int			x86_cache_size;
 	int			x86_cache_alignment;	/* In bytes */
+	/* Cache QoS architectural values: */
+	int			x86_cache_max_rmid;	/* max index */
+	int			x86_cache_occ_scale;	/* scale to bytes */
 	int			x86_power;
 	unsigned long		loops_per_jiffy;
 	/* cpuid returned max cores value: */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 07f2fc3c13a4..9fa00b2ea0ee 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -645,6 +645,30 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 		c->x86_capability[10] = eax;
 	}
 
+	/* Additional Intel-defined flags: level 0x0000000F */
+	if (c->cpuid_level >= 0x0000000F) {
+		u32 eax, ebx, ecx, edx;
+
+		/* QoS sub-leaf, EAX=0Fh, ECX=0 */
+		cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx);
+		c->x86_capability[11] = edx;
+		if (cpu_has(c, X86_FEATURE_CQM_LLC)) {
+			/* will be overridden if occupancy monitoring exists */
+			c->x86_cache_max_rmid = ebx;
+
+			/* QoS sub-leaf, EAX=0Fh, ECX=1 */
+			cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
+			c->x86_capability[12] = edx;
+			if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
+				c->x86_cache_max_rmid = ecx;
+				c->x86_cache_occ_scale = ebx;
+			}
+		} else {
+			c->x86_cache_max_rmid = -1;
+			c->x86_cache_occ_scale = -1;
+		}
+	}
+
 	/* AMD-defined flags: level 0x80000001 */
 	xlvl = cpuid_eax(0x80000000);
 	c->extended_cpuid_level = xlvl;
@@ -833,6 +857,20 @@ static void generic_identify(struct cpuinfo_x86 *c)
 	detect_nopl(c);
 }
 
+static void x86_init_cache_qos(struct cpuinfo_x86 *c)
+{
+	/*
+	 * The heavy lifting of max_rmid and cache_occ_scale are handled
+	 * in get_cpu_cap().  Here we just set the max_rmid for the boot_cpu
+	 * in case CQM bits really aren't there in this CPU.
+	 */
+	if (c != &boot_cpu_data) {
+		boot_cpu_data.x86_cache_max_rmid =
+			min(boot_cpu_data.x86_cache_max_rmid,
+			    c->x86_cache_max_rmid);
+	}
+}
+
 /*
  * This does the hard work of actually picking apart the CPU stuff...
  */
@@ -922,6 +960,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 
 	init_hypervisor(c);
 	x86_init_rdrand(c);
+	x86_init_cache_qos(c);
 
 	/*
 	 * Clear/Set all flags overriden by options, need do it
-- 
cgit v1.2.3


From 4afbb24ce5e723c8a093a6674a3c33062175078a Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Fri, 23 Jan 2015 18:45:44 +0000
Subject: perf/x86/intel: Add Intel Cache QoS Monitoring support

Future Intel Xeon processors support a Cache QoS Monitoring feature that
allows tracking of the LLC occupancy for a task or task group, i.e. the
amount of data in pulled into the LLC for the task (group).

Currently the PMU only supports per-cpu events. We create an event for
each cpu and read out all the LLC occupancy values.

Because this results in duplicate values being written out to userspace,
we also export a .per-pkg event file so that the perf tools only
accumulate values for one cpu per package.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/1422038748-21397-6-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_cqm.c | 530 +++++++++++++++++++++++++++++
 1 file changed, 530 insertions(+)
 create mode 100644 arch/x86/kernel/cpu/perf_event_intel_cqm.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
new file mode 100644
index 000000000000..05b4cd26f426
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -0,0 +1,530 @@
+/*
+ * Intel Cache Quality-of-Service Monitoring (CQM) support.
+ *
+ * Based very, very heavily on work by Peter Zijlstra.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+#include <asm/cpu_device_id.h>
+#include "perf_event.h"
+
+#define MSR_IA32_PQR_ASSOC	0x0c8f
+#define MSR_IA32_QM_CTR		0x0c8e
+#define MSR_IA32_QM_EVTSEL	0x0c8d
+
+static unsigned int cqm_max_rmid = -1;
+static unsigned int cqm_l3_scale; /* supposedly cacheline size */
+
+struct intel_cqm_state {
+	raw_spinlock_t		lock;
+	int			rmid;
+	int			cnt;
+};
+
+static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
+
+/*
+ * Protects cache_cgroups.
+ */
+static DEFINE_MUTEX(cache_mutex);
+
+/*
+ * Groups of events that have the same target(s), one RMID per group.
+ */
+static LIST_HEAD(cache_groups);
+
+/*
+ * Mask of CPUs for reading CQM values. We only need one per-socket.
+ */
+static cpumask_t cqm_cpumask;
+
+#define RMID_VAL_ERROR		(1ULL << 63)
+#define RMID_VAL_UNAVAIL	(1ULL << 62)
+
+#define QOS_L3_OCCUP_EVENT_ID	(1 << 0)
+
+#define QOS_EVENT_MASK	QOS_L3_OCCUP_EVENT_ID
+
+static u64 __rmid_read(unsigned long rmid)
+{
+	u64 val;
+
+	/*
+	 * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
+	 * it just says that to increase confusion.
+	 */
+	wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
+	rdmsrl(MSR_IA32_QM_CTR, val);
+
+	/*
+	 * Aside from the ERROR and UNAVAIL bits, assume this thing returns
+	 * the number of cachelines tagged with @rmid.
+	 */
+	return val;
+}
+
+static unsigned long *cqm_rmid_bitmap;
+
+/*
+ * Returns < 0 on fail.
+ */
+static int __get_rmid(void)
+{
+	return bitmap_find_free_region(cqm_rmid_bitmap, cqm_max_rmid, 0);
+}
+
+static void __put_rmid(int rmid)
+{
+	bitmap_release_region(cqm_rmid_bitmap, rmid, 0);
+}
+
+static int intel_cqm_setup_rmid_cache(void)
+{
+	cqm_rmid_bitmap = kmalloc(sizeof(long) * BITS_TO_LONGS(cqm_max_rmid), GFP_KERNEL);
+	if (!cqm_rmid_bitmap)
+		return -ENOMEM;
+
+	bitmap_zero(cqm_rmid_bitmap, cqm_max_rmid);
+
+	/*
+	 * RMID 0 is special and is always allocated. It's used for all
+	 * tasks that are not monitored.
+	 */
+	bitmap_allocate_region(cqm_rmid_bitmap, 0, 0);
+
+	return 0;
+}
+
+/*
+ * Determine if @a and @b measure the same set of tasks.
+ */
+static bool __match_event(struct perf_event *a, struct perf_event *b)
+{
+	if ((a->attach_state & PERF_ATTACH_TASK) !=
+	    (b->attach_state & PERF_ATTACH_TASK))
+		return false;
+
+	/* not task */
+
+	return true; /* if not task, we're machine wide */
+}
+
+/*
+ * Determine if @a's tasks intersect with @b's tasks
+ */
+static bool __conflict_event(struct perf_event *a, struct perf_event *b)
+{
+	/*
+	 * If one of them is not a task, same story as above with cgroups.
+	 */
+	if (!(a->attach_state & PERF_ATTACH_TASK) ||
+	    !(b->attach_state & PERF_ATTACH_TASK))
+		return true;
+
+	/*
+	 * Must be non-overlapping.
+	 */
+	return false;
+}
+
+/*
+ * Find a group and setup RMID.
+ *
+ * If we're part of a group, we use the group's RMID.
+ */
+static int intel_cqm_setup_event(struct perf_event *event,
+				 struct perf_event **group)
+{
+	struct perf_event *iter;
+	int rmid;
+
+	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
+		if (__match_event(iter, event)) {
+			/* All tasks in a group share an RMID */
+			event->hw.cqm_rmid = iter->hw.cqm_rmid;
+			*group = iter;
+			return 0;
+		}
+
+		if (__conflict_event(iter, event))
+			return -EBUSY;
+	}
+
+	rmid = __get_rmid();
+	if (rmid < 0)
+		return rmid;
+
+	event->hw.cqm_rmid = rmid;
+	return 0;
+}
+
+static void intel_cqm_event_read(struct perf_event *event)
+{
+	unsigned long rmid = event->hw.cqm_rmid;
+	u64 val;
+
+	val = __rmid_read(rmid);
+
+	/*
+	 * Ignore this reading on error states and do not update the value.
+	 */
+	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+		return;
+
+	local64_set(&event->count, val);
+}
+
+static void intel_cqm_event_start(struct perf_event *event, int mode)
+{
+	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
+	unsigned long rmid = event->hw.cqm_rmid;
+	unsigned long flags;
+
+	if (!(event->hw.cqm_state & PERF_HES_STOPPED))
+		return;
+
+	event->hw.cqm_state &= ~PERF_HES_STOPPED;
+
+	raw_spin_lock_irqsave(&state->lock, flags);
+
+	if (state->cnt++)
+		WARN_ON_ONCE(state->rmid != rmid);
+	else
+		WARN_ON_ONCE(state->rmid);
+
+	state->rmid = rmid;
+	wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid);
+
+	raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static void intel_cqm_event_stop(struct perf_event *event, int mode)
+{
+	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
+	unsigned long flags;
+
+	if (event->hw.cqm_state & PERF_HES_STOPPED)
+		return;
+
+	event->hw.cqm_state |= PERF_HES_STOPPED;
+
+	raw_spin_lock_irqsave(&state->lock, flags);
+	intel_cqm_event_read(event);
+
+	if (!--state->cnt) {
+		state->rmid = 0;
+		wrmsrl(MSR_IA32_PQR_ASSOC, 0);
+	} else {
+		WARN_ON_ONCE(!state->rmid);
+	}
+
+	raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static int intel_cqm_event_add(struct perf_event *event, int mode)
+{
+	int rmid;
+
+	event->hw.cqm_state = PERF_HES_STOPPED;
+	rmid = event->hw.cqm_rmid;
+	WARN_ON_ONCE(!rmid);
+
+	if (mode & PERF_EF_START)
+		intel_cqm_event_start(event, mode);
+
+	return 0;
+}
+
+static void intel_cqm_event_del(struct perf_event *event, int mode)
+{
+	intel_cqm_event_stop(event, mode);
+}
+
+static void intel_cqm_event_destroy(struct perf_event *event)
+{
+	struct perf_event *group_other = NULL;
+
+	mutex_lock(&cache_mutex);
+
+	/*
+	 * If there's another event in this group...
+	 */
+	if (!list_empty(&event->hw.cqm_group_entry)) {
+		group_other = list_first_entry(&event->hw.cqm_group_entry,
+					       struct perf_event,
+					       hw.cqm_group_entry);
+		list_del(&event->hw.cqm_group_entry);
+	}
+
+	/*
+	 * And we're the group leader..
+	 */
+	if (!list_empty(&event->hw.cqm_groups_entry)) {
+		/*
+		 * If there was a group_other, make that leader, otherwise
+		 * destroy the group and return the RMID.
+		 */
+		if (group_other) {
+			list_replace(&event->hw.cqm_groups_entry,
+				     &group_other->hw.cqm_groups_entry);
+		} else {
+			int rmid = event->hw.cqm_rmid;
+
+			__put_rmid(rmid);
+			list_del(&event->hw.cqm_groups_entry);
+		}
+	}
+
+	mutex_unlock(&cache_mutex);
+}
+
+static struct pmu intel_cqm_pmu;
+
+/*
+ * XXX there's a bit of a problem in that we cannot simply do the one
+ * event per node as one would want, since that one event would one get
+ * scheduled on the one cpu. But we want to 'schedule' the RMID on all
+ * CPUs.
+ *
+ * This means we want events for each CPU, however, that generates a lot
+ * of duplicate values out to userspace -- this is not to be helped
+ * unless we want to change the core code in some way. Fore more info,
+ * see intel_cqm_event_read().
+ */
+static int intel_cqm_event_init(struct perf_event *event)
+{
+	struct perf_event *group = NULL;
+	int err;
+
+	if (event->attr.type != intel_cqm_pmu.type)
+		return -ENOENT;
+
+	if (event->attr.config & ~QOS_EVENT_MASK)
+		return -EINVAL;
+
+	if (event->cpu == -1)
+		return -EINVAL;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    event->attr.sample_period) /* no sampling */
+		return -EINVAL;
+
+	INIT_LIST_HEAD(&event->hw.cqm_group_entry);
+	INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
+
+	event->destroy = intel_cqm_event_destroy;
+
+	mutex_lock(&cache_mutex);
+
+	err = intel_cqm_setup_event(event, &group); /* will also set rmid */
+	if (err)
+		goto out;
+
+	if (group) {
+		list_add_tail(&event->hw.cqm_group_entry,
+			      &group->hw.cqm_group_entry);
+	} else {
+		list_add_tail(&event->hw.cqm_groups_entry,
+			      &cache_groups);
+	}
+
+out:
+	mutex_unlock(&cache_mutex);
+	return err;
+}
+
+EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
+EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
+EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
+EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
+EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
+
+static struct attribute *intel_cqm_events_attr[] = {
+	EVENT_PTR(intel_cqm_llc),
+	EVENT_PTR(intel_cqm_llc_pkg),
+	EVENT_PTR(intel_cqm_llc_unit),
+	EVENT_PTR(intel_cqm_llc_scale),
+	EVENT_PTR(intel_cqm_llc_snapshot),
+	NULL,
+};
+
+static struct attribute_group intel_cqm_events_group = {
+	.name = "events",
+	.attrs = intel_cqm_events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+static struct attribute *intel_cqm_formats_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group intel_cqm_format_group = {
+	.name = "format",
+	.attrs = intel_cqm_formats_attr,
+};
+
+static const struct attribute_group *intel_cqm_attr_groups[] = {
+	&intel_cqm_events_group,
+	&intel_cqm_format_group,
+	NULL,
+};
+
+static struct pmu intel_cqm_pmu = {
+	.attr_groups	= intel_cqm_attr_groups,
+	.task_ctx_nr	= perf_sw_context,
+	.event_init	= intel_cqm_event_init,
+	.add		= intel_cqm_event_add,
+	.del		= intel_cqm_event_del,
+	.start		= intel_cqm_event_start,
+	.stop		= intel_cqm_event_stop,
+	.read		= intel_cqm_event_read,
+};
+
+static inline void cqm_pick_event_reader(int cpu)
+{
+	int phys_id = topology_physical_package_id(cpu);
+	int i;
+
+	for_each_cpu(i, &cqm_cpumask) {
+		if (phys_id == topology_physical_package_id(i))
+			return;	/* already got reader for this socket */
+	}
+
+	cpumask_set_cpu(cpu, &cqm_cpumask);
+}
+
+static void intel_cqm_cpu_prepare(unsigned int cpu)
+{
+	struct intel_cqm_state *state = &per_cpu(cqm_state, cpu);
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+	raw_spin_lock_init(&state->lock);
+	state->rmid = 0;
+	state->cnt  = 0;
+
+	WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
+	WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
+}
+
+static void intel_cqm_cpu_exit(unsigned int cpu)
+{
+	int phys_id = topology_physical_package_id(cpu);
+	int i;
+
+	/*
+	 * Is @cpu a designated cqm reader?
+	 */
+	if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
+		return;
+
+	for_each_online_cpu(i) {
+		if (i == cpu)
+			continue;
+
+		if (phys_id == topology_physical_package_id(i)) {
+			cpumask_set_cpu(i, &cqm_cpumask);
+			break;
+		}
+	}
+}
+
+static int intel_cqm_cpu_notifier(struct notifier_block *nb,
+				  unsigned long action, void *hcpu)
+{
+	unsigned int cpu  = (unsigned long)hcpu;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		intel_cqm_cpu_prepare(cpu);
+		break;
+	case CPU_DOWN_PREPARE:
+		intel_cqm_cpu_exit(cpu);
+		break;
+	case CPU_STARTING:
+		cqm_pick_event_reader(cpu);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static const struct x86_cpu_id intel_cqm_match[] = {
+	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
+	{}
+};
+
+static int __init intel_cqm_init(void)
+{
+	char *str, scale[20];
+	int i, cpu, ret;
+
+	if (!x86_match_cpu(intel_cqm_match))
+		return -ENODEV;
+
+	cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
+
+	/*
+	 * It's possible that not all resources support the same number
+	 * of RMIDs. Instead of making scheduling much more complicated
+	 * (where we have to match a task's RMID to a cpu that supports
+	 * that many RMIDs) just find the minimum RMIDs supported across
+	 * all cpus.
+	 *
+	 * Also, check that the scales match on all cpus.
+	 */
+	cpu_notifier_register_begin();
+
+	for_each_online_cpu(cpu) {
+		struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+		if (c->x86_cache_max_rmid < cqm_max_rmid)
+			cqm_max_rmid = c->x86_cache_max_rmid;
+
+		if (c->x86_cache_occ_scale != cqm_l3_scale) {
+			pr_err("Multiple LLC scale values, disabling\n");
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
+	str = kstrdup(scale, GFP_KERNEL);
+	if (!str) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	event_attr_intel_cqm_llc_scale.event_str = str;
+
+	ret = intel_cqm_setup_rmid_cache();
+	if (ret)
+		goto out;
+
+	for_each_online_cpu(i) {
+		intel_cqm_cpu_prepare(i);
+		cqm_pick_event_reader(i);
+	}
+
+	__perf_cpu_notifier(intel_cqm_cpu_notifier);
+
+	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
+
+	if (ret)
+		pr_err("Intel CQM perf registration failed: %d\n", ret);
+	else
+		pr_info("Intel CQM monitoring enabled\n");
+
+out:
+	cpu_notifier_register_done();
+
+	return ret;
+}
+device_initcall(intel_cqm_init);
-- 
cgit v1.2.3


From 35298e554c74b7849875e3676ba8eaf833c7b917 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Fri, 23 Jan 2015 18:45:45 +0000
Subject: perf/x86/intel: Implement LRU monitoring ID allocation for CQM

It's possible to run into issues with re-using unused monitoring IDs
because there may be stale cachelines associated with that ID from a
previous allocation. This can cause the LLC occupancy values to be
inaccurate.

To attempt to mitigate this problem we place the IDs on a least recently
used list, essentially a FIFO. The basic idea is that the longer the
time period between ID re-use the lower the probability that stale
cachelines exist in the cache.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/1422038748-21397-7-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_cqm.c | 100 ++++++++++++++++++++++++++---
 1 file changed, 92 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 05b4cd26f426..b5d9d746dbc0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -25,7 +25,7 @@ struct intel_cqm_state {
 static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
 
 /*
- * Protects cache_cgroups.
+ * Protects cache_cgroups and cqm_rmid_lru.
  */
 static DEFINE_MUTEX(cache_mutex);
 
@@ -64,36 +64,120 @@ static u64 __rmid_read(unsigned long rmid)
 	return val;
 }
 
-static unsigned long *cqm_rmid_bitmap;
+struct cqm_rmid_entry {
+	u64 rmid;
+	struct list_head list;
+};
+
+/*
+ * A least recently used list of RMIDs.
+ *
+ * Oldest entry at the head, newest (most recently used) entry at the
+ * tail. This list is never traversed, it's only used to keep track of
+ * the lru order. That is, we only pick entries of the head or insert
+ * them on the tail.
+ *
+ * All entries on the list are 'free', and their RMIDs are not currently
+ * in use. To mark an RMID as in use, remove its entry from the lru
+ * list.
+ *
+ * This list is protected by cache_mutex.
+ */
+static LIST_HEAD(cqm_rmid_lru);
+
+/*
+ * We use a simple array of pointers so that we can lookup a struct
+ * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
+ * and __put_rmid() from having to worry about dealing with struct
+ * cqm_rmid_entry - they just deal with rmids, i.e. integers.
+ *
+ * Once this array is initialized it is read-only. No locks are required
+ * to access it.
+ *
+ * All entries for all RMIDs can be looked up in the this array at all
+ * times.
+ */
+static struct cqm_rmid_entry **cqm_rmid_ptrs;
+
+static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
+{
+	struct cqm_rmid_entry *entry;
+
+	entry = cqm_rmid_ptrs[rmid];
+	WARN_ON(entry->rmid != rmid);
+
+	return entry;
+}
 
 /*
  * Returns < 0 on fail.
+ *
+ * We expect to be called with cache_mutex held.
  */
 static int __get_rmid(void)
 {
-	return bitmap_find_free_region(cqm_rmid_bitmap, cqm_max_rmid, 0);
+	struct cqm_rmid_entry *entry;
+
+	lockdep_assert_held(&cache_mutex);
+
+	if (list_empty(&cqm_rmid_lru))
+		return -EAGAIN;
+
+	entry = list_first_entry(&cqm_rmid_lru, struct cqm_rmid_entry, list);
+	list_del(&entry->list);
+
+	return entry->rmid;
 }
 
 static void __put_rmid(int rmid)
 {
-	bitmap_release_region(cqm_rmid_bitmap, rmid, 0);
+	struct cqm_rmid_entry *entry;
+
+	lockdep_assert_held(&cache_mutex);
+
+	entry = __rmid_entry(rmid);
+
+	list_add_tail(&entry->list, &cqm_rmid_lru);
 }
 
 static int intel_cqm_setup_rmid_cache(void)
 {
-	cqm_rmid_bitmap = kmalloc(sizeof(long) * BITS_TO_LONGS(cqm_max_rmid), GFP_KERNEL);
-	if (!cqm_rmid_bitmap)
+	struct cqm_rmid_entry *entry;
+	int r;
+
+	cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
+				(cqm_max_rmid + 1), GFP_KERNEL);
+	if (!cqm_rmid_ptrs)
 		return -ENOMEM;
 
-	bitmap_zero(cqm_rmid_bitmap, cqm_max_rmid);
+	for (r = 0; r <= cqm_max_rmid; r++) {
+		struct cqm_rmid_entry *entry;
+
+		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+		if (!entry)
+			goto fail;
+
+		INIT_LIST_HEAD(&entry->list);
+		entry->rmid = r;
+		cqm_rmid_ptrs[r] = entry;
+
+		list_add_tail(&entry->list, &cqm_rmid_lru);
+	}
 
 	/*
 	 * RMID 0 is special and is always allocated. It's used for all
 	 * tasks that are not monitored.
 	 */
-	bitmap_allocate_region(cqm_rmid_bitmap, 0, 0);
+	entry = __rmid_entry(0);
+	list_del(&entry->list);
 
 	return 0;
+fail:
+	while (r--)
+		kfree(cqm_rmid_ptrs[r]);
+
+	kfree(cqm_rmid_ptrs);
+	return -ENOMEM;
 }
 
 /*
-- 
cgit v1.2.3


From bfe1fcd2688f557a6b6a88f59ea7619228728bd7 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Fri, 23 Jan 2015 18:45:46 +0000
Subject: perf/x86/intel: Support task events with Intel CQM

Add support for task events as well as system-wide events. This change
has a big impact on the way that we gather LLC occupancy values in
intel_cqm_event_read().

Currently, for system-wide (per-cpu) events we defer processing to
userspace which knows how to discard all but one cpu result per package.

Things aren't so simple for task events because we need to do the value
aggregation ourselves. To do this, we defer updating the LLC occupancy
value in event->count from intel_cqm_event_read() and do an SMP
cross-call to read values for all packages in intel_cqm_event_count().
We need to ensure that we only do this for one task event per cache
group, otherwise we'll report duplicate values.

If we're a system-wide event we want to fallback to the default
perf_event_count() implementation. Refactor this into a common function
so that we don't duplicate the code.

Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an
event's task (if the event isn't per-cpu) inside of the Intel CQM PMU
driver.  This task information is only availble in the upper layers of
the perf infrastructure.

Other perf backends stash the target task in event->hw.*target so we
need to do something similar. The task is used to determine whether
events should share a cache group and an RMID.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Cc: linux-api@vger.kernel.org
Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_cqm.c | 195 +++++++++++++++++++++++++----
 1 file changed, 174 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index b5d9d746dbc0..8003d87afd89 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -182,23 +182,124 @@ fail:
 
 /*
  * Determine if @a and @b measure the same set of tasks.
+ *
+ * If @a and @b measure the same set of tasks then we want to share a
+ * single RMID.
  */
 static bool __match_event(struct perf_event *a, struct perf_event *b)
 {
+	/* Per-cpu and task events don't mix */
 	if ((a->attach_state & PERF_ATTACH_TASK) !=
 	    (b->attach_state & PERF_ATTACH_TASK))
 		return false;
 
-	/* not task */
+#ifdef CONFIG_CGROUP_PERF
+	if (a->cgrp != b->cgrp)
+		return false;
+#endif
+
+	/* If not task event, we're machine wide */
+	if (!(b->attach_state & PERF_ATTACH_TASK))
+		return true;
+
+	/*
+	 * Events that target same task are placed into the same cache group.
+	 */
+	if (a->hw.cqm_target == b->hw.cqm_target)
+		return true;
+
+	/*
+	 * Are we an inherited event?
+	 */
+	if (b->parent == a)
+		return true;
+
+	return false;
+}
+
+#ifdef CONFIG_CGROUP_PERF
+static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
+{
+	if (event->attach_state & PERF_ATTACH_TASK)
+		return perf_cgroup_from_task(event->hw.cqm_target);
 
-	return true; /* if not task, we're machine wide */
+	return event->cgrp;
 }
+#endif
 
 /*
  * Determine if @a's tasks intersect with @b's tasks
+ *
+ * There are combinations of events that we explicitly prohibit,
+ *
+ *		   PROHIBITS
+ *     system-wide    -> 	cgroup and task
+ *     cgroup 	      ->	system-wide
+ *     		      ->	task in cgroup
+ *     task 	      -> 	system-wide
+ *     		      ->	task in cgroup
+ *
+ * Call this function before allocating an RMID.
  */
 static bool __conflict_event(struct perf_event *a, struct perf_event *b)
 {
+#ifdef CONFIG_CGROUP_PERF
+	/*
+	 * We can have any number of cgroups but only one system-wide
+	 * event at a time.
+	 */
+	if (a->cgrp && b->cgrp) {
+		struct perf_cgroup *ac = a->cgrp;
+		struct perf_cgroup *bc = b->cgrp;
+
+		/*
+		 * This condition should have been caught in
+		 * __match_event() and we should be sharing an RMID.
+		 */
+		WARN_ON_ONCE(ac == bc);
+
+		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+			return true;
+
+		return false;
+	}
+
+	if (a->cgrp || b->cgrp) {
+		struct perf_cgroup *ac, *bc;
+
+		/*
+		 * cgroup and system-wide events are mutually exclusive
+		 */
+		if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
+		    (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
+			return true;
+
+		/*
+		 * Ensure neither event is part of the other's cgroup
+		 */
+		ac = event_to_cgroup(a);
+		bc = event_to_cgroup(b);
+		if (ac == bc)
+			return true;
+
+		/*
+		 * Must have cgroup and non-intersecting task events.
+		 */
+		if (!ac || !bc)
+			return false;
+
+		/*
+		 * We have cgroup and task events, and the task belongs
+		 * to a cgroup. Check for for overlap.
+		 */
+		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+			return true;
+
+		return false;
+	}
+#endif
 	/*
 	 * If one of them is not a task, same story as above with cgroups.
 	 */
@@ -245,9 +346,16 @@ static int intel_cqm_setup_event(struct perf_event *event,
 
 static void intel_cqm_event_read(struct perf_event *event)
 {
-	unsigned long rmid = event->hw.cqm_rmid;
+	unsigned long rmid;
 	u64 val;
 
+	/*
+	 * Task events are handled by intel_cqm_event_count().
+	 */
+	if (event->cpu == -1)
+		return;
+
+	rmid = event->hw.cqm_rmid;
 	val = __rmid_read(rmid);
 
 	/*
@@ -259,6 +367,63 @@ static void intel_cqm_event_read(struct perf_event *event)
 	local64_set(&event->count, val);
 }
 
+struct rmid_read {
+	unsigned int rmid;
+	atomic64_t value;
+};
+
+static void __intel_cqm_event_count(void *info)
+{
+	struct rmid_read *rr = info;
+	u64 val;
+
+	val = __rmid_read(rr->rmid);
+
+	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+		return;
+
+	atomic64_add(val, &rr->value);
+}
+
+static inline bool cqm_group_leader(struct perf_event *event)
+{
+	return !list_empty(&event->hw.cqm_groups_entry);
+}
+
+static u64 intel_cqm_event_count(struct perf_event *event)
+{
+	struct rmid_read rr = {
+		.rmid = event->hw.cqm_rmid,
+		.value = ATOMIC64_INIT(0),
+	};
+
+	/*
+	 * We only need to worry about task events. System-wide events
+	 * are handled like usual, i.e. entirely with
+	 * intel_cqm_event_read().
+	 */
+	if (event->cpu != -1)
+		return __perf_event_count(event);
+
+	/*
+	 * Only the group leader gets to report values. This stops us
+	 * reporting duplicate values to userspace, and gives us a clear
+	 * rule for which task gets to report the values.
+	 *
+	 * Note that it is impossible to attribute these values to
+	 * specific packages - we forfeit that ability when we create
+	 * task events.
+	 */
+	if (!cqm_group_leader(event))
+		return 0;
+
+	on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+
+	local64_set(&event->count, atomic64_read(&rr.value));
+
+	return __perf_event_count(event);
+}
+
 static void intel_cqm_event_start(struct perf_event *event, int mode)
 {
 	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
@@ -344,7 +509,7 @@ static void intel_cqm_event_destroy(struct perf_event *event)
 	/*
 	 * And we're the group leader..
 	 */
-	if (!list_empty(&event->hw.cqm_groups_entry)) {
+	if (cqm_group_leader(event)) {
 		/*
 		 * If there was a group_other, make that leader, otherwise
 		 * destroy the group and return the RMID.
@@ -365,17 +530,6 @@ static void intel_cqm_event_destroy(struct perf_event *event)
 
 static struct pmu intel_cqm_pmu;
 
-/*
- * XXX there's a bit of a problem in that we cannot simply do the one
- * event per node as one would want, since that one event would one get
- * scheduled on the one cpu. But we want to 'schedule' the RMID on all
- * CPUs.
- *
- * This means we want events for each CPU, however, that generates a lot
- * of duplicate values out to userspace -- this is not to be helped
- * unless we want to change the core code in some way. Fore more info,
- * see intel_cqm_event_read().
- */
 static int intel_cqm_event_init(struct perf_event *event)
 {
 	struct perf_event *group = NULL;
@@ -387,9 +541,6 @@ static int intel_cqm_event_init(struct perf_event *event)
 	if (event->attr.config & ~QOS_EVENT_MASK)
 		return -EINVAL;
 
-	if (event->cpu == -1)
-		return -EINVAL;
-
 	/* unsupported modes and filters */
 	if (event->attr.exclude_user   ||
 	    event->attr.exclude_kernel ||
@@ -407,7 +558,8 @@ static int intel_cqm_event_init(struct perf_event *event)
 
 	mutex_lock(&cache_mutex);
 
-	err = intel_cqm_setup_event(event, &group); /* will also set rmid */
+	/* Will also set rmid */
+	err = intel_cqm_setup_event(event, &group);
 	if (err)
 		goto out;
 
@@ -470,6 +622,7 @@ static struct pmu intel_cqm_pmu = {
 	.start		= intel_cqm_event_start,
 	.stop		= intel_cqm_event_stop,
 	.read		= intel_cqm_event_read,
+	.count		= intel_cqm_event_count,
 };
 
 static inline void cqm_pick_event_reader(int cpu)
@@ -599,8 +752,8 @@ static int __init intel_cqm_init(void)
 
 	__perf_cpu_notifier(intel_cqm_cpu_notifier);
 
-	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
-
+	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm",
+				PERF_TYPE_INTEL_CQM);
 	if (ret)
 		pr_err("Intel CQM perf registration failed: %d\n", ret);
 	else
-- 
cgit v1.2.3


From bff671dba7981195a644a5dc210d65de8ae2d251 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Fri, 23 Jan 2015 18:45:47 +0000
Subject: perf/x86/intel: Perform rotation on Intel CQM RMIDs

There are many use cases where people will want to monitor more tasks
than there exist RMIDs in the hardware, meaning that we have to perform
some kind of multiplexing.

We do this by "rotating" the RMIDs in a workqueue, and assigning an RMID
to a waiting event when the RMID becomes unused.

This scheme reserves one RMID at all times for rotation. When we need to
schedule a new event we give it the reserved RMID, pick a victim event
from the front of the global CQM list and wait for the victim's RMID to
drop to zero occupancy, before it becomes the new reserved RMID.

We put the victim's RMID onto the limbo list, where it resides for a
"minimum queue time", which is intended to save ourselves an expensive
smp IPI when the RMID is unlikely to have a occupancy value below
__intel_cqm_threshold.

If we fail to recycle an RMID, even after waiting the minimum queue time
then we need to increment __intel_cqm_threshold. There is an upper bound
on this threshold, __intel_cqm_max_threshold, which is programmable from
userland as /sys/devices/intel_cqm/max_recycling_threshold.

The comments above __intel_cqm_rmid_rotate() have more details.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/1422038748-21397-9-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_cqm.c | 671 ++++++++++++++++++++++++++---
 1 file changed, 623 insertions(+), 48 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 8003d87afd89..e31f5086f2b5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -25,9 +25,13 @@ struct intel_cqm_state {
 static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
 
 /*
- * Protects cache_cgroups and cqm_rmid_lru.
+ * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
+ * Also protects event->hw.cqm_rmid
+ *
+ * Hold either for stability, both for modification of ->hw.cqm_rmid.
  */
 static DEFINE_MUTEX(cache_mutex);
+static DEFINE_RAW_SPINLOCK(cache_lock);
 
 /*
  * Groups of events that have the same target(s), one RMID per group.
@@ -46,7 +50,34 @@ static cpumask_t cqm_cpumask;
 
 #define QOS_EVENT_MASK	QOS_L3_OCCUP_EVENT_ID
 
-static u64 __rmid_read(unsigned long rmid)
+/*
+ * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
+ *
+ * This rmid is always free and is guaranteed to have an associated
+ * near-zero occupancy value, i.e. no cachelines are tagged with this
+ * RMID, once __intel_cqm_rmid_rotate() returns.
+ */
+static unsigned int intel_cqm_rotation_rmid;
+
+#define INVALID_RMID		(-1)
+
+/*
+ * Is @rmid valid for programming the hardware?
+ *
+ * rmid 0 is reserved by the hardware for all non-monitored tasks, which
+ * means that we should never come across an rmid with that value.
+ * Likewise, an rmid value of -1 is used to indicate "no rmid currently
+ * assigned" and is used as part of the rotation code.
+ */
+static inline bool __rmid_valid(unsigned int rmid)
+{
+	if (!rmid || rmid == INVALID_RMID)
+		return false;
+
+	return true;
+}
+
+static u64 __rmid_read(unsigned int rmid)
 {
 	u64 val;
 
@@ -64,13 +95,21 @@ static u64 __rmid_read(unsigned long rmid)
 	return val;
 }
 
+enum rmid_recycle_state {
+	RMID_YOUNG = 0,
+	RMID_AVAILABLE,
+	RMID_DIRTY,
+};
+
 struct cqm_rmid_entry {
-	u64 rmid;
+	unsigned int rmid;
+	enum rmid_recycle_state state;
 	struct list_head list;
+	unsigned long queue_time;
 };
 
 /*
- * A least recently used list of RMIDs.
+ * cqm_rmid_free_lru - A least recently used list of RMIDs.
  *
  * Oldest entry at the head, newest (most recently used) entry at the
  * tail. This list is never traversed, it's only used to keep track of
@@ -81,9 +120,18 @@ struct cqm_rmid_entry {
  * in use. To mark an RMID as in use, remove its entry from the lru
  * list.
  *
- * This list is protected by cache_mutex.
+ *
+ * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
+ *
+ * This list is contains RMIDs that no one is currently using but that
+ * may have a non-zero occupancy value associated with them. The
+ * rotation worker moves RMIDs from the limbo list to the free list once
+ * the occupancy value drops below __intel_cqm_threshold.
+ *
+ * Both lists are protected by cache_mutex.
  */
-static LIST_HEAD(cqm_rmid_lru);
+static LIST_HEAD(cqm_rmid_free_lru);
+static LIST_HEAD(cqm_rmid_limbo_lru);
 
 /*
  * We use a simple array of pointers so that we can lookup a struct
@@ -120,37 +168,43 @@ static int __get_rmid(void)
 
 	lockdep_assert_held(&cache_mutex);
 
-	if (list_empty(&cqm_rmid_lru))
-		return -EAGAIN;
+	if (list_empty(&cqm_rmid_free_lru))
+		return INVALID_RMID;
 
-	entry = list_first_entry(&cqm_rmid_lru, struct cqm_rmid_entry, list);
+	entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
 	list_del(&entry->list);
 
 	return entry->rmid;
 }
 
-static void __put_rmid(int rmid)
+static void __put_rmid(unsigned int rmid)
 {
 	struct cqm_rmid_entry *entry;
 
 	lockdep_assert_held(&cache_mutex);
 
+	WARN_ON(!__rmid_valid(rmid));
 	entry = __rmid_entry(rmid);
 
-	list_add_tail(&entry->list, &cqm_rmid_lru);
+	entry->queue_time = jiffies;
+	entry->state = RMID_YOUNG;
+
+	list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
 }
 
 static int intel_cqm_setup_rmid_cache(void)
 {
 	struct cqm_rmid_entry *entry;
-	int r;
+	unsigned int nr_rmids;
+	int r = 0;
 
+	nr_rmids = cqm_max_rmid + 1;
 	cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
-				(cqm_max_rmid + 1), GFP_KERNEL);
+				nr_rmids, GFP_KERNEL);
 	if (!cqm_rmid_ptrs)
 		return -ENOMEM;
 
-	for (r = 0; r <= cqm_max_rmid; r++) {
+	for (; r <= cqm_max_rmid; r++) {
 		struct cqm_rmid_entry *entry;
 
 		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
@@ -161,7 +215,7 @@ static int intel_cqm_setup_rmid_cache(void)
 		entry->rmid = r;
 		cqm_rmid_ptrs[r] = entry;
 
-		list_add_tail(&entry->list, &cqm_rmid_lru);
+		list_add_tail(&entry->list, &cqm_rmid_free_lru);
 	}
 
 	/*
@@ -171,6 +225,10 @@ static int intel_cqm_setup_rmid_cache(void)
 	entry = __rmid_entry(0);
 	list_del(&entry->list);
 
+	mutex_lock(&cache_mutex);
+	intel_cqm_rotation_rmid = __get_rmid();
+	mutex_unlock(&cache_mutex);
+
 	return 0;
 fail:
 	while (r--)
@@ -313,6 +371,424 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b)
 	return false;
 }
 
+struct rmid_read {
+	unsigned int rmid;
+	atomic64_t value;
+};
+
+static void __intel_cqm_event_count(void *info);
+
+/*
+ * Exchange the RMID of a group of events.
+ */
+static unsigned int
+intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid)
+{
+	struct perf_event *event;
+	unsigned int old_rmid = group->hw.cqm_rmid;
+	struct list_head *head = &group->hw.cqm_group_entry;
+
+	lockdep_assert_held(&cache_mutex);
+
+	/*
+	 * If our RMID is being deallocated, perform a read now.
+	 */
+	if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
+		struct rmid_read rr = {
+			.value = ATOMIC64_INIT(0),
+			.rmid = old_rmid,
+		};
+
+		on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
+				 &rr, 1);
+		local64_set(&group->count, atomic64_read(&rr.value));
+	}
+
+	raw_spin_lock_irq(&cache_lock);
+
+	group->hw.cqm_rmid = rmid;
+	list_for_each_entry(event, head, hw.cqm_group_entry)
+		event->hw.cqm_rmid = rmid;
+
+	raw_spin_unlock_irq(&cache_lock);
+
+	return old_rmid;
+}
+
+/*
+ * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
+ * cachelines are still tagged with RMIDs in limbo, we progressively
+ * increment the threshold until we find an RMID in limbo with <=
+ * __intel_cqm_threshold lines tagged. This is designed to mitigate the
+ * problem where cachelines tagged with an RMID are not steadily being
+ * evicted.
+ *
+ * On successful rotations we decrease the threshold back towards zero.
+ *
+ * __intel_cqm_max_threshold provides an upper bound on the threshold,
+ * and is measured in bytes because it's exposed to userland.
+ */
+static unsigned int __intel_cqm_threshold;
+static unsigned int __intel_cqm_max_threshold;
+
+/*
+ * Test whether an RMID has a zero occupancy value on this cpu.
+ */
+static void intel_cqm_stable(void *arg)
+{
+	struct cqm_rmid_entry *entry;
+
+	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+		if (entry->state != RMID_AVAILABLE)
+			break;
+
+		if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
+			entry->state = RMID_DIRTY;
+	}
+}
+
+/*
+ * If we have group events waiting for an RMID that don't conflict with
+ * events already running, assign @rmid.
+ */
+static bool intel_cqm_sched_in_event(unsigned int rmid)
+{
+	struct perf_event *leader, *event;
+
+	lockdep_assert_held(&cache_mutex);
+
+	leader = list_first_entry(&cache_groups, struct perf_event,
+				  hw.cqm_groups_entry);
+	event = leader;
+
+	list_for_each_entry_continue(event, &cache_groups,
+				     hw.cqm_groups_entry) {
+		if (__rmid_valid(event->hw.cqm_rmid))
+			continue;
+
+		if (__conflict_event(event, leader))
+			continue;
+
+		intel_cqm_xchg_rmid(event, rmid);
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Initially use this constant for both the limbo queue time and the
+ * rotation timer interval, pmu::hrtimer_interval_ms.
+ *
+ * They don't need to be the same, but the two are related since if you
+ * rotate faster than you recycle RMIDs, you may run out of available
+ * RMIDs.
+ */
+#define RMID_DEFAULT_QUEUE_TIME 250	/* ms */
+
+static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
+
+/*
+ * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
+ * @nr_available: number of freeable RMIDs on the limbo list
+ *
+ * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
+ * cachelines are tagged with those RMIDs. After this we can reuse them
+ * and know that the current set of active RMIDs is stable.
+ *
+ * Return %true or %false depending on whether stabilization needs to be
+ * reattempted.
+ *
+ * If we return %true then @nr_available is updated to indicate the
+ * number of RMIDs on the limbo list that have been queued for the
+ * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
+ * are above __intel_cqm_threshold.
+ */
+static bool intel_cqm_rmid_stabilize(unsigned int *available)
+{
+	struct cqm_rmid_entry *entry, *tmp;
+	struct perf_event *event;
+
+	lockdep_assert_held(&cache_mutex);
+
+	*available = 0;
+	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+		unsigned long min_queue_time;
+		unsigned long now = jiffies;
+
+		/*
+		 * We hold RMIDs placed into limbo for a minimum queue
+		 * time. Before the minimum queue time has elapsed we do
+		 * not recycle RMIDs.
+		 *
+		 * The reasoning is that until a sufficient time has
+		 * passed since we stopped using an RMID, any RMID
+		 * placed onto the limbo list will likely still have
+		 * data tagged in the cache, which means we'll probably
+		 * fail to recycle it anyway.
+		 *
+		 * We can save ourselves an expensive IPI by skipping
+		 * any RMIDs that have not been queued for the minimum
+		 * time.
+		 */
+		min_queue_time = entry->queue_time +
+			msecs_to_jiffies(__rmid_queue_time_ms);
+
+		if (time_after(min_queue_time, now))
+			break;
+
+		entry->state = RMID_AVAILABLE;
+		(*available)++;
+	}
+
+	/*
+	 * Fast return if none of the RMIDs on the limbo list have been
+	 * sitting on the queue for the minimum queue time.
+	 */
+	if (!*available)
+		return false;
+
+	/*
+	 * Test whether an RMID is free for each package.
+	 */
+	on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
+
+	list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
+		/*
+		 * Exhausted all RMIDs that have waited min queue time.
+		 */
+		if (entry->state == RMID_YOUNG)
+			break;
+
+		if (entry->state == RMID_DIRTY)
+			continue;
+
+		list_del(&entry->list);	/* remove from limbo */
+
+		/*
+		 * The rotation RMID gets priority if it's
+		 * currently invalid. In which case, skip adding
+		 * the RMID to the the free lru.
+		 */
+		if (!__rmid_valid(intel_cqm_rotation_rmid)) {
+			intel_cqm_rotation_rmid = entry->rmid;
+			continue;
+		}
+
+		/*
+		 * If we have groups waiting for RMIDs, hand
+		 * them one now.
+		 */
+		list_for_each_entry(event, &cache_groups,
+				    hw.cqm_groups_entry) {
+			if (__rmid_valid(event->hw.cqm_rmid))
+				continue;
+
+			intel_cqm_xchg_rmid(event, entry->rmid);
+			entry = NULL;
+			break;
+		}
+
+		if (!entry)
+			continue;
+
+		/*
+		 * Otherwise place it onto the free list.
+		 */
+		list_add_tail(&entry->list, &cqm_rmid_free_lru);
+	}
+
+
+	return __rmid_valid(intel_cqm_rotation_rmid);
+}
+
+/*
+ * Pick a victim group and move it to the tail of the group list.
+ */
+static struct perf_event *
+__intel_cqm_pick_and_rotate(void)
+{
+	struct perf_event *rotor;
+
+	lockdep_assert_held(&cache_mutex);
+	lockdep_assert_held(&cache_lock);
+
+	rotor = list_first_entry(&cache_groups, struct perf_event,
+				 hw.cqm_groups_entry);
+	list_rotate_left(&cache_groups);
+
+	return rotor;
+}
+
+/*
+ * Attempt to rotate the groups and assign new RMIDs.
+ *
+ * Rotating RMIDs is complicated because the hardware doesn't give us
+ * any clues.
+ *
+ * There's problems with the hardware interface; when you change the
+ * task:RMID map cachelines retain their 'old' tags, giving a skewed
+ * picture. In order to work around this, we must always keep one free
+ * RMID - intel_cqm_rotation_rmid.
+ *
+ * Rotation works by taking away an RMID from a group (the old RMID),
+ * and assigning the free RMID to another group (the new RMID). We must
+ * then wait for the old RMID to not be used (no cachelines tagged).
+ * This ensure that all cachelines are tagged with 'active' RMIDs. At
+ * this point we can start reading values for the new RMID and treat the
+ * old RMID as the free RMID for the next rotation.
+ *
+ * Return %true or %false depending on whether we did any rotating.
+ */
+static bool __intel_cqm_rmid_rotate(void)
+{
+	struct perf_event *group, *rotor, *start = NULL;
+	unsigned int threshold_limit;
+	unsigned int nr_needed = 0;
+	unsigned int nr_available;
+	unsigned int rmid;
+	bool rotated = false;
+
+	mutex_lock(&cache_mutex);
+
+again:
+	/*
+	 * Fast path through this function if there are no groups and no
+	 * RMIDs that need cleaning.
+	 */
+	if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
+		goto out;
+
+	list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
+		if (!__rmid_valid(group->hw.cqm_rmid)) {
+			if (!start)
+				start = group;
+			nr_needed++;
+		}
+	}
+
+	/*
+	 * We have some event groups, but they all have RMIDs assigned
+	 * and no RMIDs need cleaning.
+	 */
+	if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
+		goto out;
+
+	if (!nr_needed)
+		goto stabilize;
+
+	/*
+	 * We have more event groups without RMIDs than available RMIDs.
+	 *
+	 * We force deallocate the rmid of the group at the head of
+	 * cache_groups. The first event group without an RMID then gets
+	 * assigned intel_cqm_rotation_rmid. This ensures we always make
+	 * forward progress.
+	 *
+	 * Rotate the cache_groups list so the previous head is now the
+	 * tail.
+	 */
+	rotor = __intel_cqm_pick_and_rotate();
+	rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
+
+	/*
+	 * The group at the front of the list should always have a valid
+	 * RMID. If it doesn't then no groups have RMIDs assigned.
+	 */
+	if (!__rmid_valid(rmid))
+		goto stabilize;
+
+	/*
+	 * If the rotation is going to succeed, reduce the threshold so
+	 * that we don't needlessly reuse dirty RMIDs.
+	 */
+	if (__rmid_valid(intel_cqm_rotation_rmid)) {
+		intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
+		intel_cqm_rotation_rmid = INVALID_RMID;
+
+		if (__intel_cqm_threshold)
+			__intel_cqm_threshold--;
+	}
+
+	__put_rmid(rmid);
+
+	rotated = true;
+
+stabilize:
+	/*
+	 * We now need to stablize the RMID we freed above (if any) to
+	 * ensure that the next time we rotate we have an RMID with zero
+	 * occupancy value.
+	 *
+	 * Alternatively, if we didn't need to perform any rotation,
+	 * we'll have a bunch of RMIDs in limbo that need stabilizing.
+	 */
+	threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
+
+	while (intel_cqm_rmid_stabilize(&nr_available) &&
+	       __intel_cqm_threshold < threshold_limit) {
+		unsigned int steal_limit;
+
+		/*
+		 * Don't spin if nobody is actively waiting for an RMID,
+		 * the rotation worker will be kicked as soon as an
+		 * event needs an RMID anyway.
+		 */
+		if (!nr_needed)
+			break;
+
+		/* Allow max 25% of RMIDs to be in limbo. */
+		steal_limit = (cqm_max_rmid + 1) / 4;
+
+		/*
+		 * We failed to stabilize any RMIDs so our rotation
+		 * logic is now stuck. In order to make forward progress
+		 * we have a few options:
+		 *
+		 *   1. rotate ("steal") another RMID
+		 *   2. increase the threshold
+		 *   3. do nothing
+		 *
+		 * We do both of 1. and 2. until we hit the steal limit.
+		 *
+		 * The steal limit prevents all RMIDs ending up on the
+		 * limbo list. This can happen if every RMID has a
+		 * non-zero occupancy above threshold_limit, and the
+		 * occupancy values aren't dropping fast enough.
+		 *
+		 * Note that there is prioritisation at work here - we'd
+		 * rather increase the number of RMIDs on the limbo list
+		 * than increase the threshold, because increasing the
+		 * threshold skews the event data (because we reuse
+		 * dirty RMIDs) - threshold bumps are a last resort.
+		 */
+		if (nr_available < steal_limit)
+			goto again;
+
+		__intel_cqm_threshold++;
+	}
+
+out:
+	mutex_unlock(&cache_mutex);
+	return rotated;
+}
+
+static void intel_cqm_rmid_rotate(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
+
+static struct pmu intel_cqm_pmu;
+
+static void intel_cqm_rmid_rotate(struct work_struct *work)
+{
+	unsigned long delay;
+
+	__intel_cqm_rmid_rotate();
+
+	delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
+	schedule_delayed_work(&intel_cqm_rmid_work, delay);
+}
+
 /*
  * Find a group and setup RMID.
  *
@@ -322,7 +798,6 @@ static int intel_cqm_setup_event(struct perf_event *event,
 				 struct perf_event **group)
 {
 	struct perf_event *iter;
-	int rmid;
 
 	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
 		if (__match_event(iter, event)) {
@@ -336,17 +811,14 @@ static int intel_cqm_setup_event(struct perf_event *event,
 			return -EBUSY;
 	}
 
-	rmid = __get_rmid();
-	if (rmid < 0)
-		return rmid;
-
-	event->hw.cqm_rmid = rmid;
+	event->hw.cqm_rmid = __get_rmid();
 	return 0;
 }
 
 static void intel_cqm_event_read(struct perf_event *event)
 {
-	unsigned long rmid;
+	unsigned long flags;
+	unsigned int rmid;
 	u64 val;
 
 	/*
@@ -355,23 +827,25 @@ static void intel_cqm_event_read(struct perf_event *event)
 	if (event->cpu == -1)
 		return;
 
+	raw_spin_lock_irqsave(&cache_lock, flags);
 	rmid = event->hw.cqm_rmid;
+
+	if (!__rmid_valid(rmid))
+		goto out;
+
 	val = __rmid_read(rmid);
 
 	/*
 	 * Ignore this reading on error states and do not update the value.
 	 */
 	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-		return;
+		goto out;
 
 	local64_set(&event->count, val);
+out:
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
 }
 
-struct rmid_read {
-	unsigned int rmid;
-	atomic64_t value;
-};
-
 static void __intel_cqm_event_count(void *info)
 {
 	struct rmid_read *rr = info;
@@ -392,8 +866,8 @@ static inline bool cqm_group_leader(struct perf_event *event)
 
 static u64 intel_cqm_event_count(struct perf_event *event)
 {
+	unsigned long flags;
 	struct rmid_read rr = {
-		.rmid = event->hw.cqm_rmid,
 		.value = ATOMIC64_INIT(0),
 	};
 
@@ -417,17 +891,36 @@ static u64 intel_cqm_event_count(struct perf_event *event)
 	if (!cqm_group_leader(event))
 		return 0;
 
-	on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+	/*
+	 * Notice that we don't perform the reading of an RMID
+	 * atomically, because we can't hold a spin lock across the
+	 * IPIs.
+	 *
+	 * Speculatively perform the read, since @event might be
+	 * assigned a different (possibly invalid) RMID while we're
+	 * busying performing the IPI calls. It's therefore necessary to
+	 * check @event's RMID afterwards, and if it has changed,
+	 * discard the result of the read.
+	 */
+	rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
 
-	local64_set(&event->count, atomic64_read(&rr.value));
+	if (!__rmid_valid(rr.rmid))
+		goto out;
+
+	on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
 
+	raw_spin_lock_irqsave(&cache_lock, flags);
+	if (event->hw.cqm_rmid == rr.rmid)
+		local64_set(&event->count, atomic64_read(&rr.value));
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
+out:
 	return __perf_event_count(event);
 }
 
 static void intel_cqm_event_start(struct perf_event *event, int mode)
 {
 	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
-	unsigned long rmid = event->hw.cqm_rmid;
+	unsigned int rmid = event->hw.cqm_rmid;
 	unsigned long flags;
 
 	if (!(event->hw.cqm_state & PERF_HES_STOPPED))
@@ -473,15 +966,19 @@ static void intel_cqm_event_stop(struct perf_event *event, int mode)
 
 static int intel_cqm_event_add(struct perf_event *event, int mode)
 {
-	int rmid;
+	unsigned long flags;
+	unsigned int rmid;
+
+	raw_spin_lock_irqsave(&cache_lock, flags);
 
 	event->hw.cqm_state = PERF_HES_STOPPED;
 	rmid = event->hw.cqm_rmid;
-	WARN_ON_ONCE(!rmid);
 
-	if (mode & PERF_EF_START)
+	if (__rmid_valid(rmid) && (mode & PERF_EF_START))
 		intel_cqm_event_start(event, mode);
 
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
+
 	return 0;
 }
 
@@ -518,9 +1015,10 @@ static void intel_cqm_event_destroy(struct perf_event *event)
 			list_replace(&event->hw.cqm_groups_entry,
 				     &group_other->hw.cqm_groups_entry);
 		} else {
-			int rmid = event->hw.cqm_rmid;
+			unsigned int rmid = event->hw.cqm_rmid;
 
-			__put_rmid(rmid);
+			if (__rmid_valid(rmid))
+				__put_rmid(rmid);
 			list_del(&event->hw.cqm_groups_entry);
 		}
 	}
@@ -528,11 +1026,10 @@ static void intel_cqm_event_destroy(struct perf_event *event)
 	mutex_unlock(&cache_mutex);
 }
 
-static struct pmu intel_cqm_pmu;
-
 static int intel_cqm_event_init(struct perf_event *event)
 {
 	struct perf_event *group = NULL;
+	bool rotate = false;
 	int err;
 
 	if (event->attr.type != intel_cqm_pmu.type)
@@ -569,10 +1066,24 @@ static int intel_cqm_event_init(struct perf_event *event)
 	} else {
 		list_add_tail(&event->hw.cqm_groups_entry,
 			      &cache_groups);
+
+		/*
+		 * All RMIDs are either in use or have recently been
+		 * used. Kick the rotation worker to clean/free some.
+		 *
+		 * We only do this for the group leader, rather than for
+		 * every event in a group to save on needless work.
+		 */
+		if (!__rmid_valid(event->hw.cqm_rmid))
+			rotate = true;
 	}
 
 out:
 	mutex_unlock(&cache_mutex);
+
+	if (rotate)
+		schedule_delayed_work(&intel_cqm_rmid_work, 0);
+
 	return err;
 }
 
@@ -607,22 +1118,76 @@ static struct attribute_group intel_cqm_format_group = {
 	.attrs = intel_cqm_formats_attr,
 };
 
+static ssize_t
+max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
+			   char *page)
+{
+	ssize_t rv;
+
+	mutex_lock(&cache_mutex);
+	rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
+	mutex_unlock(&cache_mutex);
+
+	return rv;
+}
+
+static ssize_t
+max_recycle_threshold_store(struct device *dev,
+			    struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	unsigned int bytes, cachelines;
+	int ret;
+
+	ret = kstrtouint(buf, 0, &bytes);
+	if (ret)
+		return ret;
+
+	mutex_lock(&cache_mutex);
+
+	__intel_cqm_max_threshold = bytes;
+	cachelines = bytes / cqm_l3_scale;
+
+	/*
+	 * The new maximum takes effect immediately.
+	 */
+	if (__intel_cqm_threshold > cachelines)
+		__intel_cqm_threshold = cachelines;
+
+	mutex_unlock(&cache_mutex);
+
+	return count;
+}
+
+static DEVICE_ATTR_RW(max_recycle_threshold);
+
+static struct attribute *intel_cqm_attrs[] = {
+	&dev_attr_max_recycle_threshold.attr,
+	NULL,
+};
+
+static const struct attribute_group intel_cqm_group = {
+	.attrs = intel_cqm_attrs,
+};
+
 static const struct attribute_group *intel_cqm_attr_groups[] = {
 	&intel_cqm_events_group,
 	&intel_cqm_format_group,
+	&intel_cqm_group,
 	NULL,
 };
 
 static struct pmu intel_cqm_pmu = {
-	.attr_groups	= intel_cqm_attr_groups,
-	.task_ctx_nr	= perf_sw_context,
-	.event_init	= intel_cqm_event_init,
-	.add		= intel_cqm_event_add,
-	.del		= intel_cqm_event_del,
-	.start		= intel_cqm_event_start,
-	.stop		= intel_cqm_event_stop,
-	.read		= intel_cqm_event_read,
-	.count		= intel_cqm_event_count,
+	.hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
+	.attr_groups	     = intel_cqm_attr_groups,
+	.task_ctx_nr	     = perf_sw_context,
+	.event_init	     = intel_cqm_event_init,
+	.add		     = intel_cqm_event_add,
+	.del		     = intel_cqm_event_del,
+	.start		     = intel_cqm_event_start,
+	.stop		     = intel_cqm_event_stop,
+	.read		     = intel_cqm_event_read,
+	.count		     = intel_cqm_event_count,
 };
 
 static inline void cqm_pick_event_reader(int cpu)
@@ -732,6 +1297,16 @@ static int __init intel_cqm_init(void)
 		}
 	}
 
+	/*
+	 * A reasonable upper limit on the max threshold is the number
+	 * of lines tagged per RMID if all RMIDs have the same number of
+	 * lines tagged in the LLC.
+	 *
+	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+	 */
+	__intel_cqm_max_threshold =
+		boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
+
 	snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
 	str = kstrdup(scale, GFP_KERNEL);
 	if (!str) {
-- 
cgit v1.2.3


From 59bf7fd45c90a8fde22a7717b5413e4ed9666c32 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Fri, 23 Jan 2015 18:45:48 +0000
Subject: perf/x86/intel: Enable conflicting event scheduling for CQM

We can leverage the workqueue that we use for RMID rotation to support
scheduling of conflicting monitoring events. Allowing events that
monitor conflicting things is done at various other places in the perf
subsystem, so there's precedent there.

An example of two conflicting events would be monitoring a cgroup and
simultaneously monitoring a task within that cgroup.

This uses the cache_groups list as a queuing mechanism, where every
event that reaches the front of the list gets the chance to be scheduled
in, possibly descheduling any conflicting events that are running.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/1422038748-21397-10-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_cqm.c | 130 +++++++++++++++++++----------
 1 file changed, 84 insertions(+), 46 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index e31f5086f2b5..9a8ef8376fcd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -507,7 +507,6 @@ static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
 static bool intel_cqm_rmid_stabilize(unsigned int *available)
 {
 	struct cqm_rmid_entry *entry, *tmp;
-	struct perf_event *event;
 
 	lockdep_assert_held(&cache_mutex);
 
@@ -577,19 +576,9 @@ static bool intel_cqm_rmid_stabilize(unsigned int *available)
 
 		/*
 		 * If we have groups waiting for RMIDs, hand
-		 * them one now.
+		 * them one now provided they don't conflict.
 		 */
-		list_for_each_entry(event, &cache_groups,
-				    hw.cqm_groups_entry) {
-			if (__rmid_valid(event->hw.cqm_rmid))
-				continue;
-
-			intel_cqm_xchg_rmid(event, entry->rmid);
-			entry = NULL;
-			break;
-		}
-
-		if (!entry)
+		if (intel_cqm_sched_in_event(entry->rmid))
 			continue;
 
 		/*
@@ -604,25 +593,73 @@ static bool intel_cqm_rmid_stabilize(unsigned int *available)
 
 /*
  * Pick a victim group and move it to the tail of the group list.
+ * @next: The first group without an RMID
  */
-static struct perf_event *
-__intel_cqm_pick_and_rotate(void)
+static void __intel_cqm_pick_and_rotate(struct perf_event *next)
 {
 	struct perf_event *rotor;
+	unsigned int rmid;
 
 	lockdep_assert_held(&cache_mutex);
-	lockdep_assert_held(&cache_lock);
 
 	rotor = list_first_entry(&cache_groups, struct perf_event,
 				 hw.cqm_groups_entry);
+
+	/*
+	 * The group at the front of the list should always have a valid
+	 * RMID. If it doesn't then no groups have RMIDs assigned and we
+	 * don't need to rotate the list.
+	 */
+	if (next == rotor)
+		return;
+
+	rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
+	__put_rmid(rmid);
+
 	list_rotate_left(&cache_groups);
+}
+
+/*
+ * Deallocate the RMIDs from any events that conflict with @event, and
+ * place them on the back of the group list.
+ */
+static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
+{
+	struct perf_event *group, *g;
+	unsigned int rmid;
+
+	lockdep_assert_held(&cache_mutex);
+
+	list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
+		if (group == event)
+			continue;
+
+		rmid = group->hw.cqm_rmid;
+
+		/*
+		 * Skip events that don't have a valid RMID.
+		 */
+		if (!__rmid_valid(rmid))
+			continue;
+
+		/*
+		 * No conflict? No problem! Leave the event alone.
+		 */
+		if (!__conflict_event(group, event))
+			continue;
 
-	return rotor;
+		intel_cqm_xchg_rmid(group, INVALID_RMID);
+		__put_rmid(rmid);
+	}
 }
 
 /*
  * Attempt to rotate the groups and assign new RMIDs.
  *
+ * We rotate for two reasons,
+ *   1. To handle the scheduling of conflicting events
+ *   2. To recycle RMIDs
+ *
  * Rotating RMIDs is complicated because the hardware doesn't give us
  * any clues.
  *
@@ -642,11 +679,10 @@ __intel_cqm_pick_and_rotate(void)
  */
 static bool __intel_cqm_rmid_rotate(void)
 {
-	struct perf_event *group, *rotor, *start = NULL;
+	struct perf_event *group, *start = NULL;
 	unsigned int threshold_limit;
 	unsigned int nr_needed = 0;
 	unsigned int nr_available;
-	unsigned int rmid;
 	bool rotated = false;
 
 	mutex_lock(&cache_mutex);
@@ -678,7 +714,9 @@ again:
 		goto stabilize;
 
 	/*
-	 * We have more event groups without RMIDs than available RMIDs.
+	 * We have more event groups without RMIDs than available RMIDs,
+	 * or we have event groups that conflict with the ones currently
+	 * scheduled.
 	 *
 	 * We force deallocate the rmid of the group at the head of
 	 * cache_groups. The first event group without an RMID then gets
@@ -688,15 +726,7 @@ again:
 	 * Rotate the cache_groups list so the previous head is now the
 	 * tail.
 	 */
-	rotor = __intel_cqm_pick_and_rotate();
-	rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
-
-	/*
-	 * The group at the front of the list should always have a valid
-	 * RMID. If it doesn't then no groups have RMIDs assigned.
-	 */
-	if (!__rmid_valid(rmid))
-		goto stabilize;
+	__intel_cqm_pick_and_rotate(start);
 
 	/*
 	 * If the rotation is going to succeed, reduce the threshold so
@@ -704,14 +734,14 @@ again:
 	 */
 	if (__rmid_valid(intel_cqm_rotation_rmid)) {
 		intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
-		intel_cqm_rotation_rmid = INVALID_RMID;
+		intel_cqm_rotation_rmid = __get_rmid();
+
+		intel_cqm_sched_out_conflicting_events(start);
 
 		if (__intel_cqm_threshold)
 			__intel_cqm_threshold--;
 	}
 
-	__put_rmid(rmid);
-
 	rotated = true;
 
 stabilize:
@@ -794,25 +824,37 @@ static void intel_cqm_rmid_rotate(struct work_struct *work)
  *
  * If we're part of a group, we use the group's RMID.
  */
-static int intel_cqm_setup_event(struct perf_event *event,
-				 struct perf_event **group)
+static void intel_cqm_setup_event(struct perf_event *event,
+				  struct perf_event **group)
 {
 	struct perf_event *iter;
+	unsigned int rmid;
+	bool conflict = false;
 
 	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
+		rmid = iter->hw.cqm_rmid;
+
 		if (__match_event(iter, event)) {
 			/* All tasks in a group share an RMID */
-			event->hw.cqm_rmid = iter->hw.cqm_rmid;
+			event->hw.cqm_rmid = rmid;
 			*group = iter;
-			return 0;
+			return;
 		}
 
-		if (__conflict_event(iter, event))
-			return -EBUSY;
+		/*
+		 * We only care about conflicts for events that are
+		 * actually scheduled in (and hence have a valid RMID).
+		 */
+		if (__conflict_event(iter, event) && __rmid_valid(rmid))
+			conflict = true;
 	}
 
-	event->hw.cqm_rmid = __get_rmid();
-	return 0;
+	if (conflict)
+		rmid = INVALID_RMID;
+	else
+		rmid = __get_rmid();
+
+	event->hw.cqm_rmid = rmid;
 }
 
 static void intel_cqm_event_read(struct perf_event *event)
@@ -1030,7 +1072,6 @@ static int intel_cqm_event_init(struct perf_event *event)
 {
 	struct perf_event *group = NULL;
 	bool rotate = false;
-	int err;
 
 	if (event->attr.type != intel_cqm_pmu.type)
 		return -ENOENT;
@@ -1056,9 +1097,7 @@ static int intel_cqm_event_init(struct perf_event *event)
 	mutex_lock(&cache_mutex);
 
 	/* Will also set rmid */
-	err = intel_cqm_setup_event(event, &group);
-	if (err)
-		goto out;
+	intel_cqm_setup_event(event, &group);
 
 	if (group) {
 		list_add_tail(&event->hw.cqm_group_entry,
@@ -1078,13 +1117,12 @@ static int intel_cqm_event_init(struct perf_event *event)
 			rotate = true;
 	}
 
-out:
 	mutex_unlock(&cache_mutex);
 
 	if (rotate)
 		schedule_delayed_work(&intel_cqm_rmid_work, 0);
 
-	return err;
+	return 0;
 }
 
 EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
-- 
cgit v1.2.3


From 66c046b407166686aeea2e8c0abfa1c4e4b94f26 Mon Sep 17 00:00:00 2001
From: "Lad, Prabhakar" <prabhakar.csengg@gmail.com>
Date: Thu, 5 Feb 2015 10:29:35 +0000
Subject: crypto: sha-mb - Fix big integer constant sparse warning

this patch fixes following sparse warning:

sha1_mb_mgr_init_avx2.c:59:31: warning: constant 0xF76543210 is so big it is long

Signed-off-by: Lad, Prabhakar <prabhakar.csengg@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
index 4ca7e166a2aa..822acb5b464c 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
@@ -56,7 +56,7 @@
 void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state)
 {
 	unsigned int j;
-	state->unused_lanes = 0xF76543210;
+	state->unused_lanes = 0xF76543210ULL;
 	for (j = 0; j < 8; j++) {
 		state->lens[j] = 0xFFFFFFFF;
 		state->ldata[j].job_in_lane = NULL;
-- 
cgit v1.2.3


From 81e397d937a8e9f46f024a9f876cf14d8e2b45a7 Mon Sep 17 00:00:00 2001
From: Tadeusz Struk <tadeusz.struk@intel.com>
Date: Fri, 6 Feb 2015 10:25:20 -0800
Subject: crypto: aesni - make driver-gcm-aes-aesni helper a proper aead alg

Changed the __driver-gcm-aes-aesni to be a proper aead algorithm.
This required a valid setkey and setauthsize functions to be added and also
some changes to make sure that math context is not corrupted when the alg is
used directly.
Note that the __driver-gcm-aes-aesni should not be used directly by modules
that can use it in interrupt context as we don't have a good fallback mechanism
in this case.

Signed-off-by: Adrian Hoban <adrian.hoban@intel.com>
Signed-off-by: Tadeusz Struk <tadeusz.struk@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_glue.c | 164 +++++++++++++++++++++++++------------
 1 file changed, 110 insertions(+), 54 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 947c6bf52c33..6893f4947583 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -890,15 +890,12 @@ out_free_ablkcipher:
 	return ret;
 }
 
-static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
-						   unsigned int key_len)
+static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key,
+				  unsigned int key_len)
 {
 	int ret = 0;
-	struct crypto_tfm *tfm = crypto_aead_tfm(parent);
-	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
-	struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
-	struct aesni_rfc4106_gcm_ctx *child_ctx =
-                                 aesni_rfc4106_gcm_ctx_get(cryptd_child);
+	struct crypto_tfm *tfm = crypto_aead_tfm(aead);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead);
 	u8 *new_key_align, *new_key_mem = NULL;
 
 	if (key_len < 4) {
@@ -943,20 +940,31 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
 		goto exit;
 	}
 	ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
-	memcpy(child_ctx, ctx, sizeof(*ctx));
 exit:
 	kfree(new_key_mem);
 	return ret;
 }
 
-/* This is the Integrity Check Value (aka the authentication tag length and can
- * be 8, 12 or 16 bytes long. */
-static int rfc4106_set_authsize(struct crypto_aead *parent,
-				unsigned int authsize)
+static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
+			   unsigned int key_len)
 {
 	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
-	struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+	struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm);
+	struct aesni_rfc4106_gcm_ctx *c_ctx = aesni_rfc4106_gcm_ctx_get(child);
+	struct cryptd_aead *cryptd_tfm = ctx->cryptd_tfm;
+	int ret;
 
+	ret = crypto_aead_setkey(child, key, key_len);
+	if (!ret) {
+		memcpy(ctx, c_ctx, sizeof(*ctx));
+		ctx->cryptd_tfm = cryptd_tfm;
+	}
+	return ret;
+}
+
+static int common_rfc4106_set_authsize(struct crypto_aead *aead,
+				       unsigned int authsize)
+{
 	switch (authsize) {
 	case 8:
 	case 12:
@@ -965,51 +973,23 @@ static int rfc4106_set_authsize(struct crypto_aead *parent,
 	default:
 		return -EINVAL;
 	}
-	crypto_aead_crt(parent)->authsize = authsize;
-	crypto_aead_crt(cryptd_child)->authsize = authsize;
+	crypto_aead_crt(aead)->authsize = authsize;
 	return 0;
 }
 
-static int rfc4106_encrypt(struct aead_request *req)
-{
-	int ret;
-	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
-
-	if (!irq_fpu_usable()) {
-		struct aead_request *cryptd_req =
-			(struct aead_request *) aead_request_ctx(req);
-		memcpy(cryptd_req, req, sizeof(*req));
-		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
-		return crypto_aead_encrypt(cryptd_req);
-	} else {
-		struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
-		kernel_fpu_begin();
-		ret = cryptd_child->base.crt_aead.encrypt(req);
-		kernel_fpu_end();
-		return ret;
-	}
-}
-
-static int rfc4106_decrypt(struct aead_request *req)
+/* This is the Integrity Check Value (aka the authentication tag length and can
+ * be 8, 12 or 16 bytes long. */
+static int rfc4106_set_authsize(struct crypto_aead *parent,
+				unsigned int authsize)
 {
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
+	struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm);
 	int ret;
-	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
 
-	if (!irq_fpu_usable()) {
-		struct aead_request *cryptd_req =
-			(struct aead_request *) aead_request_ctx(req);
-		memcpy(cryptd_req, req, sizeof(*req));
-		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
-		return crypto_aead_decrypt(cryptd_req);
-	} else {
-		struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
-		kernel_fpu_begin();
-		ret = cryptd_child->base.crt_aead.decrypt(req);
-		kernel_fpu_end();
-		return ret;
-	}
+	ret = crypto_aead_setauthsize(child, authsize);
+	if (!ret)
+		crypto_aead_crt(parent)->authsize = authsize;
+	return ret;
 }
 
 static int __driver_rfc4106_encrypt(struct aead_request *req)
@@ -1185,6 +1165,78 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
 	}
 	return retval;
 }
+
+static int rfc4106_encrypt(struct aead_request *req)
+{
+	int ret;
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+
+	if (!irq_fpu_usable()) {
+		struct aead_request *cryptd_req =
+			(struct aead_request *) aead_request_ctx(req);
+
+		memcpy(cryptd_req, req, sizeof(*req));
+		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+		ret = crypto_aead_encrypt(cryptd_req);
+	} else {
+		kernel_fpu_begin();
+		ret = __driver_rfc4106_encrypt(req);
+		kernel_fpu_end();
+	}
+	return ret;
+}
+
+static int rfc4106_decrypt(struct aead_request *req)
+{
+	int ret;
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+
+	if (!irq_fpu_usable()) {
+		struct aead_request *cryptd_req =
+			(struct aead_request *) aead_request_ctx(req);
+
+		memcpy(cryptd_req, req, sizeof(*req));
+		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+		ret = crypto_aead_decrypt(cryptd_req);
+	} else {
+		kernel_fpu_begin();
+		ret = __driver_rfc4106_decrypt(req);
+		kernel_fpu_end();
+	}
+	return ret;
+}
+
+static int helper_rfc4106_encrypt(struct aead_request *req)
+{
+	int ret;
+
+	if (unlikely(!irq_fpu_usable())) {
+		WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context");
+		ret = -EINVAL;
+	} else {
+		kernel_fpu_begin();
+		ret = __driver_rfc4106_encrypt(req);
+		kernel_fpu_end();
+	}
+	return ret;
+}
+
+static int helper_rfc4106_decrypt(struct aead_request *req)
+{
+	int ret;
+
+	if (unlikely(!irq_fpu_usable())) {
+		WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context");
+		ret = -EINVAL;
+	} else {
+		kernel_fpu_begin();
+		ret = __driver_rfc4106_decrypt(req);
+		kernel_fpu_end();
+	}
+	return ret;
+}
 #endif
 
 static struct crypto_alg aesni_algs[] = { {
@@ -1366,8 +1418,12 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_module		= THIS_MODULE,
 	.cra_u = {
 		.aead = {
-			.encrypt	= __driver_rfc4106_encrypt,
-			.decrypt	= __driver_rfc4106_decrypt,
+			.setkey		= common_rfc4106_set_key,
+			.setauthsize	= common_rfc4106_set_authsize,
+			.encrypt	= helper_rfc4106_encrypt,
+			.decrypt	= helper_rfc4106_decrypt,
+			.ivsize		= 8,
+			.maxauthsize	= 16,
 		},
 	},
 }, {
-- 
cgit v1.2.3


From 89f9f6796d41e10e224b0cb0027ddd78cb881f65 Mon Sep 17 00:00:00 2001
From: Dexuan Cui <decui@microsoft.com>
Date: Fri, 27 Feb 2015 11:25:59 -0800
Subject: hv: vmbus_post_msg: retry the hypercall on some transient errors

I got HV_STATUS_INVALID_CONNECTION_ID on Hyper-V 2008 R2 when keeping running
"rmmod hv_netvsc; modprobe hv_netvsc; rmmod hv_utils; modprobe hv_utils"
in a Linux guest. Looks the host has some kind of throttling mechanism if
some kinds of hypercalls are sent too frequently.
Without the patch, the driver can occasionally fail to load.

Also let's retry HV_STATUS_INSUFFICIENT_MEMORY, though we didn't get it
before.

Removed 'case -ENOMEM', since the hypervisor doesn't return this.

CC: "K. Y. Srinivasan" <kys@microsoft.com>
Reviewed-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/uapi/asm/hyperv.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index 90c458e66e13..ce6068dbcfbc 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -225,6 +225,8 @@
 #define HV_STATUS_INVALID_HYPERCALL_CODE	2
 #define HV_STATUS_INVALID_HYPERCALL_INPUT	3
 #define HV_STATUS_INVALID_ALIGNMENT		4
+#define HV_STATUS_INSUFFICIENT_MEMORY		11
+#define HV_STATUS_INVALID_CONNECTION_ID		18
 #define HV_STATUS_INSUFFICIENT_BUFFERS		19
 
 typedef struct _HV_REFERENCE_TSC_PAGE {
-- 
cgit v1.2.3


From d089f8e97d371a662dd233491e03bda377c9d46d Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 5 Mar 2015 10:49:17 +1030
Subject: x86: fix up obsolete cpu function usage.

Thanks to spatch, plus manual removal of "&*".

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: x86@kernel.org
---
 arch/x86/kernel/apic/x2apic_cluster.c | 8 ++++----
 arch/x86/kernel/irq.c                 | 4 ++--
 arch/x86/platform/uv/tlb_uv.c         | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index e658f21681c8..d9d0bd2faaf4 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -135,12 +135,12 @@ static void init_x2apic_ldr(void)
 
 	per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR);
 
-	__cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
+	cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
 	for_each_online_cpu(cpu) {
 		if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
 			continue;
-		__cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu));
-		__cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu));
+		cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu));
+		cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu));
 	}
 }
 
@@ -195,7 +195,7 @@ static int x2apic_init_cpu_notifier(void)
 
 	BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
 
-	__cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));
+	cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu));
 	register_hotcpu_notifier(&x2apic_cpu_notifier);
 	return 1;
 }
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 67b1cbe0093a..e5952c225532 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -295,7 +295,7 @@ int check_irq_vectors_for_cpu_disable(void)
 
 	this_cpu = smp_processor_id();
 	cpumask_copy(&online_new, cpu_online_mask);
-	cpu_clear(this_cpu, online_new);
+	cpumask_clear_cpu(this_cpu, &online_new);
 
 	this_count = 0;
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
@@ -307,7 +307,7 @@ int check_irq_vectors_for_cpu_disable(void)
 
 			data = irq_desc_get_irq_data(desc);
 			cpumask_copy(&affinity_new, data->affinity);
-			cpu_clear(this_cpu, affinity_new);
+			cpumask_clear_cpu(this_cpu, &affinity_new);
 
 			/* Do not count inactive or per-cpu irqs. */
 			if (!irq_has_action(irq) || irqd_is_per_cpu(data))
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 994798548b1a..3b6ec42718e4 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -415,7 +415,7 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
 	struct reset_args reset_args;
 
 	reset_args.sender = sender;
-	cpus_clear(*mask);
+	cpumask_clear(mask);
 	/* find a single cpu for each uvhub in this distribution mask */
 	maskbits = sizeof(struct pnmask) * BITSPERBYTE;
 	/* each bit is a pnode relative to the partition base pnode */
@@ -425,7 +425,7 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
 			continue;
 		apnode = pnode + bcp->partition_base_pnode;
 		cpu = pnode_to_first_cpu(apnode, smaster);
-		cpu_set(cpu, *mask);
+		cpumask_set_cpu(cpu, mask);
 	}
 
 	/* IPI all cpus; preemption is already disabled */
@@ -1126,7 +1126,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 	/* don't actually do a shootdown of the local cpu */
 	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
 
-	if (cpu_isset(cpu, *cpumask))
+	if (cpumask_test_cpu(cpu, cpumask))
 		stat->s_ntargself++;
 
 	bau_desc = bcp->descriptor_base;
-- 
cgit v1.2.3


From d939be3add4f1410079dad2755d4936cdb70903b Mon Sep 17 00:00:00 2001
From: Masanari Iida <standby24x7@gmail.com>
Date: Fri, 27 Feb 2015 23:52:31 +0900
Subject: treewide: Fix typo in printk messages

This patch fix spelling typo in printk messages.

Signed-off-by: Masanari Iida <standby24x7@gmail.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/kernel/test_rodata.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
index b79133abda48..5ecbfe5099da 100644
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -57,7 +57,7 @@ int rodata_test(void)
 	/* test 3: check the value hasn't changed */
 	/* If this test fails, we managed to overwrite the data */
 	if (!rodata_test_data) {
-		printk(KERN_ERR "rodata_test: Test 3 failes (end data)\n");
+		printk(KERN_ERR "rodata_test: Test 3 fails (end data)\n");
 		return -ENODEV;
 	}
 	/* test 4: check if the rodata section is 4Kb aligned */
-- 
cgit v1.2.3


From 1bd187de536494a27925902b9653e9ef0ace4774 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 23 Feb 2015 16:24:44 +0200
Subject: x86, intel-mid: remove Intel MID specific serial support

Since we have a native 8250 driver carrying the Intel MID serial devices the
specific support is not needed anymore. This patch removes it for Intel MID.

Note that the console device name is changed from ttyMFDx to ttySx.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/Kconfig.debug                             |   4 -
 arch/x86/include/asm/intel-mid.h                   |   3 -
 arch/x86/kernel/early_printk.c                     |   6 --
 arch/x86/platform/intel-mid/Makefile               |   1 -
 .../platform/intel-mid/early_printk_intel_mid.c    | 112 ---------------------
 5 files changed, 126 deletions(-)
 delete mode 100644 arch/x86/platform/intel-mid/early_printk_intel_mid.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 20028da8ae18..72484a645f05 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -43,10 +43,6 @@ config EARLY_PRINTK
 	  with klogd/syslogd or the X server. You should normally N here,
 	  unless you want to debug such a crash.
 
-config EARLY_PRINTK_INTEL_MID
-	bool "Early printk for Intel MID platform support"
-	depends on EARLY_PRINTK && X86_INTEL_MID
-
 config EARLY_PRINTK_DBGP
 	bool "Early printk via EHCI debug port"
 	depends on EARLY_PRINTK && PCI
diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h
index 705d35708a50..7c5af123bdbd 100644
--- a/arch/x86/include/asm/intel-mid.h
+++ b/arch/x86/include/asm/intel-mid.h
@@ -136,9 +136,6 @@ extern enum intel_mid_timer_options intel_mid_timer_options;
 #define SFI_MTMR_MAX_NUM 8
 #define SFI_MRTC_MAX	8
 
-extern struct console early_hsu_console;
-extern void hsu_early_console_init(const char *);
-
 extern void intel_scu_devices_create(void);
 extern void intel_scu_devices_destroy(void);
 
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index a62536a1be88..f85e3fb50f28 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -375,12 +375,6 @@ static int __init setup_early_printk(char *buf)
 		if (!strncmp(buf, "xen", 3))
 			early_console_register(&xenboot_console, keep);
 #endif
-#ifdef CONFIG_EARLY_PRINTK_INTEL_MID
-		if (!strncmp(buf, "hsu", 3)) {
-			hsu_early_console_init(buf + 3);
-			early_console_register(&early_hsu_console, keep);
-		}
-#endif
 #ifdef CONFIG_EARLY_PRINTK_EFI
 		if (!strncmp(buf, "efi", 3))
 			early_console_register(&early_efi_console, keep);
diff --git a/arch/x86/platform/intel-mid/Makefile b/arch/x86/platform/intel-mid/Makefile
index 0a8ee703b9fa..0ce1b1913673 100644
--- a/arch/x86/platform/intel-mid/Makefile
+++ b/arch/x86/platform/intel-mid/Makefile
@@ -1,5 +1,4 @@
 obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o mfld.o mrfl.o
-obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_intel_mid.o
 
 # SFI specific code
 ifdef CONFIG_X86_INTEL_MID
diff --git a/arch/x86/platform/intel-mid/early_printk_intel_mid.c b/arch/x86/platform/intel-mid/early_printk_intel_mid.c
deleted file mode 100644
index 4e720829ab90..000000000000
--- a/arch/x86/platform/intel-mid/early_printk_intel_mid.c
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * early_printk_intel_mid.c - early consoles for Intel MID platforms
- *
- * Copyright (c) 2008-2010, Intel Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-
-/*
- * This file implements early console named hsu.
- * hsu is based on a High Speed UART device which only exists in the Medfield
- * platform
- */
-
-#include <linux/serial_reg.h>
-#include <linux/serial_mfd.h>
-#include <linux/console.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>
-#include <linux/io.h>
-
-#include <asm/fixmap.h>
-#include <asm/pgtable.h>
-#include <asm/intel-mid.h>
-
-/*
- * Following is the early console based on Medfield HSU (High
- * Speed UART) device.
- */
-#define HSU_PORT_BASE		0xffa28080
-
-static void __iomem *phsu;
-
-void hsu_early_console_init(const char *s)
-{
-	unsigned long paddr, port = 0;
-	u8 lcr;
-
-	/*
-	 * Select the early HSU console port if specified by user in the
-	 * kernel command line.
-	 */
-	if (*s && !kstrtoul(s, 10, &port))
-		port = clamp_val(port, 0, 2);
-
-	paddr = HSU_PORT_BASE + port * 0x80;
-	phsu = (void __iomem *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, paddr);
-
-	/* Disable FIFO */
-	writeb(0x0, phsu + UART_FCR);
-
-	/* Set to default 115200 bps, 8n1 */
-	lcr = readb(phsu + UART_LCR);
-	writeb((0x80 | lcr), phsu + UART_LCR);
-	writeb(0x18, phsu + UART_DLL);
-	writeb(lcr,  phsu + UART_LCR);
-	writel(0x3600, phsu + UART_MUL*4);
-
-	writeb(0x8, phsu + UART_MCR);
-	writeb(0x7, phsu + UART_FCR);
-	writeb(0x3, phsu + UART_LCR);
-
-	/* Clear IRQ status */
-	readb(phsu + UART_LSR);
-	readb(phsu + UART_RX);
-	readb(phsu + UART_IIR);
-	readb(phsu + UART_MSR);
-
-	/* Enable FIFO */
-	writeb(0x7, phsu + UART_FCR);
-}
-
-#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
-
-static void early_hsu_putc(char ch)
-{
-	unsigned int timeout = 10000; /* 10ms */
-	u8 status;
-
-	while (--timeout) {
-		status = readb(phsu + UART_LSR);
-		if (status & BOTH_EMPTY)
-			break;
-		udelay(1);
-	}
-
-	/* Only write the char when there was no timeout */
-	if (timeout)
-		writeb(ch, phsu + UART_TX);
-}
-
-static void early_hsu_write(struct console *con, const char *str, unsigned n)
-{
-	int i;
-
-	for (i = 0; i < n && *str; i++) {
-		if (*str == '\n')
-			early_hsu_putc('\r');
-		early_hsu_putc(*str);
-		str++;
-	}
-}
-
-struct console early_hsu_console = {
-	.name =		"earlyhsu",
-	.write =	early_hsu_write,
-	.flags =	CON_PRINTBUFFER,
-	.index =	-1,
-};
-- 
cgit v1.2.3


From c467ea763fd5d8795b7d1b5a78eb94b6ad8f66ad Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 4 Mar 2015 18:06:33 +0100
Subject: context_tracking: Rename context symbols to prepare for transition
 state

Current context tracking symbols are designed to express living state.
As such they are prefixed with "IN_": IN_USER, IN_KERNEL.

Now we are going to use these symbols to also express state transitions
such as context_tracking_enter(IN_USER) or context_tracking_exit(IN_USER).
But while the "IN_" prefix works well to express entering a context, it's
confusing to depict a context exit: context_tracking_exit(IN_USER)
could mean two things:
	1) We are exiting the current context to enter user context.
	2) We are exiting the user context
We want 2) but the reviewer may be confused and understand 1)

So lets disambiguate these symbols and rename them to CONTEXT_USER and
CONTEXT_KERNEL.

Acked-by: Rik van Riel <riel@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Will deacon <will.deacon@arm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/kernel/traps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9d2073e2ecc9..756f74eed35d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -123,7 +123,7 @@ enum ctx_state ist_enter(struct pt_regs *regs)
 		 * but we need to notify RCU.
 		 */
 		rcu_nmi_enter();
-		prev_state = IN_KERNEL;  /* the value is irrelevant. */
+		prev_state = CONTEXT_KERNEL;  /* the value is irrelevant. */
 	}
 
 	/*
-- 
cgit v1.2.3


From fdaf3a6539d6b9b6a7240c3b00bd81c813b2760d Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 10 Mar 2015 12:43:45 +1030
Subject: x86: fix more deprecated cpu function usage.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/kernel/apic/x2apic_cluster.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index d9d0bd2faaf4..ab3219b3fbda 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -171,8 +171,8 @@ update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		for_each_online_cpu(cpu) {
 			if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
 				continue;
-			__cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu));
-			__cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu));
+			cpumask_clear_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu));
+			cpumask_clear_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu));
 		}
 		free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
 		free_cpumask_var(per_cpu(ipi_mask, this_cpu));
-- 
cgit v1.2.3


From 079119a2c7c83506ad753e7b95e9ed253e9963f9 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 11 Mar 2015 13:52:53 +0200
Subject: serial, x86: use UPF_* constants for flags

This patch fixes the following sparse warnings:

drivers/tty/serial/8250/8250_core.c:3231:32: warning: incorrect type in assignment (different base types)
drivers/tty/serial/8250/8250_core.c:3231:32:    expected restricted upf_t [usertype] flags
drivers/tty/serial/8250/8250_core.c:3231:32:    got unsigned int const [unsigned] flags

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/serial.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/serial.h b/arch/x86/include/asm/serial.h
index 460b84f64556..8378b8c9109c 100644
--- a/arch/x86/include/asm/serial.h
+++ b/arch/x86/include/asm/serial.h
@@ -12,11 +12,11 @@
 
 /* Standard COM flags (except for COM4, because of the 8514 problem) */
 #ifdef CONFIG_SERIAL_DETECT_IRQ
-# define STD_COMX_FLAGS	(ASYNC_BOOT_AUTOCONF |	ASYNC_SKIP_TEST	| ASYNC_AUTO_IRQ)
-# define STD_COM4_FLAGS	(ASYNC_BOOT_AUTOCONF |	0		| ASYNC_AUTO_IRQ)
+# define STD_COMX_FLAGS	(UPF_BOOT_AUTOCONF |	UPF_SKIP_TEST	| UPF_AUTO_IRQ)
+# define STD_COM4_FLAGS	(UPF_BOOT_AUTOCONF |	0		| UPF_AUTO_IRQ)
 #else
-# define STD_COMX_FLAGS	(ASYNC_BOOT_AUTOCONF |	ASYNC_SKIP_TEST	| 0		)
-# define STD_COM4_FLAGS	(ASYNC_BOOT_AUTOCONF |	0		| 0		)
+# define STD_COMX_FLAGS	(UPF_BOOT_AUTOCONF |	UPF_SKIP_TEST	| 0		)
+# define STD_COM4_FLAGS	(UPF_BOOT_AUTOCONF |	0		| 0		)
 #endif
 
 #define SERIAL_PORT_DFNS								\
-- 
cgit v1.2.3


From 2a442c9c6453d3d043dfd89f2e03a1deff8a6f06 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 25 Feb 2015 11:42:15 -0800
Subject: x86: Use common outgoing-CPU-notification code

This commit removes the open-coded CPU-offline notification with new
common code.  Among other things, this change avoids calling scheduler
code using RCU from an offline CPU that RCU is ignoring.  It also allows
Xen to notice at online time that the CPU did not go offline correctly.
Note that Xen has the surviving CPU carry out some cleanup operations,
so if the surviving CPU times out, these cleanup operations might have
been carried out while the outgoing CPU was still running.  It might
therefore be unwise to bring this CPU back online, and this commit
avoids doing so.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: <x86@kernel.org>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: <xen-devel@lists.xenproject.org>
---
 arch/x86/include/asm/cpu.h |  2 --
 arch/x86/include/asm/smp.h |  2 +-
 arch/x86/kernel/smpboot.c  | 39 ++++++++++++++++++---------------------
 arch/x86/xen/smp.c         | 46 +++++++++++++++++++++++++---------------------
 4 files changed, 44 insertions(+), 45 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index d2b12988d2ed..bf2caa1dedc5 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -34,8 +34,6 @@ extern int _debug_hotplug_cpu(int cpu, int action);
 #endif
 #endif
 
-DECLARE_PER_CPU(int, cpu_state);
-
 int mwait_usable(const struct cpuinfo_x86 *);
 
 #endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 8cd1cc3bc835..a5cb4f6e9492 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -150,12 +150,12 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 }
 
 void cpu_disable_common(void);
-void cpu_die_common(unsigned int cpu);
 void native_smp_prepare_boot_cpu(void);
 void native_smp_prepare_cpus(unsigned int max_cpus);
 void native_smp_cpus_done(unsigned int max_cpus);
 int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
 int native_cpu_disable(void);
+int common_cpu_die(unsigned int cpu);
 void native_cpu_die(unsigned int cpu);
 void native_play_dead(void);
 void play_dead_common(void);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index febc6aabc72e..c8fa34963ead 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -77,9 +77,6 @@
 #include <asm/realmode.h>
 #include <asm/misc.h>
 
-/* State of each CPU */
-DEFINE_PER_CPU(int, cpu_state) = { 0 };
-
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
 EXPORT_SYMBOL(smp_num_siblings);
@@ -257,7 +254,7 @@ static void notrace start_secondary(void *unused)
 	lock_vector_lock();
 	set_cpu_online(smp_processor_id(), true);
 	unlock_vector_lock();
-	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+	cpu_set_state_online(smp_processor_id());
 	x86_platform.nmi_init();
 
 	/* enable local interrupts */
@@ -948,7 +945,10 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 	 */
 	mtrr_save_state();
 
-	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+	/* x86 CPUs take themselves offline, so delayed offline is OK. */
+	err = cpu_check_up_prepare(cpu);
+	if (err && err != -EBUSY)
+		return err;
 
 	/* the FPU context is blank, nobody can own it */
 	__cpu_disable_lazy_restore(cpu);
@@ -1191,7 +1191,7 @@ void __init native_smp_prepare_boot_cpu(void)
 	switch_to_new_gdt(me);
 	/* already set me in cpu_online_mask in boot_cpu_init() */
 	cpumask_set_cpu(me, cpu_callout_mask);
-	per_cpu(cpu_state, me) = CPU_ONLINE;
+	cpu_set_state_online(me);
 }
 
 void __init native_smp_cpus_done(unsigned int max_cpus)
@@ -1318,14 +1318,10 @@ static void __ref remove_cpu_from_maps(int cpu)
 	numa_remove_cpu(cpu);
 }
 
-static DEFINE_PER_CPU(struct completion, die_complete);
-
 void cpu_disable_common(void)
 {
 	int cpu = smp_processor_id();
 
-	init_completion(&per_cpu(die_complete, smp_processor_id()));
-
 	remove_siblinginfo(cpu);
 
 	/* It's now safe to remove this processor from the online map */
@@ -1349,24 +1345,27 @@ int native_cpu_disable(void)
 	return 0;
 }
 
-void cpu_die_common(unsigned int cpu)
+int common_cpu_die(unsigned int cpu)
 {
-	wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ);
-}
+	int ret = 0;
 
-void native_cpu_die(unsigned int cpu)
-{
 	/* We don't do anything here: idle task is faking death itself. */
 
-	cpu_die_common(cpu);
-
 	/* They ack this in play_dead() by setting CPU_DEAD */
-	if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+	if (cpu_wait_death(cpu, 5)) {
 		if (system_state == SYSTEM_RUNNING)
 			pr_info("CPU %u is now offline\n", cpu);
 	} else {
 		pr_err("CPU %u didn't die...\n", cpu);
+		ret = -1;
 	}
+
+	return ret;
+}
+
+void native_cpu_die(unsigned int cpu)
+{
+	common_cpu_die(cpu);
 }
 
 void play_dead_common(void)
@@ -1375,10 +1374,8 @@ void play_dead_common(void)
 	reset_lazy_tlbstate();
 	amd_e400_remove_cpu(raw_smp_processor_id());
 
-	mb();
 	/* Ack it */
-	__this_cpu_write(cpu_state, CPU_DEAD);
-	complete(&per_cpu(die_complete, smp_processor_id()));
+	(void)cpu_report_death();
 
 	/*
 	 * With physical CPU hotplug, we should halt the cpu
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 08e8489c47f1..1c5e760f34ca 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -90,14 +90,10 @@ static void cpu_bringup(void)
 
 	set_cpu_online(cpu, true);
 
-	this_cpu_write(cpu_state, CPU_ONLINE);
-
-	wmb();
+	cpu_set_state_online(cpu);  /* Implies full memory barrier. */
 
 	/* We can take interrupts now: we're officially "up". */
 	local_irq_enable();
-
-	wmb();			/* make sure everything is out */
 }
 
 /*
@@ -459,7 +455,13 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
 	xen_setup_timer(cpu);
 	xen_init_lock_cpu(cpu);
 
-	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+	/*
+	 * PV VCPUs are always successfully taken down (see 'while' loop
+	 * in xen_cpu_die()), so -EBUSY is an error.
+	 */
+	rc = cpu_check_up_prepare(cpu);
+	if (rc)
+		return rc;
 
 	/* make sure interrupts start blocked */
 	per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -479,10 +481,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
 	rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
 	BUG_ON(rc);
 
-	while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
+	while (cpu_report_state(cpu) != CPU_ONLINE)
 		HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
-		barrier();
-	}
 
 	return 0;
 }
@@ -511,11 +511,11 @@ static void xen_cpu_die(unsigned int cpu)
 		schedule_timeout(HZ/10);
 	}
 
-	cpu_die_common(cpu);
-
-	xen_smp_intr_free(cpu);
-	xen_uninit_lock_cpu(cpu);
-	xen_teardown_timer(cpu);
+	if (common_cpu_die(cpu) == 0) {
+		xen_smp_intr_free(cpu);
+		xen_uninit_lock_cpu(cpu);
+		xen_teardown_timer(cpu);
+	}
 }
 
 static void xen_play_dead(void) /* used only with HOTPLUG_CPU */
@@ -747,6 +747,16 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
 static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	int rc;
+
+	/*
+	 * This can happen if CPU was offlined earlier and
+	 * offlining timed out in common_cpu_die().
+	 */
+	if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) {
+		xen_smp_intr_free(cpu);
+		xen_uninit_lock_cpu(cpu);
+	}
+
 	/*
 	 * xen_smp_intr_init() needs to run before native_cpu_up()
 	 * so that IPI vectors are set up on the booting CPU before
@@ -768,12 +778,6 @@ static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
 	return rc;
 }
 
-static void xen_hvm_cpu_die(unsigned int cpu)
-{
-	xen_cpu_die(cpu);
-	native_cpu_die(cpu);
-}
-
 void __init xen_hvm_smp_init(void)
 {
 	if (!xen_have_vector_callback)
@@ -781,7 +785,7 @@ void __init xen_hvm_smp_init(void)
 	smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
 	smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
 	smp_ops.cpu_up = xen_hvm_cpu_up;
-	smp_ops.cpu_die = xen_hvm_cpu_die;
+	smp_ops.cpu_die = xen_cpu_die;
 	smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
 	smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
 	smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu;
-- 
cgit v1.2.3


From 05713ba9055f1a209d50e480626f36c401bda3ad Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@lip6.fr>
Date: Wed, 11 Mar 2015 17:56:26 +0100
Subject: crypto: don't export static symbol

The semantic patch that fixes this problem is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@r@
type T;
identifier f;
@@

static T f (...) { ... }

@@
identifier r.f;
declarer name EXPORT_SYMBOL_GPL;
@@

-EXPORT_SYMBOL_GPL(f);
// </smpl>

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/glue_helper.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index 432f1d76ceb8..6a85598931b5 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -232,7 +232,6 @@ static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr,
 
 	le128_to_be128((be128 *)walk->iv, &ctrblk);
 }
-EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit);
 
 static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
 					    struct blkcipher_desc *desc,
-- 
cgit v1.2.3


From 297d716f6260cc9421d971b124ca196b957ee458 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Thu, 12 Mar 2015 08:44:11 +0100
Subject: power_supply: Change ownership from driver to core

Change the ownership of power_supply structure from each driver
implementing the class to the power supply core.

The patch changes power_supply_register() function thus all drivers
implementing power supply class are adjusted.

Each driver provides the implementation of power supply. However it
should not be the owner of power supply class instance because it is
exposed by core to other subsystems with power_supply_get_by_name().
These other subsystems have no knowledge when the driver will unregister
the power supply. This leads to several issues when driver is unbound -
mostly because user of power supply accesses freed memory.

Instead let the core own the instance of struct 'power_supply'.  Other
users of this power supply will still access valid memory because it
will be freed when device reference count reaches 0. Currently this
means "it will leak" but power_supply_put() call in next patches will
solve it.

This solves invalid memory references in following race condition
scenario:

Thread 1: charger manager
Thread 2: power supply driver, used by charger manager

THREAD 1 (charger manager)         THREAD 2 (power supply driver)
==========================         ==============================
psy = power_supply_get_by_name()
                                   Driver unbind, .remove
                                     power_supply_unregister()
                                     Device fully removed
psy->get_property()

The 'get_property' call is executed in invalid context because the driver was
unbound and struct 'power_supply' memory was freed.

This could be observed easily with charger manager driver (here compiled
with max17040 fuel gauge):

$ cat /sys/devices/virtual/power_supply/cm-battery/capacity &
$ echo "1-0036" > /sys/bus/i2c/drivers/max17040/unbind
[   55.725123] Unable to handle kernel NULL pointer dereference at virtual address 00000000
[   55.732584] pgd = d98d4000
[   55.734060] [00000000] *pgd=5afa2831, *pte=00000000, *ppte=00000000
[   55.740318] Internal error: Oops: 80000007 [#1] PREEMPT SMP ARM
[   55.746210] Modules linked in:
[   55.749259] CPU: 1 PID: 2936 Comm: cat Tainted: G        W       3.19.0-rc1-next-20141226-00048-gf79f475f3c44-dirty #1496
[   55.760190] Hardware name: SAMSUNG EXYNOS (Flattened Device Tree)
[   55.766270] task: d9b76f00 ti: daf54000 task.ti: daf54000
[   55.771647] PC is at 0x0
[   55.774182] LR is at charger_get_property+0x2f4/0x36c
[   55.779201] pc : [<00000000>]    lr : [<c034b0b4>]    psr: 60000013
[   55.779201] sp : daf55e90  ip : 00000003  fp : 00000000
[   55.790657] r10: 00000000  r9 : c06e2878  r8 : d9b26c68
[   55.795865] r7 : dad81610  r6 : daec7410  r5 : daf55ebc  r4 : 00000000
[   55.802367] r3 : 00000000  r2 : daf55ebc  r1 : 0000002a  r0 : d9b26c68
[   55.808879] Flags: nZCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment user
[   55.815994] Control: 10c5387d  Table: 598d406a  DAC: 00000015
[   55.821723] Process cat (pid: 2936, stack limit = 0xdaf54210)
[   55.827451] Stack: (0xdaf55e90 to 0xdaf56000)
[   55.831795] 5e80:                                     60000013 c01459c4 0000002a c06f8ef8
[   55.839956] 5ea0: db651000 c06f8ef8 daebac00 c04cb668 daebac08 c0346864 00000000 c01459c4
[   55.848115] 5ec0: d99eaa80 c06f8ef8 00000fff 00001000 db651000 c027f25c c027f240 d99eaa80
[   55.856274] 5ee0: d9a06c00 c0146218 daf55f18 00001000 d99eaa80 db4c18c0 00000001 00000001
[   55.864468] 5f00: daf55f80 c0144c78 c0144c54 c0107f90 00015000 d99eaab0 00000000 00000000
[   55.872603] 5f20: 000051c7 00000000 db4c18c0 c04a9370 00015000 00001000 daf55f80 00001000
[   55.880763] 5f40: daf54000 00015000 00000000 c00e53dc db4c18c0 c00e548c 0000000d 00008124
[   55.888937] 5f60: 00000001 00000000 00000000 db4c18c0 db4c18c0 00001000 00015000 c00e5550
[   55.897099] 5f80: 00000000 00000000 00001000 00001000 00015000 00000003 00000003 c000f364
[   55.905239] 5fa0: 00000000 c000f1a0 00001000 00015000 00000003 00015000 00001000 0001333c
[   55.913399] 5fc0: 00001000 00015000 00000003 00000003 00000002 00000000 00000000 00000000
[   55.921560] 5fe0: 7fffe000 be999850 0000a225 b6f3c19c 60000010 00000003 00000000 00000000
[   55.929744] [<c034b0b4>] (charger_get_property) from [<c0346864>] (power_supply_show_property+0x48/0x20c)
[   55.939286] [<c0346864>] (power_supply_show_property) from [<c027f25c>] (dev_attr_show+0x1c/0x48)
[   55.948130] [<c027f25c>] (dev_attr_show) from [<c0146218>] (sysfs_kf_seq_show+0x84/0x104)
[   55.956298] [<c0146218>] (sysfs_kf_seq_show) from [<c0144c78>] (kernfs_seq_show+0x24/0x28)
[   55.964536] [<c0144c78>] (kernfs_seq_show) from [<c0107f90>] (seq_read+0x1b0/0x484)
[   55.972172] [<c0107f90>] (seq_read) from [<c00e53dc>] (__vfs_read+0x18/0x4c)
[   55.979188] [<c00e53dc>] (__vfs_read) from [<c00e548c>] (vfs_read+0x7c/0x100)
[   55.986304] [<c00e548c>] (vfs_read) from [<c00e5550>] (SyS_read+0x40/0x8c)
[   55.993164] [<c00e5550>] (SyS_read) from [<c000f1a0>] (ret_fast_syscall+0x0/0x48)
[   56.000626] Code: bad PC value
[   56.011652] ---[ end trace 7b64343fbdae8ef1 ]---

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Reviewed-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>

[for the nvec part]
Reviewed-by: Marc Dietrich <marvin24@gmx.de>

[for compal-laptop.c]
Acked-by: Darren Hart <dvhart@linux.intel.com>

[for the mfd part]
Acked-by: Lee Jones <lee.jones@linaro.org>

[for the hid part]
Acked-by: Jiri Kosina <jkosina@suse.cz>

[for the acpi part]
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 arch/x86/platform/olpc/olpc-xo1-sci.c  | 4 ++--
 arch/x86/platform/olpc/olpc-xo15-sci.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c
index 9a2e590dd202..e4ed28bbf79d 100644
--- a/arch/x86/platform/olpc/olpc-xo1-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo1-sci.c
@@ -61,7 +61,7 @@ static void battery_status_changed(void)
 
 	if (psy) {
 		power_supply_changed(psy);
-		put_device(psy->dev);
+		put_device(&psy->dev);
 	}
 }
 
@@ -71,7 +71,7 @@ static void ac_status_changed(void)
 
 	if (psy) {
 		power_supply_changed(psy);
-		put_device(psy->dev);
+		put_device(&psy->dev);
 	}
 }
 
diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
index 08e350e757dc..186634e9021d 100644
--- a/arch/x86/platform/olpc/olpc-xo15-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -83,7 +83,7 @@ static void battery_status_changed(void)
 
 	if (psy) {
 		power_supply_changed(psy);
-		put_device(psy->dev);
+		put_device(&psy->dev);
 	}
 }
 
@@ -93,7 +93,7 @@ static void ac_status_changed(void)
 
 	if (psy) {
 		power_supply_changed(psy);
-		put_device(psy->dev);
+		put_device(&psy->dev);
 	}
 }
 
-- 
cgit v1.2.3


From ed6dad52298152a5c493223234e431f206c5a46b Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Thu, 12 Mar 2015 08:44:15 +0100
Subject: x86/olpc/xo1/sci: Use newly added power_supply_put API

Replace direct usage of put_device() with new API: power_supply_put().

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Reviewed-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Reviewed-by: Sebastian Reichel <sre@kernel.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 arch/x86/platform/olpc/olpc-xo1-sci.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c
index e4ed28bbf79d..7fa8b3b53bc0 100644
--- a/arch/x86/platform/olpc/olpc-xo1-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo1-sci.c
@@ -61,7 +61,7 @@ static void battery_status_changed(void)
 
 	if (psy) {
 		power_supply_changed(psy);
-		put_device(&psy->dev);
+		power_supply_put(psy);
 	}
 }
 
@@ -71,7 +71,7 @@ static void ac_status_changed(void)
 
 	if (psy) {
 		power_supply_changed(psy);
-		put_device(&psy->dev);
+		power_supply_put(psy);
 	}
 }
 
-- 
cgit v1.2.3


From 67273a1b421a12ef77bcf08b027d165f399054ae Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Thu, 12 Mar 2015 08:44:16 +0100
Subject: x86/olpc/xo15/sci: Use newly added power_supply_put API

Replace direct usage of put_device() with new API: power_supply_put().

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Reviewed-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Reviewed-by: Sebastian Reichel <sre@kernel.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 arch/x86/platform/olpc/olpc-xo15-sci.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
index 186634e9021d..55130846ac87 100644
--- a/arch/x86/platform/olpc/olpc-xo15-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -83,7 +83,7 @@ static void battery_status_changed(void)
 
 	if (psy) {
 		power_supply_changed(psy);
-		put_device(&psy->dev);
+		power_supply_put(psy);
 	}
 }
 
@@ -93,7 +93,7 @@ static void ac_status_changed(void)
 
 	if (psy) {
 		power_supply_changed(psy);
-		put_device(&psy->dev);
+		power_supply_put(psy);
 	}
 }
 
-- 
cgit v1.2.3


From c42e9902f3f1e731a7b83397acdf402eeb3237ca Mon Sep 17 00:00:00 2001
From: Ameen Ali <ameenali023@gmail.com>
Date: Fri, 13 Mar 2015 23:38:21 +0200
Subject: crypto: sha1-mb - Syntax error

fixing a syntax-error .

Signed-off-by: Ameen Ali <AmeenAli023@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha-mb/sha1_mb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c
index fd9f6b035b16..9414fd964ecd 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -828,7 +828,7 @@ static unsigned long sha1_mb_flusher(struct mcryptd_alg_cstate *cstate)
 	while (!list_empty(&cstate->work_list)) {
 		rctx = list_entry(cstate->work_list.next,
 				struct mcryptd_hash_request_ctx, waiter);
-		if time_before(cur_time, rctx->tag.expire)
+		if (time_before(cur_time, rctx->tag.expire))
 			break;
 		kernel_fpu_begin();
 		sha_ctx = (struct sha1_hash_ctx *) sha1_ctx_mgr_flush(cstate->mgr);
-- 
cgit v1.2.3


From 9b4ade226f7468bb26f98b6cd01cb5b8a05fc96d Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 21 Jan 2015 08:49:22 +0100
Subject: xen: build infrastructure for generating hypercall depending symbols

Today there are several places in the kernel which build tables
containing one entry for each possible Xen hypercall. Create an
infrastructure to be able to generate these tables at build time.

Based-on-patch-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/syscalls/Makefile | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile
index 3323c2745248..a55abb9f6c5e 100644
--- a/arch/x86/syscalls/Makefile
+++ b/arch/x86/syscalls/Makefile
@@ -19,6 +19,9 @@ quiet_cmd_syshdr = SYSHDR  $@
 quiet_cmd_systbl = SYSTBL  $@
       cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@
 
+quiet_cmd_hypercalls = HYPERCALLS $@
+      cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<,$^)
+
 syshdr_abi_unistd_32 := i386
 $(uapi)/unistd_32.h: $(syscall32) $(syshdr)
 	$(call if_changed,syshdr)
@@ -47,10 +50,16 @@ $(out)/syscalls_32.h: $(syscall32) $(systbl)
 $(out)/syscalls_64.h: $(syscall64) $(systbl)
 	$(call if_changed,systbl)
 
+$(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh
+	$(call if_changed,hypercalls)
+
+$(out)/xen-hypercalls.h: $(srctree)/include/xen/interface/xen*.h
+
 uapisyshdr-y			+= unistd_32.h unistd_64.h unistd_x32.h
 syshdr-y			+= syscalls_32.h
 syshdr-$(CONFIG_X86_64)		+= unistd_32_ia32.h unistd_64_x32.h
 syshdr-$(CONFIG_X86_64)		+= syscalls_64.h
+syshdr-$(CONFIG_XEN)		+= xen-hypercalls.h
 
 targets	+= $(uapisyshdr-y) $(syshdr-y)
 
-- 
cgit v1.2.3


From 16b12d605795e775cd80b5901c85dd2e320a6ec2 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 21 Jan 2015 08:49:23 +0100
Subject: xen: synchronize include/xen/interface/xen.h with xen

The header include/xen/interface/xen.h doesn't contain all definitions
from Xen's version of that header. Update it accordingly.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c
index 520022d1a181..8296cdf4e581 100644
--- a/arch/x86/xen/trace.c
+++ b/arch/x86/xen/trace.c
@@ -29,7 +29,7 @@ static const char *xen_hypercall_names[] = {
 	N(vcpu_op),
 	N(set_segment_base),
 	N(mmuext_op),
-	N(acm_op),
+	N(xsm_op),
 	N(nmi_op),
 	N(sched_op),
 	N(callback_op),
-- 
cgit v1.2.3


From fc903f87364d1b00890fa991a6f116e2bb38ec1e Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 21 Jan 2015 08:49:24 +0100
Subject: xen: use generated hypervisor symbols in arch/x86/xen/trace.c

Instead of manually list all hypervisor calls in arch/x86/xen/trace.c
use the auto generated list.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/trace.c | 50 ++++----------------------------------------------
 1 file changed, 4 insertions(+), 46 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c
index 8296cdf4e581..a702ec2f5931 100644
--- a/arch/x86/xen/trace.c
+++ b/arch/x86/xen/trace.c
@@ -1,54 +1,12 @@
 #include <linux/ftrace.h>
 #include <xen/interface/xen.h>
+#include <xen/interface/xen-mca.h>
 
-#define N(x)	[__HYPERVISOR_##x] = "("#x")"
+#define HYPERCALL(x)	[__HYPERVISOR_##x] = "("#x")",
 static const char *xen_hypercall_names[] = {
-	N(set_trap_table),
-	N(mmu_update),
-	N(set_gdt),
-	N(stack_switch),
-	N(set_callbacks),
-	N(fpu_taskswitch),
-	N(sched_op_compat),
-	N(dom0_op),
-	N(set_debugreg),
-	N(get_debugreg),
-	N(update_descriptor),
-	N(memory_op),
-	N(multicall),
-	N(update_va_mapping),
-	N(set_timer_op),
-	N(event_channel_op_compat),
-	N(xen_version),
-	N(console_io),
-	N(physdev_op_compat),
-	N(grant_table_op),
-	N(vm_assist),
-	N(update_va_mapping_otherdomain),
-	N(iret),
-	N(vcpu_op),
-	N(set_segment_base),
-	N(mmuext_op),
-	N(xsm_op),
-	N(nmi_op),
-	N(sched_op),
-	N(callback_op),
-	N(xenoprof_op),
-	N(event_channel_op),
-	N(physdev_op),
-	N(hvm_op),
-
-/* Architecture-specific hypercall definitions. */
-	N(arch_0),
-	N(arch_1),
-	N(arch_2),
-	N(arch_3),
-	N(arch_4),
-	N(arch_5),
-	N(arch_6),
-	N(arch_7),
+#include <asm/xen-hypercalls.h>
 };
-#undef N
+#undef HYPERCALL
 
 static const char *xen_hypercall_name(unsigned op)
 {
-- 
cgit v1.2.3


From 526abeaed4971def462f209632b88fd7b8c4a09c Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 21 Jan 2015 08:49:25 +0100
Subject: xen: use generated hypercall symbols in arch/x86/xen/xen-head.S

Instead of manually list each hypercall in arch/x86/xen/xen-head.S
use the auto generated symbol list.

This also corrects the wrong address of xen_hypercall_mca which was
located 32 bytes higher than it should.

Symbol addresses have been verified to match the correct ones via
objdump output.

Based-on-patch-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/xen-head.S | 63 ++++++++-----------------------------------------
 1 file changed, 10 insertions(+), 53 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 674b222544b7..8afdfccf6086 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -12,6 +12,8 @@
 
 #include <xen/interface/elfnote.h>
 #include <xen/interface/features.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/xen-mca.h>
 #include <asm/xen/interface.h>
 
 #ifdef CONFIG_XEN_PVH
@@ -85,59 +87,14 @@ ENTRY(xen_pvh_early_cpu_init)
 .pushsection .text
 	.balign PAGE_SIZE
 ENTRY(hypercall_page)
-#define NEXT_HYPERCALL(x) \
-	ENTRY(xen_hypercall_##x) \
-	.skip 32
-
-NEXT_HYPERCALL(set_trap_table)
-NEXT_HYPERCALL(mmu_update)
-NEXT_HYPERCALL(set_gdt)
-NEXT_HYPERCALL(stack_switch)
-NEXT_HYPERCALL(set_callbacks)
-NEXT_HYPERCALL(fpu_taskswitch)
-NEXT_HYPERCALL(sched_op_compat)
-NEXT_HYPERCALL(platform_op)
-NEXT_HYPERCALL(set_debugreg)
-NEXT_HYPERCALL(get_debugreg)
-NEXT_HYPERCALL(update_descriptor)
-NEXT_HYPERCALL(ni)
-NEXT_HYPERCALL(memory_op)
-NEXT_HYPERCALL(multicall)
-NEXT_HYPERCALL(update_va_mapping)
-NEXT_HYPERCALL(set_timer_op)
-NEXT_HYPERCALL(event_channel_op_compat)
-NEXT_HYPERCALL(xen_version)
-NEXT_HYPERCALL(console_io)
-NEXT_HYPERCALL(physdev_op_compat)
-NEXT_HYPERCALL(grant_table_op)
-NEXT_HYPERCALL(vm_assist)
-NEXT_HYPERCALL(update_va_mapping_otherdomain)
-NEXT_HYPERCALL(iret)
-NEXT_HYPERCALL(vcpu_op)
-NEXT_HYPERCALL(set_segment_base)
-NEXT_HYPERCALL(mmuext_op)
-NEXT_HYPERCALL(xsm_op)
-NEXT_HYPERCALL(nmi_op)
-NEXT_HYPERCALL(sched_op)
-NEXT_HYPERCALL(callback_op)
-NEXT_HYPERCALL(xenoprof_op)
-NEXT_HYPERCALL(event_channel_op)
-NEXT_HYPERCALL(physdev_op)
-NEXT_HYPERCALL(hvm_op)
-NEXT_HYPERCALL(sysctl)
-NEXT_HYPERCALL(domctl)
-NEXT_HYPERCALL(kexec_op)
-NEXT_HYPERCALL(tmem_op) /* 38 */
-ENTRY(xen_hypercall_rsvr)
-	.skip 320
-NEXT_HYPERCALL(mca) /* 48 */
-NEXT_HYPERCALL(arch_1)
-NEXT_HYPERCALL(arch_2)
-NEXT_HYPERCALL(arch_3)
-NEXT_HYPERCALL(arch_4)
-NEXT_HYPERCALL(arch_5)
-NEXT_HYPERCALL(arch_6)
-	.balign PAGE_SIZE
+	.skip PAGE_SIZE
+
+#define HYPERCALL(n) \
+	.equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \
+	.type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32
+#include <asm/xen-hypercalls.h>
+#undef HYPERCALL
+
 .popsection
 
 	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
-- 
cgit v1.2.3


From feb44f1f7a4ac299d1ab1c3606860e70b9b89d69 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 2 Mar 2015 12:06:23 -0500
Subject: x86/xen: Provide a "Xen PV" APIC driver to support >255 VCPUs

Instead of mangling the default APIC driver, provide a Xen PV guest
specific one that explicitly provides appropriate methods.

This allows use to report that all APIC IDs are valid, allowing dom0
to boot with more than 255 VCPUs.

Since the probe order of APIC drivers is link dependent, we add in an
late probe function to change to the Xen PV if it hadn't been done
during bootup.

Suggested-by: David Vrabel <david.vrabel@citrix.com>
Reported-by: Cathy Avery <cathy.avery@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/apic.c      | 180 +++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/xen/enlighten.c |  90 +-----------------------
 2 files changed, 181 insertions(+), 89 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 7005ced5d1ad..5e0ecaf4c336 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -7,6 +7,7 @@
 #include <xen/xen.h>
 #include <xen/interface/physdev.h>
 #include "xen-ops.h"
+#include "smp.h"
 
 static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
 {
@@ -28,7 +29,186 @@ static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
 	return 0xfd;
 }
 
+static unsigned long xen_set_apic_id(unsigned int x)
+{
+	WARN_ON(1);
+	return x;
+}
+
+static unsigned int xen_get_apic_id(unsigned long x)
+{
+	return ((x)>>24) & 0xFFu;
+}
+
+static u32 xen_apic_read(u32 reg)
+{
+	struct xen_platform_op op = {
+		.cmd = XENPF_get_cpuinfo,
+		.interface_version = XENPF_INTERFACE_VERSION,
+		.u.pcpu_info.xen_cpuid = 0,
+	};
+	int ret = 0;
+
+	/* Shouldn't need this as APIC is turned off for PV, and we only
+	 * get called on the bootup processor. But just in case. */
+	if (!xen_initial_domain() || smp_processor_id())
+		return 0;
+
+	if (reg == APIC_LVR)
+		return 0x10;
+#ifdef CONFIG_X86_32
+	if (reg == APIC_LDR)
+		return SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
+#endif
+	if (reg != APIC_ID)
+		return 0;
+
+	ret = HYPERVISOR_dom0_op(&op);
+	if (ret)
+		return 0;
+
+	return op.u.pcpu_info.apic_id << 24;
+}
+
+static void xen_apic_write(u32 reg, u32 val)
+{
+	/* Warn to see if there's any stray references */
+	WARN_ON(1);
+}
+
+static u64 xen_apic_icr_read(void)
+{
+	return 0;
+}
+
+static void xen_apic_icr_write(u32 low, u32 id)
+{
+	/* Warn to see if there's any stray references */
+	WARN_ON(1);
+}
+
+static u32 xen_safe_apic_wait_icr_idle(void)
+{
+        return 0;
+}
+
+static int xen_apic_probe_pv(void)
+{
+	if (xen_pv_domain())
+		return 1;
+
+	return 0;
+}
+
+static int xen_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+	return xen_pv_domain();
+}
+
+static int xen_id_always_valid(int apicid)
+{
+	return 1;
+}
+
+static int xen_id_always_registered(void)
+{
+	return 1;
+}
+
+static int xen_phys_pkg_id(int initial_apic_id, int index_msb)
+{
+	return initial_apic_id >> index_msb;
+}
+
+#ifdef CONFIG_X86_32
+static int xen_x86_32_early_logical_apicid(int cpu)
+{
+	/* Match with APIC_LDR read. Otherwise setup_local_APIC complains. */
+	return 1 << cpu;
+}
+#endif
+
+static void xen_noop(void)
+{
+}
+
+static void xen_silent_inquire(int apicid)
+{
+}
+
+static struct apic xen_pv_apic = {
+	.name 				= "Xen PV",
+	.probe 				= xen_apic_probe_pv,
+	.acpi_madt_oem_check		= xen_madt_oem_check,
+	.apic_id_valid 			= xen_id_always_valid,
+	.apic_id_registered 		= xen_id_always_registered,
+
+	/* .irq_delivery_mode - used in native_compose_msi_msg only */
+	/* .irq_dest_mode     - used in native_compose_msi_msg only */
+
+	.target_cpus			= default_target_cpus,
+	.disable_esr			= 0,
+	/* .dest_logical      -  default_send_IPI_ use it but we use our own. */
+	.check_apicid_used		= default_check_apicid_used, /* Used on 32-bit */
+
+	.vector_allocation_domain	= flat_vector_allocation_domain,
+	.init_apic_ldr			= xen_noop, /* setup_local_APIC calls it */
+
+	.ioapic_phys_id_map		= default_ioapic_phys_id_map, /* Used on 32-bit */
+	.setup_apic_routing		= NULL,
+	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
+	.apicid_to_cpu_present		= physid_set_mask_of_physid, /* Used on 32-bit */
+	.check_phys_apicid_present	= default_check_phys_apicid_present, /* smp_sanity_check needs it */
+	.phys_pkg_id			= xen_phys_pkg_id, /* detect_ht */
+
+	.get_apic_id 			= xen_get_apic_id,
+	.set_apic_id 			= xen_set_apic_id, /* Can be NULL on 32-bit. */
+	.apic_id_mask			= 0xFF << 24, /* Used by verify_local_APIC. Match with what xen_get_apic_id does. */
+
+	.cpu_mask_to_apicid_and		= flat_cpu_mask_to_apicid_and,
+
+#ifdef CONFIG_SMP
+	.send_IPI_mask 			= xen_send_IPI_mask,
+	.send_IPI_mask_allbutself 	= xen_send_IPI_mask_allbutself,
+	.send_IPI_allbutself 		= xen_send_IPI_allbutself,
+	.send_IPI_all 			= xen_send_IPI_all,
+	.send_IPI_self 			= xen_send_IPI_self,
+#endif
+	/* .wait_for_init_deassert- used  by AP bootup - smp_callin which we don't use */
+	.inquire_remote_apic		= xen_silent_inquire,
+
+	.read				= xen_apic_read,
+	.write				= xen_apic_write,
+	.eoi_write			= xen_apic_write,
+
+	.icr_read 			= xen_apic_icr_read,
+	.icr_write 			= xen_apic_icr_write,
+	.wait_icr_idle 			= xen_noop,
+	.safe_wait_icr_idle 		= xen_safe_apic_wait_icr_idle,
+
+#ifdef CONFIG_X86_32
+	/* generic_processor_info and setup_local_APIC. */
+	.x86_32_early_logical_apicid	= xen_x86_32_early_logical_apicid,
+#endif
+};
+
+static void __init xen_apic_check(void)
+{
+	if (apic == &xen_pv_apic)
+		return;
+
+	pr_info("Switched APIC routing from %s to %s.\n", apic->name,
+		xen_pv_apic.name);
+	apic = &xen_pv_apic;
+}
 void __init xen_init_apic(void)
 {
 	x86_io_apic_ops.read = xen_io_apic_read;
+	/* On PV guests the APIC CPUID bit is disabled so none of the
+	 * routines end up executing. */
+	if (!xen_initial_domain())
+		apic = &xen_pv_apic;
+
+	x86_platform.apic_post_init = xen_apic_check;
 }
+apic_driver(xen_pv_apic);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5240f563076d..b9a227284149 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -927,92 +927,6 @@ static void xen_io_delay(void)
 {
 }
 
-#ifdef CONFIG_X86_LOCAL_APIC
-static unsigned long xen_set_apic_id(unsigned int x)
-{
-	WARN_ON(1);
-	return x;
-}
-static unsigned int xen_get_apic_id(unsigned long x)
-{
-	return ((x)>>24) & 0xFFu;
-}
-static u32 xen_apic_read(u32 reg)
-{
-	struct xen_platform_op op = {
-		.cmd = XENPF_get_cpuinfo,
-		.interface_version = XENPF_INTERFACE_VERSION,
-		.u.pcpu_info.xen_cpuid = 0,
-	};
-	int ret = 0;
-
-	/* Shouldn't need this as APIC is turned off for PV, and we only
-	 * get called on the bootup processor. But just in case. */
-	if (!xen_initial_domain() || smp_processor_id())
-		return 0;
-
-	if (reg == APIC_LVR)
-		return 0x10;
-
-	if (reg != APIC_ID)
-		return 0;
-
-	ret = HYPERVISOR_dom0_op(&op);
-	if (ret)
-		return 0;
-
-	return op.u.pcpu_info.apic_id << 24;
-}
-
-static void xen_apic_write(u32 reg, u32 val)
-{
-	/* Warn to see if there's any stray references */
-	WARN_ON(1);
-}
-
-static u64 xen_apic_icr_read(void)
-{
-	return 0;
-}
-
-static void xen_apic_icr_write(u32 low, u32 id)
-{
-	/* Warn to see if there's any stray references */
-	WARN_ON(1);
-}
-
-static void xen_apic_wait_icr_idle(void)
-{
-        return;
-}
-
-static u32 xen_safe_apic_wait_icr_idle(void)
-{
-        return 0;
-}
-
-static void set_xen_basic_apic_ops(void)
-{
-	apic->read = xen_apic_read;
-	apic->write = xen_apic_write;
-	apic->icr_read = xen_apic_icr_read;
-	apic->icr_write = xen_apic_icr_write;
-	apic->wait_icr_idle = xen_apic_wait_icr_idle;
-	apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle;
-	apic->set_apic_id = xen_set_apic_id;
-	apic->get_apic_id = xen_get_apic_id;
-
-#ifdef CONFIG_SMP
-	apic->send_IPI_allbutself = xen_send_IPI_allbutself;
-	apic->send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself;
-	apic->send_IPI_mask = xen_send_IPI_mask;
-	apic->send_IPI_all = xen_send_IPI_all;
-	apic->send_IPI_self = xen_send_IPI_self;
-#endif
-}
-
-#endif
-
 static void xen_clts(void)
 {
 	struct multicall_space mcs;
@@ -1618,7 +1532,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
 	/*
 	 * set up the basic apic ops.
 	 */
-	set_xen_basic_apic_ops();
+	xen_init_apic();
 #endif
 
 	if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
@@ -1731,8 +1645,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
 		if (HYPERVISOR_dom0_op(&op) == 0)
 			boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags;
 
-		xen_init_apic();
-
 		/* Make sure ACS will be enabled */
 		pci_request_acs();
 
-- 
cgit v1.2.3


From b3b06c7eb7820cea5c15f9faa4964044284c5399 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 3 Mar 2015 16:12:12 -0500
Subject: x86/xen/apic: WARN with details.

We should not be writting to the APIC registers under
Xen PV. But if we do instead of just giving an blanket
warning - include some details to help troubleshoot.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/apic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 5e0ecaf4c336..70e060ad879a 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -73,7 +73,7 @@ static u32 xen_apic_read(u32 reg)
 static void xen_apic_write(u32 reg, u32 val)
 {
 	/* Warn to see if there's any stray references */
-	WARN_ON(1);
+	WARN(1,"register: %x, value: %x\n", reg, val);
 }
 
 static u64 xen_apic_icr_read(void)
-- 
cgit v1.2.3


From 628c28eefd6f2cef03b212081b466ae43fd093a3 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Wed, 11 Mar 2015 14:49:56 +0000
Subject: xen: unify foreign GFN map/unmap for auto-xlated physmap guests

Auto-translated physmap guests (arm, arm64 and x86 PVHVM/PVH) map and
unmap foreign GFNs using the same method (updating the physmap).
Unify the two arm and x86 implementations into one commont one.

Note that on arm and arm64, the correct error code will be returned
(instead of always -EFAULT) and map/unmap failure warnings are no
longer printed.  These changes are required if the foreign domain is
paging (-ENOENT failures are expected and must be propagated up to the
caller).

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/x86/xen/mmu.c | 110 ++---------------------------------------------------
 1 file changed, 3 insertions(+), 107 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index adca9e2b6553..3d536a56ddf1 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2436,95 +2436,6 @@ void __init xen_hvm_init_mmu_ops(void)
 }
 #endif
 
-#ifdef CONFIG_XEN_PVH
-/*
- * Map foreign gfn (fgfn), to local pfn (lpfn). This for the user
- * space creating new guest on pvh dom0 and needing to map domU pages.
- */
-static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn,
-			    unsigned int domid)
-{
-	int rc, err = 0;
-	xen_pfn_t gpfn = lpfn;
-	xen_ulong_t idx = fgfn;
-
-	struct xen_add_to_physmap_range xatp = {
-		.domid = DOMID_SELF,
-		.foreign_domid = domid,
-		.size = 1,
-		.space = XENMAPSPACE_gmfn_foreign,
-	};
-	set_xen_guest_handle(xatp.idxs, &idx);
-	set_xen_guest_handle(xatp.gpfns, &gpfn);
-	set_xen_guest_handle(xatp.errs, &err);
-
-	rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
-	if (rc < 0)
-		return rc;
-	return err;
-}
-
-static int xlate_remove_from_p2m(unsigned long spfn, int count)
-{
-	struct xen_remove_from_physmap xrp;
-	int i, rc;
-
-	for (i = 0; i < count; i++) {
-		xrp.domid = DOMID_SELF;
-		xrp.gpfn = spfn+i;
-		rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
-		if (rc)
-			break;
-	}
-	return rc;
-}
-
-struct xlate_remap_data {
-	unsigned long fgfn; /* foreign domain's gfn */
-	pgprot_t prot;
-	domid_t  domid;
-	int index;
-	struct page **pages;
-};
-
-static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
-			    void *data)
-{
-	int rc;
-	struct xlate_remap_data *remap = data;
-	unsigned long pfn = page_to_pfn(remap->pages[remap->index++]);
-	pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot));
-
-	rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid);
-	if (rc)
-		return rc;
-	native_set_pte(ptep, pteval);
-
-	return 0;
-}
-
-static int xlate_remap_gfn_range(struct vm_area_struct *vma,
-				 unsigned long addr, unsigned long mfn,
-				 int nr, pgprot_t prot, unsigned domid,
-				 struct page **pages)
-{
-	int err;
-	struct xlate_remap_data pvhdata;
-
-	BUG_ON(!pages);
-
-	pvhdata.fgfn = mfn;
-	pvhdata.prot = prot;
-	pvhdata.domid = domid;
-	pvhdata.index = 0;
-	pvhdata.pages = pages;
-	err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT,
-				  xlate_map_pte_fn, &pvhdata);
-	flush_tlb_all();
-	return err;
-}
-#endif
-
 #define REMAP_BATCH_SIZE 16
 
 struct remap_data {
@@ -2564,8 +2475,8 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
 	if (xen_feature(XENFEAT_auto_translated_physmap)) {
 #ifdef CONFIG_XEN_PVH
 		/* We need to update the local page tables and the xen HAP */
-		return xlate_remap_gfn_range(vma, addr, mfn, nr, prot,
-					     domid, pages);
+		return xen_xlate_remap_gfn_range(vma, addr, mfn, nr, prot,
+						 domid, pages);
 #else
 		return -EINVAL;
 #endif
@@ -2609,22 +2520,7 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
 		return 0;
 
 #ifdef CONFIG_XEN_PVH
-	while (numpgs--) {
-		/*
-		 * The mmu has already cleaned up the process mmu
-		 * resources at this point (lookup_address will return
-		 * NULL).
-		 */
-		unsigned long pfn = page_to_pfn(pages[numpgs]);
-
-		xlate_remove_from_p2m(pfn, 1);
-	}
-	/*
-	 * We don't need to flush tlbs because as part of
-	 * xlate_remove_from_p2m, the hypervisor will do tlb flushes
-	 * after removing the p2m entries from the EPT/NPT
-	 */
-	return 0;
+	return xen_xlate_unmap_gfn_range(vma, numpgs, pages);
 #else
 	return -EINVAL;
 #endif
-- 
cgit v1.2.3


From 4e8c0c8c4bf3a5b5c98046e146ab3884bf7a7d0e Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Wed, 11 Mar 2015 14:49:57 +0000
Subject: xen/privcmd: improve performance of MMAPBATCH_V2

Make the IOCTL_PRIVCMD_MMAPBATCH_V2 (and older V1 version) map
multiple frames at a time rather than one at a time, despite the pages
being non-consecutive GFNs.

xen_remap_foreign_mfn_array() is added which maps an array of GFNs
(instead of a consecutive range of GFNs).

Since per-frame errors are returned in an array, privcmd must set the
MMAPBATCH_V1 error bits as part of the "report errors" phase, after
all the frames are mapped.

Migrate times are significantly improved (when using a PV toolstack
domain).  For example, for an idle 12 GiB PV guest:

        Before     After
  real  0m38.179s  0m26.868s
  user  0m15.096s  0m13.652s
  sys   0m28.988s  0m18.732s

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/x86/xen/mmu.c | 101 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 82 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3d536a56ddf1..29b3be230ede 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2439,7 +2439,8 @@ void __init xen_hvm_init_mmu_ops(void)
 #define REMAP_BATCH_SIZE 16
 
 struct remap_data {
-	unsigned long mfn;
+	xen_pfn_t *mfn;
+	bool contiguous;
 	pgprot_t prot;
 	struct mmu_update *mmu_update;
 };
@@ -2448,7 +2449,14 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
 				 unsigned long addr, void *data)
 {
 	struct remap_data *rmd = data;
-	pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot));
+	pte_t pte = pte_mkspecial(mfn_pte(*rmd->mfn, rmd->prot));
+
+	/* If we have a contigious range, just update the mfn itself,
+	   else update pointer to be "next mfn". */
+	if (rmd->contiguous)
+		(*rmd->mfn)++;
+	else
+		rmd->mfn++;
 
 	rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
 	rmd->mmu_update->val = pte_val_ma(pte);
@@ -2457,26 +2465,26 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
 	return 0;
 }
 
-int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
-			       unsigned long addr,
-			       xen_pfn_t mfn, int nr,
-			       pgprot_t prot, unsigned domid,
-			       struct page **pages)
-
+static int do_remap_mfn(struct vm_area_struct *vma,
+			unsigned long addr,
+			xen_pfn_t *mfn, int nr,
+			int *err_ptr, pgprot_t prot,
+			unsigned domid,
+			struct page **pages)
 {
+	int err = 0;
 	struct remap_data rmd;
 	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
-	int batch;
 	unsigned long range;
-	int err = 0;
+	int mapped = 0;
 
 	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
 
 	if (xen_feature(XENFEAT_auto_translated_physmap)) {
 #ifdef CONFIG_XEN_PVH
 		/* We need to update the local page tables and the xen HAP */
-		return xen_xlate_remap_gfn_range(vma, addr, mfn, nr, prot,
-						 domid, pages);
+		return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr,
+						 prot, domid, pages);
 #else
 		return -EINVAL;
 #endif
@@ -2484,9 +2492,15 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
 
 	rmd.mfn = mfn;
 	rmd.prot = prot;
+	/* We use the err_ptr to indicate if there we are doing a contigious
+	 * mapping or a discontigious mapping. */
+	rmd.contiguous = !err_ptr;
 
 	while (nr) {
-		batch = min(REMAP_BATCH_SIZE, nr);
+		int index = 0;
+		int done = 0;
+		int batch = min(REMAP_BATCH_SIZE, nr);
+		int batch_left = batch;
 		range = (unsigned long)batch << PAGE_SHIFT;
 
 		rmd.mmu_update = mmu_update;
@@ -2495,23 +2509,72 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
 		if (err)
 			goto out;
 
-		err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid);
-		if (err < 0)
-			goto out;
+		/* We record the error for each page that gives an error, but
+		 * continue mapping until the whole set is done */
+		do {
+			int i;
+
+			err = HYPERVISOR_mmu_update(&mmu_update[index],
+						    batch_left, &done, domid);
+
+			/*
+			 * @err_ptr may be the same buffer as @mfn, so
+			 * only clear it after each chunk of @mfn is
+			 * used.
+			 */
+			if (err_ptr) {
+				for (i = index; i < index + done; i++)
+					err_ptr[i] = 0;
+			}
+			if (err < 0) {
+				if (!err_ptr)
+					goto out;
+				err_ptr[i] = err;
+				done++; /* Skip failed frame. */
+			} else
+				mapped += done;
+			batch_left -= done;
+			index += done;
+		} while (batch_left);
 
 		nr -= batch;
 		addr += range;
+		if (err_ptr)
+			err_ptr += batch;
 	}
-
-	err = 0;
 out:
 
 	xen_flush_tlb_all();
 
-	return err;
+	return err < 0 ? err : mapped;
+}
+
+int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+			       unsigned long addr,
+			       xen_pfn_t mfn, int nr,
+			       pgprot_t prot, unsigned domid,
+			       struct page **pages)
+{
+	return do_remap_mfn(vma, addr, &mfn, nr, NULL, prot, domid, pages);
 }
 EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
 
+int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
+			       unsigned long addr,
+			       xen_pfn_t *mfn, int nr,
+			       int *err_ptr, pgprot_t prot,
+			       unsigned domid, struct page **pages)
+{
+	/* We BUG_ON because it's a programmer error to pass a NULL err_ptr,
+	 * and the consequences later is quite hard to detect what the actual
+	 * cause of "wrong memory was mapped in".
+	 */
+	BUG_ON(err_ptr == NULL);
+	return do_remap_mfn(vma, addr, mfn, nr, err_ptr, prot, domid, pages);
+}
+EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
+
+
 /* Returns: 0 success */
 int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
 			       int numpgs, struct page **pages)
-- 
cgit v1.2.3


From c80e5c0c23ce2282476fdc64c4b5e3d3a40723fd Mon Sep 17 00:00:00 2001
From: Eugene Shatokhin <eugene.shatokhin@rosalab.ru>
Date: Tue, 17 Mar 2015 19:09:18 +0900
Subject: kprobes/x86: Return correct length in __copy_instruction()

On x86-64, __copy_instruction() always returns 0 (error) if the
instruction uses %rip-relative addressing. This is because
kernel_insn_init() is called the second time for 'insn' instance
in such cases and sets all its fields to 0.

Because of this, trying to place a kprobe on such instruction
will fail, register_kprobe() will return -EINVAL.

This patch fixes the problem.

Signed-off-by: Eugene Shatokhin <eugene.shatokhin@rosalab.ru>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Link: http://lkml.kernel.org/r/20150317100918.28349.94654.stgit@localhost.localdomain
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/kprobes/core.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 4e3d5a9621fe..03189d86357d 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -354,6 +354,7 @@ int __copy_instruction(u8 *dest, u8 *src)
 {
 	struct insn insn;
 	kprobe_opcode_t buf[MAX_INSN_SIZE];
+	int length;
 	unsigned long recovered_insn =
 		recover_probed_instruction(buf, (unsigned long)src);
 
@@ -361,16 +362,18 @@ int __copy_instruction(u8 *dest, u8 *src)
 		return 0;
 	kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
 	insn_get_length(&insn);
+	length = insn.length;
+
 	/* Another subsystem puts a breakpoint, failed to recover */
 	if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
 		return 0;
-	memcpy(dest, insn.kaddr, insn.length);
+	memcpy(dest, insn.kaddr, length);
 
 #ifdef CONFIG_X86_64
 	if (insn_rip_relative(&insn)) {
 		s64 newdisp;
 		u8 *disp;
-		kernel_insn_init(&insn, dest, insn.length);
+		kernel_insn_init(&insn, dest, length);
 		insn_get_displacement(&insn);
 		/*
 		 * The copied instruction uses the %rip-relative addressing
@@ -394,7 +397,7 @@ int __copy_instruction(u8 *dest, u8 *src)
 		*(s32 *) disp = (s32) newdisp;
 	}
 #endif
-	return insn.length;
+	return length;
 }
 
 static int arch_copy_kprobe(struct kprobe *p)
-- 
cgit v1.2.3


From 431d452af13720463dda498999b2e9a08729c03a Mon Sep 17 00:00:00 2001
From: Zhonghui Fu <zhonghui.fu@linux.intel.com>
Date: Wed, 18 Mar 2015 15:54:27 +0100
Subject: PM / sleep: add pm-trace support for suspending phase

Occasionally, the system can't come back up after suspend/resume
due to problems of device suspending phase. This patch make
PM_TRACE infrastructure cover device suspending phase of
suspend/resume process, and the information in RTC can tell
developers which device suspending function make system hang.

Signed-off-by: Zhonghui Fu <zhonghui.fu@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/include/asm/pm-trace.h     | 23 +++++++++++++++++++++++
 arch/x86/include/asm/resume-trace.h | 21 ---------------------
 2 files changed, 23 insertions(+), 21 deletions(-)
 create mode 100644 arch/x86/include/asm/pm-trace.h
 delete mode 100644 arch/x86/include/asm/resume-trace.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pm-trace.h b/arch/x86/include/asm/pm-trace.h
new file mode 100644
index 000000000000..7b7ac42c3661
--- /dev/null
+++ b/arch/x86/include/asm/pm-trace.h
@@ -0,0 +1,23 @@
+#ifndef _ASM_X86_PM_TRACE_H
+#define _ASM_X86_PM_TRACE_H
+
+#include <asm/asm.h>
+
+#define TRACE_RESUME(user)					\
+do {								\
+	if (pm_trace_enabled) {					\
+		const void *tracedata;				\
+		asm volatile(_ASM_MOV " $1f,%0\n"		\
+			     ".section .tracedata,\"a\"\n"	\
+			     "1:\t.word %c1\n\t"		\
+			     _ASM_PTR " %c2\n"			\
+			     ".previous"			\
+			     :"=r" (tracedata)			\
+			     : "i" (__LINE__), "i" (__FILE__));	\
+		generate_pm_trace(tracedata, user);		\
+	}							\
+} while (0)
+
+#define TRACE_SUSPEND(user)	TRACE_RESUME(user)
+
+#endif /* _ASM_X86_PM_TRACE_H */
diff --git a/arch/x86/include/asm/resume-trace.h b/arch/x86/include/asm/resume-trace.h
deleted file mode 100644
index 3ff1c2cb1da5..000000000000
--- a/arch/x86/include/asm/resume-trace.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef _ASM_X86_RESUME_TRACE_H
-#define _ASM_X86_RESUME_TRACE_H
-
-#include <asm/asm.h>
-
-#define TRACE_RESUME(user)					\
-do {								\
-	if (pm_trace_enabled) {					\
-		const void *tracedata;				\
-		asm volatile(_ASM_MOV " $1f,%0\n"		\
-			     ".section .tracedata,\"a\"\n"	\
-			     "1:\t.word %c1\n\t"		\
-			     _ASM_PTR " %c2\n"			\
-			     ".previous"			\
-			     :"=r" (tracedata)			\
-			     : "i" (__LINE__), "i" (__FILE__));	\
-		generate_resume_trace(tracedata, user);		\
-	}							\
-} while (0)
-
-#endif /* _ASM_X86_RESUME_TRACE_H */
-- 
cgit v1.2.3


From b97ea289cf6aff8d4cbcefe2b707bb9b00a73c73 Mon Sep 17 00:00:00 2001
From: Yijing Wang <wangyijing@huawei.com>
Date: Mon, 16 Mar 2015 11:18:56 +0800
Subject: PCI: Assign resources before drivers claim devices
 (pci_scan_root_bus())

Previously, pci_scan_root_bus() created a root PCI bus, enumerated the
devices on it, and called pci_bus_add_devices(), which made the devices
available for drivers to claim them.

Most callers assigned resources to devices after pci_scan_root_bus()
returns, which may be after drivers have claimed the devices.  This is
incorrect; the PCI core should not change device resources while a driver
is managing the device.

Remove pci_bus_add_devices() from pci_scan_root_bus() and do it after any
resource assignment in the callers.

Note that ARM's pci_common_init_dev() already called pci_bus_add_devices()
after pci_scan_root_bus(), so we only need to remove the first call:

  pci_common_init_dev
    pcibios_init_hw
      pci_scan_root_bus
        pci_bus_add_devices        # first call
    pci_bus_assign_resources
    pci_bus_add_devices            # second call

[bhelgaas: changelog, drop "root_bus" var in alpha common_init_pci(),
return failure earlier in mn10300, add "return" in x86 pcibios_scan_root(),
return early if xtensa platform_pcibios_fixup() fails]
Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
CC: Richard Henderson <rth@twiddle.net>
CC: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
CC: Matt Turner <mattst88@gmail.com>
CC: David Howells <dhowells@redhat.com>
CC: Tony Luck <tony.luck@intel.com>
CC: Michal Simek <monstr@monstr.eu>
CC: Ralf Baechle <ralf@linux-mips.org>
CC: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
CC: Sebastian Ott <sebott@linux.vnet.ibm.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: Chris Metcalf <cmetcalf@ezchip.com>
CC: Chris Zankel <chris@zankel.net>
CC: Max Filippov <jcmvbkbc@gmail.com>
CC: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/pci/common.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 3d2612b68694..95a0ba70376b 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -490,7 +490,9 @@ void pcibios_scan_root(int busnum)
 	if (!bus) {
 		pci_free_resource_list(&resources);
 		kfree(sd);
+		return;
 	}
+	pci_bus_add_devices(bus);
 }
 
 void __init pcibios_set_cache_line_size(void)
-- 
cgit v1.2.3


From 88ad1a147e2c84d33cb50f5ebff1ece5e0cd4383 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 20 Mar 2015 14:18:12 +1030
Subject: lguest: fix pending interrupt test.

Denys says:
  TEST with zero will always set ZF. Thus, "jnz send_interrupts" never jumps.

We get interrupts regularly enough that this didn't cause immediate problems.

Reported-by: Denys Vlasenko <vda.linux@googlemail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/lguest/head_32.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S
index 6ddfe4fc23c3..05b0a85507ce 100644
--- a/arch/x86/lguest/head_32.S
+++ b/arch/x86/lguest/head_32.S
@@ -84,7 +84,7 @@ ENTRY(lg_irq_enable)
 	 * set lguest_data.irq_pending to X86_EFLAGS_IF.  If it's not zero, we
 	 * jump to send_interrupts, otherwise we're done.
 	 */
-	testl $0, lguest_data+LGUEST_DATA_irq_pending
+	cmpl $0, lguest_data+LGUEST_DATA_irq_pending
 	jnz send_interrupts
 	/*
 	 * One cool thing about x86 is that you can do many things without using
-- 
cgit v1.2.3


From 4e16ed99416ef569a89782a7234f95007919fadd Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Thu, 26 Feb 2015 18:47:00 +0000
Subject: perf/x86/intel: Fix Makefile to actually build the cqm driver

Someone fat fingered a merge conflict and lost the Makefile hunk.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <acme@redhat.com>
Cc: <hpa@zytor.com>
Cc: <jolsa@redhat.com>
Cc: <kanaka.d.juvva@intel.com>
Cc: <tglx@linutronix.de>
Cc: <torvalds@linux-foundation.org>
Cc: <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/1424976420.15321.35.camel@mfleming-mobl1.ger.corp.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 80091ae54c2b..6c1ca139f736 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd_iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o perf_event_intel_cqm.o
 
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore.o \
 					   perf_event_intel_uncore_snb.o \
-- 
cgit v1.2.3


From 50f16a8bf9d7a92c437ed1867d0f7e1dc6a9aca9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 5 Mar 2015 22:10:19 +0100
Subject: perf: Remove type specific target pointers

The only reason CQM had to use a hard-coded pmu type was so it could use
cqm_target in hw_perf_event.

Do away with the {tp,bp,cqm}_target pointers and provide a non type
specific one.

This allows us to do away with that silly pmu type as well.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Vince Weaver <vince@deater.net>
Cc: acme@kernel.org
Cc: acme@redhat.com
Cc: hpa@zytor.com
Cc: jolsa@redhat.com
Cc: kanaka.d.juvva@intel.com
Cc: matt.fleming@intel.com
Cc: tglx@linutronix.de
Cc: torvalds@linux-foundation.org
Cc: vikas.shivappa@linux.intel.com
Link: http://lkml.kernel.org/r/20150305211019.GU21418@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_cqm.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 9a8ef8376fcd..e4d1b8b738fa 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -263,7 +263,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b)
 	/*
 	 * Events that target same task are placed into the same cache group.
 	 */
-	if (a->hw.cqm_target == b->hw.cqm_target)
+	if (a->hw.target == b->hw.target)
 		return true;
 
 	/*
@@ -279,7 +279,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b)
 static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
 {
 	if (event->attach_state & PERF_ATTACH_TASK)
-		return perf_cgroup_from_task(event->hw.cqm_target);
+		return perf_cgroup_from_task(event->hw.target);
 
 	return event->cgrp;
 }
@@ -1365,8 +1365,7 @@ static int __init intel_cqm_init(void)
 
 	__perf_cpu_notifier(intel_cqm_cpu_notifier);
 
-	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm",
-				PERF_TYPE_INTEL_CQM);
+	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
 	if (ret)
 		pr_err("Intel CQM perf registration failed: %d\n", ret);
 	else
-- 
cgit v1.2.3


From 41f055d49cb04d648a89a2cb6d57c94ff86d5bb6 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Tue, 24 Mar 2015 11:51:38 +1030
Subject: lguest: rename i386_head.S in the comments

i386_head.S renamed to the head_32.S, let's update it in
the comments too.

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/lguest/boot.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index ac4453d8520e..543510a2f9e0 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -262,7 +262,7 @@ PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl);
 PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable);
 /*:*/
 
-/* These are in i386_head.S */
+/* These are in head_32.S */
 extern void lg_irq_enable(void);
 extern void lg_restore_fl(unsigned long flags);
 
@@ -1366,7 +1366,7 @@ static void lguest_restart(char *reason)
  * fit comfortably.
  *
  * First we need assembly templates of each of the patchable Guest operations,
- * and these are in i386_head.S.
+ * and these are in head_32.S.
  */
 
 /*G:060 We construct a table from the assembler templates: */
-- 
cgit v1.2.3


From 7042cb4eb30967b5eb9eeba04907882f04d6b6e5 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Tue, 24 Mar 2015 11:51:39 +1030
Subject: lguest: simplify lguest_iret

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: lguest@lists.ozlabs.org
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/lguest/head_32.S | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S
index 05b0a85507ce..81678bf0fcb7 100644
--- a/arch/x86/lguest/head_32.S
+++ b/arch/x86/lguest/head_32.S
@@ -168,29 +168,28 @@ ENTRY(lg_restore_fl)
  * So we have to copy eflags from the stack to lguest_data.irq_enabled before
  * we do the "iret".
  *
- * There are two problems with this: firstly, we need to use a register to do
- * the copy and secondly, the whole thing needs to be atomic.  The first
- * problem is easy to solve: push %eax on the stack so we can use it, and then
- * restore it at the end just before the real "iret".
+ * There are two problems with this: firstly, we can't clobber any registers
+ * and secondly, the whole thing needs to be atomic.  The first problem
+ * is solved by using "push memory"/"pop memory" instruction pair for copying.
  *
  * The second is harder: copying eflags to lguest_data.irq_enabled will turn
  * interrupts on before we're finished, so we could be interrupted before we
  * return to userspace or wherever.  Our solution to this is to surround the
  * code with lguest_noirq_start: and lguest_noirq_end: labels.  We tell the
  * Host that it is *never* to interrupt us there, even if interrupts seem to be
- * enabled.
+ * enabled. (It's not necessary to protect pop instruction, since
+ * data gets updated only after it completes, so we end up surrounding
+ * just one instruction, iret).
  */
 ENTRY(lguest_iret)
-	pushl	%eax
-	movl	12(%esp), %eax
-lguest_noirq_start:
+	pushl	2*4(%esp)
 	/*
 	 * Note the %ss: segment prefix here.  Normal data accesses use the
 	 * "ds" segment, but that will have already been restored for whatever
 	 * we're returning to (such as userspace): we can't trust it.  The %ss:
 	 * prefix makes sure we use the stack segment, which is still valid.
 	 */
-	movl	%eax,%ss:lguest_data+LGUEST_DATA_irq_enabled
-	popl	%eax
+	popl	%ss:lguest_data+LGUEST_DATA_irq_enabled
+lguest_noirq_start:
 	iret
 lguest_noirq_end:
-- 
cgit v1.2.3


From 2f921b5bb0511fb698681d8ef35c48be7a9116bf Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 24 Mar 2015 11:51:39 +1030
Subject: lguest: suppress interrupts for single insn, not range.

The last patch reduced our interrupt-suppression region to one address,
so simplify the code somewhat.

Also, remove the obsolete undefined instruction ranges and the comment
which refers to lguest_guest.S instead of head_32.S.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/include/asm/lguest.h |  7 ++-----
 arch/x86/lguest/boot.c        |  3 +--
 arch/x86/lguest/head_32.S     | 15 ++++++---------
 3 files changed, 9 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index e2d4a4afa8c3..3bbc07a57a31 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -20,13 +20,10 @@ extern unsigned long switcher_addr;
 /* Found in switcher.S */
 extern unsigned long default_idt_entries[];
 
-/* Declarations for definitions in lguest_guest.S */
-extern char lguest_noirq_start[], lguest_noirq_end[];
+/* Declarations for definitions in arch/x86/lguest/head_32.S */
+extern char lguest_noirq_iret[];
 extern const char lgstart_cli[], lgend_cli[];
-extern const char lgstart_sti[], lgend_sti[];
-extern const char lgstart_popf[], lgend_popf[];
 extern const char lgstart_pushf[], lgend_pushf[];
-extern const char lgstart_iret[], lgend_iret[];
 
 extern void lguest_iret(void);
 extern void lguest_init(void);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 543510a2f9e0..13616d708389 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -87,8 +87,7 @@
 
 struct lguest_data lguest_data = {
 	.hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
-	.noirq_start = (u32)lguest_noirq_start,
-	.noirq_end = (u32)lguest_noirq_end,
+	.noirq_iret = (u32)lguest_noirq_iret,
 	.kernel_address = PAGE_OFFSET,
 	.blocked_interrupts = { 1 }, /* Block timer interrupts */
 	.syscall_vec = SYSCALL_VECTOR,
diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S
index 81678bf0fcb7..d5ae63f5ec5d 100644
--- a/arch/x86/lguest/head_32.S
+++ b/arch/x86/lguest/head_32.S
@@ -133,9 +133,8 @@ ENTRY(lg_restore_fl)
 	ret
 /*:*/
 
-/* These demark the EIP range where host should never deliver interrupts. */
-.global lguest_noirq_start
-.global lguest_noirq_end
+/* These demark the EIP where host should never deliver interrupts. */
+.global lguest_noirq_iret
 
 /*M:004
  * When the Host reflects a trap or injects an interrupt into the Guest, it
@@ -174,12 +173,11 @@ ENTRY(lg_restore_fl)
  *
  * The second is harder: copying eflags to lguest_data.irq_enabled will turn
  * interrupts on before we're finished, so we could be interrupted before we
- * return to userspace or wherever.  Our solution to this is to surround the
- * code with lguest_noirq_start: and lguest_noirq_end: labels.  We tell the
+ * return to userspace or wherever.  Our solution to this is to tell the
  * Host that it is *never* to interrupt us there, even if interrupts seem to be
  * enabled. (It's not necessary to protect pop instruction, since
- * data gets updated only after it completes, so we end up surrounding
- * just one instruction, iret).
+ * data gets updated only after it completes, so we only need to protect
+ * one instruction, iret).
  */
 ENTRY(lguest_iret)
 	pushl	2*4(%esp)
@@ -190,6 +188,5 @@ ENTRY(lguest_iret)
 	 * prefix makes sure we use the stack segment, which is still valid.
 	 */
 	popl	%ss:lguest_data+LGUEST_DATA_irq_enabled
-lguest_noirq_start:
+lguest_noirq_iret:
 	iret
-lguest_noirq_end:
-- 
cgit v1.2.3


From 6e0a0ea12962a2175a9f47621f9fe7a4c866cb12 Mon Sep 17 00:00:00 2001
From: Graeme Gregory <graeme.gregory@linaro.org>
Date: Tue, 24 Mar 2015 14:02:39 +0000
Subject: ACPI / sleep: Introduce CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT

ACPI 5.1 does not currently support S states for ARM64 hardware but
ACPI code will call acpi_target_system_state() and acpi_sleep_init()
for device power management, so introduce
CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT and select it for x86 and
ia64 only to make sleep functions available, and also introduce stub
function to allow other drivers to function until S states are defined
for ARM64.

It will be no functional change for x86 and IA64.

Suggested-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Graeme Gregory <graeme.gregory@linaro.org>
Signed-off-by: Tomasz Nowicki <tomasz.nowicki@linaro.org>
Signed-off-by: Hanjun Guo <hanjun.guo@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b7d31ca55187..c3ea9f9a29d1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -22,6 +22,7 @@ config X86_64
 ### Arch settings
 config X86
 	def_bool y
+	select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
 	select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
 	select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
 	select ARCH_HAS_FAST_MULTIPLIER
-- 
cgit v1.2.3


From 828aef376d7a129547bc4ebb949965040177e3da Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Tue, 24 Mar 2015 14:02:46 +0000
Subject: ACPI / processor: Introduce phys_cpuid_t for CPU hardware ID

CPU hardware ID (phys_id) is defined as u32 in structure acpi_processor,
but phys_id is used as int in acpi processor driver, so it will lead to
some inconsistence for the drivers.

Furthermore, to cater for ACPI arch ports that implement 64 bits CPU
ids a generic CPU physical id type is required.

So introduce typedef u32 phys_cpuid_t in a common file, and introduce
a macro PHYS_CPUID_INVALID as (phys_cpuid_t)(-1) if it's not defined
by other archs, this will solve the inconsistence in acpi processor driver,
and will prepare for the ACPI on ARM64 for the 64 bit CPU hardware ID
in the following patch.

CC: Rafael J Wysocki <rjw@rjwysocki.net>
Suggested-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Reviewed-by: Grant Likely <grant.likely@linaro.org>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
[hj: reworked cpu physid map return codes]
Signed-off-by: Hanjun Guo <hanjun.guo@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/x86/kernel/acpi/boot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 3d525c6124f6..e4f8582eb756 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -757,7 +757,7 @@ static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
 }
 
 /* wrapper to silence section mismatch warning */
-int __ref acpi_map_cpu(acpi_handle handle, int physid, int *pcpu)
+int __ref acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu)
 {
 	return _acpi_map_lsapic(handle, physid, pcpu);
 }
-- 
cgit v1.2.3


From 0f1b5ca240c65ed9533f193720f337bf24fb2f2f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 17 Feb 2015 18:18:04 -0800
Subject: perf/x86/intel: Add new cache events table for Haswell

Haswell offcore events are quite different from Sandy Bridge.
Add a new table to handle Haswell properly.

Note that the offcore bits listed in the SDM are not quite correct
(this is currently being fixed). An uptodate list of bits is
in the patch.

The basic setup is similar to Sandy Bridge. The prefetch columns
have been removed, as prefetch counting is not very reliable
on Haswell. One L1 event that is not in the event list anymore
has been also removed.

- data reads do not include code reads (comparable to earlier Sandy Bridge tables)
- data counts include speculative execution (except L1 write, dtlb, bpu)
- remote node access includes both remote memory, remote cache, remote mmio.
- prefetches are not included in the counts for consistency
  (different from Sandy Bridge, which includes prefetches in the remote node)

Signed-off-by: Andi Kleen <ak@linux.intel.com>
[ Removed the HSM30 comments; we don't have them for SNB/IVB either. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 194 ++++++++++++++++++++++++++++++++-
 1 file changed, 192 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 9f1dd18fa395..5ef64bf88ecd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -415,6 +415,196 @@ static __initconst const u64 snb_hw_cache_event_ids
 
 };
 
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts because they are not
+ *   reliably counted.
+ */
+
+#define HSW_DEMAND_DATA_RD		BIT_ULL(0)
+#define HSW_DEMAND_RFO			BIT_ULL(1)
+#define HSW_ANY_RESPONSE		BIT_ULL(16)
+#define HSW_SUPPLIER_NONE		BIT_ULL(17)
+#define HSW_L3_MISS_LOCAL_DRAM		BIT_ULL(22)
+#define HSW_L3_MISS_REMOTE_HOP0		BIT_ULL(27)
+#define HSW_L3_MISS_REMOTE_HOP1		BIT_ULL(28)
+#define HSW_L3_MISS_REMOTE_HOP2P	BIT_ULL(29)
+#define HSW_L3_MISS			(HSW_L3_MISS_LOCAL_DRAM| \
+					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+					 HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE			BIT_ULL(31)
+#define HSW_SNOOP_NOT_NEEDED		BIT_ULL(32)
+#define HSW_SNOOP_MISS			BIT_ULL(33)
+#define HSW_SNOOP_HIT_NO_FWD		BIT_ULL(34)
+#define HSW_SNOOP_HIT_WITH_FWD		BIT_ULL(35)
+#define HSW_SNOOP_HITM			BIT_ULL(36)
+#define HSW_SNOOP_NON_DRAM		BIT_ULL(37)
+#define HSW_ANY_SNOOP			(HSW_SNOOP_NONE| \
+					 HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+					 HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+					 HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+#define HSW_SNOOP_DRAM			(HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
+#define HSW_DEMAND_READ			HSW_DEMAND_DATA_RD
+#define HSW_DEMAND_WRITE		HSW_DEMAND_RFO
+#define HSW_L3_MISS_REMOTE		(HSW_L3_MISS_REMOTE_HOP0|\
+					 HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_LLC_ACCESS			HSW_ANY_RESPONSE
+
+static __initconst const u64 hsw_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x151,	/* L1D.REPLACEMENT */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x280,	/* ICACHE.MISSES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x108,	/* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x149,	/* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x6085,	/* ITLB_MISSES.STLB_HIT */
+		[ C(RESULT_MISS)   ] = 0x185,	/* ITLB_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xc4,	/* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0xc5,	/* BR_MISP_RETIRED.ALL_BRANCHES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+				       HSW_LLC_ACCESS,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS|HSW_ANY_SNOOP,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+				       HSW_LLC_ACCESS,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS|HSW_ANY_SNOOP,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS_LOCAL_DRAM|
+				       HSW_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS_REMOTE|
+				       HSW_SNOOP_DRAM,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS_LOCAL_DRAM|
+				       HSW_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS_REMOTE|
+				       HSW_SNOOP_DRAM,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
 static __initconst const u64 westmere_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -2520,8 +2710,8 @@ __init int intel_pmu_init(void)
 	case 69: /* 22nm Haswell ULT */
 	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
 		x86_pmu.late_ack = true;
-		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
 
 		intel_pmu_lbr_init_hsw();
 
-- 
cgit v1.2.3


From 91f1b70582c62576f429cf78d53751c66677553d Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 17 Feb 2015 18:18:05 -0800
Subject: perf/x86/intel: Add Broadwell core support

Add Broadwell support for Broadwell to perf.

The basic support is very similar to Haswell. We use the new cache
event list added for Haswell earlier. The only differences
are a few bits related to remote nodes. To avoid an extra,
mostly identical, table these are patched up in the initialization code.

The constraint list has one new event that needs to be handled over Haswell.

Includes code and testing from Kan Liang.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 47 ++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 5ef64bf88ecd..28838536a9f7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -220,6 +220,15 @@ static struct event_constraint intel_hsw_event_constraints[] = {
 	EVENT_CONSTRAINT_END
 };
 
+struct event_constraint intel_bdw_event_constraints[] = {
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
+	INTEL_UEVENT_CONSTRAINT(0x148, 0x4),	/* L1D_PEND_MISS.PENDING */
+	INTEL_EVENT_CONSTRAINT(0xa3, 0x4),	/* CYCLE_ACTIVITY.* */
+	EVENT_CONSTRAINT_END
+};
+
 static u64 intel_pmu_event_map(int hw_event)
 {
 	return intel_perfmon_event_map[hw_event];
@@ -453,6 +462,12 @@ static __initconst const u64 snb_hw_cache_event_ids
 					 HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
 #define HSW_LLC_ACCESS			HSW_ANY_RESPONSE
 
+#define BDW_L3_MISS_LOCAL		BIT(26)
+#define BDW_L3_MISS			(BDW_L3_MISS_LOCAL| \
+					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+					 HSW_L3_MISS_REMOTE_HOP2P)
+
+
 static __initconst const u64 hsw_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -2730,6 +2745,38 @@ __init int intel_pmu_init(void)
 		pr_cont("Haswell events, ");
 		break;
 
+	case 61: /* 14nm Broadwell Core-M */
+	case 86: /* 14nm Broadwell Xeon D */
+		x86_pmu.late_ack = true;
+		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+		/* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
+		hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
+									 BDW_L3_MISS|HSW_SNOOP_DRAM;
+		hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
+									  HSW_SNOOP_DRAM;
+		hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
+									     BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+		hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
+									      BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+
+		intel_pmu_lbr_init_snb();
+
+		x86_pmu.event_constraints = intel_bdw_event_constraints;
+		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_snbep_extra_regs;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.er_flags |= ERF_HAS_RSP_1;
+		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+
+		x86_pmu.hw_config = hsw_hw_config;
+		x86_pmu.get_event_constraints = hsw_get_event_constraints;
+		x86_pmu.cpu_events = hsw_events_attrs;
+		pr_cont("Broadwell events, ");
+		break;
+
 	default:
 		switch (x86_pmu.version) {
 		case 1:
-- 
cgit v1.2.3


From 294fe0f52a44c6f207211de0686c369a961b5533 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 17 Feb 2015 18:18:06 -0800
Subject: perf/x86/intel: Add INST_RETIRED.ALL workarounds

On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.

This is erratum BDM11 and BDM55:

  http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf

BDM11: When using a period < 100; we may get incorrect PEBS/PMI
interrupts and/or an invalid counter state.
BDM55: When bit0-5 of the period are !0 we may get redundant PEBS
records on overflow.

Add a new callback to enforce this, and set it for Broadwell.

How does this handle the case when an app requests a specific
period with some of the bottom bits set?

Short answer:

Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.

Long answer (by Peterz):

IFF we guarantee perf_event_attr::sample_period >= 128.

Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.

So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.

So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.

So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
[ Updated comments and changelog a bit. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c       |  9 +++++++++
 arch/x86/kernel/cpu/perf_event.h       |  1 +
 arch/x86/kernel/cpu/perf_event_intel.c | 27 +++++++++++++++++++++++++++
 3 files changed, 37 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e0dab5ce61e9..ec6e982fd464 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -451,6 +451,12 @@ int x86_pmu_hw_config(struct perf_event *event)
 	if (event->attr.type == PERF_TYPE_RAW)
 		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 
+	if (event->attr.sample_period && x86_pmu.limit_period) {
+		if (x86_pmu.limit_period(event, event->attr.sample_period) >
+				event->attr.sample_period)
+			return -EINVAL;
+	}
+
 	return x86_setup_perfctr(event);
 }
 
@@ -988,6 +994,9 @@ int x86_perf_event_set_period(struct perf_event *event)
 	if (left > x86_pmu.max_period)
 		left = x86_pmu.max_period;
 
+	if (x86_pmu.limit_period)
+		left = x86_pmu.limit_period(event, left);
+
 	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
 
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index a371d27d6795..87e5081f4cdc 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -451,6 +451,7 @@ struct x86_pmu {
 	struct x86_pmu_quirk *quirks;
 	int		perfctr_second_write;
 	bool		late_ack;
+	unsigned	(*limit_period)(struct perf_event *event, unsigned l);
 
 	/*
 	 * sysfs attrs
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 28838536a9f7..fc6dbc46af4a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2096,6 +2096,32 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 	return c;
 }
 
+/*
+ * Broadwell:
+ *
+ * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
+ * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
+ * the two to enforce a minimum period of 128 (the smallest value that has bits
+ * 0-5 cleared and >= 100).
+ *
+ * Because of how the code in x86_perf_event_set_period() works, the truncation
+ * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
+ * to make up for the 'lost' events due to carrying the 'error' in period_left.
+ *
+ * Therefore the effective (average) period matches the requested period,
+ * despite coarser hardware granularity.
+ */
+static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
+{
+	if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+			X86_CONFIG(.event=0xc0, .umask=0x01)) {
+		if (left < 128)
+			left = 128;
+		left &= ~0x3fu;
+	}
+	return left;
+}
+
 PMU_FORMAT_ATTR(event,	"config:0-7"	);
 PMU_FORMAT_ATTR(umask,	"config:8-15"	);
 PMU_FORMAT_ATTR(edge,	"config:18"	);
@@ -2774,6 +2800,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.hw_config = hsw_hw_config;
 		x86_pmu.get_event_constraints = hsw_get_event_constraints;
 		x86_pmu.cpu_events = hsw_events_attrs;
+		x86_pmu.limit_period = bdw_limit_period;
 		pr_cont("Broadwell events, ");
 		break;
 
-- 
cgit v1.2.3


From 9332d250b4b4f67c633894b311e022e3cf943bd5 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 18 Feb 2015 10:45:43 -0700
Subject: perf/x86: Remove redundant calls to perf_pmu_{dis|en}able()

perf_pmu_disable() is called before pmu->add() and perf_pmu_enable() is called
afterwards. No need to call these inside of x86_pmu_add() as well.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424281543-67335-1-git-send-email-dsahern@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ec6e982fd464..ac41b3ad1fc9 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1044,7 +1044,6 @@ static int x86_pmu_add(struct perf_event *event, int flags)
 
 	hwc = &event->hw;
 
-	perf_pmu_disable(event->pmu);
 	n0 = cpuc->n_events;
 	ret = n = collect_events(cpuc, event, false);
 	if (ret < 0)
@@ -1082,7 +1081,6 @@ done_collect:
 
 	ret = 0;
 out:
-	perf_pmu_enable(event->pmu);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 34f439278cef7b1177f8ce24f9fc81dfc6221d3b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 20 Feb 2015 14:05:38 +0100
Subject: perf: Add per event clockid support

While thinking on the whole clock discussion it occurred to me we have
two distinct uses of time:

 1) the tracking of event/ctx/cgroup enabled/running/stopped times
    which includes the self-monitoring support in struct
    perf_event_mmap_page.

 2) the actual timestamps visible in the data records.

And we've been conflating them.

The first is all about tracking time deltas, nobody should really care
in what time base that happens, its all relative information, as long
as its internally consistent it works.

The second however is what people are worried about when having to
merge their data with external sources. And here we have the
discussion on MONOTONIC vs MONOTONIC_RAW etc..

Where MONOTONIC is good for correlating between machines (static
offset), MONOTNIC_RAW is required for correlating against a fixed rate
hardware clock.

This means configurability; now 1) makes that hard because it needs to
be internally consistent across groups of unrelated events; which is
why we had to have a global perf_clock().

However, for 2) it doesn't really matter, perf itself doesn't care
what it writes into the buffer.

The below patch makes the distinction between these two cases by
adding perf_event_clock() which is used for the second case. It
further makes this configurable on a per-event basis, but adds a few
sanity checks such that we cannot combine events with different clocks
in confusing ways.

And since we then have per-event configurability we might as well
retain the 'legacy' behaviour as a default.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ac41b3ad1fc9..0420ebcac116 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1978,13 +1978,23 @@ void arch_perf_update_userpage(struct perf_event *event,
 
 	data = cyc2ns_read_begin();
 
+	/*
+	 * Internal timekeeping for enabled/running/stopped times
+	 * is always in the local_clock domain.
+	 */
 	userpg->cap_user_time = 1;
 	userpg->time_mult = data->cyc2ns_mul;
 	userpg->time_shift = data->cyc2ns_shift;
 	userpg->time_offset = data->cyc2ns_offset - now;
 
-	userpg->cap_user_time_zero = 1;
-	userpg->time_zero = data->cyc2ns_offset;
+	/*
+	 * cap_user_time_zero doesn't make sense when we're using a different
+	 * time base for the records.
+	 */
+	if (event->clock == &local_clock) {
+		userpg->cap_user_time_zero = 1;
+		userpg->time_zero = data->cyc2ns_offset;
+	}
 
 	cyc2ns_read_end(data);
 }
-- 
cgit v1.2.3


From eabdc320ece583e16e581306c720e5f1ff67c3bb Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Mon, 30 Mar 2015 21:58:17 +0200
Subject: crypto: aesni - mark AES-NI helper ciphers

Flag all AES-NI helper ciphers as internal ciphers to prevent them from
being called by normal users.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_glue.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 6893f4947583..f9a78f32f494 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -797,7 +797,9 @@ static int rfc4106_init(struct crypto_tfm *tfm)
 		PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
 	struct crypto_aead *cryptd_child;
 	struct aesni_rfc4106_gcm_ctx *child_ctx;
-	cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0);
+	cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni",
+				       CRYPTO_ALG_INTERNAL,
+				       CRYPTO_ALG_INTERNAL);
 	if (IS_ERR(cryptd_tfm))
 		return PTR_ERR(cryptd_tfm);
 
@@ -1262,7 +1264,7 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_name		= "__aes-aesni",
 	.cra_driver_name	= "__driver-aes-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER | CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct crypto_aes_ctx) +
 				  AESNI_ALIGN - 1,
@@ -1281,7 +1283,8 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_name		= "__ecb-aes-aesni",
 	.cra_driver_name	= "__driver-ecb-aes-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct crypto_aes_ctx) +
 				  AESNI_ALIGN - 1,
@@ -1301,7 +1304,8 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_name		= "__cbc-aes-aesni",
 	.cra_driver_name	= "__driver-cbc-aes-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct crypto_aes_ctx) +
 				  AESNI_ALIGN - 1,
@@ -1365,7 +1369,8 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_name		= "__ctr-aes-aesni",
 	.cra_driver_name	= "__driver-ctr-aes-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= 1,
 	.cra_ctxsize		= sizeof(struct crypto_aes_ctx) +
 				  AESNI_ALIGN - 1,
@@ -1409,7 +1414,7 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_name		= "__gcm-aes-aesni",
 	.cra_driver_name	= "__driver-gcm-aes-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_AEAD,
+	.cra_flags		= CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= 1,
 	.cra_ctxsize		= sizeof(struct aesni_rfc4106_gcm_ctx) +
 				  AESNI_ALIGN,
@@ -1479,7 +1484,8 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_name		= "__lrw-aes-aesni",
 	.cra_driver_name	= "__driver-lrw-aes-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct aesni_lrw_ctx),
 	.cra_alignmask		= 0,
@@ -1500,7 +1506,8 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_name		= "__xts-aes-aesni",
 	.cra_driver_name	= "__driver-xts-aes-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct aesni_xts_ctx),
 	.cra_alignmask		= 0,
-- 
cgit v1.2.3


From 6a9b52b7fa1a376c1749cf441d1aeb8572d3b691 Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Mon, 30 Mar 2015 22:01:49 +0200
Subject: crypto: clmulni - mark ghash clmulni helper ciphers

Flag all ash clmulni helper ciphers as internal ciphers to prevent them
from being called by normal users.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/ghash-clmulni-intel_glue.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 8253d85aa165..2079baf06bdd 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -154,7 +154,8 @@ static struct shash_alg ghash_alg = {
 		.cra_name		= "__ghash",
 		.cra_driver_name	= "__ghash-pclmulqdqni",
 		.cra_priority		= 0,
-		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH |
+					  CRYPTO_ALG_INTERNAL,
 		.cra_blocksize		= GHASH_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct ghash_ctx),
 		.cra_module		= THIS_MODULE,
@@ -261,7 +262,9 @@ static int ghash_async_init_tfm(struct crypto_tfm *tfm)
 	struct cryptd_ahash *cryptd_tfm;
 	struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
 
-	cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", 0, 0);
+	cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni",
+					CRYPTO_ALG_INTERNAL,
+					CRYPTO_ALG_INTERNAL);
 	if (IS_ERR(cryptd_tfm))
 		return PTR_ERR(cryptd_tfm);
 	ctx->cryptd_tfm = cryptd_tfm;
-- 
cgit v1.2.3


From a62356a978180eb3ccd2bcac49764dfd628c72f8 Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Mon, 30 Mar 2015 22:03:17 +0200
Subject: crypto: camellia_aesni_avx2 - mark AES-NI Camellia helper ciphers

Flag all AES-NI Camellia helper ciphers as internal ciphers to
prevent them from being called by normal users.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/camellia_aesni_avx2_glue.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
index 9a07fafe3831..baf0ac21ace5 100644
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -343,7 +343,8 @@ static struct crypto_alg cmll_algs[10] = { {
 	.cra_name		= "__ecb-camellia-aesni-avx2",
 	.cra_driver_name	= "__driver-ecb-camellia-aesni-avx2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct camellia_ctx),
 	.cra_alignmask		= 0,
@@ -362,7 +363,8 @@ static struct crypto_alg cmll_algs[10] = { {
 	.cra_name		= "__cbc-camellia-aesni-avx2",
 	.cra_driver_name	= "__driver-cbc-camellia-aesni-avx2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct camellia_ctx),
 	.cra_alignmask		= 0,
@@ -381,7 +383,8 @@ static struct crypto_alg cmll_algs[10] = { {
 	.cra_name		= "__ctr-camellia-aesni-avx2",
 	.cra_driver_name	= "__driver-ctr-camellia-aesni-avx2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= 1,
 	.cra_ctxsize		= sizeof(struct camellia_ctx),
 	.cra_alignmask		= 0,
@@ -401,7 +404,8 @@ static struct crypto_alg cmll_algs[10] = { {
 	.cra_name		= "__lrw-camellia-aesni-avx2",
 	.cra_driver_name	= "__driver-lrw-camellia-aesni-avx2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct camellia_lrw_ctx),
 	.cra_alignmask		= 0,
@@ -424,7 +428,8 @@ static struct crypto_alg cmll_algs[10] = { {
 	.cra_name		= "__xts-camellia-aesni-avx2",
 	.cra_driver_name	= "__driver-xts-camellia-aesni-avx2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct camellia_xts_ctx),
 	.cra_alignmask		= 0,
-- 
cgit v1.2.3


From 680574e8b3b2f62e507cdd9ca35c77cad0344bb2 Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Mon, 30 Mar 2015 22:03:57 +0200
Subject: crypto: cast5_avx - mark CAST5 helper ciphers

Flag all CAST5 helper ciphers as internal ciphers to prevent them
from being called by normal users.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/cast5_avx_glue.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index 60ada677a928..236c80974457 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -341,7 +341,8 @@ static struct crypto_alg cast5_algs[6] = { {
 	.cra_name		= "__ecb-cast5-avx",
 	.cra_driver_name	= "__driver-ecb-cast5-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAST5_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct cast5_ctx),
 	.cra_alignmask		= 0,
@@ -360,7 +361,8 @@ static struct crypto_alg cast5_algs[6] = { {
 	.cra_name		= "__cbc-cast5-avx",
 	.cra_driver_name	= "__driver-cbc-cast5-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAST5_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct cast5_ctx),
 	.cra_alignmask		= 0,
@@ -379,7 +381,8 @@ static struct crypto_alg cast5_algs[6] = { {
 	.cra_name		= "__ctr-cast5-avx",
 	.cra_driver_name	= "__driver-ctr-cast5-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= 1,
 	.cra_ctxsize		= sizeof(struct cast5_ctx),
 	.cra_alignmask		= 0,
-- 
cgit v1.2.3


From 7d2c31dd70ed49210847ae8cc00f14064292b349 Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Mon, 30 Mar 2015 22:04:49 +0200
Subject: crypto: camellia_aesni_avx - mark AVX Camellia helper ciphers

Flag all AVX Camellia helper ciphers as internal ciphers to prevent
them from being called by normal users.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/camellia_aesni_avx_glue.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index ed38d959add6..78818a1e73e3 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -335,7 +335,8 @@ static struct crypto_alg cmll_algs[10] = { {
 	.cra_name		= "__ecb-camellia-aesni",
 	.cra_driver_name	= "__driver-ecb-camellia-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct camellia_ctx),
 	.cra_alignmask		= 0,
@@ -354,7 +355,8 @@ static struct crypto_alg cmll_algs[10] = { {
 	.cra_name		= "__cbc-camellia-aesni",
 	.cra_driver_name	= "__driver-cbc-camellia-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct camellia_ctx),
 	.cra_alignmask		= 0,
@@ -373,7 +375,8 @@ static struct crypto_alg cmll_algs[10] = { {
 	.cra_name		= "__ctr-camellia-aesni",
 	.cra_driver_name	= "__driver-ctr-camellia-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= 1,
 	.cra_ctxsize		= sizeof(struct camellia_ctx),
 	.cra_alignmask		= 0,
@@ -393,7 +396,8 @@ static struct crypto_alg cmll_algs[10] = { {
 	.cra_name		= "__lrw-camellia-aesni",
 	.cra_driver_name	= "__driver-lrw-camellia-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct camellia_lrw_ctx),
 	.cra_alignmask		= 0,
@@ -416,7 +420,8 @@ static struct crypto_alg cmll_algs[10] = { {
 	.cra_name		= "__xts-camellia-aesni",
 	.cra_driver_name	= "__driver-xts-camellia-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct camellia_xts_ctx),
 	.cra_alignmask		= 0,
-- 
cgit v1.2.3


From e69b8a46ca0ec38ef419071b00574bc053664e18 Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Mon, 30 Mar 2015 22:05:35 +0200
Subject: crypto: cast6_avx - mark CAST6 helper ciphers

Flag all CAST6 helper ciphers as internal ciphers to prevent them
from being called by normal users.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/cast6_avx_glue.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 0160f68a57ff..f448810ca4ac 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -372,7 +372,8 @@ static struct crypto_alg cast6_algs[10] = { {
 	.cra_name		= "__ecb-cast6-avx",
 	.cra_driver_name	= "__driver-ecb-cast6-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAST6_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct cast6_ctx),
 	.cra_alignmask		= 0,
@@ -391,7 +392,8 @@ static struct crypto_alg cast6_algs[10] = { {
 	.cra_name		= "__cbc-cast6-avx",
 	.cra_driver_name	= "__driver-cbc-cast6-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAST6_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct cast6_ctx),
 	.cra_alignmask		= 0,
@@ -410,7 +412,8 @@ static struct crypto_alg cast6_algs[10] = { {
 	.cra_name		= "__ctr-cast6-avx",
 	.cra_driver_name	= "__driver-ctr-cast6-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= 1,
 	.cra_ctxsize		= sizeof(struct cast6_ctx),
 	.cra_alignmask		= 0,
@@ -430,7 +433,8 @@ static struct crypto_alg cast6_algs[10] = { {
 	.cra_name		= "__lrw-cast6-avx",
 	.cra_driver_name	= "__driver-lrw-cast6-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAST6_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct cast6_lrw_ctx),
 	.cra_alignmask		= 0,
@@ -453,7 +457,8 @@ static struct crypto_alg cast6_algs[10] = { {
 	.cra_name		= "__xts-cast6-avx",
 	.cra_driver_name	= "__driver-xts-cast6-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAST6_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct cast6_xts_ctx),
 	.cra_alignmask		= 0,
-- 
cgit v1.2.3


From f82419acd8ad6b045a040ba8f5f0289972189826 Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Mon, 30 Mar 2015 22:06:13 +0200
Subject: crypto: serpent_avx2 - mark Serpent AVX2 helper ciphers

Flag all Serpent AVX2 helper ciphers as internal ciphers to prevent
them from being called by normal users.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/serpent_avx2_glue.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
index 437e47a4d302..2f63dc89e7a9 100644
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -309,7 +309,8 @@ static struct crypto_alg srp_algs[10] = { {
 	.cra_name		= "__ecb-serpent-avx2",
 	.cra_driver_name	= "__driver-ecb-serpent-avx2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_ctx),
 	.cra_alignmask		= 0,
@@ -329,7 +330,8 @@ static struct crypto_alg srp_algs[10] = { {
 	.cra_name		= "__cbc-serpent-avx2",
 	.cra_driver_name	= "__driver-cbc-serpent-avx2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_ctx),
 	.cra_alignmask		= 0,
@@ -349,7 +351,8 @@ static struct crypto_alg srp_algs[10] = { {
 	.cra_name		= "__ctr-serpent-avx2",
 	.cra_driver_name	= "__driver-ctr-serpent-avx2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= 1,
 	.cra_ctxsize		= sizeof(struct serpent_ctx),
 	.cra_alignmask		= 0,
@@ -370,7 +373,8 @@ static struct crypto_alg srp_algs[10] = { {
 	.cra_name		= "__lrw-serpent-avx2",
 	.cra_driver_name	= "__driver-lrw-serpent-avx2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_lrw_ctx),
 	.cra_alignmask		= 0,
@@ -394,7 +398,8 @@ static struct crypto_alg srp_algs[10] = { {
 	.cra_name		= "__xts-serpent-avx2",
 	.cra_driver_name	= "__driver-xts-serpent-avx2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_xts_ctx),
 	.cra_alignmask		= 0,
-- 
cgit v1.2.3


From 65aed5394120953ccb6e307d75f9abd17c15740e Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Mon, 30 Mar 2015 22:07:05 +0200
Subject: crypto: serpent_avx - mark Serpent AVX helper ciphers

Flag all Serpent AVX helper ciphers as internal ciphers to prevent
them from being called by normal users.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/serpent_avx_glue.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 7e217398b4eb..c8d478af8456 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -378,7 +378,8 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_name		= "__ecb-serpent-avx",
 	.cra_driver_name	= "__driver-ecb-serpent-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_ctx),
 	.cra_alignmask		= 0,
@@ -397,7 +398,8 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_name		= "__cbc-serpent-avx",
 	.cra_driver_name	= "__driver-cbc-serpent-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_ctx),
 	.cra_alignmask		= 0,
@@ -416,7 +418,8 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_name		= "__ctr-serpent-avx",
 	.cra_driver_name	= "__driver-ctr-serpent-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= 1,
 	.cra_ctxsize		= sizeof(struct serpent_ctx),
 	.cra_alignmask		= 0,
@@ -436,7 +439,8 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_name		= "__lrw-serpent-avx",
 	.cra_driver_name	= "__driver-lrw-serpent-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_lrw_ctx),
 	.cra_alignmask		= 0,
@@ -459,7 +463,8 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_name		= "__xts-serpent-avx",
 	.cra_driver_name	= "__driver-xts-serpent-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_xts_ctx),
 	.cra_alignmask		= 0,
-- 
cgit v1.2.3


From 748be1f1bfddfe83d4b31c17191b082e96c86867 Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Mon, 30 Mar 2015 22:07:45 +0200
Subject: crypto: serpent_sse2 - mark Serpent SSE2 helper ciphers

Flag all Serpent SSE2 helper ciphers as internal ciphers to prevent
them from being called by normal users.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/serpent_sse2_glue.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index bf025adaea01..3643dd508f45 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -387,7 +387,8 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_name		= "__ecb-serpent-sse2",
 	.cra_driver_name	= "__driver-ecb-serpent-sse2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_ctx),
 	.cra_alignmask		= 0,
@@ -406,7 +407,8 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_name		= "__cbc-serpent-sse2",
 	.cra_driver_name	= "__driver-cbc-serpent-sse2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_ctx),
 	.cra_alignmask		= 0,
@@ -425,7 +427,8 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_name		= "__ctr-serpent-sse2",
 	.cra_driver_name	= "__driver-ctr-serpent-sse2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= 1,
 	.cra_ctxsize		= sizeof(struct serpent_ctx),
 	.cra_alignmask		= 0,
@@ -445,7 +448,8 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_name		= "__lrw-serpent-sse2",
 	.cra_driver_name	= "__driver-lrw-serpent-sse2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_lrw_ctx),
 	.cra_alignmask		= 0,
@@ -468,7 +472,8 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_name		= "__xts-serpent-sse2",
 	.cra_driver_name	= "__driver-xts-serpent-sse2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_xts_ctx),
 	.cra_alignmask		= 0,
-- 
cgit v1.2.3


From 4dda66f62e01e832e09d6d1fbd9c9070d57b049e Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Mon, 30 Mar 2015 22:08:53 +0200
Subject: crypto: twofish_avx - mark Twofish AVX helper ciphers

Flag all Twofish AVX helper ciphers as internal ciphers to prevent
them from being called by normal users.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/twofish_avx_glue.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index 1ac531ea9bcc..b5e2d5651851 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -340,7 +340,8 @@ static struct crypto_alg twofish_algs[10] = { {
 	.cra_name		= "__ecb-twofish-avx",
 	.cra_driver_name	= "__driver-ecb-twofish-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= TF_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct twofish_ctx),
 	.cra_alignmask		= 0,
@@ -359,7 +360,8 @@ static struct crypto_alg twofish_algs[10] = { {
 	.cra_name		= "__cbc-twofish-avx",
 	.cra_driver_name	= "__driver-cbc-twofish-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= TF_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct twofish_ctx),
 	.cra_alignmask		= 0,
@@ -378,7 +380,8 @@ static struct crypto_alg twofish_algs[10] = { {
 	.cra_name		= "__ctr-twofish-avx",
 	.cra_driver_name	= "__driver-ctr-twofish-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= 1,
 	.cra_ctxsize		= sizeof(struct twofish_ctx),
 	.cra_alignmask		= 0,
@@ -398,7 +401,8 @@ static struct crypto_alg twofish_algs[10] = { {
 	.cra_name		= "__lrw-twofish-avx",
 	.cra_driver_name	= "__driver-lrw-twofish-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= TF_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct twofish_lrw_ctx),
 	.cra_alignmask		= 0,
@@ -421,7 +425,8 @@ static struct crypto_alg twofish_algs[10] = { {
 	.cra_name		= "__xts-twofish-avx",
 	.cra_driver_name	= "__driver-xts-twofish-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= TF_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct twofish_xts_ctx),
 	.cra_alignmask		= 0,
-- 
cgit v1.2.3


From 555fa17b2b8cc865c203de263443041eb75bc7f7 Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Mon, 30 Mar 2015 22:11:46 +0200
Subject: crypto: sha-mb - mark Multi buffer SHA1 helper cipher

Flag all Multi buffer SHA1 helper ciphers as internal ciphers
to prevent them from being called by normal users.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha-mb/sha1_mb.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c
index 9414fd964ecd..e510b1c5d690 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -694,7 +694,8 @@ static struct shash_alg sha1_mb_shash_alg = {
 		 * use ASYNC flag as some buffers in multi-buffer
 		 * algo may not have completed before hashing thread sleep
 		 */
-		.cra_flags	 = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_ASYNC,
+		.cra_flags	 = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_ASYNC |
+				   CRYPTO_ALG_INTERNAL,
 		.cra_blocksize	 = SHA1_BLOCK_SIZE,
 		.cra_module	 = THIS_MODULE,
 		.cra_list	 = LIST_HEAD_INIT(sha1_mb_shash_alg.base.cra_list),
@@ -770,7 +771,9 @@ static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm)
 	struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
 	struct mcryptd_hash_ctx *mctx;
 
-	mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb", 0, 0);
+	mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb",
+					  CRYPTO_ALG_INTERNAL,
+					  CRYPTO_ALG_INTERNAL);
 	if (IS_ERR(mcryptd_tfm))
 		return PTR_ERR(mcryptd_tfm);
 	mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
-- 
cgit v1.2.3


From ec776ef6bbe1734c29cd6bd05219cd93b2731bd4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 1 Apr 2015 09:12:18 +0200
Subject: x86/mm: Add support for the non-standard protected e820 type

Various recent BIOSes support NVDIMMs or ADR using a
non-standard e820 memory type, and Intel supplied reference
Linux code using this type to various vendors.

Wire this e820 table type up to export platform devices for the
pmem driver so that we can use it in Linux.

Based on earlier work from:

   Dave Jiang <dave.jiang@intel.com>
   Dan Williams <dan.j.williams@intel.com>

Includes fixes for NUMA regions from Boaz Harrosh.

Tested-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jens Axboe <axboe@fb.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox <willy@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-nvdimm@ml01.01.org
Link: http://lkml.kernel.org/r/1427872339-6688-2-git-send-email-hch@lst.de
[ Minor cleanups. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig                 | 10 ++++++++
 arch/x86/include/uapi/asm/e820.h | 10 ++++++++
 arch/x86/kernel/Makefile         |  1 +
 arch/x86/kernel/e820.c           | 26 +++++++++++++++-----
 arch/x86/kernel/pmem.c           | 53 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 94 insertions(+), 6 deletions(-)
 create mode 100644 arch/x86/kernel/pmem.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b7d31ca55187..9e3bcd6f4a48 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1430,6 +1430,16 @@ config ILLEGAL_POINTER_VALUE
 
 source "mm/Kconfig"
 
+config X86_PMEM_LEGACY
+	bool "Support non-standard NVDIMMs and ADR protected memory"
+	help
+	  Treat memory marked using the non-standard e820 type of 12 as used
+	  by the Intel Sandy Bridge-EP reference BIOS as protected memory.
+	  The kernel will offer these regions to the 'pmem' driver so
+	  they can be used for persistent storage.
+
+	  Say Y if unsure.
+
 config HIGHPTE
 	bool "Allocate 3rd-level pagetables from highmem"
 	depends on HIGHMEM
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index d993e33f5236..960a8a9dc4ab 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -33,6 +33,16 @@
 #define E820_NVS	4
 #define E820_UNUSABLE	5
 
+/*
+ * This is a non-standardized way to represent ADR or NVDIMM regions that
+ * persist over a reboot.  The kernel will ignore their special capabilities
+ * unless the CONFIG_X86_PMEM_LEGACY=y option is set.
+ *
+ * ( Note that older platforms also used 6 for the same type of memory,
+ *   but newer versions switched to 12 as 6 was assigned differently.  Some
+ *   time they will learn... )
+ */
+#define E820_PRAM	12
 
 /*
  * reserved RAM used by kernel itself
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index cdb1b70ddad0..971f18cd9ca0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -94,6 +94,7 @@ obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvmclock.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
 obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o
+obj-$(CONFIG_X86_PMEM_LEGACY)	+= pmem.o
 
 obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o
 
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 46201deee923..11cc7d54ec3f 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -149,6 +149,9 @@ static void __init e820_print_type(u32 type)
 	case E820_UNUSABLE:
 		printk(KERN_CONT "unusable");
 		break;
+	case E820_PRAM:
+		printk(KERN_CONT "persistent (type %u)", type);
+		break;
 	default:
 		printk(KERN_CONT "type %u", type);
 		break;
@@ -343,7 +346,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
 		 * continue building up new bios map based on this
 		 * information
 		 */
-		if (current_type != last_type)	{
+		if (current_type != last_type || current_type == E820_PRAM) {
 			if (last_type != 0)	 {
 				new_bios[new_bios_entry].size =
 					change_point[chgidx]->addr - last_addr;
@@ -688,6 +691,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
 			register_nosave_region(pfn, PFN_UP(ei->addr));
 
 		pfn = PFN_DOWN(ei->addr + ei->size);
+
 		if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
 			register_nosave_region(PFN_UP(ei->addr), pfn);
 
@@ -748,7 +752,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
 /*
  * Find the highest page frame number we have available
  */
-static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
+static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
 {
 	int i;
 	unsigned long last_pfn = 0;
@@ -759,7 +763,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
 		unsigned long start_pfn;
 		unsigned long end_pfn;
 
-		if (ei->type != type)
+		/*
+		 * Persistent memory is accounted as ram for purposes of
+		 * establishing max_pfn and mem_map.
+		 */
+		if (ei->type != E820_RAM && ei->type != E820_PRAM)
 			continue;
 
 		start_pfn = ei->addr >> PAGE_SHIFT;
@@ -784,12 +792,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
 }
 unsigned long __init e820_end_of_ram_pfn(void)
 {
-	return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
+	return e820_end_pfn(MAX_ARCH_PFN);
 }
 
 unsigned long __init e820_end_of_low_ram_pfn(void)
 {
-	return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
+	return e820_end_pfn(1UL << (32-PAGE_SHIFT));
 }
 
 static void early_panic(char *msg)
@@ -866,6 +874,9 @@ static int __init parse_memmap_one(char *p)
 	} else if (*p == '$') {
 		start_at = memparse(p+1, &p);
 		e820_add_region(start_at, mem_size, E820_RESERVED);
+	} else if (*p == '!') {
+		start_at = memparse(p+1, &p);
+		e820_add_region(start_at, mem_size, E820_PRAM);
 	} else
 		e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
 
@@ -907,6 +918,7 @@ static inline const char *e820_type_to_string(int e820_type)
 	case E820_ACPI:	return "ACPI Tables";
 	case E820_NVS:	return "ACPI Non-volatile Storage";
 	case E820_UNUSABLE:	return "Unusable memory";
+	case E820_PRAM: return "Persistent RAM";
 	default:	return "reserved";
 	}
 }
@@ -940,7 +952,9 @@ void __init e820_reserve_resources(void)
 		 * pci device BAR resource and insert them later in
 		 * pcibios_resource_survey()
 		 */
-		if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
+		if (((e820.map[i].type != E820_RESERVED) &&
+		     (e820.map[i].type != E820_PRAM)) ||
+		     res->start < (1ULL<<20)) {
 			res->flags |= IORESOURCE_BUSY;
 			insert_resource(&iomem_resource, res);
 		}
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
new file mode 100644
index 000000000000..3420c874ddc5
--- /dev/null
+++ b/arch/x86/kernel/pmem.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2015, Christoph Hellwig.
+ */
+#include <linux/memblock.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <asm/e820.h>
+#include <asm/page_types.h>
+#include <asm/setup.h>
+
+static __init void register_pmem_device(struct resource *res)
+{
+	struct platform_device *pdev;
+	int error;
+
+	pdev = platform_device_alloc("pmem", PLATFORM_DEVID_AUTO);
+	if (!pdev)
+		return;
+
+	error = platform_device_add_resources(pdev, res, 1);
+	if (error)
+		goto out_put_pdev;
+
+	error = platform_device_add(pdev);
+	if (error)
+		goto out_put_pdev;
+	return;
+
+out_put_pdev:
+	dev_warn(&pdev->dev, "failed to add 'pmem' (persistent memory) device!\n");
+	platform_device_put(pdev);
+}
+
+static __init int register_pmem_devices(void)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+
+		if (ei->type == E820_PRAM) {
+			struct resource res = {
+				.flags	= IORESOURCE_MEM,
+				.start	= ei->addr,
+				.end	= ei->addr + ei->size - 1,
+			};
+			register_pmem_device(&res);
+		}
+	}
+
+	return 0;
+}
+device_initcall(register_pmem_devices);
-- 
cgit v1.2.3


From a436bb7b806383ae0593cab53d17fc9676270cd3 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 27 Mar 2015 20:43:36 +0900
Subject: kbuild: use relative path more to include Makefile

Prior to this commit, it was impossible to use relative path to
include Makefiles from the top level Makefile because the option
"--include-dir=$(srctree)" becomes effective when Make enters into
sub Makefiles.

To use relative path in any places, this commit moves the option
above the "sub-make" target.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Michal Marek <mmarek@suse.cz>
---
 arch/x86/Makefile    | 2 +-
 arch/x86/Makefile.um | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 5ba2d9ce82dc..2fda005bb334 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -63,7 +63,7 @@ ifeq ($(CONFIG_X86_32),y)
 				$(call cc-option,-fno-unit-at-a-time))
 
         # CPU-specific tuning. Anything which can be shared with UML should go here.
-        include $(srctree)/arch/x86/Makefile_32.cpu
+        include arch/x86/Makefile_32.cpu
         KBUILD_CFLAGS += $(cflags-y)
 
         # temporary until string.h is fixed
diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um
index 95eba554baf9..5b7e898ffd9a 100644
--- a/arch/x86/Makefile.um
+++ b/arch/x86/Makefile.um
@@ -18,7 +18,7 @@ LDS_EXTRA		:= -Ui386
 export LDS_EXTRA
 
 # First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y.
-include $(srctree)/arch/x86/Makefile_32.cpu
+include arch/x86/Makefile_32.cpu
 
 # prevent gcc from keeping the stack 16 byte aligned. Taken from i386.
 cflags-y += $(call cc-option,-mpreferred-stack-boundary=2)
-- 
cgit v1.2.3


From ed69628b3b04578179334393d7f5fe60a2681f1c Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:19 +0200
Subject: x86: Add Intel Processor Trace (INTEL_PT) cpu feature detection

Intel Processor Trace is an architecture extension that allows for program
flow tracing.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-11-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeature.h | 1 +
 arch/x86/kernel/cpu/scattered.c   | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 361922dcc9b1..c1553b70fed4 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -195,6 +195,7 @@
 #define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */
 #define X86_FEATURE_HWP_EPP	( 7*32+13) /* Intel HWP_EPP */
 #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
+#define X86_FEATURE_INTEL_PT	( 7*32+15) /* Intel Processor Trace */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 60639093d536..3d423a101fae 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -41,6 +41,7 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		{ X86_FEATURE_HWP_ACT_WINDOW,	CR_EAX, 9, 0x00000006, 0 },
 		{ X86_FEATURE_HWP_EPP,		CR_EAX,10, 0x00000006, 0 },
 		{ X86_FEATURE_HWP_PKG_REQ,	CR_EAX,11, 0x00000006, 0 },
+		{ X86_FEATURE_INTEL_PT,		CR_EBX,25, 0x00000007, 0 },
 		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006, 0 },
 		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006, 0 },
 		{ X86_FEATURE_HW_PSTATE,	CR_EDX, 7, 0x80000007, 0 },
-- 
cgit v1.2.3


From 4807034248bed58d49a4f9f450c024e3b0f58577 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:20 +0200
Subject: perf/x86: Mark Intel PT and LBR/BTS as mutually exclusive

Intel PT cannot be used at the same time as LBR or BTS and will cause a
general protection fault if they are used together. In order to avoid
fixing up GPs in the fast path, instead we disallow creating LBR/BTS
events when PT events are present and vice versa.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-12-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c       | 43 ++++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_event.h       | 40 +++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_event_intel.c | 11 +++++++++
 3 files changed, 94 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0420ebcac116..549d01d6d996 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -263,6 +263,14 @@ static void hw_perf_event_destroy(struct perf_event *event)
 	}
 }
 
+void hw_perf_lbr_event_destroy(struct perf_event *event)
+{
+	hw_perf_event_destroy(event);
+
+	/* undo the lbr/bts event accounting */
+	x86_del_exclusive(x86_lbr_exclusive_lbr);
+}
+
 static inline int x86_pmu_initialized(void)
 {
 	return x86_pmu.handle_irq != NULL;
@@ -302,6 +310,35 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
 	return x86_pmu_extra_regs(val, event);
 }
 
+/*
+ * Check if we can create event of a certain type (that no conflicting events
+ * are present).
+ */
+int x86_add_exclusive(unsigned int what)
+{
+	int ret = -EBUSY, i;
+
+	if (atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what]))
+		return 0;
+
+	mutex_lock(&pmc_reserve_mutex);
+	for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++)
+		if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
+			goto out;
+
+	atomic_inc(&x86_pmu.lbr_exclusive[what]);
+	ret = 0;
+
+out:
+	mutex_unlock(&pmc_reserve_mutex);
+	return ret;
+}
+
+void x86_del_exclusive(unsigned int what)
+{
+	atomic_dec(&x86_pmu.lbr_exclusive[what]);
+}
+
 int x86_setup_perfctr(struct perf_event *event)
 {
 	struct perf_event_attr *attr = &event->attr;
@@ -346,6 +383,12 @@ int x86_setup_perfctr(struct perf_event *event)
 		/* BTS is currently only allowed for user-mode. */
 		if (!attr->exclude_kernel)
 			return -EOPNOTSUPP;
+
+		/* disallow bts if conflicting events are present */
+		if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+			return -EBUSY;
+
+		event->destroy = hw_perf_lbr_event_destroy;
 	}
 
 	hwc->config |= config;
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 87e5081f4cdc..47499661e8d4 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -408,6 +408,12 @@ union x86_pmu_config {
 
 #define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
 
+enum {
+	x86_lbr_exclusive_lbr,
+	x86_lbr_exclusive_pt,
+	x86_lbr_exclusive_max,
+};
+
 /*
  * struct x86_pmu - generic x86 pmu
  */
@@ -505,6 +511,11 @@ struct x86_pmu {
 	const int	*lbr_sel_map;		   /* lbr_select mappings */
 	bool		lbr_double_abort;	   /* duplicated lbr aborts */
 
+	/*
+	 * Intel PT/LBR/BTS are exclusive
+	 */
+	atomic_t	lbr_exclusive[x86_lbr_exclusive_max];
+
 	/*
 	 * Extra registers for events
 	 */
@@ -603,6 +614,12 @@ static inline int x86_pmu_rdpmc_index(int index)
 	return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
 }
 
+int x86_add_exclusive(unsigned int what);
+
+void x86_del_exclusive(unsigned int what);
+
+void hw_perf_lbr_event_destroy(struct perf_event *event);
+
 int x86_setup_perfctr(struct perf_event *event);
 
 int x86_pmu_hw_config(struct perf_event *event);
@@ -689,6 +706,29 @@ static inline int amd_pmu_init(void)
 
 #ifdef CONFIG_CPU_SUP_INTEL
 
+static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
+{
+	/* user explicitly requested branch sampling */
+	if (has_branch_stack(event))
+		return true;
+
+	/* implicit branch sampling to correct PEBS skid */
+	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
+	    x86_pmu.intel_cap.pebs_format < 2)
+		return true;
+
+	return false;
+}
+
+static inline bool intel_pmu_has_bts(struct perf_event *event)
+{
+	if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
+	    !event->attr.freq && event->hw.sample_period == 1)
+		return true;
+
+	return false;
+}
+
 int intel_pmu_save_and_restart(struct perf_event *event);
 
 struct event_constraint *
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index fc6dbc46af4a..b7b3ff21c832 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1942,6 +1942,17 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		ret = intel_pmu_setup_lbr_filter(event);
 		if (ret)
 			return ret;
+
+		/*
+		 * BTS is set up earlier in this path, so don't account twice
+		 */
+		if (!intel_pmu_has_bts(event)) {
+			/* disallow lbr if conflicting events are present */
+			if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+				return -EBUSY;
+
+			event->destroy = hw_perf_lbr_event_destroy;
+		}
 	}
 
 	if (event->attr.type != PERF_TYPE_RAW)
-- 
cgit v1.2.3


From 52ca9ced3f70779589e6ecc329baffe69d8f5f7a Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Fri, 30 Jan 2015 12:39:52 +0200
Subject: perf/x86/intel/pt: Add Intel PT PMU driver

Add support for Intel Processor Trace (PT) to kernel's perf events.
PT is an extension of Intel Architecture that collects information about
software execuction such as control flow, execution modes and timings and
formats it into highly compressed binary packets. Even being compressed,
these packets are generated at hundreds of megabytes per second per core,
which makes it impractical to decode them on the fly in the kernel.

This driver exports trace data by through AUX space in the perf ring
buffer, which is zero-copy mapped into userspace for faster data retrieval.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1422614392-114498-1-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/uapi/asm/msr-index.h     |   18 +
 arch/x86/kernel/cpu/Makefile              |    1 +
 arch/x86/kernel/cpu/intel_pt.h            |  131 ++++
 arch/x86/kernel/cpu/perf_event.h          |    2 +
 arch/x86/kernel/cpu/perf_event_intel.c    |    8 +
 arch/x86/kernel/cpu/perf_event_intel_pt.c | 1096 +++++++++++++++++++++++++++++
 6 files changed, 1256 insertions(+)
 create mode 100644 arch/x86/kernel/cpu/intel_pt.h
 create mode 100644 arch/x86/kernel/cpu/perf_event_intel_pt.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index 3ce079136c11..1a4eae695ca8 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -74,6 +74,24 @@
 #define MSR_IA32_PERF_CAPABILITIES	0x00000345
 #define MSR_PEBS_LD_LAT_THRESHOLD	0x000003f6
 
+#define MSR_IA32_RTIT_CTL		0x00000570
+#define RTIT_CTL_TRACEEN		BIT(0)
+#define RTIT_CTL_OS			BIT(2)
+#define RTIT_CTL_USR			BIT(3)
+#define RTIT_CTL_CR3EN			BIT(7)
+#define RTIT_CTL_TOPA			BIT(8)
+#define RTIT_CTL_TSC_EN			BIT(10)
+#define RTIT_CTL_DISRETC		BIT(11)
+#define RTIT_CTL_BRANCH_EN		BIT(13)
+#define MSR_IA32_RTIT_STATUS		0x00000571
+#define RTIT_STATUS_CONTEXTEN		BIT(1)
+#define RTIT_STATUS_TRIGGEREN		BIT(2)
+#define RTIT_STATUS_ERROR		BIT(4)
+#define RTIT_STATUS_STOPPED		BIT(5)
+#define MSR_IA32_RTIT_CR3_MATCH		0x00000572
+#define MSR_IA32_RTIT_OUTPUT_BASE	0x00000560
+#define MSR_IA32_RTIT_OUTPUT_MASK	0x00000561
+
 #define MSR_MTRRfix64K_00000		0x00000250
 #define MSR_MTRRfix16K_80000		0x00000258
 #define MSR_MTRRfix16K_A0000		0x00000259
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 6c1ca139f736..e6b353f10e09 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -40,6 +40,7 @@ endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o perf_event_intel_cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o
 
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore.o \
 					   perf_event_intel_uncore_snb.o \
diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h
new file mode 100644
index 000000000000..1c338b0eba05
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_pt.h
@@ -0,0 +1,131 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#ifndef __INTEL_PT_H__
+#define __INTEL_PT_H__
+
+/*
+ * Single-entry ToPA: when this close to region boundary, switch
+ * buffers to avoid losing data.
+ */
+#define TOPA_PMI_MARGIN 512
+
+/*
+ * Table of Physical Addresses bits
+ */
+enum topa_sz {
+	TOPA_4K	= 0,
+	TOPA_8K,
+	TOPA_16K,
+	TOPA_32K,
+	TOPA_64K,
+	TOPA_128K,
+	TOPA_256K,
+	TOPA_512K,
+	TOPA_1MB,
+	TOPA_2MB,
+	TOPA_4MB,
+	TOPA_8MB,
+	TOPA_16MB,
+	TOPA_32MB,
+	TOPA_64MB,
+	TOPA_128MB,
+	TOPA_SZ_END,
+};
+
+static inline unsigned int sizes(enum topa_sz tsz)
+{
+	return 1 << (tsz + 12);
+};
+
+struct topa_entry {
+	u64	end	: 1;
+	u64	rsvd0	: 1;
+	u64	intr	: 1;
+	u64	rsvd1	: 1;
+	u64	stop	: 1;
+	u64	rsvd2	: 1;
+	u64	size	: 4;
+	u64	rsvd3	: 2;
+	u64	base	: 36;
+	u64	rsvd4	: 16;
+};
+
+#define TOPA_SHIFT 12
+#define PT_CPUID_LEAVES 2
+
+enum pt_capabilities {
+	PT_CAP_max_subleaf = 0,
+	PT_CAP_cr3_filtering,
+	PT_CAP_topa_output,
+	PT_CAP_topa_multiple_entries,
+	PT_CAP_payloads_lip,
+};
+
+struct pt_pmu {
+	struct pmu		pmu;
+	u32			caps[4 * PT_CPUID_LEAVES];
+};
+
+/**
+ * struct pt_buffer - buffer configuration; one buffer per task_struct or
+ *		cpu, depending on perf event configuration
+ * @cpu:	cpu for per-cpu allocation
+ * @tables:	list of ToPA tables in this buffer
+ * @first:	shorthand for first topa table
+ * @last:	shorthand for last topa table
+ * @cur:	current topa table
+ * @nr_pages:	buffer size in pages
+ * @cur_idx:	current output region's index within @cur table
+ * @output_off:	offset within the current output region
+ * @data_size:	running total of the amount of data in this buffer
+ * @lost:	if data was lost/truncated
+ * @head:	logical write offset inside the buffer
+ * @snapshot:	if this is for a snapshot/overwrite counter
+ * @stop_pos:	STOP topa entry in the buffer
+ * @intr_pos:	INT topa entry in the buffer
+ * @data_pages:	array of pages from perf
+ * @topa_index:	table of topa entries indexed by page offset
+ */
+struct pt_buffer {
+	int			cpu;
+	struct list_head	tables;
+	struct topa		*first, *last, *cur;
+	unsigned int		cur_idx;
+	size_t			output_off;
+	unsigned long		nr_pages;
+	local_t			data_size;
+	local_t			lost;
+	local64_t		head;
+	bool			snapshot;
+	unsigned long		stop_pos, intr_pos;
+	void			**data_pages;
+	struct topa_entry	*topa_index[0];
+};
+
+/**
+ * struct pt - per-cpu pt context
+ * @handle:	perf output handle
+ * @handle_nmi:	do handle PT PMI on this cpu, there's an active event
+ */
+struct pt {
+	struct perf_output_handle handle;
+	int			handle_nmi;
+};
+
+#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 47499661e8d4..f04729ac3290 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -808,6 +808,8 @@ void intel_pmu_lbr_init_hsw(void);
 
 int intel_pmu_setup_lbr_filter(struct perf_event *event);
 
+void intel_pt_interrupt(void);
+
 int p4_pmu_init(void);
 
 int p6_pmu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index b7b3ff21c832..8eb22ce26303 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1589,6 +1589,14 @@ again:
 		x86_pmu.drain_pebs(regs);
 	}
 
+	/*
+	 * Intel PT
+	 */
+	if (__test_and_clear_bit(55, (unsigned long *)&status)) {
+		handled++;
+		intel_pt_interrupt();
+	}
+
 	/*
 	 * Checkpointed counters can lead to 'spurious' PMIs because the
 	 * rollback caused by the PMI will have cleared the overflow status
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
new file mode 100644
index 000000000000..a9a1092cf836
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -0,0 +1,1096 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+
+#include <asm/perf_event.h>
+#include <asm/insn.h>
+#include <asm/io.h>
+
+#include "perf_event.h"
+#include "intel_pt.h"
+
+static DEFINE_PER_CPU(struct pt, pt_ctx);
+
+static struct pt_pmu pt_pmu;
+
+enum cpuid_regs {
+	CR_EAX = 0,
+	CR_ECX,
+	CR_EDX,
+	CR_EBX
+};
+
+/*
+ * Capabilities of Intel PT hardware, such as number of address bits or
+ * supported output schemes, are cached and exported to userspace as "caps"
+ * attribute group of pt pmu device
+ * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
+ * relevant bits together with intel_pt traces.
+ *
+ * These are necessary for both trace decoding (payloads_lip, contains address
+ * width encoded in IP-related packets), and event configuration (bitmasks with
+ * permitted values for certain bit fields).
+ */
+#define PT_CAP(_n, _l, _r, _m)						\
+	[PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,	\
+			    .reg = _r, .mask = _m }
+
+static struct pt_cap_desc {
+	const char	*name;
+	u32		leaf;
+	u8		reg;
+	u32		mask;
+} pt_caps[] = {
+	PT_CAP(max_subleaf,		0, CR_EAX, 0xffffffff),
+	PT_CAP(cr3_filtering,		0, CR_EBX, BIT(0)),
+	PT_CAP(topa_output,		0, CR_ECX, BIT(0)),
+	PT_CAP(topa_multiple_entries,	0, CR_ECX, BIT(1)),
+	PT_CAP(payloads_lip,		0, CR_ECX, BIT(31)),
+};
+
+static u32 pt_cap_get(enum pt_capabilities cap)
+{
+	struct pt_cap_desc *cd = &pt_caps[cap];
+	u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg];
+	unsigned int shift = __ffs(cd->mask);
+
+	return (c & cd->mask) >> shift;
+}
+
+static ssize_t pt_cap_show(struct device *cdev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct dev_ext_attribute *ea =
+		container_of(attr, struct dev_ext_attribute, attr);
+	enum pt_capabilities cap = (long)ea->var;
+
+	return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
+}
+
+static struct attribute_group pt_cap_group = {
+	.name	= "caps",
+};
+
+PMU_FORMAT_ATTR(tsc,		"config:10"	);
+PMU_FORMAT_ATTR(noretcomp,	"config:11"	);
+
+static struct attribute *pt_formats_attr[] = {
+	&format_attr_tsc.attr,
+	&format_attr_noretcomp.attr,
+	NULL,
+};
+
+static struct attribute_group pt_format_group = {
+	.name	= "format",
+	.attrs	= pt_formats_attr,
+};
+
+static const struct attribute_group *pt_attr_groups[] = {
+	&pt_cap_group,
+	&pt_format_group,
+	NULL,
+};
+
+static int __init pt_pmu_hw_init(void)
+{
+	struct dev_ext_attribute *de_attrs;
+	struct attribute **attrs;
+	size_t size;
+	long i;
+
+	if (test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT)) {
+		for (i = 0; i < PT_CPUID_LEAVES; i++)
+			cpuid_count(20, i,
+				    &pt_pmu.caps[CR_EAX + i * 4],
+				    &pt_pmu.caps[CR_EBX + i * 4],
+				    &pt_pmu.caps[CR_ECX + i * 4],
+				    &pt_pmu.caps[CR_EDX + i * 4]);
+	} else {
+		return -ENODEV;
+	}
+
+	size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps) + 1);
+	attrs = kzalloc(size, GFP_KERNEL);
+	if (!attrs)
+		goto err_attrs;
+
+	size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps) + 1);
+	de_attrs = kzalloc(size, GFP_KERNEL);
+	if (!de_attrs)
+		goto err_de_attrs;
+
+	for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
+		de_attrs[i].attr.attr.name = pt_caps[i].name;
+
+		sysfs_attr_init(&de_attrs[i].attr.attr);
+		de_attrs[i].attr.attr.mode = S_IRUGO;
+		de_attrs[i].attr.show = pt_cap_show;
+		de_attrs[i].var = (void *)i;
+		attrs[i] = &de_attrs[i].attr.attr;
+	}
+
+	pt_cap_group.attrs = attrs;
+	return 0;
+
+err_de_attrs:
+	kfree(de_attrs);
+err_attrs:
+	kfree(attrs);
+
+	return -ENOMEM;
+}
+
+#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC)
+
+static bool pt_event_valid(struct perf_event *event)
+{
+	u64 config = event->attr.config;
+
+	if ((config & PT_CONFIG_MASK) != config)
+		return false;
+
+	return true;
+}
+
+/*
+ * PT configuration helpers
+ * These all are cpu affine and operate on a local PT
+ */
+
+static bool pt_is_running(void)
+{
+	u64 ctl;
+
+	rdmsrl(MSR_IA32_RTIT_CTL, ctl);
+
+	return !!(ctl & RTIT_CTL_TRACEEN);
+}
+
+static void pt_config(struct perf_event *event)
+{
+	u64 reg;
+
+	reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
+
+	if (!event->attr.exclude_kernel)
+		reg |= RTIT_CTL_OS;
+	if (!event->attr.exclude_user)
+		reg |= RTIT_CTL_USR;
+
+	reg |= (event->attr.config & PT_CONFIG_MASK);
+
+	wrmsrl(MSR_IA32_RTIT_CTL, reg);
+}
+
+static void pt_config_start(bool start)
+{
+	u64 ctl;
+
+	rdmsrl(MSR_IA32_RTIT_CTL, ctl);
+	if (start)
+		ctl |= RTIT_CTL_TRACEEN;
+	else
+		ctl &= ~RTIT_CTL_TRACEEN;
+	wrmsrl(MSR_IA32_RTIT_CTL, ctl);
+
+	/*
+	 * A wrmsr that disables trace generation serializes other PT
+	 * registers and causes all data packets to be written to memory,
+	 * but a fence is required for the data to become globally visible.
+	 *
+	 * The below WMB, separating data store and aux_head store matches
+	 * the consumer's RMB that separates aux_head load and data load.
+	 */
+	if (!start)
+		wmb();
+}
+
+static void pt_config_buffer(void *buf, unsigned int topa_idx,
+			     unsigned int output_off)
+{
+	u64 reg;
+
+	wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
+
+	reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
+
+	wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+}
+
+/*
+ * Keep ToPA table-related metadata on the same page as the actual table,
+ * taking up a few words from the top
+ */
+
+#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
+
+/**
+ * struct topa - page-sized ToPA table with metadata at the top
+ * @table:	actual ToPA table entries, as understood by PT hardware
+ * @list:	linkage to struct pt_buffer's list of tables
+ * @phys:	physical address of this page
+ * @offset:	offset of the first entry in this table in the buffer
+ * @size:	total size of all entries in this table
+ * @last:	index of the last initialized entry in this table
+ */
+struct topa {
+	struct topa_entry	table[TENTS_PER_PAGE];
+	struct list_head	list;
+	u64			phys;
+	u64			offset;
+	size_t			size;
+	int			last;
+};
+
+/* make -1 stand for the last table entry */
+#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
+
+/**
+ * topa_alloc() - allocate page-sized ToPA table
+ * @cpu:	CPU on which to allocate.
+ * @gfp:	Allocation flags.
+ *
+ * Return:	On success, return the pointer to ToPA table page.
+ */
+static struct topa *topa_alloc(int cpu, gfp_t gfp)
+{
+	int node = cpu_to_node(cpu);
+	struct topa *topa;
+	struct page *p;
+
+	p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
+	if (!p)
+		return NULL;
+
+	topa = page_address(p);
+	topa->last = 0;
+	topa->phys = page_to_phys(p);
+
+	/*
+	 * In case of singe-entry ToPA, always put the self-referencing END
+	 * link as the 2nd entry in the table
+	 */
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
+		TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
+		TOPA_ENTRY(topa, 1)->end = 1;
+	}
+
+	return topa;
+}
+
+/**
+ * topa_free() - free a page-sized ToPA table
+ * @topa:	Table to deallocate.
+ */
+static void topa_free(struct topa *topa)
+{
+	free_page((unsigned long)topa);
+}
+
+/**
+ * topa_insert_table() - insert a ToPA table into a buffer
+ * @buf:	 PT buffer that's being extended.
+ * @topa:	 New topa table to be inserted.
+ *
+ * If it's the first table in this buffer, set up buffer's pointers
+ * accordingly; otherwise, add a END=1 link entry to @topa to the current
+ * "last" table and adjust the last table pointer to @topa.
+ */
+static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
+{
+	struct topa *last = buf->last;
+
+	list_add_tail(&topa->list, &buf->tables);
+
+	if (!buf->first) {
+		buf->first = buf->last = buf->cur = topa;
+		return;
+	}
+
+	topa->offset = last->offset + last->size;
+	buf->last = topa;
+
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+		return;
+
+	BUG_ON(last->last != TENTS_PER_PAGE - 1);
+
+	TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
+	TOPA_ENTRY(last, -1)->end = 1;
+}
+
+/**
+ * topa_table_full() - check if a ToPA table is filled up
+ * @topa:	ToPA table.
+ */
+static bool topa_table_full(struct topa *topa)
+{
+	/* single-entry ToPA is a special case */
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+		return !!topa->last;
+
+	return topa->last == TENTS_PER_PAGE - 1;
+}
+
+/**
+ * topa_insert_pages() - create a list of ToPA tables
+ * @buf:	PT buffer being initialized.
+ * @gfp:	Allocation flags.
+ *
+ * This initializes a list of ToPA tables with entries from
+ * the data_pages provided by rb_alloc_aux().
+ *
+ * Return:	0 on success or error code.
+ */
+static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
+{
+	struct topa *topa = buf->last;
+	int order = 0;
+	struct page *p;
+
+	p = virt_to_page(buf->data_pages[buf->nr_pages]);
+	if (PagePrivate(p))
+		order = page_private(p);
+
+	if (topa_table_full(topa)) {
+		topa = topa_alloc(buf->cpu, gfp);
+		if (!topa)
+			return -ENOMEM;
+
+		topa_insert_table(buf, topa);
+	}
+
+	TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
+	TOPA_ENTRY(topa, -1)->size = order;
+	if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
+		TOPA_ENTRY(topa, -1)->intr = 1;
+		TOPA_ENTRY(topa, -1)->stop = 1;
+	}
+
+	topa->last++;
+	topa->size += sizes(order);
+
+	buf->nr_pages += 1ul << order;
+
+	return 0;
+}
+
+/**
+ * pt_topa_dump() - print ToPA tables and their entries
+ * @buf:	PT buffer.
+ */
+static void pt_topa_dump(struct pt_buffer *buf)
+{
+	struct topa *topa;
+
+	list_for_each_entry(topa, &buf->tables, list) {
+		int i;
+
+		pr_debug("# table @%p (%p), off %llx size %zx\n", topa->table,
+			 (void *)topa->phys, topa->offset, topa->size);
+		for (i = 0; i < TENTS_PER_PAGE; i++) {
+			pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
+				 &topa->table[i],
+				 (unsigned long)topa->table[i].base << TOPA_SHIFT,
+				 sizes(topa->table[i].size),
+				 topa->table[i].end ?  'E' : ' ',
+				 topa->table[i].intr ? 'I' : ' ',
+				 topa->table[i].stop ? 'S' : ' ',
+				 *(u64 *)&topa->table[i]);
+			if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
+			     topa->table[i].stop) ||
+			    topa->table[i].end)
+				break;
+		}
+	}
+}
+
+/**
+ * pt_buffer_advance() - advance to the next output region
+ * @buf:	PT buffer.
+ *
+ * Advance the current pointers in the buffer to the next ToPA entry.
+ */
+static void pt_buffer_advance(struct pt_buffer *buf)
+{
+	buf->output_off = 0;
+	buf->cur_idx++;
+
+	if (buf->cur_idx == buf->cur->last) {
+		if (buf->cur == buf->last)
+			buf->cur = buf->first;
+		else
+			buf->cur = list_entry(buf->cur->list.next, struct topa,
+					      list);
+		buf->cur_idx = 0;
+	}
+}
+
+/**
+ * pt_update_head() - calculate current offsets and sizes
+ * @pt:		Per-cpu pt context.
+ *
+ * Update buffer's current write pointer position and data size.
+ */
+static void pt_update_head(struct pt *pt)
+{
+	struct pt_buffer *buf = perf_get_aux(&pt->handle);
+	u64 topa_idx, base, old;
+
+	/* offset of the first region in this table from the beginning of buf */
+	base = buf->cur->offset + buf->output_off;
+
+	/* offset of the current output region within this table */
+	for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
+		base += sizes(buf->cur->table[topa_idx].size);
+
+	if (buf->snapshot) {
+		local_set(&buf->data_size, base);
+	} else {
+		old = (local64_xchg(&buf->head, base) &
+		       ((buf->nr_pages << PAGE_SHIFT) - 1));
+		if (base < old)
+			base += buf->nr_pages << PAGE_SHIFT;
+
+		local_add(base - old, &buf->data_size);
+	}
+}
+
+/**
+ * pt_buffer_region() - obtain current output region's address
+ * @buf:	PT buffer.
+ */
+static void *pt_buffer_region(struct pt_buffer *buf)
+{
+	return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
+}
+
+/**
+ * pt_buffer_region_size() - obtain current output region's size
+ * @buf:	PT buffer.
+ */
+static size_t pt_buffer_region_size(struct pt_buffer *buf)
+{
+	return sizes(buf->cur->table[buf->cur_idx].size);
+}
+
+/**
+ * pt_handle_status() - take care of possible status conditions
+ * @pt:		Per-cpu pt context.
+ */
+static void pt_handle_status(struct pt *pt)
+{
+	struct pt_buffer *buf = perf_get_aux(&pt->handle);
+	int advance = 0;
+	u64 status;
+
+	rdmsrl(MSR_IA32_RTIT_STATUS, status);
+
+	if (status & RTIT_STATUS_ERROR) {
+		pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
+		pt_topa_dump(buf);
+		status &= ~RTIT_STATUS_ERROR;
+	}
+
+	if (status & RTIT_STATUS_STOPPED) {
+		status &= ~RTIT_STATUS_STOPPED;
+
+		/*
+		 * On systems that only do single-entry ToPA, hitting STOP
+		 * means we are already losing data; need to let the decoder
+		 * know.
+		 */
+		if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
+		    buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
+			local_inc(&buf->lost);
+			advance++;
+		}
+	}
+
+	/*
+	 * Also on single-entry ToPA implementations, interrupt will come
+	 * before the output reaches its output region's boundary.
+	 */
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
+	    pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
+		void *head = pt_buffer_region(buf);
+
+		/* everything within this margin needs to be zeroed out */
+		memset(head + buf->output_off, 0,
+		       pt_buffer_region_size(buf) -
+		       buf->output_off);
+		advance++;
+	}
+
+	if (advance)
+		pt_buffer_advance(buf);
+
+	wrmsrl(MSR_IA32_RTIT_STATUS, status);
+}
+
+/**
+ * pt_read_offset() - translate registers into buffer pointers
+ * @buf:	PT buffer.
+ *
+ * Set buffer's output pointers from MSR values.
+ */
+static void pt_read_offset(struct pt_buffer *buf)
+{
+	u64 offset, base_topa;
+
+	rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
+	buf->cur = phys_to_virt(base_topa);
+
+	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
+	/* offset within current output region */
+	buf->output_off = offset >> 32;
+	/* index of current output region within this table */
+	buf->cur_idx = (offset & 0xffffff80) >> 7;
+}
+
+/**
+ * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
+ * @buf:	PT buffer.
+ * @pg:		Page offset in the buffer.
+ *
+ * When advancing to the next output region (ToPA entry), given a page offset
+ * into the buffer, we need to find the offset of the first page in the next
+ * region.
+ */
+static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
+{
+	struct topa_entry *te = buf->topa_index[pg];
+
+	/* one region */
+	if (buf->first == buf->last && buf->first->last == 1)
+		return pg;
+
+	do {
+		pg++;
+		pg &= buf->nr_pages - 1;
+	} while (buf->topa_index[pg] == te);
+
+	return pg;
+}
+
+/**
+ * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
+ * @buf:	PT buffer.
+ * @handle:	Current output handle.
+ *
+ * Place INT and STOP marks to prevent overwriting old data that the consumer
+ * hasn't yet collected.
+ */
+static int pt_buffer_reset_markers(struct pt_buffer *buf,
+				   struct perf_output_handle *handle)
+
+{
+	unsigned long idx, npages, end;
+
+	if (buf->snapshot)
+		return 0;
+
+	/* can't stop in the middle of an output region */
+	if (buf->output_off + handle->size + 1 <
+	    sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
+		return -EINVAL;
+
+
+	/* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+		return 0;
+
+	/* clear STOP and INT from current entry */
+	buf->topa_index[buf->stop_pos]->stop = 0;
+	buf->topa_index[buf->intr_pos]->intr = 0;
+
+	if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
+		npages = (handle->size + 1) >> PAGE_SHIFT;
+		end = (local64_read(&buf->head) >> PAGE_SHIFT) + npages;
+		/*if (end > handle->wakeup >> PAGE_SHIFT)
+		  end = handle->wakeup >> PAGE_SHIFT;*/
+		idx = end & (buf->nr_pages - 1);
+		buf->stop_pos = idx;
+		idx = (local64_read(&buf->head) >> PAGE_SHIFT) + npages - 1;
+		idx &= buf->nr_pages - 1;
+		buf->intr_pos = idx;
+	}
+
+	buf->topa_index[buf->stop_pos]->stop = 1;
+	buf->topa_index[buf->intr_pos]->intr = 1;
+
+	return 0;
+}
+
+/**
+ * pt_buffer_setup_topa_index() - build topa_index[] table of regions
+ * @buf:	PT buffer.
+ *
+ * topa_index[] references output regions indexed by offset into the
+ * buffer for purposes of quick reverse lookup.
+ */
+static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
+{
+	struct topa *cur = buf->first, *prev = buf->last;
+	struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
+		*te_prev = TOPA_ENTRY(prev, prev->last - 1);
+	int pg = 0, idx = 0, ntopa = 0;
+
+	while (pg < buf->nr_pages) {
+		int tidx;
+
+		/* pages within one topa entry */
+		for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
+			buf->topa_index[pg] = te_prev;
+
+		te_prev = te_cur;
+
+		if (idx == cur->last - 1) {
+			/* advance to next topa table */
+			idx = 0;
+			cur = list_entry(cur->list.next, struct topa, list);
+			ntopa++;
+		} else
+			idx++;
+		te_cur = TOPA_ENTRY(cur, idx);
+	}
+
+}
+
+/**
+ * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
+ * @buf:	PT buffer.
+ * @head:	Write pointer (aux_head) from AUX buffer.
+ *
+ * Find the ToPA table and entry corresponding to given @head and set buffer's
+ * "current" pointers accordingly.
+ */
+static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
+{
+	int pg;
+
+	if (buf->snapshot)
+		head &= (buf->nr_pages << PAGE_SHIFT) - 1;
+
+	pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
+	pg = pt_topa_next_entry(buf, pg);
+
+	buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
+	buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
+			(unsigned long)buf->cur) / sizeof(struct topa_entry);
+	buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
+
+	local64_set(&buf->head, head);
+	local_set(&buf->data_size, 0);
+}
+
+/**
+ * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
+ * @buf:	PT buffer.
+ */
+static void pt_buffer_fini_topa(struct pt_buffer *buf)
+{
+	struct topa *topa, *iter;
+
+	list_for_each_entry_safe(topa, iter, &buf->tables, list) {
+		/*
+		 * right now, this is in free_aux() path only, so
+		 * no need to unlink this table from the list
+		 */
+		topa_free(topa);
+	}
+}
+
+/**
+ * pt_buffer_init_topa() - initialize ToPA table for pt buffer
+ * @buf:	PT buffer.
+ * @size:	Total size of all regions within this ToPA.
+ * @gfp:	Allocation flags.
+ */
+static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
+			       gfp_t gfp)
+{
+	struct topa *topa;
+	int err;
+
+	topa = topa_alloc(buf->cpu, gfp);
+	if (!topa)
+		return -ENOMEM;
+
+	topa_insert_table(buf, topa);
+
+	while (buf->nr_pages < nr_pages) {
+		err = topa_insert_pages(buf, gfp);
+		if (err) {
+			pt_buffer_fini_topa(buf);
+			return -ENOMEM;
+		}
+	}
+
+	pt_buffer_setup_topa_index(buf);
+
+	/* link last table to the first one, unless we're double buffering */
+	if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
+		TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
+		TOPA_ENTRY(buf->last, -1)->end = 1;
+	}
+
+	pt_topa_dump(buf);
+	return 0;
+}
+
+/**
+ * pt_buffer_setup_aux() - set up topa tables for a PT buffer
+ * @cpu:	Cpu on which to allocate, -1 means current.
+ * @pages:	Array of pointers to buffer pages passed from perf core.
+ * @nr_pages:	Number of pages in the buffer.
+ * @snapshot:	If this is a snapshot/overwrite counter.
+ *
+ * This is a pmu::setup_aux callback that sets up ToPA tables and all the
+ * bookkeeping for an AUX buffer.
+ *
+ * Return:	Our private PT buffer structure.
+ */
+static void *
+pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
+{
+	struct pt_buffer *buf;
+	int node, ret;
+
+	if (!nr_pages)
+		return NULL;
+
+	if (cpu == -1)
+		cpu = raw_smp_processor_id();
+	node = cpu_to_node(cpu);
+
+	buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
+			   GFP_KERNEL, node);
+	if (!buf)
+		return NULL;
+
+	buf->cpu = cpu;
+	buf->snapshot = snapshot;
+	buf->data_pages = pages;
+
+	INIT_LIST_HEAD(&buf->tables);
+
+	ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
+	if (ret) {
+		kfree(buf);
+		return NULL;
+	}
+
+	return buf;
+}
+
+/**
+ * pt_buffer_free_aux() - perf AUX deallocation path callback
+ * @data:	PT buffer.
+ */
+static void pt_buffer_free_aux(void *data)
+{
+	struct pt_buffer *buf = data;
+
+	pt_buffer_fini_topa(buf);
+	kfree(buf);
+}
+
+/**
+ * pt_buffer_is_full() - check if the buffer is full
+ * @buf:	PT buffer.
+ * @pt:		Per-cpu pt handle.
+ *
+ * If the user hasn't read data from the output region that aux_head
+ * points to, the buffer is considered full: the user needs to read at
+ * least this region and update aux_tail to point past it.
+ */
+static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
+{
+	if (buf->snapshot)
+		return false;
+
+	if (local_read(&buf->data_size) >= pt->handle.size)
+		return true;
+
+	return false;
+}
+
+/**
+ * intel_pt_interrupt() - PT PMI handler
+ */
+void intel_pt_interrupt(void)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct pt_buffer *buf;
+	struct perf_event *event = pt->handle.event;
+
+	/*
+	 * There may be a dangling PT bit in the interrupt status register
+	 * after PT has been disabled by pt_event_stop(). Make sure we don't
+	 * do anything (particularly, re-enable) for this event here.
+	 */
+	if (!ACCESS_ONCE(pt->handle_nmi))
+		return;
+
+	pt_config_start(false);
+
+	if (!event)
+		return;
+
+	buf = perf_get_aux(&pt->handle);
+	if (!buf)
+		return;
+
+	pt_read_offset(buf);
+
+	pt_handle_status(pt);
+
+	pt_update_head(pt);
+
+	perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+			    local_xchg(&buf->lost, 0));
+
+	if (!event->hw.state) {
+		int ret;
+
+		buf = perf_aux_output_begin(&pt->handle, event);
+		if (!buf) {
+			event->hw.state = PERF_HES_STOPPED;
+			return;
+		}
+
+		pt_buffer_reset_offsets(buf, pt->handle.head);
+		ret = pt_buffer_reset_markers(buf, &pt->handle);
+		if (ret) {
+			perf_aux_output_end(&pt->handle, 0, true);
+			return;
+		}
+
+		pt_config_buffer(buf->cur->table, buf->cur_idx,
+				 buf->output_off);
+		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+		pt_config(event);
+	}
+}
+
+/*
+ * PMU callbacks
+ */
+
+static void pt_event_start(struct perf_event *event, int mode)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+	if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) {
+		event->hw.state = PERF_HES_STOPPED;
+		return;
+	}
+
+	ACCESS_ONCE(pt->handle_nmi) = 1;
+	event->hw.state = 0;
+
+	pt_config_buffer(buf->cur->table, buf->cur_idx,
+			 buf->output_off);
+	wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+	pt_config(event);
+}
+
+static void pt_event_stop(struct perf_event *event, int mode)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+	/*
+	 * Protect against the PMI racing with disabling wrmsr,
+	 * see comment in intel_pt_interrupt().
+	 */
+	ACCESS_ONCE(pt->handle_nmi) = 0;
+	pt_config_start(false);
+
+	if (event->hw.state == PERF_HES_STOPPED)
+		return;
+
+	event->hw.state = PERF_HES_STOPPED;
+
+	if (mode & PERF_EF_UPDATE) {
+		struct pt *pt = this_cpu_ptr(&pt_ctx);
+		struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+		if (!buf)
+			return;
+
+		if (WARN_ON_ONCE(pt->handle.event != event))
+			return;
+
+		pt_read_offset(buf);
+
+		pt_handle_status(pt);
+
+		pt_update_head(pt);
+	}
+}
+
+static void pt_event_del(struct perf_event *event, int mode)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct pt_buffer *buf;
+
+	pt_event_stop(event, PERF_EF_UPDATE);
+
+	buf = perf_get_aux(&pt->handle);
+
+	if (buf) {
+		if (buf->snapshot)
+			pt->handle.head =
+				local_xchg(&buf->data_size,
+					   buf->nr_pages << PAGE_SHIFT);
+		perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+				    local_xchg(&buf->lost, 0));
+	}
+}
+
+static int pt_event_add(struct perf_event *event, int mode)
+{
+	struct pt_buffer *buf;
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct hw_perf_event *hwc = &event->hw;
+	int ret = -EBUSY;
+
+	if (pt->handle.event)
+		goto out;
+
+	buf = perf_aux_output_begin(&pt->handle, event);
+	if (!buf) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	pt_buffer_reset_offsets(buf, pt->handle.head);
+	if (!buf->snapshot) {
+		ret = pt_buffer_reset_markers(buf, &pt->handle);
+		if (ret) {
+			perf_aux_output_end(&pt->handle, 0, true);
+			goto out;
+		}
+	}
+
+	if (mode & PERF_EF_START) {
+		pt_event_start(event, 0);
+		if (hwc->state == PERF_HES_STOPPED) {
+			pt_event_del(event, 0);
+			ret = -EBUSY;
+		}
+	} else {
+		hwc->state = PERF_HES_STOPPED;
+	}
+
+	ret = 0;
+out:
+
+	if (ret)
+		hwc->state = PERF_HES_STOPPED;
+
+	return ret;
+}
+
+static void pt_event_read(struct perf_event *event)
+{
+}
+
+static void pt_event_destroy(struct perf_event *event)
+{
+	x86_del_exclusive(x86_lbr_exclusive_pt);
+}
+
+static int pt_event_init(struct perf_event *event)
+{
+	if (event->attr.type != pt_pmu.pmu.type)
+		return -ENOENT;
+
+	if (!pt_event_valid(event))
+		return -EINVAL;
+
+	if (x86_add_exclusive(x86_lbr_exclusive_pt))
+		return -EBUSY;
+
+	event->destroy = pt_event_destroy;
+
+	return 0;
+}
+
+static __init int pt_init(void)
+{
+	int ret, cpu, prior_warn = 0;
+
+	BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		u64 ctl;
+
+		ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
+		if (!ret && (ctl & RTIT_CTL_TRACEEN))
+			prior_warn++;
+	}
+	put_online_cpus();
+
+	if (prior_warn) {
+		x86_add_exclusive(x86_lbr_exclusive_pt);
+		pr_warn("PT is enabled at boot time, doing nothing\n");
+
+		return -EBUSY;
+	}
+
+	ret = pt_pmu_hw_init();
+	if (ret)
+		return ret;
+
+	if (!pt_cap_get(PT_CAP_topa_output)) {
+		pr_warn("ToPA output is not supported on this CPU\n");
+		return -ENODEV;
+	}
+
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+		pt_pmu.pmu.capabilities =
+			PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
+
+	pt_pmu.pmu.capabilities	|= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
+	pt_pmu.pmu.attr_groups	= pt_attr_groups;
+	pt_pmu.pmu.task_ctx_nr	= perf_sw_context;
+	pt_pmu.pmu.event_init	= pt_event_init;
+	pt_pmu.pmu.add		= pt_event_add;
+	pt_pmu.pmu.del		= pt_event_del;
+	pt_pmu.pmu.start	= pt_event_start;
+	pt_pmu.pmu.stop		= pt_event_stop;
+	pt_pmu.pmu.read		= pt_event_read;
+	pt_pmu.pmu.setup_aux	= pt_buffer_setup_aux;
+	pt_pmu.pmu.free_aux	= pt_buffer_free_aux;
+	ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
+
+	return ret;
+}
+
+module_init(pt_init);
-- 
cgit v1.2.3


From 8062382c8dbe2dc11d37e7f0b139508cf10de9d4 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Fri, 30 Jan 2015 12:40:35 +0200
Subject: perf/x86/intel/bts: Add BTS PMU driver

Add support for Branch Trace Store (BTS) via kernel perf event infrastructure.
The difference with the existing implementation of BTS support is that this
one is a separate PMU that exports events' trace buffers to userspace by means
of AUX area of the perf buffer, which is zero-copy mapped into userspace.

The immediate benefit is that the buffer size can be much bigger, resulting in
fewer interrupts and no kernel side copying is involved and little to no trace
data loss. Also, kernel code can be traced with this driver.

The old way of collecting BTS traces still works.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1422614435-114702-1-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/Makefile               |   2 +-
 arch/x86/kernel/cpu/perf_event.h           |   7 +
 arch/x86/kernel/cpu/perf_event_intel.c     |   6 +-
 arch/x86/kernel/cpu/perf_event_intel_bts.c | 525 +++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_event_intel_ds.c  |   3 +-
 5 files changed, 540 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/perf_event_intel_bts.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index e6b353f10e09..9bff68798836 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -40,7 +40,7 @@ endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o perf_event_intel_cqm.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o perf_event_intel_bts.o
 
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore.o \
 					   perf_event_intel_uncore_snb.o \
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index f04729ac3290..eaebfd707016 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -410,6 +410,7 @@ union x86_pmu_config {
 
 enum {
 	x86_lbr_exclusive_lbr,
+	x86_lbr_exclusive_bts,
 	x86_lbr_exclusive_pt,
 	x86_lbr_exclusive_max,
 };
@@ -810,6 +811,12 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event);
 
 void intel_pt_interrupt(void);
 
+int intel_bts_interrupt(void);
+
+void intel_bts_enable_local(void);
+
+void intel_bts_disable_local(void);
+
 int p4_pmu_init(void);
 
 int p6_pmu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 8eb22ce26303..b9861e19cd3d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1242,6 +1242,8 @@ static void intel_pmu_disable_all(void)
 
 	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
 		intel_pmu_disable_bts();
+	else
+		intel_bts_disable_local();
 
 	intel_pmu_pebs_disable_all();
 	intel_pmu_lbr_disable_all();
@@ -1264,7 +1266,8 @@ static void intel_pmu_enable_all(int added)
 			return;
 
 		intel_pmu_enable_bts(event->hw.config);
-	}
+	} else
+		intel_bts_enable_local();
 }
 
 /*
@@ -1550,6 +1553,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 		apic_write(APIC_LVTPC, APIC_DM_NMI);
 	intel_pmu_disable_all();
 	handled = intel_pmu_drain_bts_buffer();
+	handled += intel_bts_interrupt();
 	status = intel_pmu_get_status();
 	if (!status)
 		goto done;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c
new file mode 100644
index 000000000000..fb1a4c28f3e1
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -0,0 +1,525 @@
+/*
+ * BTS PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/coredump.h>
+
+#include <asm-generic/sizes.h>
+#include <asm/perf_event.h>
+
+#include "perf_event.h"
+
+struct bts_ctx {
+	struct perf_output_handle	handle;
+	struct debug_store		ds_back;
+	int				started;
+};
+
+static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
+
+#define BTS_RECORD_SIZE		24
+#define BTS_SAFETY_MARGIN	4080
+
+struct bts_phys {
+	struct page	*page;
+	unsigned long	size;
+	unsigned long	offset;
+	unsigned long	displacement;
+};
+
+struct bts_buffer {
+	size_t		real_size;	/* multiple of BTS_RECORD_SIZE */
+	unsigned int	nr_pages;
+	unsigned int	nr_bufs;
+	unsigned int	cur_buf;
+	bool		snapshot;
+	local_t		data_size;
+	local_t		lost;
+	local_t		head;
+	unsigned long	end;
+	void		**data_pages;
+	struct bts_phys	buf[0];
+};
+
+struct pmu bts_pmu;
+
+void intel_pmu_enable_bts(u64 config);
+void intel_pmu_disable_bts(void);
+
+static size_t buf_size(struct page *page)
+{
+	return 1 << (PAGE_SHIFT + page_private(page));
+}
+
+static void *
+bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
+{
+	struct bts_buffer *buf;
+	struct page *page;
+	int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+	unsigned long offset;
+	size_t size = nr_pages << PAGE_SHIFT;
+	int pg, nbuf, pad;
+
+	/* count all the high order buffers */
+	for (pg = 0, nbuf = 0; pg < nr_pages;) {
+		page = virt_to_page(pages[pg]);
+		if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1))
+			return NULL;
+		pg += 1 << page_private(page);
+		nbuf++;
+	}
+
+	/*
+	 * to avoid interrupts in overwrite mode, only allow one physical
+	 */
+	if (overwrite && nbuf > 1)
+		return NULL;
+
+	buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
+	if (!buf)
+		return NULL;
+
+	buf->nr_pages = nr_pages;
+	buf->nr_bufs = nbuf;
+	buf->snapshot = overwrite;
+	buf->data_pages = pages;
+	buf->real_size = size - size % BTS_RECORD_SIZE;
+
+	for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
+		unsigned int __nr_pages;
+
+		page = virt_to_page(pages[pg]);
+		__nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
+		buf->buf[nbuf].page = page;
+		buf->buf[nbuf].offset = offset;
+		buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
+		buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
+		pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
+		buf->buf[nbuf].size -= pad;
+
+		pg += __nr_pages;
+		offset += __nr_pages << PAGE_SHIFT;
+	}
+
+	return buf;
+}
+
+static void bts_buffer_free_aux(void *data)
+{
+	kfree(data);
+}
+
+static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
+{
+	return buf->buf[idx].offset + buf->buf[idx].displacement;
+}
+
+static void
+bts_config_buffer(struct bts_buffer *buf)
+{
+	int cpu = raw_smp_processor_id();
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	struct bts_phys *phys = &buf->buf[buf->cur_buf];
+	unsigned long index, thresh = 0, end = phys->size;
+	struct page *page = phys->page;
+
+	index = local_read(&buf->head);
+
+	if (!buf->snapshot) {
+		if (buf->end < phys->offset + buf_size(page))
+			end = buf->end - phys->offset - phys->displacement;
+
+		index -= phys->offset + phys->displacement;
+
+		if (end - index > BTS_SAFETY_MARGIN)
+			thresh = end - BTS_SAFETY_MARGIN;
+		else if (end - index > BTS_RECORD_SIZE)
+			thresh = end - BTS_RECORD_SIZE;
+		else
+			thresh = end;
+	}
+
+	ds->bts_buffer_base = (u64)page_address(page) + phys->displacement;
+	ds->bts_index = ds->bts_buffer_base + index;
+	ds->bts_absolute_maximum = ds->bts_buffer_base + end;
+	ds->bts_interrupt_threshold = !buf->snapshot
+		? ds->bts_buffer_base + thresh
+		: ds->bts_absolute_maximum + BTS_RECORD_SIZE;
+}
+
+static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
+{
+	unsigned long index = head - phys->offset;
+
+	memset(page_address(phys->page) + index, 0, phys->size - index);
+}
+
+static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts)
+{
+	if (buf->snapshot)
+		return false;
+
+	if (local_read(&buf->data_size) >= bts->handle.size ||
+	    bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE)
+		return true;
+
+	return false;
+}
+
+static void bts_update(struct bts_ctx *bts)
+{
+	int cpu = raw_smp_processor_id();
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	struct bts_buffer *buf = perf_get_aux(&bts->handle);
+	unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
+
+	if (!buf)
+		return;
+
+	head = index + bts_buffer_offset(buf, buf->cur_buf);
+	old = local_xchg(&buf->head, head);
+
+	if (!buf->snapshot) {
+		if (old == head)
+			return;
+
+		if (ds->bts_index >= ds->bts_absolute_maximum)
+			local_inc(&buf->lost);
+
+		/*
+		 * old and head are always in the same physical buffer, so we
+		 * can subtract them to get the data size.
+		 */
+		local_add(head - old, &buf->data_size);
+	} else {
+		local_set(&buf->data_size, head);
+	}
+}
+
+static void __bts_event_start(struct perf_event *event)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+	struct bts_buffer *buf = perf_get_aux(&bts->handle);
+	u64 config = 0;
+
+	if (!buf || bts_buffer_is_full(buf, bts))
+		return;
+
+	event->hw.state = 0;
+
+	if (!buf->snapshot)
+		config |= ARCH_PERFMON_EVENTSEL_INT;
+	if (!event->attr.exclude_kernel)
+		config |= ARCH_PERFMON_EVENTSEL_OS;
+	if (!event->attr.exclude_user)
+		config |= ARCH_PERFMON_EVENTSEL_USR;
+
+	bts_config_buffer(buf);
+
+	/*
+	 * local barrier to make sure that ds configuration made it
+	 * before we enable BTS
+	 */
+	wmb();
+
+	intel_pmu_enable_bts(config);
+}
+
+static void bts_event_start(struct perf_event *event, int flags)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+	__bts_event_start(event);
+
+	/* PMI handler: this counter is running and likely generating PMIs */
+	ACCESS_ONCE(bts->started) = 1;
+}
+
+static void __bts_event_stop(struct perf_event *event)
+{
+	/*
+	 * No extra synchronization is mandated by the documentation to have
+	 * BTS data stores globally visible.
+	 */
+	intel_pmu_disable_bts();
+
+	if (event->hw.state & PERF_HES_STOPPED)
+		return;
+
+	ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED;
+}
+
+static void bts_event_stop(struct perf_event *event, int flags)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+	/* PMI handler: don't restart this counter */
+	ACCESS_ONCE(bts->started) = 0;
+
+	__bts_event_stop(event);
+
+	if (flags & PERF_EF_UPDATE)
+		bts_update(bts);
+}
+
+void intel_bts_enable_local(void)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+	if (bts->handle.event && bts->started)
+		__bts_event_start(bts->handle.event);
+}
+
+void intel_bts_disable_local(void)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+	if (bts->handle.event)
+		__bts_event_stop(bts->handle.event);
+}
+
+static int
+bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
+{
+	unsigned long head, space, next_space, pad, gap, skip, wakeup;
+	unsigned int next_buf;
+	struct bts_phys *phys, *next_phys;
+	int ret;
+
+	if (buf->snapshot)
+		return 0;
+
+	head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
+	if (WARN_ON_ONCE(head != local_read(&buf->head)))
+		return -EINVAL;
+
+	phys = &buf->buf[buf->cur_buf];
+	space = phys->offset + phys->displacement + phys->size - head;
+	pad = space;
+	if (space > handle->size) {
+		space = handle->size;
+		space -= space % BTS_RECORD_SIZE;
+	}
+	if (space <= BTS_SAFETY_MARGIN) {
+		/* See if next phys buffer has more space */
+		next_buf = buf->cur_buf + 1;
+		if (next_buf >= buf->nr_bufs)
+			next_buf = 0;
+		next_phys = &buf->buf[next_buf];
+		gap = buf_size(phys->page) - phys->displacement - phys->size +
+		      next_phys->displacement;
+		skip = pad + gap;
+		if (handle->size >= skip) {
+			next_space = next_phys->size;
+			if (next_space + skip > handle->size) {
+				next_space = handle->size - skip;
+				next_space -= next_space % BTS_RECORD_SIZE;
+			}
+			if (next_space > space || !space) {
+				if (pad)
+					bts_buffer_pad_out(phys, head);
+				ret = perf_aux_output_skip(handle, skip);
+				if (ret)
+					return ret;
+				/* Advance to next phys buffer */
+				phys = next_phys;
+				space = next_space;
+				head = phys->offset + phys->displacement;
+				/*
+				 * After this, cur_buf and head won't match ds
+				 * anymore, so we must not be racing with
+				 * bts_update().
+				 */
+				buf->cur_buf = next_buf;
+				local_set(&buf->head, head);
+			}
+		}
+	}
+
+	/* Don't go far beyond wakeup watermark */
+	wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
+		 handle->head;
+	if (space > wakeup) {
+		space = wakeup;
+		space -= space % BTS_RECORD_SIZE;
+	}
+
+	buf->end = head + space;
+
+	/*
+	 * If we have no space, the lost notification would have been sent when
+	 * we hit absolute_maximum - see bts_update()
+	 */
+	if (!space)
+		return -ENOSPC;
+
+	return 0;
+}
+
+int intel_bts_interrupt(void)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+	struct perf_event *event = bts->handle.event;
+	struct bts_buffer *buf;
+	s64 old_head;
+	int err;
+
+	if (!event || !bts->started)
+		return 0;
+
+	buf = perf_get_aux(&bts->handle);
+	/*
+	 * Skip snapshot counters: they don't use the interrupt, but
+	 * there's no other way of telling, because the pointer will
+	 * keep moving
+	 */
+	if (!buf || buf->snapshot)
+		return 0;
+
+	old_head = local_read(&buf->head);
+	bts_update(bts);
+
+	/* no new data */
+	if (old_head == local_read(&buf->head))
+		return 0;
+
+	perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+			    !!local_xchg(&buf->lost, 0));
+
+	buf = perf_aux_output_begin(&bts->handle, event);
+	if (!buf)
+		return 1;
+
+	err = bts_buffer_reset(buf, &bts->handle);
+	if (err)
+		perf_aux_output_end(&bts->handle, 0, false);
+
+	return 1;
+}
+
+static void bts_event_del(struct perf_event *event, int mode)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+	struct bts_buffer *buf = perf_get_aux(&bts->handle);
+
+	bts_event_stop(event, PERF_EF_UPDATE);
+
+	if (buf) {
+		if (buf->snapshot)
+			bts->handle.head =
+				local_xchg(&buf->data_size,
+					   buf->nr_pages << PAGE_SHIFT);
+		perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+				    !!local_xchg(&buf->lost, 0));
+	}
+
+	cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
+	cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
+	cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
+	cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
+}
+
+static int bts_event_add(struct perf_event *event, int mode)
+{
+	struct bts_buffer *buf;
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int ret = -EBUSY;
+
+	event->hw.state = PERF_HES_STOPPED;
+
+	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+		return -EBUSY;
+
+	if (bts->handle.event)
+		return -EBUSY;
+
+	buf = perf_aux_output_begin(&bts->handle, event);
+	if (!buf)
+		return -EINVAL;
+
+	ret = bts_buffer_reset(buf, &bts->handle);
+	if (ret) {
+		perf_aux_output_end(&bts->handle, 0, false);
+		return ret;
+	}
+
+	bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
+	bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
+	bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
+
+	if (mode & PERF_EF_START) {
+		bts_event_start(event, 0);
+		if (hwc->state & PERF_HES_STOPPED) {
+			bts_event_del(event, 0);
+			return -EBUSY;
+		}
+	}
+
+	return 0;
+}
+
+static void bts_event_destroy(struct perf_event *event)
+{
+	x86_del_exclusive(x86_lbr_exclusive_bts);
+}
+
+static int bts_event_init(struct perf_event *event)
+{
+	if (event->attr.type != bts_pmu.type)
+		return -ENOENT;
+
+	if (x86_add_exclusive(x86_lbr_exclusive_bts))
+		return -EBUSY;
+
+	event->destroy = bts_event_destroy;
+
+	return 0;
+}
+
+static void bts_event_read(struct perf_event *event)
+{
+}
+
+static __init int bts_init(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
+		return -ENODEV;
+
+	bts_pmu.capabilities	= PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE;
+	bts_pmu.task_ctx_nr	= perf_sw_context;
+	bts_pmu.event_init	= bts_event_init;
+	bts_pmu.add		= bts_event_add;
+	bts_pmu.del		= bts_event_del;
+	bts_pmu.start		= bts_event_start;
+	bts_pmu.stop		= bts_event_stop;
+	bts_pmu.read		= bts_event_read;
+	bts_pmu.setup_aux	= bts_buffer_setup_aux;
+	bts_pmu.free_aux	= bts_buffer_free_aux;
+
+	return perf_pmu_register(&bts_pmu, "intel_bts", -1);
+}
+
+module_init(bts_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 073983398364..a5149c7abe73 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -461,7 +461,8 @@ void intel_pmu_enable_bts(u64 config)
 
 	debugctlmsr |= DEBUGCTLMSR_TR;
 	debugctlmsr |= DEBUGCTLMSR_BTS;
-	debugctlmsr |= DEBUGCTLMSR_BTINT;
+	if (config & ARCH_PERFMON_EVENTSEL_INT)
+		debugctlmsr |= DEBUGCTLMSR_BTINT;
 
 	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
 		debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
-- 
cgit v1.2.3


From 9a5e3fb52ae5458c8bf1a67129b96c39b541a582 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 17 Nov 2014 20:06:53 +0100
Subject: perf/x86: Rename x86_pmu::er_flags to 'flags'

Because it will be used for more than just tracking the
presence of extra registers.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Cc: maria.n.dimakopoulou@gmail.com
Link: http://lkml.kernel.org/r/1416251225-17721-2-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h       |  9 ++++++---
 arch/x86/kernel/cpu/perf_event_intel.c | 24 ++++++++++++------------
 2 files changed, 18 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index eaebfd707016..5264010c9a08 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -521,7 +521,7 @@ struct x86_pmu {
 	 * Extra registers for events
 	 */
 	struct extra_reg *extra_regs;
-	unsigned int er_flags;
+	unsigned int flags;
 
 	/*
 	 * Intel host/guest support (KVM)
@@ -545,8 +545,11 @@ do {									\
 	x86_pmu.quirks = &__quirk;					\
 } while (0)
 
-#define ERF_NO_HT_SHARING	1
-#define ERF_HAS_RSP_1		2
+/*
+ * x86_pmu flags
+ */
+#define PMU_FL_NO_HT_SHARING	0x1 /* no hyper-threading resource sharing */
+#define PMU_FL_HAS_RSP_1	0x2 /* has 2 equivalent offcore_rsp regs   */
 
 #define EVENT_VAR(_id)  event_attr_##_id
 #define EVENT_PTR(_id) &event_attr_##_id.attr.attr
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 1c78f44f4f93..e85988e2ecc7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1667,7 +1667,7 @@ intel_bts_constraints(struct perf_event *event)
 
 static int intel_alt_er(int idx)
 {
-	if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
+	if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
 		return idx;
 
 	if (idx == EXTRA_REG_RSP_0)
@@ -2250,7 +2250,7 @@ static void intel_pmu_cpu_starting(int cpu)
 	if (!cpuc->shared_regs)
 		return;
 
-	if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) {
+	if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
 		for_each_cpu(i, topology_thread_cpumask(cpu)) {
 			struct intel_shared_regs *pc;
 
@@ -2671,7 +2671,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.event_constraints = intel_slm_event_constraints;
 		x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_slm_extra_regs;
-		x86_pmu.er_flags |= ERF_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		pr_cont("Silvermont events, ");
 		break;
 
@@ -2689,7 +2689,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
 		x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_westmere_extra_regs;
-		x86_pmu.er_flags |= ERF_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 
 		x86_pmu.cpu_events = nhm_events_attrs;
 
@@ -2721,8 +2721,8 @@ __init int intel_pmu_init(void)
 		else
 			x86_pmu.extra_regs = intel_snb_extra_regs;
 		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.er_flags |= ERF_HAS_RSP_1;
-		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
 
 		x86_pmu.cpu_events = snb_events_attrs;
 
@@ -2756,8 +2756,8 @@ __init int intel_pmu_init(void)
 		else
 			x86_pmu.extra_regs = intel_snb_extra_regs;
 		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.er_flags |= ERF_HAS_RSP_1;
-		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
 
 		x86_pmu.cpu_events = snb_events_attrs;
 
@@ -2784,8 +2784,8 @@ __init int intel_pmu_init(void)
 		x86_pmu.extra_regs = intel_snbep_extra_regs;
 		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
 		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.er_flags |= ERF_HAS_RSP_1;
-		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
 
 		x86_pmu.hw_config = hsw_hw_config;
 		x86_pmu.get_event_constraints = hsw_get_event_constraints;
@@ -2817,8 +2817,8 @@ __init int intel_pmu_init(void)
 		x86_pmu.extra_regs = intel_snbep_extra_regs;
 		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
 		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.er_flags |= ERF_HAS_RSP_1;
-		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
 
 		x86_pmu.hw_config = hsw_hw_config;
 		x86_pmu.get_event_constraints = hsw_get_event_constraints;
-- 
cgit v1.2.3


From 90413464313e00fe4975f4a0ebf25fe31d01f793 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 17 Nov 2014 20:06:54 +0100
Subject: perf/x86: Vectorize cpuc->kfree_on_online

Make the cpuc->kfree_on_online a vector to accommodate
more than one entry and add the second entry to be
used by a later patch.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-3-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c       | 10 +++++++---
 arch/x86/kernel/cpu/perf_event.h       |  8 +++++++-
 arch/x86/kernel/cpu/perf_event_amd.c   |  3 ++-
 arch/x86/kernel/cpu/perf_event_intel.c |  4 +++-
 4 files changed, 19 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 549d01d6d996..682ef00727e7 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1373,11 +1373,12 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (long)hcpu;
 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-	int ret = NOTIFY_OK;
+	int i, ret = NOTIFY_OK;
 
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_UP_PREPARE:
-		cpuc->kfree_on_online = NULL;
+		for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
+			cpuc->kfree_on_online[i] = NULL;
 		if (x86_pmu.cpu_prepare)
 			ret = x86_pmu.cpu_prepare(cpu);
 		break;
@@ -1388,7 +1389,10 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 		break;
 
 	case CPU_ONLINE:
-		kfree(cpuc->kfree_on_online);
+		for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
+			kfree(cpuc->kfree_on_online[i]);
+			cpuc->kfree_on_online[i] = NULL;
+		}
 		break;
 
 	case CPU_DYING:
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 5264010c9a08..55b915511e53 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -125,6 +125,12 @@ struct intel_shared_regs {
 
 #define MAX_LBR_ENTRIES		16
 
+enum {
+	X86_PERF_KFREE_SHARED = 0,
+	X86_PERF_KFREE_EXCL   = 1,
+	X86_PERF_KFREE_MAX
+};
+
 struct cpu_hw_events {
 	/*
 	 * Generic x86 PMC bits
@@ -187,7 +193,7 @@ struct cpu_hw_events {
 	/* Inverted mask of bits to clear in the perf_ctr ctrl registers */
 	u64				perf_ctr_virt_mask;
 
-	void				*kfree_on_online;
+	void				*kfree_on_online[X86_PERF_KFREE_MAX];
 };
 
 #define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 28926311aac1..e4302b8fed2a 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -382,6 +382,7 @@ static int amd_pmu_cpu_prepare(int cpu)
 static void amd_pmu_cpu_starting(int cpu)
 {
 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
 	struct amd_nb *nb;
 	int i, nb_id;
 
@@ -399,7 +400,7 @@ static void amd_pmu_cpu_starting(int cpu)
 			continue;
 
 		if (nb->nb_id == nb_id) {
-			cpuc->kfree_on_online = cpuc->amd_nb;
+			*onln = cpuc->amd_nb;
 			cpuc->amd_nb = nb;
 			break;
 		}
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index e85988e2ecc7..c0ed5a4b9537 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2251,12 +2251,14 @@ static void intel_pmu_cpu_starting(int cpu)
 		return;
 
 	if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
+		void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
+
 		for_each_cpu(i, topology_thread_cpumask(cpu)) {
 			struct intel_shared_regs *pc;
 
 			pc = per_cpu(cpu_hw_events, i).shared_regs;
 			if (pc && pc->core_id == core_id) {
-				cpuc->kfree_on_online = cpuc->shared_regs;
+				*onln = cpuc->shared_regs;
 				cpuc->shared_regs = pc;
 				break;
 			}
-- 
cgit v1.2.3


From c5362c0c376486afcf3c91d3c2691d348ac1e2fd Mon Sep 17 00:00:00 2001
From: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Date: Mon, 17 Nov 2014 20:06:55 +0100
Subject: perf/x86: Add 3 new scheduling callbacks

This patch adds 3 new PMU model specific callbacks
during the event scheduling done by x86_schedule_events().

  ->start_scheduling():  invoked when entering the schedule routine.
  ->stop_scheduling():   invoked at the end of the schedule routine
  ->commit_scheduling(): invoked for each committed event

To be used optionally by model-specific code.

Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-4-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 9 +++++++++
 arch/x86/kernel/cpu/perf_event.h | 9 +++++++++
 2 files changed, 18 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 682ef00727e7..cd6115867fb8 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -784,6 +784,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 
 	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
 
+	if (x86_pmu.start_scheduling)
+		x86_pmu.start_scheduling(cpuc);
+
 	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
 		hwc = &cpuc->event_list[i]->hw;
 		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
@@ -830,6 +833,8 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 		for (i = 0; i < n; i++) {
 			e = cpuc->event_list[i];
 			e->hw.flags |= PERF_X86_EVENT_COMMITTED;
+			if (x86_pmu.commit_scheduling)
+				x86_pmu.commit_scheduling(cpuc, e, assign[i]);
 		}
 	}
 	/*
@@ -850,6 +855,10 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 				x86_pmu.put_event_constraints(cpuc, e);
 		}
 	}
+
+	if (x86_pmu.stop_scheduling)
+		x86_pmu.stop_scheduling(cpuc);
+
 	return num ? -EINVAL : 0;
 }
 
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 55b915511e53..ea27e63fc945 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -460,6 +460,15 @@ struct x86_pmu {
 
 	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
 						 struct perf_event *event);
+
+	void		(*commit_scheduling)(struct cpu_hw_events *cpuc,
+					     struct perf_event *event,
+					     int cntr);
+
+	void		(*start_scheduling)(struct cpu_hw_events *cpuc);
+
+	void		(*stop_scheduling)(struct cpu_hw_events *cpuc);
+
 	struct event_constraint *event_constraints;
 	struct x86_pmu_quirk *quirks;
 	int		perfctr_second_write;
-- 
cgit v1.2.3


From 79cba822443a168c8f7f5b853d9c7225a6d5415e Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 17 Nov 2014 20:06:56 +0100
Subject: perf/x86: Add 'index' param to get_event_constraint() callback

This patch adds an index parameter to the get_event_constraint()
x86_pmu callback. It is expected to represent the index of the
event in the cpuc->event_list[] array. When the callback is used
for fake_cpuc (evnet validation), then the index must be -1. The
motivation for passing the index is to use it to index into another
cpuc array.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Cc: maria.n.dimakopoulou@gmail.com
Link: http://lkml.kernel.org/r/1416251225-17721-5-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c       |  4 ++--
 arch/x86/kernel/cpu/perf_event.h       |  4 +++-
 arch/x86/kernel/cpu/perf_event_amd.c   |  6 ++++--
 arch/x86/kernel/cpu/perf_event_intel.c | 15 ++++++++++-----
 4 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index cd6115867fb8..71755401476c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -789,7 +789,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 
 	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
 		hwc = &cpuc->event_list[i]->hw;
-		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
+		c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
 		hwc->constraint = c;
 
 		wmin = min(wmin, c->weight);
@@ -1777,7 +1777,7 @@ static int validate_event(struct perf_event *event)
 	if (IS_ERR(fake_cpuc))
 		return PTR_ERR(fake_cpuc);
 
-	c = x86_pmu.get_event_constraints(fake_cpuc, event);
+	c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
 
 	if (!c || !c->weight)
 		ret = -EINVAL;
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index ea27e63fc945..24a65057c1c0 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -456,6 +456,7 @@ struct x86_pmu {
 	u64		max_period;
 	struct event_constraint *
 			(*get_event_constraints)(struct cpu_hw_events *cpuc,
+						 int idx,
 						 struct perf_event *event);
 
 	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
@@ -751,7 +752,8 @@ static inline bool intel_pmu_has_bts(struct perf_event *event)
 int intel_pmu_save_and_restart(struct perf_event *event);
 
 struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event);
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event);
 
 struct intel_shared_regs *allocate_shared_regs(int cpu);
 
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index e4302b8fed2a..1cee5d2d7ece 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -430,7 +430,8 @@ static void amd_pmu_cpu_dead(int cpu)
 }
 
 static struct event_constraint *
-amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event)
 {
 	/*
 	 * if not NB event or no NB, then no constraints
@@ -538,7 +539,8 @@ static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
 static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
 
 static struct event_constraint *
-amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
+amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
+			       struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	unsigned int event_code = amd_get_event_code(hwc);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index c0ed5a4b9537..2dd34b57d3ff 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1827,7 +1827,8 @@ intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
 }
 
 struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event)
 {
 	struct event_constraint *c;
 
@@ -1844,7 +1845,8 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 }
 
 static struct event_constraint *
-intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			    struct perf_event *event)
 {
 	struct event_constraint *c;
 
@@ -1860,7 +1862,7 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
 	if (c)
 		return c;
 
-	return x86_get_event_constraints(cpuc, event);
+	return x86_get_event_constraints(cpuc, idx, event);
 }
 
 static void
@@ -2105,9 +2107,12 @@ static struct event_constraint counter2_constraint =
 			EVENT_CONSTRAINT(0, 0x4, 0);
 
 static struct event_constraint *
-hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event)
 {
-	struct event_constraint *c = intel_get_event_constraints(cpuc, event);
+	struct event_constraint *c;
+
+	c = intel_get_event_constraints(cpuc, idx, event);
 
 	/* Handle special quirk on in_tx_checkpointed only in counter 2 */
 	if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
-- 
cgit v1.2.3


From 6f6539cad926f55d5eb6e79d05bbe99f0d54d56d Mon Sep 17 00:00:00 2001
From: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Date: Mon, 17 Nov 2014 20:06:57 +0100
Subject: perf/x86/intel: Add cross-HT counter exclusion infrastructure

This patch adds a new shared_regs style structure to the
per-cpu x86 state (cpuc). It is used to coordinate access
between counters which must be used with exclusion across
HyperThreads on Intel processors. This new struct is not
needed on each PMU, thus is is allocated on demand.

Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[peterz: spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-6-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h       | 32 +++++++++++++++
 arch/x86/kernel/cpu/perf_event_intel.c | 71 +++++++++++++++++++++++++++++++---
 2 files changed, 98 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 24a65057c1c0..f31f90e2d859 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -71,6 +71,7 @@ struct event_constraint {
 #define PERF_X86_EVENT_COMMITTED	0x8 /* event passed commit_txn */
 #define PERF_X86_EVENT_PEBS_LD_HSW	0x10 /* haswell style datala, load */
 #define PERF_X86_EVENT_PEBS_NA_HSW	0x20 /* haswell style datala, unknown */
+#define PERF_X86_EVENT_EXCL		0x40 /* HT exclusivity on counter */
 #define PERF_X86_EVENT_RDPMC_ALLOWED	0x40 /* grant rdpmc permission */
 
 
@@ -123,6 +124,26 @@ struct intel_shared_regs {
 	unsigned                core_id;	/* per-core: core id */
 };
 
+enum intel_excl_state_type {
+	INTEL_EXCL_UNUSED    = 0, /* counter is unused */
+	INTEL_EXCL_SHARED    = 1, /* counter can be used by both threads */
+	INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */
+};
+
+struct intel_excl_states {
+	enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
+	enum intel_excl_state_type state[X86_PMC_IDX_MAX];
+};
+
+struct intel_excl_cntrs {
+	raw_spinlock_t	lock;
+
+	struct intel_excl_states states[2];
+
+	int		refcnt;		/* per-core: #HT threads */
+	unsigned	core_id;	/* per-core: core id */
+};
+
 #define MAX_LBR_ENTRIES		16
 
 enum {
@@ -185,6 +206,12 @@ struct cpu_hw_events {
 	 * used on Intel NHM/WSM/SNB
 	 */
 	struct intel_shared_regs	*shared_regs;
+	/*
+	 * manage exclusive counter access between hyperthread
+	 */
+	struct event_constraint *constraint_list; /* in enable order */
+	struct intel_excl_cntrs		*excl_cntrs;
+	int excl_thread_id; /* 0 or 1 */
 
 	/*
 	 * AMD specific bits
@@ -208,6 +235,10 @@ struct cpu_hw_events {
 #define EVENT_CONSTRAINT(c, n, m)	\
 	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
 
+#define INTEL_EXCLEVT_CONSTRAINT(c, n)	\
+	__EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
+			   0, PERF_X86_EVENT_EXCL)
+
 /*
  * The overlap flag marks event constraints with overlapping counter
  * masks. This is the case if the counter mask of such an event is not
@@ -566,6 +597,7 @@ do {									\
  */
 #define PMU_FL_NO_HT_SHARING	0x1 /* no hyper-threading resource sharing */
 #define PMU_FL_HAS_RSP_1	0x2 /* has 2 equivalent offcore_rsp regs   */
+#define PMU_FL_EXCL_CNTRS	0x4 /* has exclusive counter requirements  */
 
 #define EVENT_VAR(_id)  event_attr_##_id
 #define EVENT_PTR(_id) &event_attr_##_id.attr.attr
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 2dd34b57d3ff..7f54000fd0f1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2224,16 +2224,52 @@ struct intel_shared_regs *allocate_shared_regs(int cpu)
 	return regs;
 }
 
+static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
+{
+	struct intel_excl_cntrs *c;
+	int i;
+
+	c = kzalloc_node(sizeof(struct intel_excl_cntrs),
+			 GFP_KERNEL, cpu_to_node(cpu));
+	if (c) {
+		raw_spin_lock_init(&c->lock);
+		for (i = 0; i < X86_PMC_IDX_MAX; i++) {
+			c->states[0].state[i] = INTEL_EXCL_UNUSED;
+			c->states[0].init_state[i] = INTEL_EXCL_UNUSED;
+
+			c->states[1].state[i] = INTEL_EXCL_UNUSED;
+			c->states[1].init_state[i] = INTEL_EXCL_UNUSED;
+		}
+		c->core_id = -1;
+	}
+	return c;
+}
+
 static int intel_pmu_cpu_prepare(int cpu)
 {
 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
 
-	if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map))
-		return NOTIFY_OK;
+	if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
+		cpuc->shared_regs = allocate_shared_regs(cpu);
+		if (!cpuc->shared_regs)
+			return NOTIFY_BAD;
+	}
 
-	cpuc->shared_regs = allocate_shared_regs(cpu);
-	if (!cpuc->shared_regs)
-		return NOTIFY_BAD;
+	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+		size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
+
+		cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
+		if (!cpuc->constraint_list)
+			return NOTIFY_BAD;
+
+		cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
+		if (!cpuc->excl_cntrs) {
+			kfree(cpuc->constraint_list);
+			kfree(cpuc->shared_regs);
+			return NOTIFY_BAD;
+		}
+		cpuc->excl_thread_id = 0;
+	}
 
 	return NOTIFY_OK;
 }
@@ -2274,12 +2310,29 @@ static void intel_pmu_cpu_starting(int cpu)
 
 	if (x86_pmu.lbr_sel_map)
 		cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
+
+	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+		for_each_cpu(i, topology_thread_cpumask(cpu)) {
+			struct intel_excl_cntrs *c;
+
+			c = per_cpu(cpu_hw_events, i).excl_cntrs;
+			if (c && c->core_id == core_id) {
+				cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
+				cpuc->excl_cntrs = c;
+				cpuc->excl_thread_id = 1;
+				break;
+			}
+		}
+		cpuc->excl_cntrs->core_id = core_id;
+		cpuc->excl_cntrs->refcnt++;
+	}
 }
 
 static void intel_pmu_cpu_dying(int cpu)
 {
 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
 	struct intel_shared_regs *pc;
+	struct intel_excl_cntrs *c;
 
 	pc = cpuc->shared_regs;
 	if (pc) {
@@ -2287,6 +2340,14 @@ static void intel_pmu_cpu_dying(int cpu)
 			kfree(pc);
 		cpuc->shared_regs = NULL;
 	}
+	c = cpuc->excl_cntrs;
+	if (c) {
+		if (c->core_id == -1 || --c->refcnt == 0)
+			kfree(c);
+		cpuc->excl_cntrs = NULL;
+		kfree(cpuc->constraint_list);
+		cpuc->constraint_list = NULL;
+	}
 
 	fini_debug_store_on_cpu(cpu);
 }
-- 
cgit v1.2.3


From e979121b1b1556e184492e6fc149bbe188fc83e6 Mon Sep 17 00:00:00 2001
From: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Date: Mon, 17 Nov 2014 20:06:58 +0100
Subject: perf/x86/intel: Implement cross-HT corruption bug workaround

This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:

  - SandyBridge: BJ122
  - IvyBridge: BV98
  - Haswell: HSD29

The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.

There is no HW or FW workaround for this problem.

The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.

  $ taskset -c 0 triad &
  $ taskset -c 4 triad &
  $ perf stat -a -C 0 -e r81d0 sleep 100 &
  $ perf stat -a -C 4 -r20cc sleep 10
  Performance counter stats for 'system wide':
        139 277 291      r20cc
       10,000969126 seconds time elapsed

In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.

This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.

The same example run with the workaround enabled, yield the correct answer:

  $ taskset -c 0 triad &
  $ taskset -c 4 triad &
  $ perf stat -a -C 0 -e r81d0 sleep 100 &
  $ perf stat -a -C 4 -r20cc sleep 10
  Performance counter stats for 'system wide':
        0 r20cc
       10,000969126 seconds time elapsed

The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.

The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.

As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.

Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.

This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.

Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c       |  31 ++--
 arch/x86/kernel/cpu/perf_event.h       |   6 +
 arch/x86/kernel/cpu/perf_event_intel.c | 307 ++++++++++++++++++++++++++++++++-
 3 files changed, 331 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 71755401476c..b8b7a1277d8d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -779,7 +779,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 	struct event_constraint *c;
 	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	struct perf_event *e;
-	int i, wmin, wmax, num = 0;
+	int i, wmin, wmax, unsched = 0;
 	struct hw_perf_event *hwc;
 
 	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
@@ -822,14 +822,20 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 
 	/* slow path */
 	if (i != n)
-		num = perf_assign_events(cpuc->event_list, n, wmin,
-					 wmax, assign);
+		unsched = perf_assign_events(cpuc->event_list, n, wmin,
+					     wmax, assign);
 
 	/*
-	 * Mark the event as committed, so we do not put_constraint()
-	 * in case new events are added and fail scheduling.
+	 * In case of success (unsched = 0), mark events as committed,
+	 * so we do not put_constraint() in case new events are added
+	 * and fail to be scheduled
+	 *
+	 * We invoke the lower level commit callback to lock the resource
+	 *
+	 * We do not need to do all of this in case we are called to
+	 * validate an event group (assign == NULL)
 	 */
-	if (!num && assign) {
+	if (!unsched && assign) {
 		for (i = 0; i < n; i++) {
 			e = cpuc->event_list[i];
 			e->hw.flags |= PERF_X86_EVENT_COMMITTED;
@@ -837,11 +843,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 				x86_pmu.commit_scheduling(cpuc, e, assign[i]);
 		}
 	}
-	/*
-	 * scheduling failed or is just a simulation,
-	 * free resources if necessary
-	 */
-	if (!assign || num) {
+
+	if (!assign || unsched) {
+
 		for (i = 0; i < n; i++) {
 			e = cpuc->event_list[i];
 			/*
@@ -851,6 +855,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 			if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
 				continue;
 
+			/*
+			 * release events that failed scheduling
+			 */
 			if (x86_pmu.put_event_constraints)
 				x86_pmu.put_event_constraints(cpuc, e);
 		}
@@ -859,7 +866,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 	if (x86_pmu.stop_scheduling)
 		x86_pmu.stop_scheduling(cpuc);
 
-	return num ? -EINVAL : 0;
+	return unsched ? -EINVAL : 0;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index f31f90e2d859..236afee35587 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -72,6 +72,7 @@ struct event_constraint {
 #define PERF_X86_EVENT_PEBS_LD_HSW	0x10 /* haswell style datala, load */
 #define PERF_X86_EVENT_PEBS_NA_HSW	0x20 /* haswell style datala, unknown */
 #define PERF_X86_EVENT_EXCL		0x40 /* HT exclusivity on counter */
+#define PERF_X86_EVENT_DYNAMIC		0x80 /* dynamic alloc'd constraint */
 #define PERF_X86_EVENT_RDPMC_ALLOWED	0x40 /* grant rdpmc permission */
 
 
@@ -133,6 +134,7 @@ enum intel_excl_state_type {
 struct intel_excl_states {
 	enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
 	enum intel_excl_state_type state[X86_PMC_IDX_MAX];
+	bool sched_started; /* true if scheduling has started */
 };
 
 struct intel_excl_cntrs {
@@ -296,6 +298,10 @@ struct cpu_hw_events {
 #define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n)	\
 	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
 
+#define INTEL_EXCLUEVT_CONSTRAINT(c, n)	\
+	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+			   HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)
+
 #define INTEL_PLD_CONSTRAINT(c, n)	\
 	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
 			   HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 7f54000fd0f1..91cc7749d7ce 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1845,7 +1845,7 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 }
 
 static struct event_constraint *
-intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 			    struct perf_event *event)
 {
 	struct event_constraint *c;
@@ -1865,6 +1865,254 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 	return x86_get_event_constraints(cpuc, idx, event);
 }
 
+static void
+intel_start_scheduling(struct cpu_hw_events *cpuc)
+{
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct intel_excl_states *xl, *xlo;
+	int tid = cpuc->excl_thread_id;
+	int o_tid = 1 - tid; /* sibling thread */
+
+	/*
+	 * nothing needed if in group validation mode
+	 */
+	if (cpuc->is_fake)
+		return;
+	/*
+	 * no exclusion needed
+	 */
+	if (!excl_cntrs)
+		return;
+
+	xlo = &excl_cntrs->states[o_tid];
+	xl = &excl_cntrs->states[tid];
+
+	xl->sched_started = true;
+
+	/*
+	 * lock shared state until we are done scheduling
+	 * in stop_event_scheduling()
+	 * makes scheduling appear as a transaction
+	 */
+	WARN_ON_ONCE(!irqs_disabled());
+	raw_spin_lock(&excl_cntrs->lock);
+
+	/*
+	 * save initial state of sibling thread
+	 */
+	memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state));
+}
+
+static void
+intel_stop_scheduling(struct cpu_hw_events *cpuc)
+{
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct intel_excl_states *xl, *xlo;
+	int tid = cpuc->excl_thread_id;
+	int o_tid = 1 - tid; /* sibling thread */
+
+	/*
+	 * nothing needed if in group validation mode
+	 */
+	if (cpuc->is_fake)
+		return;
+	/*
+	 * no exclusion needed
+	 */
+	if (!excl_cntrs)
+		return;
+
+	xlo = &excl_cntrs->states[o_tid];
+	xl = &excl_cntrs->states[tid];
+
+	/*
+	 * make new sibling thread state visible
+	 */
+	memcpy(xlo->state, xlo->init_state, sizeof(xlo->state));
+
+	xl->sched_started = false;
+	/*
+	 * release shared state lock (acquired in intel_start_scheduling())
+	 */
+	raw_spin_unlock(&excl_cntrs->lock);
+}
+
+static struct event_constraint *
+intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+			   int idx, struct event_constraint *c)
+{
+	struct event_constraint *cx;
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct intel_excl_states *xl, *xlo;
+	int is_excl, i;
+	int tid = cpuc->excl_thread_id;
+	int o_tid = 1 - tid; /* alternate */
+
+	/*
+	 * validating a group does not require
+	 * enforcing cross-thread  exclusion
+	 */
+	if (cpuc->is_fake)
+		return c;
+
+	/*
+	 * event requires exclusive counter access
+	 * across HT threads
+	 */
+	is_excl = c->flags & PERF_X86_EVENT_EXCL;
+
+	/*
+	 * xl = state of current HT
+	 * xlo = state of sibling HT
+	 */
+	xl = &excl_cntrs->states[tid];
+	xlo = &excl_cntrs->states[o_tid];
+
+	cx = c;
+
+	/*
+	 * because we modify the constraint, we need
+	 * to make a copy. Static constraints come
+	 * from static const tables.
+	 *
+	 * only needed when constraint has not yet
+	 * been cloned (marked dynamic)
+	 */
+	if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
+
+		/* sanity check */
+		if (idx < 0)
+			return &emptyconstraint;
+
+		/*
+		 * grab pre-allocated constraint entry
+		 */
+		cx = &cpuc->constraint_list[idx];
+
+		/*
+		 * initialize dynamic constraint
+		 * with static constraint
+		 */
+		memcpy(cx, c, sizeof(*cx));
+
+		/*
+		 * mark constraint as dynamic, so we
+		 * can free it later on
+		 */
+		cx->flags |= PERF_X86_EVENT_DYNAMIC;
+	}
+
+	/*
+	 * From here on, the constraint is dynamic.
+	 * Either it was just allocated above, or it
+	 * was allocated during a earlier invocation
+	 * of this function
+	 */
+
+	/*
+	 * Modify static constraint with current dynamic
+	 * state of thread
+	 *
+	 * EXCLUSIVE: sibling counter measuring exclusive event
+	 * SHARED   : sibling counter measuring non-exclusive event
+	 * UNUSED   : sibling counter unused
+	 */
+	for_each_set_bit(i, cx->idxmsk, X86_PMC_IDX_MAX) {
+		/*
+		 * exclusive event in sibling counter
+		 * our corresponding counter cannot be used
+		 * regardless of our event
+		 */
+		if (xl->state[i] == INTEL_EXCL_EXCLUSIVE)
+			__clear_bit(i, cx->idxmsk);
+		/*
+		 * if measuring an exclusive event, sibling
+		 * measuring non-exclusive, then counter cannot
+		 * be used
+		 */
+		if (is_excl && xl->state[i] == INTEL_EXCL_SHARED)
+			__clear_bit(i, cx->idxmsk);
+	}
+
+	/*
+	 * recompute actual bit weight for scheduling algorithm
+	 */
+	cx->weight = hweight64(cx->idxmsk64);
+
+	/*
+	 * if we return an empty mask, then switch
+	 * back to static empty constraint to avoid
+	 * the cost of freeing later on
+	 */
+	if (cx->weight == 0)
+		cx = &emptyconstraint;
+
+	return cx;
+}
+
+static struct event_constraint *
+intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			    struct perf_event *event)
+{
+	struct event_constraint *c = event->hw.constraint;
+
+	/*
+	 * first time only
+	 * - static constraint: no change across incremental scheduling calls
+	 * - dynamic constraint: handled by intel_get_excl_constraints()
+	 */
+	if (!c)
+		c = __intel_get_event_constraints(cpuc, idx, event);
+
+	if (cpuc->excl_cntrs)
+		return intel_get_excl_constraints(cpuc, event, idx, c);
+
+	return c;
+}
+
+static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
+		struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct intel_excl_states *xlo, *xl;
+	unsigned long flags = 0; /* keep compiler happy */
+	int tid = cpuc->excl_thread_id;
+	int o_tid = 1 - tid;
+
+	/*
+	 * nothing needed if in group validation mode
+	 */
+	if (cpuc->is_fake)
+		return;
+
+	WARN_ON_ONCE(!excl_cntrs);
+
+	if (!excl_cntrs)
+		return;
+
+	xl = &excl_cntrs->states[tid];
+	xlo = &excl_cntrs->states[o_tid];
+
+	/*
+	 * put_constraint may be called from x86_schedule_events()
+	 * which already has the lock held so here make locking
+	 * conditional
+	 */
+	if (!xl->sched_started)
+		raw_spin_lock_irqsave(&excl_cntrs->lock, flags);
+
+	/*
+	 * if event was actually assigned, then mark the
+	 * counter state as unused now
+	 */
+	if (hwc->idx >= 0)
+		xlo->state[hwc->idx] = INTEL_EXCL_UNUSED;
+
+	if (!xl->sched_started)
+		raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags);
+}
+
 static void
 intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
 					struct perf_event *event)
@@ -1883,7 +2131,57 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
 static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
 					struct perf_event *event)
 {
+	struct event_constraint *c = event->hw.constraint;
+
 	intel_put_shared_regs_event_constraints(cpuc, event);
+
+	/*
+	 * is PMU has exclusive counter restrictions, then
+	 * all events are subject to and must call the
+	 * put_excl_constraints() routine
+	 */
+	if (c && cpuc->excl_cntrs)
+		intel_put_excl_constraints(cpuc, event);
+
+	/* cleanup dynamic constraint */
+	if (c && (c->flags & PERF_X86_EVENT_DYNAMIC))
+		event->hw.constraint = NULL;
+}
+
+static void intel_commit_scheduling(struct cpu_hw_events *cpuc,
+				    struct perf_event *event, int cntr)
+{
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct event_constraint *c = event->hw.constraint;
+	struct intel_excl_states *xlo, *xl;
+	int tid = cpuc->excl_thread_id;
+	int o_tid = 1 - tid;
+	int is_excl;
+
+	if (cpuc->is_fake || !c)
+		return;
+
+	is_excl = c->flags & PERF_X86_EVENT_EXCL;
+
+	if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
+		return;
+
+	WARN_ON_ONCE(!excl_cntrs);
+
+	if (!excl_cntrs)
+		return;
+
+	xl = &excl_cntrs->states[tid];
+	xlo = &excl_cntrs->states[o_tid];
+
+	WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock));
+
+	if (cntr >= 0) {
+		if (is_excl)
+			xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE;
+		else
+			xlo->init_state[cntr] = INTEL_EXCL_SHARED;
+	}
 }
 
 static void intel_pebs_aliases_core2(struct perf_event *event)
@@ -2349,6 +2647,13 @@ static void intel_pmu_cpu_dying(int cpu)
 		cpuc->constraint_list = NULL;
 	}
 
+	c = cpuc->excl_cntrs;
+	if (c) {
+		if (c->core_id == -1 || --c->refcnt == 0)
+			kfree(c);
+		cpuc->excl_cntrs = NULL;
+	}
+
 	fini_debug_store_on_cpu(cpu);
 }
 
-- 
cgit v1.2.3


From 93fcf72cc0fa286aa8c3e11a1a8fd4659f0e27c0 Mon Sep 17 00:00:00 2001
From: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Date: Mon, 17 Nov 2014 20:06:59 +0100
Subject: perf/x86/intel: Enforce HT bug workaround for SNB/IVB/HSW

This patches activates the HT bug workaround for the
SNB/IVB/HSW processors. This covers non-PEBS mode.
Activation is done thru the constraint tables.

Both client and server processors needs this workaround.

Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-8-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 53 ++++++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 91cc7749d7ce..9350de0f5407 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -113,6 +113,12 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
 	INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
 	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
 	INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+
+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
 	EVENT_CONSTRAINT_END
 };
 
@@ -131,15 +137,12 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
 	INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
 	INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
 	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
-	/*
-	 * Errata BV98 -- MEM_*_RETIRED events can leak between counters of SMT
-	 * siblings; disable these events because they can corrupt unrelated
-	 * counters.
-	 */
-	INTEL_EVENT_CONSTRAINT(0xd0, 0x0), /* MEM_UOPS_RETIRED.* */
-	INTEL_EVENT_CONSTRAINT(0xd1, 0x0), /* MEM_LOAD_UOPS_RETIRED.* */
-	INTEL_EVENT_CONSTRAINT(0xd2, 0x0), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-	INTEL_EVENT_CONSTRAINT(0xd3, 0x0), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
 	EVENT_CONSTRAINT_END
 };
 
@@ -217,6 +220,12 @@ static struct event_constraint intel_hsw_event_constraints[] = {
 	INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
 	/* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
 	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
+
+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
 	EVENT_CONSTRAINT_END
 };
 
@@ -2865,6 +2874,27 @@ static __init void intel_nehalem_quirk(void)
 	}
 }
 
+/*
+ * enable software workaround for errata:
+ * SNB: BJ122
+ * IVB: BV98
+ * HSW: HSD29
+ *
+ * Only needed when HT is enabled. However detecting
+ * this is too difficult and model specific so we enable
+ * it even with HT off for now.
+ */
+static __init void intel_ht_bug(void)
+{
+	x86_pmu.flags |= PMU_FL_EXCL_CNTRS;
+
+	x86_pmu.commit_scheduling = intel_commit_scheduling;
+	x86_pmu.start_scheduling = intel_start_scheduling;
+	x86_pmu.stop_scheduling = intel_stop_scheduling;
+
+	pr_info("CPU erratum BJ122, BV98, HSD29 worked around\n");
+}
+
 EVENT_ATTR_STR(mem-loads,	mem_ld_hsw,	"event=0xcd,umask=0x1,ldlat=3");
 EVENT_ATTR_STR(mem-stores,	mem_st_hsw,	"event=0xd0,umask=0x82")
 
@@ -3079,6 +3109,7 @@ __init int intel_pmu_init(void)
 	case 42: /* 32nm SandyBridge         */
 	case 45: /* 32nm SandyBridge-E/EN/EP */
 		x86_add_quirk(intel_sandybridge_quirk);
+		x86_add_quirk(intel_ht_bug);
 		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
@@ -3093,6 +3124,8 @@ __init int intel_pmu_init(void)
 			x86_pmu.extra_regs = intel_snbep_extra_regs;
 		else
 			x86_pmu.extra_regs = intel_snb_extra_regs;
+
+
 		/* all extra regs are per-cpu when HT is on */
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
@@ -3111,6 +3144,7 @@ __init int intel_pmu_init(void)
 
 	case 58: /* 22nm IvyBridge       */
 	case 62: /* 22nm IvyBridge-EP/EX */
+		x86_add_quirk(intel_ht_bug);
 		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 		/* dTLB-load-misses on IVB is different than SNB */
@@ -3146,6 +3180,7 @@ __init int intel_pmu_init(void)
 	case 63: /* 22nm Haswell Server */
 	case 69: /* 22nm Haswell ULT */
 	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+		x86_add_quirk(intel_ht_bug);
 		x86_pmu.late_ack = true;
 		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-- 
cgit v1.2.3


From b63b4b459a78a9a45ea47a4803b8d1868e9d17d5 Mon Sep 17 00:00:00 2001
From: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Date: Mon, 17 Nov 2014 20:07:00 +0100
Subject: perf/x86/intel: Enforce HT bug workaround with PEBS for SNB/IVB/HSW

This patch modifies the PEBS constraint tables for SNB/IVB/HSW
such that corrupting events supporting PEBS activate the HT
workaround.

Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-9-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h          | 20 +++++++++++++++++++-
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 28 ++++++++++++++++++----------
 2 files changed, 37 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 236afee35587..2ba71ecc244f 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -326,22 +326,40 @@ struct cpu_hw_events {
 
 /* Check flags and event code, and set the HSW load flag */
 #define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
-	__EVENT_CONSTRAINT(code, n, 			\
+	__EVENT_CONSTRAINT(code, n,			\
 			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
 			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
 
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
+	__EVENT_CONSTRAINT(code, n,			\
+			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, \
+			  PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
 /* Check flags and event code/umask, and set the HSW store flag */
 #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
 	__EVENT_CONSTRAINT(code, n, 			\
 			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
 			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
 
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \
+	__EVENT_CONSTRAINT(code, n,			\
+			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, \
+			  PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL)
+
 /* Check flags and event code/umask, and set the HSW load flag */
 #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
 	__EVENT_CONSTRAINT(code, n, 			\
 			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
 			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
 
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \
+	__EVENT_CONSTRAINT(code, n,			\
+			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, \
+			  PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
 /* Check flags and event code/umask, and set the HSW N/A flag */
 #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
 	__EVENT_CONSTRAINT(code, n, 			\
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index a5149c7abe73..ca69ea56c712 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -612,6 +612,10 @@ struct event_constraint intel_snb_pebs_event_constraints[] = {
 	INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
 	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+        INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
 	/* Allow all events as PEBS with no flags */
 	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
 	EVENT_CONSTRAINT_END
@@ -623,6 +627,10 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = {
 	INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
 	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
 	/* Allow all events as PEBS with no flags */
 	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
         EVENT_CONSTRAINT_END
@@ -634,16 +642,16 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
 	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
 	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
 	/* Allow all events as PEBS with no flags */
 	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
 	EVENT_CONSTRAINT_END
-- 
cgit v1.2.3


From a90738c2cb0dceb882e0d7f7f9141e0062809b4d Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 17 Nov 2014 20:07:01 +0100
Subject: perf/x86/intel: Fix intel_get_event_constraints() for dynamic
 constraints

With dynamic constraint, we need to restart from the static
constraints each time the intel_get_event_constraints() is called.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-10-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 9350de0f5407..4773ae0b52d0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2063,20 +2063,25 @@ static struct event_constraint *
 intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 			    struct perf_event *event)
 {
-	struct event_constraint *c = event->hw.constraint;
+	struct event_constraint *c1 = event->hw.constraint;
+	struct event_constraint *c2;
 
 	/*
 	 * first time only
 	 * - static constraint: no change across incremental scheduling calls
 	 * - dynamic constraint: handled by intel_get_excl_constraints()
 	 */
-	if (!c)
-		c = __intel_get_event_constraints(cpuc, idx, event);
+	c2 = __intel_get_event_constraints(cpuc, idx, event);
+	if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
+		bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
+		c1->weight = c2->weight;
+		c2 = c1;
+	}
 
 	if (cpuc->excl_cntrs)
-		return intel_get_excl_constraints(cpuc, event, idx, c);
+		return intel_get_excl_constraints(cpuc, event, idx, c2);
 
-	return c;
+	return c2;
 }
 
 static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
-- 
cgit v1.2.3


From c02cdbf60b51b8d98a49185535f5d527a2965142 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 17 Nov 2014 20:07:02 +0100
Subject: perf/x86/intel: Limit to half counters when the HT workaround is
 enabled, to avoid exclusive mode starvation

This patch limits the number of counters available to each CPU when
the HT bug workaround is enabled.

This is necessary to avoid situation of counter starvation. Such can
arise from configuration where one HT thread, HT0, is using all 4 counters
with corrupting events which require exclusion the the sibling HT, HT1.

In such case, HT1 would not be able to schedule any event until HT0
is done. To mitigate this problem, this patch artificially limits
the number of counters to 2.

That way, we can gurantee that at least 2 counters are not in exclusive
mode and therefore allow the sibling thread to schedule events of the
same type (system vs. per-thread). The 2 counters are not determined
in advance. We simply set the limit to two events per HT.

This helps mitigate starvation in case of events with specific counter
constraints such a PREC_DIST.

Note that this does not elimintate the starvation is all cases. But
it is better than not having it.

(Solution suggested by Peter Zjilstra.)

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Cc: maria.n.dimakopoulou@gmail.com
Link: http://lkml.kernel.org/r/1416251225-17721-11-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h       |  2 ++
 arch/x86/kernel/cpu/perf_event_intel.c | 22 ++++++++++++++++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 2ba71ecc244f..176749cca98f 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -134,6 +134,8 @@ enum intel_excl_state_type {
 struct intel_excl_states {
 	enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
 	enum intel_excl_state_type state[X86_PMC_IDX_MAX];
+	int  num_alloc_cntrs;/* #counters allocated */
+	int  max_alloc_cntrs;/* max #counters allowed */
 	bool sched_started; /* true if scheduling has started */
 };
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 4773ae0b52d0..4187d3f4ed12 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1897,7 +1897,7 @@ intel_start_scheduling(struct cpu_hw_events *cpuc)
 	xl = &excl_cntrs->states[tid];
 
 	xl->sched_started = true;
-
+	xl->num_alloc_cntrs = 0;
 	/*
 	 * lock shared state until we are done scheduling
 	 * in stop_event_scheduling()
@@ -1963,7 +1963,6 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 	 */
 	if (cpuc->is_fake)
 		return c;
-
 	/*
 	 * event requires exclusive counter access
 	 * across HT threads
@@ -1977,6 +1976,18 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 	xl = &excl_cntrs->states[tid];
 	xlo = &excl_cntrs->states[o_tid];
 
+	/*
+	 * do not allow scheduling of more than max_alloc_cntrs
+	 * which is set to half the available generic counters.
+	 * this helps avoid counter starvation of sibling thread
+	 * by ensuring at most half the counters cannot be in
+	 * exclusive mode. There is not designated counters for the
+	 * limits. Any N/2 counters can be used. This helps with
+	 * events with specifix counter constraints
+	 */
+	if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
+		return &emptyconstraint;
+
 	cx = c;
 
 	/*
@@ -2624,6 +2635,8 @@ static void intel_pmu_cpu_starting(int cpu)
 		cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
 
 	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+		int h = x86_pmu.num_counters >> 1;
+
 		for_each_cpu(i, topology_thread_cpumask(cpu)) {
 			struct intel_excl_cntrs *c;
 
@@ -2637,6 +2650,11 @@ static void intel_pmu_cpu_starting(int cpu)
 		}
 		cpuc->excl_cntrs->core_id = core_id;
 		cpuc->excl_cntrs->refcnt++;
+		/*
+		 * set hard limit to half the number of generic counters
+		 */
+		cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
+		cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
 	}
 }
 
-- 
cgit v1.2.3


From b37609c30e41264c4df4acff78abfc894499a49b Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 17 Nov 2014 20:07:04 +0100
Subject: perf/x86/intel: Make the HT bug workaround conditional on HT enabled

This patch disables the PMU HT bug when Hyperthreading (HT)
is disabled. We cannot do this test immediately when perf_events
is initialized. We need to wait until the topology information
is setup properly. As such, we register a later initcall, check
the topology and potentially disable the workaround. To do this,
we need to ensure there is no user of the PMU. At this point of
the boot, the only user is the NMI watchdog, thus we disable
it during the switch and re-enable it right after.

Having the workaround disabled when it is not needed provides
some benefits by limiting the overhead is time and space.
The workaround still ensures correct scheduling of the corrupting
memory events (0xd0, 0xd1, 0xd2) when HT is off. Those events
can only be measured on counters 0-3. Something else the current
kernel did not handle correctly.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Cc: maria.n.dimakopoulou@gmail.com
Link: http://lkml.kernel.org/r/1416251225-17721-13-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h       |  5 ++
 arch/x86/kernel/cpu/perf_event_intel.c | 95 ++++++++++++++++++++++++++--------
 2 files changed, 79 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 176749cca98f..7250c0281f9d 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -624,6 +624,7 @@ do {									\
 #define PMU_FL_NO_HT_SHARING	0x1 /* no hyper-threading resource sharing */
 #define PMU_FL_HAS_RSP_1	0x2 /* has 2 equivalent offcore_rsp regs   */
 #define PMU_FL_EXCL_CNTRS	0x4 /* has exclusive counter requirements  */
+#define PMU_FL_EXCL_ENABLED	0x8 /* exclusive counter active */
 
 #define EVENT_VAR(_id)  event_attr_##_id
 #define EVENT_PTR(_id) &event_attr_##_id.attr.attr
@@ -904,6 +905,10 @@ int knc_pmu_init(void);
 ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
 			  char *page);
 
+static inline int is_ht_workaround_enabled(void)
+{
+	return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
+}
 #else /* CONFIG_CPU_SUP_INTEL */
 
 static inline void reserve_ds_buffers(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 4187d3f4ed12..6ea61a572fb0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/watchdog.h>
 
 #include <asm/cpufeature.h>
 #include <asm/hardirq.h>
@@ -1885,8 +1886,9 @@ intel_start_scheduling(struct cpu_hw_events *cpuc)
 	/*
 	 * nothing needed if in group validation mode
 	 */
-	if (cpuc->is_fake)
+	if (cpuc->is_fake || !is_ht_workaround_enabled())
 		return;
+
 	/*
 	 * no exclusion needed
 	 */
@@ -1923,7 +1925,7 @@ intel_stop_scheduling(struct cpu_hw_events *cpuc)
 	/*
 	 * nothing needed if in group validation mode
 	 */
-	if (cpuc->is_fake)
+	if (cpuc->is_fake || !is_ht_workaround_enabled())
 		return;
 	/*
 	 * no exclusion needed
@@ -1961,7 +1963,13 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 	 * validating a group does not require
 	 * enforcing cross-thread  exclusion
 	 */
-	if (cpuc->is_fake)
+	if (cpuc->is_fake || !is_ht_workaround_enabled())
+		return c;
+
+	/*
+	 * no exclusion needed
+	 */
+	if (!excl_cntrs)
 		return c;
 	/*
 	 * event requires exclusive counter access
@@ -2658,18 +2666,11 @@ static void intel_pmu_cpu_starting(int cpu)
 	}
 }
 
-static void intel_pmu_cpu_dying(int cpu)
+static void free_excl_cntrs(int cpu)
 {
 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-	struct intel_shared_regs *pc;
 	struct intel_excl_cntrs *c;
 
-	pc = cpuc->shared_regs;
-	if (pc) {
-		if (pc->core_id == -1 || --pc->refcnt == 0)
-			kfree(pc);
-		cpuc->shared_regs = NULL;
-	}
 	c = cpuc->excl_cntrs;
 	if (c) {
 		if (c->core_id == -1 || --c->refcnt == 0)
@@ -2678,14 +2679,22 @@ static void intel_pmu_cpu_dying(int cpu)
 		kfree(cpuc->constraint_list);
 		cpuc->constraint_list = NULL;
 	}
+}
 
-	c = cpuc->excl_cntrs;
-	if (c) {
-		if (c->core_id == -1 || --c->refcnt == 0)
-			kfree(c);
-		cpuc->excl_cntrs = NULL;
+static void intel_pmu_cpu_dying(int cpu)
+{
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	struct intel_shared_regs *pc;
+
+	pc = cpuc->shared_regs;
+	if (pc) {
+		if (pc->core_id == -1 || --pc->refcnt == 0)
+			kfree(pc);
+		cpuc->shared_regs = NULL;
 	}
 
+	free_excl_cntrs(cpu);
+
 	fini_debug_store_on_cpu(cpu);
 }
 
@@ -2904,18 +2913,18 @@ static __init void intel_nehalem_quirk(void)
  * HSW: HSD29
  *
  * Only needed when HT is enabled. However detecting
- * this is too difficult and model specific so we enable
- * it even with HT off for now.
+ * if HT is enabled is difficult (model specific). So instead,
+ * we enable the workaround in the early boot, and verify if
+ * it is needed in a later initcall phase once we have valid
+ * topology information to check if HT is actually enabled
  */
 static __init void intel_ht_bug(void)
 {
-	x86_pmu.flags |= PMU_FL_EXCL_CNTRS;
+	x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
 
 	x86_pmu.commit_scheduling = intel_commit_scheduling;
 	x86_pmu.start_scheduling = intel_start_scheduling;
 	x86_pmu.stop_scheduling = intel_stop_scheduling;
-
-	pr_info("CPU erratum BJ122, BV98, HSD29 worked around\n");
 }
 
 EVENT_ATTR_STR(mem-loads,	mem_ld_hsw,	"event=0xcd,umask=0x1,ldlat=3");
@@ -3343,3 +3352,47 @@ __init int intel_pmu_init(void)
 
 	return 0;
 }
+
+/*
+ * HT bug: phase 2 init
+ * Called once we have valid topology information to check
+ * whether or not HT is enabled
+ * If HT is off, then we disable the workaround
+ */
+static __init int fixup_ht_bug(void)
+{
+	int cpu = smp_processor_id();
+	int w, c;
+	/*
+	 * problem not present on this CPU model, nothing to do
+	 */
+	if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
+		return 0;
+
+	w = cpumask_weight(topology_thread_cpumask(cpu));
+	if (w > 1) {
+		pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
+		return 0;
+	}
+
+	watchdog_nmi_disable_all();
+
+	x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
+
+	x86_pmu.commit_scheduling = NULL;
+	x86_pmu.start_scheduling = NULL;
+	x86_pmu.stop_scheduling = NULL;
+
+	watchdog_nmi_enable_all();
+
+	get_online_cpus();
+
+	for_each_online_cpu(c) {
+		free_excl_cntrs(c);
+	}
+
+	put_online_cpus();
+	pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
+	return 0;
+}
+subsys_initcall(fixup_ht_bug)
-- 
cgit v1.2.3


From 8882edf735738c949aba4b65d3ec3453066bab12 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 27 Feb 2015 09:48:30 -0800
Subject: perf/x86/intel: Reset more state in PMU reset

The PMU reset code didn't quite keep up with newer PMU features.
Improve it a bit to really reset a modern PMU:

  - Clear all overflow status
  - Clear LBRs and freezing state
  - Disable fixed counters too

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1425059312-18217-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 6ea61a572fb0..59994602bb94 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1538,6 +1538,18 @@ static void intel_pmu_reset(void)
 	if (ds)
 		ds->bts_index = ds->bts_buffer_base;
 
+	/* Ack all overflows and disable fixed counters */
+	if (x86_pmu.version >= 2) {
+		intel_pmu_ack_status(intel_pmu_get_status());
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+	}
+
+	/* Reset LBRs and LBR freezing */
+	if (x86_pmu.lbr_nr) {
+		update_debugctlmsr(get_debugctlmsr() &
+			~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
+	}
+
 	local_irq_restore(flags);
 }
 
-- 
cgit v1.2.3


From da3e606d885a17525eb18afd423f5c438860b833 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 27 Feb 2015 09:48:31 -0800
Subject: perf/x86: Dump DEBUGCTL in PMU dump

LBRs and LBR freezing are controlled through the DEBUGCTL MSR. So
dump the state of DEBUGCTL too when dumping the PMU state.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1425059312-18217-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b8b7a1277d8d..994737263daa 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1171,7 +1171,7 @@ static void x86_pmu_start(struct perf_event *event, int flags)
 void perf_event_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
-	u64 pebs;
+	u64 pebs, debugctl;
 	struct cpu_hw_events *cpuc;
 	unsigned long flags;
 	int cpu, idx;
@@ -1197,6 +1197,10 @@ void perf_event_print_debug(void)
 		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
 		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
 		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
+		if (x86_pmu.lbr_nr) {
+			rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+			pr_info("CPU#%d: debugctl:   %016llx\n", cpu, debugctl);
+		}
 	}
 	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
 
-- 
cgit v1.2.3


From 15fde1101a1aed11958e0d86bc360f01866a74b1 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 27 Feb 2015 09:48:32 -0800
Subject: perf/x86: Only dump PEBS register when PEBS has been detected

Technically PEBS_ENABLED is only guaranteed to exist when we
detected PEBS. So add a check for this to the PMU dump function.
I don't think it can happen on a real CPU, but could in a VM.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1425059312-18217-4-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 994737263daa..689e35760924 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1189,14 +1189,16 @@ void perf_event_print_debug(void)
 		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
 		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
-		rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
 
 		pr_info("\n");
 		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
 		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
 		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
 		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
-		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
+		if (x86_pmu.pebs_constraints) {
+			rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
+			pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
+		}
 		if (x86_pmu.lbr_nr) {
 			rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 			pr_info("CPU#%d: debugctl:   %016llx\n", cpu, debugctl);
-- 
cgit v1.2.3


From 1a78d93750bb5f61abdc59a91fc3bd06a214542a Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 20 Mar 2015 10:11:23 -0700
Subject: perf/x86/intel: Streamline LBR MSR handling in PMI

The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.

So there is no need to disable the LBRs explicitely in the
PMI handler.

Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.

 5)               |  /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
 5)               |  /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
 5)               |  /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
 5)               |  /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
 5)               |  /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
 5)               |  /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
 5)               |  /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
 5)               |  /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */

This patch:

  - Avoids disabling already frozen LBRs unnecessarily in the PMI
  - Avoids changing LBR_SELECT in the PMI

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h           |  2 +-
 arch/x86/kernel/cpu/perf_event_intel.c     | 23 ++++++++++++++++++-----
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 12 ++++++++----
 3 files changed, 27 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 7250c0281f9d..329f0356ad4a 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -870,7 +870,7 @@ void intel_pmu_lbr_enable(struct perf_event *event);
 
 void intel_pmu_lbr_disable(struct perf_event *event);
 
-void intel_pmu_lbr_enable_all(void);
+void intel_pmu_lbr_enable_all(bool pmi);
 
 void intel_pmu_lbr_disable_all(void);
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 59994602bb94..9da2400c2ec3 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1244,7 +1244,10 @@ static __initconst const u64 slm_hw_cache_event_ids
  },
 };
 
-static void intel_pmu_disable_all(void)
+/*
+ * Use from PMIs where the LBRs are already disabled.
+ */
+static void __intel_pmu_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
@@ -1256,15 +1259,20 @@ static void intel_pmu_disable_all(void)
 		intel_bts_disable_local();
 
 	intel_pmu_pebs_disable_all();
+}
+
+static void intel_pmu_disable_all(void)
+{
+	__intel_pmu_disable_all();
 	intel_pmu_lbr_disable_all();
 }
 
-static void intel_pmu_enable_all(int added)
+static void __intel_pmu_enable_all(int added, bool pmi)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
 	intel_pmu_pebs_enable_all();
-	intel_pmu_lbr_enable_all();
+	intel_pmu_lbr_enable_all(pmi);
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
 			x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
 
@@ -1280,6 +1288,11 @@ static void intel_pmu_enable_all(int added)
 		intel_bts_enable_local();
 }
 
+static void intel_pmu_enable_all(int added)
+{
+	__intel_pmu_enable_all(added, false);
+}
+
 /*
  * Workaround for:
  *   Intel Errata AAK100 (model 26)
@@ -1573,7 +1586,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 	 */
 	if (!x86_pmu.late_ack)
 		apic_write(APIC_LVTPC, APIC_DM_NMI);
-	intel_pmu_disable_all();
+	__intel_pmu_disable_all();
 	handled = intel_pmu_drain_bts_buffer();
 	handled += intel_bts_interrupt();
 	status = intel_pmu_get_status();
@@ -1658,7 +1671,7 @@ again:
 		goto again;
 
 done:
-	intel_pmu_enable_all(0);
+	__intel_pmu_enable_all(0, true);
 	/*
 	 * Only unmask the NMI after the overflow counters
 	 * have been reset. This avoids spurious NMIs on
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 0473874109cb..3d537252f011 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -132,12 +132,16 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
  * otherwise it becomes near impossible to get a reliable stack.
  */
 
-static void __intel_pmu_lbr_enable(void)
+static void __intel_pmu_lbr_enable(bool pmi)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	u64 debugctl, lbr_select = 0;
 
-	if (cpuc->lbr_sel) {
+	/*
+	 * No need to reprogram LBR_SELECT in a PMI, as it
+	 * did not change.
+	 */
+	if (cpuc->lbr_sel && !pmi) {
 		lbr_select = cpuc->lbr_sel->config;
 		wrmsrl(MSR_LBR_SELECT, lbr_select);
 	}
@@ -351,12 +355,12 @@ void intel_pmu_lbr_disable(struct perf_event *event)
 	}
 }
 
-void intel_pmu_lbr_enable_all(void)
+void intel_pmu_lbr_enable_all(bool pmi)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
 	if (cpuc->lbr_users)
-		__intel_pmu_lbr_enable();
+		__intel_pmu_lbr_enable(pmi);
 }
 
 void intel_pmu_lbr_disable_all(void)
-- 
cgit v1.2.3


From cd1f11de69226cc7ce7e7f22bdab9010103ddaa6 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 20 Mar 2015 10:11:24 -0700
Subject: perf/x86/intel: Avoid rewriting DEBUGCTL with the same value for LBRs

perf with LBRs on has a tendency to rewrite the DEBUGCTL MSR with
the same value. Add a little optimization to skip the unnecessary
write.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 3d537252f011..94e5b506caa6 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -135,7 +135,7 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
 static void __intel_pmu_lbr_enable(bool pmi)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	u64 debugctl, lbr_select = 0;
+	u64 debugctl, lbr_select = 0, orig_debugctl;
 
 	/*
 	 * No need to reprogram LBR_SELECT in a PMI, as it
@@ -147,6 +147,7 @@ static void __intel_pmu_lbr_enable(bool pmi)
 	}
 
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+	orig_debugctl = debugctl;
 	debugctl |= DEBUGCTLMSR_LBR;
 	/*
 	 * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
@@ -155,7 +156,8 @@ static void __intel_pmu_lbr_enable(bool pmi)
 	 */
 	if (!(lbr_select & LBR_CALL_STACK))
 		debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
-	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+	if (orig_debugctl != debugctl)
+		wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 }
 
 static void __intel_pmu_lbr_disable(void)
-- 
cgit v1.2.3


From 2e54a5bdba107f80a9ba90065fb43bba0498147d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Apr 2015 17:57:59 +0200
Subject: perf/x86/intel/pt: Fix the 32-bit build

On a 32-bit build I got:

  arch/x86/kernel/cpu/perf_event_intel_pt.c:413:5: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
  arch/x86/kernel/cpu/perf_event_intel_bts.c:162:24: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]

Fix it. The code should probably be (re-)tested on 32-bit systems to make
sure all is fine.

Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: linux-kernel@vger.kernel.org
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_bts.c | 2 +-
 arch/x86/kernel/cpu/perf_event_intel_pt.c  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c
index fb1a4c28f3e1..ac1f0c55f379 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_bts.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -159,7 +159,7 @@ bts_config_buffer(struct bts_buffer *buf)
 			thresh = end;
 	}
 
-	ds->bts_buffer_base = (u64)page_address(page) + phys->displacement;
+	ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
 	ds->bts_index = ds->bts_buffer_base + index;
 	ds->bts_absolute_maximum = ds->bts_buffer_base + end;
 	ds->bts_interrupt_threshold = !buf->snapshot
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
index a9a1092cf836..f5a3afc65371 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -409,8 +409,8 @@ static void pt_topa_dump(struct pt_buffer *buf)
 	list_for_each_entry(topa, &buf->tables, list) {
 		int i;
 
-		pr_debug("# table @%p (%p), off %llx size %zx\n", topa->table,
-			 (void *)topa->phys, topa->offset, topa->size);
+		pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
+			 topa->phys, topa->offset, topa->size);
 		for (i = 0; i < TENTS_PER_PAGE; i++) {
 			pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
 				 &topa->table[i],
-- 
cgit v1.2.3


From 824b43763c562ee2feec16bb4017785528f3b54c Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Thu, 9 Apr 2015 12:55:46 +0200
Subject: crypto: x86/sha1_ssse3 - move SHA-1 SSSE3 implementation to base
 layer

This removes all the boilerplate from the existing implementation,
and replaces it with calls into the base layer.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha1_ssse3_glue.c | 139 ++++++++------------------------------
 1 file changed, 28 insertions(+), 111 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 6c20fe04a738..33d1b9dc14cc 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -28,7 +28,7 @@
 #include <linux/cryptohash.h>
 #include <linux/types.h>
 #include <crypto/sha.h>
-#include <asm/byteorder.h>
+#include <crypto/sha1_base.h>
 #include <asm/i387.h>
 #include <asm/xcr.h>
 #include <asm/xsave.h>
@@ -44,132 +44,51 @@ asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
 #define SHA1_AVX2_BLOCK_OPTSIZE	4	/* optimal 4*64 bytes of SHA1 blocks */
 
 asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
-				unsigned int rounds);
+				    unsigned int rounds);
 #endif
 
-static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int);
-
-
-static int sha1_ssse3_init(struct shash_desc *desc)
-{
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-
-	*sctx = (struct sha1_state){
-		.state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
-	};
-
-	return 0;
-}
-
-static int __sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
-			       unsigned int len, unsigned int partial)
-{
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-	unsigned int done = 0;
-
-	sctx->count += len;
-
-	if (partial) {
-		done = SHA1_BLOCK_SIZE - partial;
-		memcpy(sctx->buffer + partial, data, done);
-		sha1_transform_asm(sctx->state, sctx->buffer, 1);
-	}
-
-	if (len - done >= SHA1_BLOCK_SIZE) {
-		const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE;
-
-		sha1_transform_asm(sctx->state, data + done, rounds);
-		done += rounds * SHA1_BLOCK_SIZE;
-	}
-
-	memcpy(sctx->buffer, data + done, len - done);
-
-	return 0;
-}
+static void (*sha1_transform_asm)(u32 *, const char *, unsigned int);
 
 static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
 			     unsigned int len)
 {
 	struct sha1_state *sctx = shash_desc_ctx(desc);
-	unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
-	int res;
 
-	/* Handle the fast case right here */
-	if (partial + len < SHA1_BLOCK_SIZE) {
-		sctx->count += len;
-		memcpy(sctx->buffer + partial, data, len);
+	if (!irq_fpu_usable() ||
+	    (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE)
+		return crypto_sha1_update(desc, data, len);
 
-		return 0;
-	}
+	/* make sure casting to sha1_block_fn() is safe */
+	BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
 
-	if (!irq_fpu_usable()) {
-		res = crypto_sha1_update(desc, data, len);
-	} else {
-		kernel_fpu_begin();
-		res = __sha1_ssse3_update(desc, data, len, partial);
-		kernel_fpu_end();
-	}
-
-	return res;
-}
-
-
-/* Add padding and return the message digest. */
-static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
-{
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-	unsigned int i, index, padlen;
-	__be32 *dst = (__be32 *)out;
-	__be64 bits;
-	static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
-
-	bits = cpu_to_be64(sctx->count << 3);
-
-	/* Pad out to 56 mod 64 and append length */
-	index = sctx->count % SHA1_BLOCK_SIZE;
-	padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index);
-	if (!irq_fpu_usable()) {
-		crypto_sha1_update(desc, padding, padlen);
-		crypto_sha1_update(desc, (const u8 *)&bits, sizeof(bits));
-	} else {
-		kernel_fpu_begin();
-		/* We need to fill a whole block for __sha1_ssse3_update() */
-		if (padlen <= 56) {
-			sctx->count += padlen;
-			memcpy(sctx->buffer + index, padding, padlen);
-		} else {
-			__sha1_ssse3_update(desc, padding, padlen, index);
-		}
-		__sha1_ssse3_update(desc, (const u8 *)&bits, sizeof(bits), 56);
-		kernel_fpu_end();
-	}
-
-	/* Store state in digest */
-	for (i = 0; i < 5; i++)
-		dst[i] = cpu_to_be32(sctx->state[i]);
-
-	/* Wipe context */
-	memset(sctx, 0, sizeof(*sctx));
+	kernel_fpu_begin();
+	sha1_base_do_update(desc, data, len,
+			    (sha1_block_fn *)sha1_transform_asm);
+	kernel_fpu_end();
 
 	return 0;
 }
 
-static int sha1_ssse3_export(struct shash_desc *desc, void *out)
+static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
+			      unsigned int len, u8 *out)
 {
-	struct sha1_state *sctx = shash_desc_ctx(desc);
+	if (!irq_fpu_usable())
+		return crypto_sha1_finup(desc, data, len, out);
 
-	memcpy(out, sctx, sizeof(*sctx));
+	kernel_fpu_begin();
+	if (len)
+		sha1_base_do_update(desc, data, len,
+				    (sha1_block_fn *)sha1_transform_asm);
+	sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_transform_asm);
+	kernel_fpu_end();
 
-	return 0;
+	return sha1_base_finish(desc, out);
 }
 
-static int sha1_ssse3_import(struct shash_desc *desc, const void *in)
+/* Add padding and return the message digest. */
+static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
 {
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(sctx, in, sizeof(*sctx));
-
-	return 0;
+	return sha1_ssse3_finup(desc, NULL, 0, out);
 }
 
 #ifdef CONFIG_AS_AVX2
@@ -186,13 +105,11 @@ static void sha1_apply_transform_avx2(u32 *digest, const char *data,
 
 static struct shash_alg alg = {
 	.digestsize	=	SHA1_DIGEST_SIZE,
-	.init		=	sha1_ssse3_init,
+	.init		=	sha1_base_init,
 	.update		=	sha1_ssse3_update,
 	.final		=	sha1_ssse3_final,
-	.export		=	sha1_ssse3_export,
-	.import		=	sha1_ssse3_import,
+	.finup		=	sha1_ssse3_finup,
 	.descsize	=	sizeof(struct sha1_state),
-	.statesize	=	sizeof(struct sha1_state),
 	.base		=	{
 		.cra_name	=	"sha1",
 		.cra_driver_name=	"sha1-ssse3",
-- 
cgit v1.2.3


From 1631030ae63aef0a54fe08813e0f4e26c8ef9c78 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Thu, 9 Apr 2015 12:55:47 +0200
Subject: crypto: x86/sha256_ssse3 - move SHA-224/256 SSSE3 implementation to
 base layer

This removes all the boilerplate from the existing implementation,
and replaces it with calls into the base layer. It also changes the
prototypes of the core asm functions to be compatible with the base
prototype

  void (sha256_block_fn)(struct sha256_state *sst, u8 const *src, int blocks)

so that they can be passed to the base layer directly.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha256-avx-asm.S    |  10 +-
 arch/x86/crypto/sha256-avx2-asm.S   |  10 +-
 arch/x86/crypto/sha256-ssse3-asm.S  |  10 +-
 arch/x86/crypto/sha256_ssse3_glue.c | 193 +++++++-----------------------------
 4 files changed, 50 insertions(+), 173 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
index 642f15687a0a..92b3b5d75ba9 100644
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -96,10 +96,10 @@ SHUF_DC00 = %xmm12      # shuffle xDxC -> DC00
 BYTE_FLIP_MASK = %xmm13
 
 NUM_BLKS = %rdx   # 3rd arg
-CTX = %rsi        # 2nd arg
-INP = %rdi        # 1st arg
+INP = %rsi        # 2nd arg
+CTX = %rdi        # 1st arg
 
-SRND = %rdi       # clobbers INP
+SRND = %rsi       # clobbers INP
 c = %ecx
 d = %r8d
 e = %edx
@@ -342,8 +342,8 @@ a = TMP_
 
 ########################################################################
 ## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
-## arg 1 : pointer to input data
-## arg 2 : pointer to digest
+## arg 1 : pointer to digest
+## arg 2 : pointer to input data
 ## arg 3 : Num blocks
 ########################################################################
 .text
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 9e86944c539d..570ec5ec62d7 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -91,12 +91,12 @@ BYTE_FLIP_MASK = %ymm13
 X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
 
 NUM_BLKS = %rdx	# 3rd arg
-CTX	= %rsi  # 2nd arg
-INP	= %rdi	# 1st arg
+INP	= %rsi  # 2nd arg
+CTX	= %rdi	# 1st arg
 c	= %ecx
 d	= %r8d
 e       = %edx	# clobbers NUM_BLKS
-y3	= %edi	# clobbers INP
+y3	= %esi	# clobbers INP
 
 
 TBL	= %rbp
@@ -523,8 +523,8 @@ STACK_SIZE	= _RSP      + _RSP_SIZE
 
 ########################################################################
 ## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
-## arg 1 : pointer to input data
-## arg 2 : pointer to digest
+## arg 1 : pointer to digest
+## arg 2 : pointer to input data
 ## arg 3 : Num blocks
 ########################################################################
 .text
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S
index f833b74d902b..2cedc44e8121 100644
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@@ -88,10 +88,10 @@ SHUF_DC00 = %xmm11      # shuffle xDxC -> DC00
 BYTE_FLIP_MASK = %xmm12
 
 NUM_BLKS = %rdx   # 3rd arg
-CTX = %rsi        # 2nd arg
-INP = %rdi        # 1st arg
+INP = %rsi        # 2nd arg
+CTX = %rdi        # 1st arg
 
-SRND = %rdi       # clobbers INP
+SRND = %rsi       # clobbers INP
 c = %ecx
 d = %r8d
 e = %edx
@@ -348,8 +348,8 @@ a = TMP_
 
 ########################################################################
 ## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks)
-## arg 1 : pointer to input data
-## arg 2 : pointer to digest
+## arg 1 : pointer to digest
+## arg 2 : pointer to input data
 ## arg 3 : Num blocks
 ########################################################################
 .text
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index 8fad72f4dfd2..ccc338881ee8 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -36,195 +36,74 @@
 #include <linux/cryptohash.h>
 #include <linux/types.h>
 #include <crypto/sha.h>
-#include <asm/byteorder.h>
+#include <crypto/sha256_base.h>
 #include <asm/i387.h>
 #include <asm/xcr.h>
 #include <asm/xsave.h>
 #include <linux/string.h>
 
-asmlinkage void sha256_transform_ssse3(const char *data, u32 *digest,
-				     u64 rounds);
+asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data,
+				       u64 rounds);
 #ifdef CONFIG_AS_AVX
-asmlinkage void sha256_transform_avx(const char *data, u32 *digest,
+asmlinkage void sha256_transform_avx(u32 *digest, const char *data,
 				     u64 rounds);
 #endif
 #ifdef CONFIG_AS_AVX2
-asmlinkage void sha256_transform_rorx(const char *data, u32 *digest,
-				     u64 rounds);
+asmlinkage void sha256_transform_rorx(u32 *digest, const char *data,
+				      u64 rounds);
 #endif
 
-static asmlinkage void (*sha256_transform_asm)(const char *, u32 *, u64);
-
-
-static int sha256_ssse3_init(struct shash_desc *desc)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-
-	sctx->state[0] = SHA256_H0;
-	sctx->state[1] = SHA256_H1;
-	sctx->state[2] = SHA256_H2;
-	sctx->state[3] = SHA256_H3;
-	sctx->state[4] = SHA256_H4;
-	sctx->state[5] = SHA256_H5;
-	sctx->state[6] = SHA256_H6;
-	sctx->state[7] = SHA256_H7;
-	sctx->count = 0;
-
-	return 0;
-}
-
-static int __sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
-			       unsigned int len, unsigned int partial)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-	unsigned int done = 0;
-
-	sctx->count += len;
-
-	if (partial) {
-		done = SHA256_BLOCK_SIZE - partial;
-		memcpy(sctx->buf + partial, data, done);
-		sha256_transform_asm(sctx->buf, sctx->state, 1);
-	}
-
-	if (len - done >= SHA256_BLOCK_SIZE) {
-		const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
-
-		sha256_transform_asm(data + done, sctx->state, (u64) rounds);
-
-		done += rounds * SHA256_BLOCK_SIZE;
-	}
-
-	memcpy(sctx->buf, data + done, len - done);
-
-	return 0;
-}
+static void (*sha256_transform_asm)(u32 *, const char *, u64);
 
 static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
 			     unsigned int len)
 {
 	struct sha256_state *sctx = shash_desc_ctx(desc);
-	unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
-	int res;
 
-	/* Handle the fast case right here */
-	if (partial + len < SHA256_BLOCK_SIZE) {
-		sctx->count += len;
-		memcpy(sctx->buf + partial, data, len);
+	if (!irq_fpu_usable() ||
+	    (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE)
+		return crypto_sha256_update(desc, data, len);
 
-		return 0;
-	}
-
-	if (!irq_fpu_usable()) {
-		res = crypto_sha256_update(desc, data, len);
-	} else {
-		kernel_fpu_begin();
-		res = __sha256_ssse3_update(desc, data, len, partial);
-		kernel_fpu_end();
-	}
-
-	return res;
-}
+	/* make sure casting to sha256_block_fn() is safe */
+	BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0);
 
-
-/* Add padding and return the message digest. */
-static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-	unsigned int i, index, padlen;
-	__be32 *dst = (__be32 *)out;
-	__be64 bits;
-	static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
-
-	bits = cpu_to_be64(sctx->count << 3);
-
-	/* Pad out to 56 mod 64 and append length */
-	index = sctx->count % SHA256_BLOCK_SIZE;
-	padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
-
-	if (!irq_fpu_usable()) {
-		crypto_sha256_update(desc, padding, padlen);
-		crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits));
-	} else {
-		kernel_fpu_begin();
-		/* We need to fill a whole block for __sha256_ssse3_update() */
-		if (padlen <= 56) {
-			sctx->count += padlen;
-			memcpy(sctx->buf + index, padding, padlen);
-		} else {
-			__sha256_ssse3_update(desc, padding, padlen, index);
-		}
-		__sha256_ssse3_update(desc, (const u8 *)&bits,
-					sizeof(bits), 56);
-		kernel_fpu_end();
-	}
-
-	/* Store state in digest */
-	for (i = 0; i < 8; i++)
-		dst[i] = cpu_to_be32(sctx->state[i]);
-
-	/* Wipe context */
-	memset(sctx, 0, sizeof(*sctx));
+	kernel_fpu_begin();
+	sha256_base_do_update(desc, data, len,
+			      (sha256_block_fn *)sha256_transform_asm);
+	kernel_fpu_end();
 
 	return 0;
 }
 
-static int sha256_ssse3_export(struct shash_desc *desc, void *out)
+static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
+			      unsigned int len, u8 *out)
 {
-	struct sha256_state *sctx = shash_desc_ctx(desc);
+	if (!irq_fpu_usable())
+		return crypto_sha256_finup(desc, data, len, out);
 
-	memcpy(out, sctx, sizeof(*sctx));
+	kernel_fpu_begin();
+	if (len)
+		sha256_base_do_update(desc, data, len,
+				      (sha256_block_fn *)sha256_transform_asm);
+	sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_transform_asm);
+	kernel_fpu_end();
 
-	return 0;
+	return sha256_base_finish(desc, out);
 }
 
-static int sha256_ssse3_import(struct shash_desc *desc, const void *in)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(sctx, in, sizeof(*sctx));
-
-	return 0;
-}
-
-static int sha224_ssse3_init(struct shash_desc *desc)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-
-	sctx->state[0] = SHA224_H0;
-	sctx->state[1] = SHA224_H1;
-	sctx->state[2] = SHA224_H2;
-	sctx->state[3] = SHA224_H3;
-	sctx->state[4] = SHA224_H4;
-	sctx->state[5] = SHA224_H5;
-	sctx->state[6] = SHA224_H6;
-	sctx->state[7] = SHA224_H7;
-	sctx->count = 0;
-
-	return 0;
-}
-
-static int sha224_ssse3_final(struct shash_desc *desc, u8 *hash)
+/* Add padding and return the message digest. */
+static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
 {
-	u8 D[SHA256_DIGEST_SIZE];
-
-	sha256_ssse3_final(desc, D);
-
-	memcpy(hash, D, SHA224_DIGEST_SIZE);
-	memzero_explicit(D, SHA256_DIGEST_SIZE);
-
-	return 0;
+	return sha256_ssse3_finup(desc, NULL, 0, out);
 }
 
 static struct shash_alg algs[] = { {
 	.digestsize	=	SHA256_DIGEST_SIZE,
-	.init		=	sha256_ssse3_init,
+	.init		=	sha256_base_init,
 	.update		=	sha256_ssse3_update,
 	.final		=	sha256_ssse3_final,
-	.export		=	sha256_ssse3_export,
-	.import		=	sha256_ssse3_import,
+	.finup		=	sha256_ssse3_finup,
 	.descsize	=	sizeof(struct sha256_state),
-	.statesize	=	sizeof(struct sha256_state),
 	.base		=	{
 		.cra_name	=	"sha256",
 		.cra_driver_name =	"sha256-ssse3",
@@ -235,13 +114,11 @@ static struct shash_alg algs[] = { {
 	}
 }, {
 	.digestsize	=	SHA224_DIGEST_SIZE,
-	.init		=	sha224_ssse3_init,
+	.init		=	sha224_base_init,
 	.update		=	sha256_ssse3_update,
-	.final		=	sha224_ssse3_final,
-	.export		=	sha256_ssse3_export,
-	.import		=	sha256_ssse3_import,
+	.final		=	sha256_ssse3_final,
+	.finup		=	sha256_ssse3_finup,
 	.descsize	=	sizeof(struct sha256_state),
-	.statesize	=	sizeof(struct sha256_state),
 	.base		=	{
 		.cra_name	=	"sha224",
 		.cra_driver_name =	"sha224-ssse3",
-- 
cgit v1.2.3


From e68410ebf62676dfb93aafff7c55b76644f37072 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Thu, 9 Apr 2015 12:55:48 +0200
Subject: crypto: x86/sha512_ssse3 - move SHA-384/512 SSSE3 implementation to
 base layer

This removes all the boilerplate from the existing implementation,
and replaces it with calls into the base layer.  It also changes the
prototypes of the core asm functions to be compatible with the base
prototype

  void (sha512_block_fn)(struct sha256_state *sst, u8 const *src, int blocks)

so that they can be passed to the base layer directly.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha512-avx-asm.S    |   6 +-
 arch/x86/crypto/sha512-avx2-asm.S   |   6 +-
 arch/x86/crypto/sha512-ssse3-asm.S  |   6 +-
 arch/x86/crypto/sha512_ssse3_glue.c | 202 +++++++-----------------------------
 4 files changed, 44 insertions(+), 176 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
index 974dde9bc6cd..565274d6a641 100644
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -54,9 +54,9 @@
 
 # Virtual Registers
 # ARG1
-msg	= %rdi
+digest	= %rdi
 # ARG2
-digest	= %rsi
+msg	= %rsi
 # ARG3
 msglen	= %rdx
 T1	= %rcx
@@ -271,7 +271,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
 .endm
 
 ########################################################################
-# void sha512_transform_avx(const void* M, void* D, u64 L)
+# void sha512_transform_avx(void* D, const void* M, u64 L)
 # Purpose: Updates the SHA512 digest stored at D with the message stored in M.
 # The size of the message pointed to by M must be an integer multiple of SHA512
 # message blocks.
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index 568b96105f5c..a4771dcd1fcf 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -70,9 +70,9 @@ XFER  = YTMP0
 BYTE_FLIP_MASK  = %ymm9
 
 # 1st arg
-INP         = %rdi
+CTX         = %rdi
 # 2nd arg
-CTX         = %rsi
+INP         = %rsi
 # 3rd arg
 NUM_BLKS    = %rdx
 
@@ -562,7 +562,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
 .endm
 
 ########################################################################
-# void sha512_transform_rorx(const void* M, void* D, uint64_t L)#
+# void sha512_transform_rorx(void* D, const void* M, uint64_t L)#
 # Purpose: Updates the SHA512 digest stored at D with the message stored in M.
 # The size of the message pointed to by M must be an integer multiple of SHA512
 #   message blocks.
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
index fb56855d51f5..e610e29cbc81 100644
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -53,9 +53,9 @@
 
 # Virtual Registers
 # ARG1
-msg =		%rdi
+digest =	%rdi
 # ARG2
-digest =	%rsi
+msg =		%rsi
 # ARG3
 msglen =	%rdx
 T1 =		%rcx
@@ -269,7 +269,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
 .endm
 
 ########################################################################
-# void sha512_transform_ssse3(const void* M, void* D, u64 L)#
+# void sha512_transform_ssse3(void* D, const void* M, u64 L)#
 # Purpose: Updates the SHA512 digest stored at D with the message stored in M.
 # The size of the message pointed to by M must be an integer multiple of SHA512
 #   message blocks.
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 0b6af26832bf..d9fa4c1e063f 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -34,205 +34,75 @@
 #include <linux/cryptohash.h>
 #include <linux/types.h>
 #include <crypto/sha.h>
-#include <asm/byteorder.h>
+#include <crypto/sha512_base.h>
 #include <asm/i387.h>
 #include <asm/xcr.h>
 #include <asm/xsave.h>
 
 #include <linux/string.h>
 
-asmlinkage void sha512_transform_ssse3(const char *data, u64 *digest,
-				     u64 rounds);
+asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data,
+				       u64 rounds);
 #ifdef CONFIG_AS_AVX
-asmlinkage void sha512_transform_avx(const char *data, u64 *digest,
+asmlinkage void sha512_transform_avx(u64 *digest, const char *data,
 				     u64 rounds);
 #endif
 #ifdef CONFIG_AS_AVX2
-asmlinkage void sha512_transform_rorx(const char *data, u64 *digest,
-				     u64 rounds);
+asmlinkage void sha512_transform_rorx(u64 *digest, const char *data,
+				      u64 rounds);
 #endif
 
-static asmlinkage void (*sha512_transform_asm)(const char *, u64 *, u64);
-
-
-static int sha512_ssse3_init(struct shash_desc *desc)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-
-	sctx->state[0] = SHA512_H0;
-	sctx->state[1] = SHA512_H1;
-	sctx->state[2] = SHA512_H2;
-	sctx->state[3] = SHA512_H3;
-	sctx->state[4] = SHA512_H4;
-	sctx->state[5] = SHA512_H5;
-	sctx->state[6] = SHA512_H6;
-	sctx->state[7] = SHA512_H7;
-	sctx->count[0] = sctx->count[1] = 0;
-
-	return 0;
-}
+static void (*sha512_transform_asm)(u64 *, const char *, u64);
 
-static int __sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
-			       unsigned int len, unsigned int partial)
+static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
+			       unsigned int len)
 {
 	struct sha512_state *sctx = shash_desc_ctx(desc);
-	unsigned int done = 0;
-
-	sctx->count[0] += len;
-	if (sctx->count[0] < len)
-		sctx->count[1]++;
 
-	if (partial) {
-		done = SHA512_BLOCK_SIZE - partial;
-		memcpy(sctx->buf + partial, data, done);
-		sha512_transform_asm(sctx->buf, sctx->state, 1);
-	}
-
-	if (len - done >= SHA512_BLOCK_SIZE) {
-		const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE;
+	if (!irq_fpu_usable() ||
+	    (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE)
+		return crypto_sha512_update(desc, data, len);
 
-		sha512_transform_asm(data + done, sctx->state, (u64) rounds);
-
-		done += rounds * SHA512_BLOCK_SIZE;
-	}
+	/* make sure casting to sha512_block_fn() is safe */
+	BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0);
 
-	memcpy(sctx->buf, data + done, len - done);
+	kernel_fpu_begin();
+	sha512_base_do_update(desc, data, len,
+			      (sha512_block_fn *)sha512_transform_asm);
+	kernel_fpu_end();
 
 	return 0;
 }
 
-static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
-			     unsigned int len)
+static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
+			      unsigned int len, u8 *out)
 {
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-	unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE;
-	int res;
-
-	/* Handle the fast case right here */
-	if (partial + len < SHA512_BLOCK_SIZE) {
-		sctx->count[0] += len;
-		if (sctx->count[0] < len)
-			sctx->count[1]++;
-		memcpy(sctx->buf + partial, data, len);
-
-		return 0;
-	}
+	if (!irq_fpu_usable())
+		return crypto_sha512_finup(desc, data, len, out);
 
-	if (!irq_fpu_usable()) {
-		res = crypto_sha512_update(desc, data, len);
-	} else {
-		kernel_fpu_begin();
-		res = __sha512_ssse3_update(desc, data, len, partial);
-		kernel_fpu_end();
-	}
+	kernel_fpu_begin();
+	if (len)
+		sha512_base_do_update(desc, data, len,
+				      (sha512_block_fn *)sha512_transform_asm);
+	sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_transform_asm);
+	kernel_fpu_end();
 
-	return res;
+	return sha512_base_finish(desc, out);
 }
 
-
 /* Add padding and return the message digest. */
 static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
 {
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-	unsigned int i, index, padlen;
-	__be64 *dst = (__be64 *)out;
-	__be64 bits[2];
-	static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, };
-
-	/* save number of bits */
-	bits[1] = cpu_to_be64(sctx->count[0] << 3);
-	bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
-
-	/* Pad out to 112 mod 128 and append length */
-	index = sctx->count[0] & 0x7f;
-	padlen = (index < 112) ? (112 - index) : ((128+112) - index);
-
-	if (!irq_fpu_usable()) {
-		crypto_sha512_update(desc, padding, padlen);
-		crypto_sha512_update(desc, (const u8 *)&bits, sizeof(bits));
-	} else {
-		kernel_fpu_begin();
-		/* We need to fill a whole block for __sha512_ssse3_update() */
-		if (padlen <= 112) {
-			sctx->count[0] += padlen;
-			if (sctx->count[0] < padlen)
-				sctx->count[1]++;
-			memcpy(sctx->buf + index, padding, padlen);
-		} else {
-			__sha512_ssse3_update(desc, padding, padlen, index);
-		}
-		__sha512_ssse3_update(desc, (const u8 *)&bits,
-					sizeof(bits), 112);
-		kernel_fpu_end();
-	}
-
-	/* Store state in digest */
-	for (i = 0; i < 8; i++)
-		dst[i] = cpu_to_be64(sctx->state[i]);
-
-	/* Wipe context */
-	memset(sctx, 0, sizeof(*sctx));
-
-	return 0;
-}
-
-static int sha512_ssse3_export(struct shash_desc *desc, void *out)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(out, sctx, sizeof(*sctx));
-
-	return 0;
-}
-
-static int sha512_ssse3_import(struct shash_desc *desc, const void *in)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(sctx, in, sizeof(*sctx));
-
-	return 0;
-}
-
-static int sha384_ssse3_init(struct shash_desc *desc)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-
-	sctx->state[0] = SHA384_H0;
-	sctx->state[1] = SHA384_H1;
-	sctx->state[2] = SHA384_H2;
-	sctx->state[3] = SHA384_H3;
-	sctx->state[4] = SHA384_H4;
-	sctx->state[5] = SHA384_H5;
-	sctx->state[6] = SHA384_H6;
-	sctx->state[7] = SHA384_H7;
-
-	sctx->count[0] = sctx->count[1] = 0;
-
-	return 0;
-}
-
-static int sha384_ssse3_final(struct shash_desc *desc, u8 *hash)
-{
-	u8 D[SHA512_DIGEST_SIZE];
-
-	sha512_ssse3_final(desc, D);
-
-	memcpy(hash, D, SHA384_DIGEST_SIZE);
-	memzero_explicit(D, SHA512_DIGEST_SIZE);
-
-	return 0;
+	return sha512_ssse3_finup(desc, NULL, 0, out);
 }
 
 static struct shash_alg algs[] = { {
 	.digestsize	=	SHA512_DIGEST_SIZE,
-	.init		=	sha512_ssse3_init,
+	.init		=	sha512_base_init,
 	.update		=	sha512_ssse3_update,
 	.final		=	sha512_ssse3_final,
-	.export		=	sha512_ssse3_export,
-	.import		=	sha512_ssse3_import,
+	.finup		=	sha512_ssse3_finup,
 	.descsize	=	sizeof(struct sha512_state),
-	.statesize	=	sizeof(struct sha512_state),
 	.base		=	{
 		.cra_name	=	"sha512",
 		.cra_driver_name =	"sha512-ssse3",
@@ -243,13 +113,11 @@ static struct shash_alg algs[] = { {
 	}
 },  {
 	.digestsize	=	SHA384_DIGEST_SIZE,
-	.init		=	sha384_ssse3_init,
+	.init		=	sha384_base_init,
 	.update		=	sha512_ssse3_update,
-	.final		=	sha384_ssse3_final,
-	.export		=	sha512_ssse3_export,
-	.import		=	sha512_ssse3_import,
+	.final		=	sha512_ssse3_final,
+	.finup		=	sha512_ssse3_finup,
 	.descsize	=	sizeof(struct sha512_state),
-	.statesize	=	sizeof(struct sha512_state),
 	.base		=	{
 		.cra_name	=	"sha384",
 		.cra_driver_name =	"sha384-ssse3",
-- 
cgit v1.2.3


From 066450be419fa48007a9f29e19828f2a86198754 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 12 Apr 2015 11:11:21 +0200
Subject: perf/x86/intel/pt: Clean up the control flow in pt_pmu_hw_init()

Dan Carpenter pointed out that the control flow in pt_pmu_hw_init()
is a bit messy: for example the kfree(de_attrs) is entirely
superfluous.

Another problem is the inconsistent mixing of label based and
direct return error handling.

Add modern, label based error handling instead and clean up the code
a bit as well.

Note that we'll still do a kfree(NULL) in the normal case - this does
not matter as this is an init path and kfree() returns early if it
sees a NULL.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20150409090805.GG17605@mwanda
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_pt.c | 53 +++++++++++++++++--------------
 1 file changed, 30 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
index f5a3afc65371..f2770641c0fd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -119,48 +119,55 @@ static int __init pt_pmu_hw_init(void)
 	struct dev_ext_attribute *de_attrs;
 	struct attribute **attrs;
 	size_t size;
+	int ret;
 	long i;
 
-	if (test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT)) {
-		for (i = 0; i < PT_CPUID_LEAVES; i++)
-			cpuid_count(20, i,
-				    &pt_pmu.caps[CR_EAX + i * 4],
-				    &pt_pmu.caps[CR_EBX + i * 4],
-				    &pt_pmu.caps[CR_ECX + i * 4],
-				    &pt_pmu.caps[CR_EDX + i * 4]);
-	} else {
-		return -ENODEV;
+	attrs = NULL;
+	ret = -ENODEV;
+	if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
+		goto fail;
+
+	for (i = 0; i < PT_CPUID_LEAVES; i++) {
+		cpuid_count(20, i,
+			    &pt_pmu.caps[CR_EAX + i*4],
+			    &pt_pmu.caps[CR_EBX + i*4],
+			    &pt_pmu.caps[CR_ECX + i*4],
+			    &pt_pmu.caps[CR_EDX + i*4]);
 	}
 
-	size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps) + 1);
+	ret = -ENOMEM;
+	size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
 	attrs = kzalloc(size, GFP_KERNEL);
 	if (!attrs)
-		goto err_attrs;
+		goto fail;
 
-	size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps) + 1);
+	size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
 	de_attrs = kzalloc(size, GFP_KERNEL);
 	if (!de_attrs)
-		goto err_de_attrs;
+		goto fail;
 
 	for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
-		de_attrs[i].attr.attr.name = pt_caps[i].name;
+		struct dev_ext_attribute *de_attr = de_attrs + i;
+
+		de_attr->attr.attr.name = pt_caps[i].name;
+
+		sysfs_attr_init(&de_attrs->attr.attr);
 
-		sysfs_attr_init(&de_attrs[i].attr.attr);
-		de_attrs[i].attr.attr.mode = S_IRUGO;
-		de_attrs[i].attr.show = pt_cap_show;
-		de_attrs[i].var = (void *)i;
-		attrs[i] = &de_attrs[i].attr.attr;
+		de_attr->attr.attr.mode		= S_IRUGO;
+		de_attr->attr.show		= pt_cap_show;
+		de_attr->var			= (void *)i;
+
+		attrs[i] = &de_attr->attr.attr;
 	}
 
 	pt_cap_group.attrs = attrs;
+
 	return 0;
 
-err_de_attrs:
-	kfree(de_attrs);
-err_attrs:
+fail:
 	kfree(attrs);
 
-	return -ENOMEM;
+	return ret;
 }
 
 #define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC)
-- 
cgit v1.2.3


From fd223849f10a28fa40201652b5f13d52fa8f2bb0 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Sun, 13 Jul 2014 17:41:57 +0200
Subject: um: Remove signal translation and exec_domain

As execution domain support is gone we can remove
signal translation from the signal code and remove
exec_domain from thread_info.

Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/x86/um/signal.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 0c8c32bfd792..592491d1d70d 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -549,13 +549,6 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
 	if (err)
 		return err;
 
-	/* Set up registers for signal handler */
-	{
-		struct exec_domain *ed = current_thread_info()->exec_domain;
-		if (unlikely(ed && ed->signal_invmap && sig < 32))
-			sig = ed->signal_invmap[sig];
-	}
-
 	PT_REGS_SP(regs) = (unsigned long) frame;
 	PT_REGS_DI(regs) = sig;
 	/* In case the signal handler was declared without prototypes */
-- 
cgit v1.2.3


From 3050a35fba296196cb00e87f4a96aa7d9ed17a7b Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Sun, 13 Jul 2014 17:43:51 +0200
Subject: x86: Remove signal translation and exec_domain

As execution domain support is gone we can remove
signal translation from the signal code and remove
exec_domain from thread_info.

Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/x86/include/asm/thread_info.h |  3 ---
 arch/x86/kernel/signal.c           | 16 +---------------
 2 files changed, 1 insertion(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 1d4e4f279a32..2df52baf5228 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -19,13 +19,11 @@
  */
 #ifndef __ASSEMBLY__
 struct task_struct;
-struct exec_domain;
 #include <asm/processor.h>
 #include <linux/atomic.h>
 
 struct thread_info {
 	struct task_struct	*task;		/* main task structure */
-	struct exec_domain	*exec_domain;	/* execution domain */
 	__u32			flags;		/* low level flags */
 	__u32			status;		/* thread synchronous flags */
 	__u32			cpu;		/* current CPU */
@@ -39,7 +37,6 @@ struct thread_info {
 #define INIT_THREAD_INFO(tsk)			\
 {						\
 	.task		= &tsk,			\
-	.exec_domain	= &default_exec_domain,	\
 	.flags		= 0,			\
 	.cpu		= 0,			\
 	.saved_preempt_count = INIT_PREEMPT_COUNT,	\
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index e5042463c1bc..5ddc7ec20e75 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -592,24 +592,10 @@ badframe:
 	return 0;
 }
 
-/*
- * OK, we're invoking a handler:
- */
-static int signr_convert(int sig)
-{
-#ifdef CONFIG_X86_32
-	struct thread_info *info = current_thread_info();
-
-	if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
-		return info->exec_domain->signal_invmap[sig];
-#endif /* CONFIG_X86_32 */
-	return sig;
-}
-
 static int
 setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
 {
-	int usig = signr_convert(ksig->sig);
+	int usig = ksig->sig;
 	sigset_t *set = sigmask_to_save();
 	compat_sigset_t *cset = (compat_sigset_t *) set;
 
-- 
cgit v1.2.3


From d0b5e15f0c0fdd759dd3dd48dc2dc2e7199e0da0 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Wed, 18 Mar 2015 21:31:27 +0100
Subject: um: Remove SKAS3/4 support

Before we had SKAS0 UML had two modes of operation
TT (tracing thread) and SKAS3/4 (separated kernel address space).
TT was known to be insecure and got removed a long time ago.
SKAS3/4 required a few (3 or 4) patches on the host side which never went
mainline. The last host patch is 10 years old.

With SKAS0 mode (separated kernel address space using 0 host patches),
default since 2005, SKAS3/4 is obsolete and can be removed.

Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/x86/um/ldt.c                        | 227 +++++++------------------------
 arch/x86/um/shared/sysdep/faultinfo_32.h |   3 -
 arch/x86/um/shared/sysdep/faultinfo_64.h |   3 -
 arch/x86/um/shared/sysdep/skas_ptrace.h  |  22 ---
 4 files changed, 48 insertions(+), 207 deletions(-)
 delete mode 100644 arch/x86/um/shared/sysdep/skas_ptrace.h

(limited to 'arch/x86')

diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c
index 8e08176f0bcb..5c0b711d2433 100644
--- a/arch/x86/um/ldt.c
+++ b/arch/x86/um/ldt.c
@@ -8,9 +8,7 @@
 #include <linux/slab.h>
 #include <asm/unistd.h>
 #include <os.h>
-#include <proc_mm.h>
 #include <skas.h>
-#include <skas_ptrace.h>
 #include <sysdep/tls.h>
 
 extern int modify_ldt(int func, void *ptr, unsigned long bytecount);
@@ -19,105 +17,20 @@ static long write_ldt_entry(struct mm_id *mm_idp, int func,
 		     struct user_desc *desc, void **addr, int done)
 {
 	long res;
-
-	if (proc_mm) {
-		/*
-		 * This is a special handling for the case, that the mm to
-		 * modify isn't current->active_mm.
-		 * If this is called directly by modify_ldt,
-		 *     (current->active_mm->context.skas.u == mm_idp)
-		 * will be true. So no call to __switch_mm(mm_idp) is done.
-		 * If this is called in case of init_new_ldt or PTRACE_LDT,
-		 * mm_idp won't belong to current->active_mm, but child->mm.
-		 * So we need to switch child's mm into our userspace, then
-		 * later switch back.
-		 *
-		 * Note: I'm unsure: should interrupts be disabled here?
-		 */
-		if (!current->active_mm || current->active_mm == &init_mm ||
-		    mm_idp != &current->active_mm->context.id)
-			__switch_mm(mm_idp);
-	}
-
-	if (ptrace_ldt) {
-		struct ptrace_ldt ldt_op = (struct ptrace_ldt) {
-			.func = func,
-			.ptr = desc,
-			.bytecount = sizeof(*desc)};
-		u32 cpu;
-		int pid;
-
-		if (!proc_mm)
-			pid = mm_idp->u.pid;
-		else {
-			cpu = get_cpu();
-			pid = userspace_pid[cpu];
-		}
-
-		res = os_ptrace_ldt(pid, 0, (unsigned long) &ldt_op);
-
-		if (proc_mm)
-			put_cpu();
-	}
-	else {
-		void *stub_addr;
-		res = syscall_stub_data(mm_idp, (unsigned long *)desc,
-					(sizeof(*desc) + sizeof(long) - 1) &
-					    ~(sizeof(long) - 1),
-					addr, &stub_addr);
-		if (!res) {
-			unsigned long args[] = { func,
-						 (unsigned long)stub_addr,
-						 sizeof(*desc),
-						 0, 0, 0 };
-			res = run_syscall_stub(mm_idp, __NR_modify_ldt, args,
-					       0, addr, done);
-		}
+	void *stub_addr;
+	res = syscall_stub_data(mm_idp, (unsigned long *)desc,
+				(sizeof(*desc) + sizeof(long) - 1) &
+				    ~(sizeof(long) - 1),
+				addr, &stub_addr);
+	if (!res) {
+		unsigned long args[] = { func,
+					 (unsigned long)stub_addr,
+					 sizeof(*desc),
+					 0, 0, 0 };
+		res = run_syscall_stub(mm_idp, __NR_modify_ldt, args,
+				       0, addr, done);
 	}
 
-	if (proc_mm) {
-		/*
-		 * This is the second part of special handling, that makes
-		 * PTRACE_LDT possible to implement.
-		 */
-		if (current->active_mm && current->active_mm != &init_mm &&
-		    mm_idp != &current->active_mm->context.id)
-			__switch_mm(&current->active_mm->context.id);
-	}
-
-	return res;
-}
-
-static long read_ldt_from_host(void __user * ptr, unsigned long bytecount)
-{
-	int res, n;
-	struct ptrace_ldt ptrace_ldt = (struct ptrace_ldt) {
-			.func = 0,
-			.bytecount = bytecount,
-			.ptr = kmalloc(bytecount, GFP_KERNEL)};
-	u32 cpu;
-
-	if (ptrace_ldt.ptr == NULL)
-		return -ENOMEM;
-
-	/*
-	 * This is called from sys_modify_ldt only, so userspace_pid gives
-	 * us the right number
-	 */
-
-	cpu = get_cpu();
-	res = os_ptrace_ldt(userspace_pid[cpu], 0, (unsigned long) &ptrace_ldt);
-	put_cpu();
-	if (res < 0)
-		goto out;
-
-	n = copy_to_user(ptr, ptrace_ldt.ptr, res);
-	if (n != 0)
-		res = -EFAULT;
-
-  out:
-	kfree(ptrace_ldt.ptr);
-
 	return res;
 }
 
@@ -145,9 +58,6 @@ static int read_ldt(void __user * ptr, unsigned long bytecount)
 		bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
 	err = bytecount;
 
-	if (ptrace_ldt)
-		return read_ldt_from_host(ptr, bytecount);
-
 	mutex_lock(&ldt->lock);
 	if (ldt->entry_count <= LDT_DIRECT_ENTRIES) {
 		size = LDT_ENTRY_SIZE*LDT_DIRECT_ENTRIES;
@@ -229,17 +139,11 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int func)
 			goto out;
 	}
 
-	if (!ptrace_ldt)
-		mutex_lock(&ldt->lock);
+	mutex_lock(&ldt->lock);
 
 	err = write_ldt_entry(mm_idp, func, &ldt_info, &addr, 1);
 	if (err)
 		goto out_unlock;
-	else if (ptrace_ldt) {
-		/* With PTRACE_LDT available, this is used as a flag only */
-		ldt->entry_count = 1;
-		goto out;
-	}
 
 	if (ldt_info.entry_number >= ldt->entry_count &&
 	    ldt_info.entry_number >= LDT_DIRECT_ENTRIES) {
@@ -393,91 +297,56 @@ long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm)
 	int i;
 	long page, err=0;
 	void *addr = NULL;
-	struct proc_mm_op copy;
 
 
-	if (!ptrace_ldt)
-		mutex_init(&new_mm->arch.ldt.lock);
+	mutex_init(&new_mm->arch.ldt.lock);
 
 	if (!from_mm) {
 		memset(&desc, 0, sizeof(desc));
 		/*
-		 * We have to initialize a clean ldt.
+		 * Now we try to retrieve info about the ldt, we
+		 * inherited from the host. All ldt-entries found
+		 * will be reset in the following loop
 		 */
-		if (proc_mm) {
-			/*
-			 * If the new mm was created using proc_mm, host's
-			 * default-ldt currently is assigned, which normally
-			 * contains the call-gates for lcall7 and lcall27.
-			 * To remove these gates, we simply write an empty
-			 * entry as number 0 to the host.
-			 */
-			err = write_ldt_entry(&new_mm->id, 1, &desc, &addr, 1);
-		}
-		else{
-			/*
-			 * Now we try to retrieve info about the ldt, we
-			 * inherited from the host. All ldt-entries found
-			 * will be reset in the following loop
-			 */
-			ldt_get_host_info();
-			for (num_p=host_ldt_entries; *num_p != -1; num_p++) {
-				desc.entry_number = *num_p;
-				err = write_ldt_entry(&new_mm->id, 1, &desc,
-						      &addr, *(num_p + 1) == -1);
-				if (err)
-					break;
-			}
+		ldt_get_host_info();
+		for (num_p=host_ldt_entries; *num_p != -1; num_p++) {
+			desc.entry_number = *num_p;
+			err = write_ldt_entry(&new_mm->id, 1, &desc,
+					      &addr, *(num_p + 1) == -1);
+			if (err)
+				break;
 		}
 		new_mm->arch.ldt.entry_count = 0;
 
 		goto out;
 	}
 
-	if (proc_mm) {
-		/*
-		 * We have a valid from_mm, so we now have to copy the LDT of
-		 * from_mm to new_mm, because using proc_mm an new mm with
-		 * an empty/default LDT was created in new_mm()
-		 */
-		copy = ((struct proc_mm_op) { .op 	= MM_COPY_SEGMENTS,
-					      .u 	=
-					      { .copy_segments =
-							from_mm->id.u.mm_fd } } );
-		i = os_write_file(new_mm->id.u.mm_fd, &copy, sizeof(copy));
-		if (i != sizeof(copy))
-			printk(KERN_ERR "new_mm : /proc/mm copy_segments "
-			       "failed, err = %d\n", -i);
-	}
-
-	if (!ptrace_ldt) {
-		/*
-		 * Our local LDT is used to supply the data for
-		 * modify_ldt(READLDT), if PTRACE_LDT isn't available,
-		 * i.e., we have to use the stub for modify_ldt, which
-		 * can't handle the big read buffer of up to 64kB.
-		 */
-		mutex_lock(&from_mm->arch.ldt.lock);
-		if (from_mm->arch.ldt.entry_count <= LDT_DIRECT_ENTRIES)
-			memcpy(new_mm->arch.ldt.u.entries, from_mm->arch.ldt.u.entries,
-			       sizeof(new_mm->arch.ldt.u.entries));
-		else {
-			i = from_mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE;
-			while (i-->0) {
-				page = __get_free_page(GFP_KERNEL|__GFP_ZERO);
-				if (!page) {
-					err = -ENOMEM;
-					break;
-				}
-				new_mm->arch.ldt.u.pages[i] =
-					(struct ldt_entry *) page;
-				memcpy(new_mm->arch.ldt.u.pages[i],
-				       from_mm->arch.ldt.u.pages[i], PAGE_SIZE);
+	/*
+	 * Our local LDT is used to supply the data for
+	 * modify_ldt(READLDT), if PTRACE_LDT isn't available,
+	 * i.e., we have to use the stub for modify_ldt, which
+	 * can't handle the big read buffer of up to 64kB.
+	 */
+	mutex_lock(&from_mm->arch.ldt.lock);
+	if (from_mm->arch.ldt.entry_count <= LDT_DIRECT_ENTRIES)
+		memcpy(new_mm->arch.ldt.u.entries, from_mm->arch.ldt.u.entries,
+		       sizeof(new_mm->arch.ldt.u.entries));
+	else {
+		i = from_mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE;
+		while (i-->0) {
+			page = __get_free_page(GFP_KERNEL|__GFP_ZERO);
+			if (!page) {
+				err = -ENOMEM;
+				break;
 			}
+			new_mm->arch.ldt.u.pages[i] =
+				(struct ldt_entry *) page;
+			memcpy(new_mm->arch.ldt.u.pages[i],
+			       from_mm->arch.ldt.u.pages[i], PAGE_SIZE);
 		}
-		new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count;
-		mutex_unlock(&from_mm->arch.ldt.lock);
 	}
+	new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count;
+	mutex_unlock(&from_mm->arch.ldt.lock);
 
     out:
 	return err;
@@ -488,7 +357,7 @@ void free_ldt(struct mm_context *mm)
 {
 	int i;
 
-	if (!ptrace_ldt && mm->arch.ldt.entry_count > LDT_DIRECT_ENTRIES) {
+	if (mm->arch.ldt.entry_count > LDT_DIRECT_ENTRIES) {
 		i = mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE;
 		while (i-- > 0)
 			free_page((long) mm->arch.ldt.u.pages[i]);
diff --git a/arch/x86/um/shared/sysdep/faultinfo_32.h b/arch/x86/um/shared/sysdep/faultinfo_32.h
index a26086b8a800..b6f2437ec29c 100644
--- a/arch/x86/um/shared/sysdep/faultinfo_32.h
+++ b/arch/x86/um/shared/sysdep/faultinfo_32.h
@@ -27,9 +27,6 @@ struct faultinfo {
 /* This is Page Fault */
 #define SEGV_IS_FIXABLE(fi)	((fi)->trap_no == 14)
 
-/* SKAS3 has no trap_no on i386, but get_skas_faultinfo() sets it to 0. */
-#define SEGV_MAYBE_FIXABLE(fi)	((fi)->trap_no == 0 && ptrace_faultinfo)
-
 #define PTRACE_FULL_FAULTINFO 0
 
 #endif
diff --git a/arch/x86/um/shared/sysdep/faultinfo_64.h b/arch/x86/um/shared/sysdep/faultinfo_64.h
index f811cbe15d62..ee88f88974ea 100644
--- a/arch/x86/um/shared/sysdep/faultinfo_64.h
+++ b/arch/x86/um/shared/sysdep/faultinfo_64.h
@@ -27,9 +27,6 @@ struct faultinfo {
 /* This is Page Fault */
 #define SEGV_IS_FIXABLE(fi)	((fi)->trap_no == 14)
 
-/* No broken SKAS API, which doesn't pass trap_no, here. */
-#define SEGV_MAYBE_FIXABLE(fi)	0
-
 #define PTRACE_FULL_FAULTINFO 1
 
 #endif
diff --git a/arch/x86/um/shared/sysdep/skas_ptrace.h b/arch/x86/um/shared/sysdep/skas_ptrace.h
deleted file mode 100644
index 453febe98993..000000000000
--- a/arch/x86/um/shared/sysdep/skas_ptrace.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
- * Licensed under the GPL
- */
-
-#ifndef __SYSDEP_X86_SKAS_PTRACE_H
-#define __SYSDEP_X86_SKAS_PTRACE_H
-
-struct ptrace_faultinfo {
-        int is_write;
-        unsigned long addr;
-};
-
-struct ptrace_ldt {
-        int func;
-        void *ptr;
-        unsigned long bytecount;
-};
-
-#define PTRACE_LDT 54
-
-#endif
-- 
cgit v1.2.3


From 28fa468f53163bc0b867b4cc75a9e36e7ed4dbbd Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Wed, 18 Mar 2015 21:42:54 +0100
Subject: um: Remove broken SMP support

At times where UML used the TT mode to operate it had
kind of SMP support. It never got finished nor was
stable.
Let's rip out that cruft and stop confusing developers
which do tree-wide SMP cleanups.

If someone wants SMP support UML it has do be done from scratch.

Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/x86/um/asm/barrier.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 2d7d9a1f5b53..aec75a5d8722 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -36,22 +36,11 @@
 #endif /* CONFIG_X86_PPRO_FENCE */
 #define dma_wmb()	barrier()
 
-#ifdef CONFIG_SMP
-
-#define smp_mb()	mb()
-#define smp_rmb()	dma_rmb()
-#define smp_wmb()	barrier()
-#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
-
-#else /* CONFIG_SMP */
-
 #define smp_mb()	barrier()
 #define smp_rmb()	barrier()
 #define smp_wmb()	barrier()
 #define set_mb(var, value) do { var = value; barrier(); } while (0)
 
-#endif /* CONFIG_SMP */
-
 #define read_barrier_depends()		do { } while (0)
 #define smp_read_barrier_depends()	do { } while (0)
 
-- 
cgit v1.2.3


From a98a6d864d3b84219a6ec6213b00c260fb52f9f4 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Wed, 18 Mar 2015 21:59:35 +0100
Subject: um: Remove broken highmem support

Highmem was always buggy and experimental on UML(i386).
In times where 64 bit computers are default we can
remove that experimental code.

Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/x86/um/Makefile | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index eafa324eb7a5..acb384d24669 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -21,7 +21,6 @@ obj-$(CONFIG_BINFMT_ELF) += elfcore.o
 
 subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o
 subarch-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += ../lib/rwsem.o
-subarch-$(CONFIG_HIGHMEM) += ../mm/highmem_32.o
 
 else
 
-- 
cgit v1.2.3


From fc9bea0e28db8cbfe0a08c1bfb1796bfd7adf49b Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Tue, 24 Mar 2015 18:31:24 +0300
Subject: x86, UML: fix integer overflow in ELF_ET_DYN_BASE

Almost all arches define ELF_ET_DYN_BASE as 2/3 of TASK_SIZE.
Though it seems that some architectures do this in a wrong way.
The problem is that 2*TASK_SIZE may overflow 32-bits so
the real ELF_ET_DYN_BASE becomes wrong.
Fix this overflow by dividing TASK_SIZE prior to multiplying:
	(TASK_SIZE / 3 * 2)

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/x86/um/asm/elf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h
index 25a1022dd793..0a656b727b1a 100644
--- a/arch/x86/um/asm/elf.h
+++ b/arch/x86/um/asm/elf.h
@@ -210,7 +210,7 @@ extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu);
 
 #define ELF_EXEC_PAGESIZE 4096
 
-#define ELF_ET_DYN_BASE (2 * TASK_SIZE / 3)
+#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2)
 
 extern long elf_aux_hwcap;
 #define ELF_HWCAP (elf_aux_hwcap)
-- 
cgit v1.2.3


From c4d30668da689de2f27bb0b19de4430d6c95d7cf Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 10 Apr 2015 00:22:56 -0400
Subject: x86 msr-index: define MSR_TURBO_RATIO_LIMIT,1,2

MSR_TURBO_RATIO_LIMIT has grown into a set of three registers.
Add the documented names for them, in preparation
for deleting the previous ad-hoc names:

+#define MSR_TURBO_RATIO_LIMIT          0x000001ad
+#define MSR_TURBO_RATIO_LIMIT1         0x000001ae
+#define MSR_TURBO_RATIO_LIMIT2         0x000001af

Signed-off-by: Len Brown <len.brown@intel.com>
Cc: x86@kernel.org
---
 arch/x86/include/uapi/asm/msr-index.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index 3ce079136c11..c4c75272314a 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -61,6 +61,9 @@
 #define MSR_OFFCORE_RSP_1		0x000001a7
 #define MSR_NHM_TURBO_RATIO_LIMIT	0x000001ad
 #define MSR_IVT_TURBO_RATIO_LIMIT	0x000001ae
+#define MSR_TURBO_RATIO_LIMIT		0x000001ad
+#define MSR_TURBO_RATIO_LIMIT1		0x000001ae
+#define MSR_TURBO_RATIO_LIMIT2		0x000001af
 
 #define MSR_LBR_SELECT			0x000001c8
 #define MSR_LBR_TOS			0x000001c9
-- 
cgit v1.2.3


From 9e9c3fe40bcd28e3f98f0ad8408435f4503f2781 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@cs.technion.ac.il>
Date: Sun, 12 Apr 2015 21:47:15 +0300
Subject: KVM: x86: Fix MSR_IA32_BNDCFGS in msrs_to_save

kvm_init_msr_list is currently called before hardware_setup. As a result,
vmx_mpx_supported always returns false when kvm_init_msr_list checks whether to
save MSR_IA32_BNDCFGS.

Move kvm_init_msr_list after vmx_hardware_setup is called to fix this issue.

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>

Message-Id: <1428864435-4732-1-git-send-email-namit@cs.technion.ac.il>
Cc: stable@vger.kernel.org # 3.15+
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e1a81267f3f6..ed31c31b2485 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5799,7 +5799,6 @@ int kvm_arch_init(void *opaque)
 	kvm_set_mmio_spte_mask();
 
 	kvm_x86_ops = ops;
-	kvm_init_msr_list();
 
 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
 			PT_DIRTY_MASK, PT64_NX_MASK, 0);
@@ -7253,7 +7252,14 @@ void kvm_arch_hardware_disable(void)
 
 int kvm_arch_hardware_setup(void)
 {
-	return kvm_x86_ops->hardware_setup();
+	int r;
+
+	r = kvm_x86_ops->hardware_setup();
+	if (r != 0)
+		return r;
+
+	kvm_init_msr_list();
+	return 0;
 }
 
 void kvm_arch_hardware_unsetup(void)
-- 
cgit v1.2.3


From bea15428b9d6bc36e87288f168ab314619a66757 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 13 Apr 2015 15:40:02 +0200
Subject: KVM: x86: cleanup kvm_irq_delivery_to_apic_fast

Sparse is reporting a "we previously assumed 'src' could be null" error.
This is true as far as the static analyzer can see, but in practice only
IPIs can set shorthand to self and they also set 'src', so it's ok.
Still, move the initialization of x2apic_ipi (and thus the NULL check for
src right before the first use.

While at it, initializing ret to "false" is somewhat confusing because of
the almost immediate assigned of "true" to the same variable.  Thus,
initialize it to "true" and modify it in the only path that used to use
the value from "bool ret = false".  There is no change in generated code
from this change.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d67206a7b99a..629af0f1c5c4 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -683,8 +683,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 	unsigned long bitmap = 1;
 	struct kvm_lapic **dst;
 	int i;
-	bool ret = false;
-	bool x2apic_ipi = src && apic_x2apic_mode(src);
+	bool ret, x2apic_ipi;
 
 	*r = -1;
 
@@ -696,16 +695,18 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 	if (irq->shorthand)
 		return false;
 
+	x2apic_ipi = src && apic_x2apic_mode(src);
 	if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST))
 		return false;
 
+	ret = true;
 	rcu_read_lock();
 	map = rcu_dereference(kvm->arch.apic_map);
 
-	if (!map)
+	if (!map) {
+		ret = false;
 		goto out;
-
-	ret = true;
+	}
 
 	if (irq->dest_mode == APIC_DEST_PHYSICAL) {
 		if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
-- 
cgit v1.2.3


From 692297d8f96887f836d9049a653ed05a71cf48fb Mon Sep 17 00:00:00 2001
From: Ulrich Obergfell <uobergfe@redhat.com>
Date: Tue, 14 Apr 2015 15:44:19 -0700
Subject: watchdog: introduce the hardlockup_detector_disable() function

Have kvm_guest_init() use hardlockup_detector_disable() instead of
watchdog_enable_hardlockup_detector(false).

Remove the watchdog_hardlockup_detector_is_enabled() and the
watchdog_enable_hardlockup_detector() function which are no longer needed.

Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/kvm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e354cc6446ab..9435620062df 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -513,7 +513,7 @@ void __init kvm_guest_init(void)
 	 * can get false positives too easily, for example if the host is
 	 * overcommitted.
 	 */
-	watchdog_enable_hardlockup_detector(false);
+	hardlockup_detector_disable();
 }
 
 static noinline uint32_t __kvm_cpuid_base(void)
-- 
cgit v1.2.3


From 982333683385343d8d2db9a1df69c98406f42687 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 14 Apr 2015 15:46:14 -0700
Subject: x86: expose number of page table levels on Kconfig level

We would want to use number of page table level to define mm_struct.
Let's expose it as CONFIG_PGTABLE_LEVELS.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig                            |  6 ++++++
 arch/x86/include/asm/paravirt.h             |  8 ++++----
 arch/x86/include/asm/paravirt_types.h       |  8 ++++----
 arch/x86/include/asm/pgalloc.h              |  8 ++++----
 arch/x86/include/asm/pgtable-2level_types.h |  1 -
 arch/x86/include/asm/pgtable-3level_types.h |  2 --
 arch/x86/include/asm/pgtable.h              |  8 ++++----
 arch/x86/include/asm/pgtable_64_types.h     |  1 -
 arch/x86/include/asm/pgtable_types.h        |  4 ++--
 arch/x86/kernel/paravirt.c                  |  6 +++---
 arch/x86/mm/pgtable.c                       | 14 +++++++-------
 arch/x86/xen/mmu.c                          | 14 +++++++-------
 12 files changed, 41 insertions(+), 39 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index faff6934c05a..3e3aaad23414 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -277,6 +277,12 @@ config ARCH_SUPPORTS_UPROBES
 config FIX_EARLYCON_MEM
 	def_bool y
 
+config PGTABLE_LEVELS
+	int
+	default 4 if X86_64
+	default 3 if X86_PAE
+	default 2
+
 source "init/Kconfig"
 source "kernel/Kconfig.freezer"
 
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 5f6051d5d139..8957810ad7d1 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -545,7 +545,7 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 		PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val);
 }
 
-#if PAGETABLE_LEVELS >= 3
+#if CONFIG_PGTABLE_LEVELS >= 3
 static inline pmd_t __pmd(pmdval_t val)
 {
 	pmdval_t ret;
@@ -585,7 +585,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
 		PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
 			    val);
 }
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
 static inline pud_t __pud(pudval_t val)
 {
 	pudval_t ret;
@@ -636,9 +636,9 @@ static inline void pud_clear(pud_t *pudp)
 	set_pud(pudp, __pud(0));
 }
 
-#endif	/* PAGETABLE_LEVELS == 4 */
+#endif	/* CONFIG_PGTABLE_LEVELS == 4 */
 
-#endif	/* PAGETABLE_LEVELS >= 3 */
+#endif	/* CONFIG_PGTABLE_LEVELS >= 3 */
 
 #ifdef CONFIG_X86_PAE
 /* Special-case pte-setting operations for PAE, which can't update a
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 7549b8b369e4..f7b0b5c112f2 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -294,7 +294,7 @@ struct pv_mmu_ops {
 	struct paravirt_callee_save pgd_val;
 	struct paravirt_callee_save make_pgd;
 
-#if PAGETABLE_LEVELS >= 3
+#if CONFIG_PGTABLE_LEVELS >= 3
 #ifdef CONFIG_X86_PAE
 	void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
 	void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
@@ -308,13 +308,13 @@ struct pv_mmu_ops {
 	struct paravirt_callee_save pmd_val;
 	struct paravirt_callee_save make_pmd;
 
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
 	struct paravirt_callee_save pud_val;
 	struct paravirt_callee_save make_pud;
 
 	void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
-#endif	/* PAGETABLE_LEVELS == 4 */
-#endif	/* PAGETABLE_LEVELS >= 3 */
+#endif	/* CONFIG_PGTABLE_LEVELS == 4 */
+#endif	/* CONFIG_PGTABLE_LEVELS >= 3 */
 
 	struct pv_lazy_ops lazy_mode;
 
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index c4412e972bbd..bf7f8b55b0f9 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -77,7 +77,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 
 #define pmd_pgtable(pmd) pmd_page(pmd)
 
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
 	struct page *page;
@@ -116,7 +116,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 }
 #endif	/* CONFIG_X86_PAE */
 
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
 {
 	paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
@@ -142,7 +142,7 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
 	___pud_free_tlb(tlb, pud);
 }
 
-#endif	/* PAGETABLE_LEVELS > 3 */
-#endif	/* PAGETABLE_LEVELS > 2 */
+#endif	/* CONFIG_PGTABLE_LEVELS > 3 */
+#endif	/* CONFIG_PGTABLE_LEVELS > 2 */
 
 #endif /* _ASM_X86_PGALLOC_H */
diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h
index daacc23e3fb9..392576433e77 100644
--- a/arch/x86/include/asm/pgtable-2level_types.h
+++ b/arch/x86/include/asm/pgtable-2level_types.h
@@ -17,7 +17,6 @@ typedef union {
 #endif	/* !__ASSEMBLY__ */
 
 #define SHARED_KERNEL_PMD	0
-#define PAGETABLE_LEVELS	2
 
 /*
  * traditional i386 two-level paging structure:
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index 1bd5876c8649..bcc89625ebe5 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -24,8 +24,6 @@ typedef union {
 #define SHARED_KERNEL_PMD	1
 #endif
 
-#define PAGETABLE_LEVELS	3
-
 /*
  * PGDIR_SHIFT determines what a top-level page table entry can map
  */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a0c35bf6cb92..fe57e7a98839 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -551,7 +551,7 @@ static inline unsigned long pages_to_mb(unsigned long npg)
 	return npg >> (20 - PAGE_SHIFT);
 }
 
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
 static inline int pud_none(pud_t pud)
 {
 	return native_pud_val(pud) == 0;
@@ -594,9 +594,9 @@ static inline int pud_large(pud_t pud)
 {
 	return 0;
 }
-#endif	/* PAGETABLE_LEVELS > 2 */
+#endif	/* CONFIG_PGTABLE_LEVELS > 2 */
 
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
 static inline int pgd_present(pgd_t pgd)
 {
 	return pgd_flags(pgd) & _PAGE_PRESENT;
@@ -633,7 +633,7 @@ static inline int pgd_none(pgd_t pgd)
 {
 	return !native_pgd_val(pgd);
 }
-#endif	/* PAGETABLE_LEVELS > 3 */
+#endif	/* CONFIG_PGTABLE_LEVELS > 3 */
 
 #endif	/* __ASSEMBLY__ */
 
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 602b6028c5b6..e6844dfb4471 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -20,7 +20,6 @@ typedef struct { pteval_t pte; } pte_t;
 #endif	/* !__ASSEMBLY__ */
 
 #define SHARED_KERNEL_PMD	0
-#define PAGETABLE_LEVELS	4
 
 /*
  * PGDIR_SHIFT determines what a top-level page table entry can map
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 8c7c10802e9c..78f0c8cbe316 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -234,7 +234,7 @@ static inline pgdval_t pgd_flags(pgd_t pgd)
 	return native_pgd_val(pgd) & PTE_FLAGS_MASK;
 }
 
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
 typedef struct { pudval_t pud; } pud_t;
 
 static inline pud_t native_make_pud(pmdval_t val)
@@ -255,7 +255,7 @@ static inline pudval_t native_pud_val(pud_t pud)
 }
 #endif
 
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
 typedef struct { pmdval_t pmd; } pmd_t;
 
 static inline pmd_t native_make_pmd(pmdval_t val)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 548d25f00c90..c614dd492f5f 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -443,7 +443,7 @@ struct pv_mmu_ops pv_mmu_ops = {
 	.ptep_modify_prot_start = __ptep_modify_prot_start,
 	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
 
-#if PAGETABLE_LEVELS >= 3
+#if CONFIG_PGTABLE_LEVELS >= 3
 #ifdef CONFIG_X86_PAE
 	.set_pte_atomic = native_set_pte_atomic,
 	.pte_clear = native_pte_clear,
@@ -454,13 +454,13 @@ struct pv_mmu_ops pv_mmu_ops = {
 	.pmd_val = PTE_IDENT,
 	.make_pmd = PTE_IDENT,
 
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
 	.pud_val = PTE_IDENT,
 	.make_pud = PTE_IDENT,
 
 	.set_pgd = native_set_pgd,
 #endif
-#endif /* PAGETABLE_LEVELS >= 3 */
+#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
 
 	.pte_val = PTE_IDENT,
 	.pgd_val = PTE_IDENT,
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5a7e5252c878..b28edfecbdfe 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -58,7 +58,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
 	tlb_remove_page(tlb, pte);
 }
 
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 {
 	struct page *page = virt_to_page(pmd);
@@ -74,14 +74,14 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 	tlb_remove_page(tlb, page);
 }
 
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
 {
 	paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
 	tlb_remove_page(tlb, virt_to_page(pud));
 }
-#endif	/* PAGETABLE_LEVELS > 3 */
-#endif	/* PAGETABLE_LEVELS > 2 */
+#endif	/* CONFIG_PGTABLE_LEVELS > 3 */
+#endif	/* CONFIG_PGTABLE_LEVELS > 2 */
 
 static inline void pgd_list_add(pgd_t *pgd)
 {
@@ -117,9 +117,9 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
 	/* If the pgd points to a shared pagetable level (either the
 	   ptes in non-PAE, or shared PMD in PAE), then just copy the
 	   references from swapper_pg_dir. */
-	if (PAGETABLE_LEVELS == 2 ||
-	    (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
-	    PAGETABLE_LEVELS == 4) {
+	if (CONFIG_PGTABLE_LEVELS == 2 ||
+	    (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
+	    CONFIG_PGTABLE_LEVELS == 4) {
 		clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
 				swapper_pg_dir + KERNEL_PGD_BOUNDARY,
 				KERNEL_PGD_PTRS);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index adca9e2b6553..65083ad63b6f 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -502,7 +502,7 @@ __visible pmd_t xen_make_pmd(pmdval_t pmd)
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
 
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
 __visible pudval_t xen_pud_val(pud_t pud)
 {
 	return pte_mfn_to_pfn(pud.pud);
@@ -589,7 +589,7 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val)
 
 	xen_mc_issue(PARAVIRT_LAZY_MMU);
 }
-#endif	/* PAGETABLE_LEVELS == 4 */
+#endif	/* CONFIG_PGTABLE_LEVELS == 4 */
 
 /*
  * (Yet another) pagetable walker.  This one is intended for pinning a
@@ -1628,7 +1628,7 @@ static void xen_release_pmd(unsigned long pfn)
 	xen_release_ptpage(pfn, PT_PMD);
 }
 
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
 {
 	xen_alloc_ptpage(mm, pfn, PT_PUD);
@@ -2046,7 +2046,7 @@ static void __init xen_post_allocator_init(void)
 	pv_mmu_ops.set_pte = xen_set_pte;
 	pv_mmu_ops.set_pmd = xen_set_pmd;
 	pv_mmu_ops.set_pud = xen_set_pud;
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
 	pv_mmu_ops.set_pgd = xen_set_pgd;
 #endif
 
@@ -2056,7 +2056,7 @@ static void __init xen_post_allocator_init(void)
 	pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
 	pv_mmu_ops.release_pte = xen_release_pte;
 	pv_mmu_ops.release_pmd = xen_release_pmd;
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
 	pv_mmu_ops.alloc_pud = xen_alloc_pud;
 	pv_mmu_ops.release_pud = xen_release_pud;
 #endif
@@ -2122,14 +2122,14 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
 	.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
 	.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
 
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
 	.pud_val = PV_CALLEE_SAVE(xen_pud_val),
 	.make_pud = PV_CALLEE_SAVE(xen_make_pud),
 	.set_pgd = xen_set_pgd_hyper,
 
 	.alloc_pud = xen_alloc_pmd_init,
 	.release_pud = xen_release_pmd_init,
-#endif	/* PAGETABLE_LEVELS == 4 */
+#endif	/* CONFIG_PGTABLE_LEVELS == 4 */
 
 	.activate_mm = xen_activate_mm,
 	.dup_mmap = xen_dup_mmap,
-- 
cgit v1.2.3


From 5d72b4fba40ef4b3f7a1a11d6aacc85d9af81561 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hp.com>
Date: Tue, 14 Apr 2015 15:47:29 -0700
Subject: x86, mm: support huge I/O mapping capability I/F

Implement huge I/O mapping capability interfaces for ioremap() on x86.

IOREMAP_MAX_ORDER is defined to PUD_SHIFT on x86/64 and PMD_SHIFT on
x86/32, which overrides the default value defined in <linux/vmalloc.h>.

Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Robert Elliott <Elliott@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/page_types.h |  2 ++
 arch/x86/mm/ioremap.c             | 23 +++++++++++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index f97fbe3abb67..c7c712f2648b 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,8 +40,10 @@
 
 #ifdef CONFIG_X86_64
 #include <asm/page_64_types.h>
+#define IOREMAP_MAX_ORDER       (PUD_SHIFT)
 #else
 #include <asm/page_32_types.h>
+#define IOREMAP_MAX_ORDER       (PMD_SHIFT)
 #endif	/* CONFIG_X86_64 */
 
 #ifndef __ASSEMBLY__
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index fdf617c00e2f..5ead4d6cf3a7 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -67,8 +67,13 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
 
 /*
  * Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
- * directly.
+ * address space. It transparently creates kernel huge I/O mapping when
+ * the physical address is aligned by a huge page size (1GB or 2MB) and
+ * the requested size is at least the huge page size.
+ *
+ * NOTE: MTRRs can override PAT memory types with a 4KB granularity.
+ * Therefore, the mapping code falls back to use a smaller page toward 4KB
+ * when a mapping range is covered by non-WB type of MTRRs.
  *
  * NOTE! We need to allow non-page-aligned mappings too: we will obviously
  * have to convert them into an offset in a page-aligned mapping, but the
@@ -326,6 +331,20 @@ void iounmap(volatile void __iomem *addr)
 }
 EXPORT_SYMBOL(iounmap);
 
+int arch_ioremap_pud_supported(void)
+{
+#ifdef CONFIG_X86_64
+	return cpu_has_gbpages;
+#else
+	return 0;
+#endif
+}
+
+int arch_ioremap_pmd_supported(void)
+{
+	return cpu_has_pse;
+}
+
 /*
  * Convert a physical pointer to a virtual kernel pointer for /dev/mem
  * access
-- 
cgit v1.2.3


From 6b6378355b925050eb6fa966742d8c2d65ff0d83 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hp.com>
Date: Tue, 14 Apr 2015 15:47:32 -0700
Subject: x86, mm: support huge KVA mappings on x86

Implement huge KVA mapping interfaces on x86.

On x86, MTRRs can override PAT memory types with a 4KB granularity.  When
using a huge page, MTRRs can override the memory type of the huge page,
which may lead a performance penalty.  The processor can also behave in an
undefined manner if a huge page is mapped to a memory range that MTRRs
have mapped with multiple different memory types.  Therefore, the mapping
code falls back to use a smaller page size toward 4KB when a mapping range
is covered by non-WB type of MTRRs.  The WB type of MTRRs has no affect on
the PAT memory types.

pud_set_huge() and pmd_set_huge() call mtrr_type_lookup() to see if a
given range is covered by MTRRs.  MTRR_TYPE_WRBACK indicates that the
range is either covered by WB or not covered and the MTRR default value is
set to WB.  0xFF indicates that MTRRs are disabled.

HAVE_ARCH_HUGE_VMAP is selected when X86_64 or X86_32 with X86_PAE is set.
 X86_32 without X86_PAE is not supported since such config can unlikey be
benefited from this feature, and there was an issue found in testing.

[fengguang.wu@intel.com: ioremap_pud_capable can be static]
Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Robert Elliott <Elliott@hp.com>
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig      |  1 +
 arch/x86/mm/pgtable.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3e3aaad23414..0f948cefaeb1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -99,6 +99,7 @@ config X86
 	select IRQ_FORCED_THREADING
 	select HAVE_BPF_JIT if X86_64
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+	select HAVE_ARCH_HUGE_VMAP if X86_64 || (X86_32 && X86_PAE)
 	select ARCH_HAS_SG_CHAIN
 	select CLKEVT_I8253
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index b28edfecbdfe..0b97d2c75df3 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,6 +4,7 @@
 #include <asm/pgtable.h>
 #include <asm/tlb.h>
 #include <asm/fixmap.h>
+#include <asm/mtrr.h>
 
 #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
 
@@ -560,3 +561,67 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
 {
 	__native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
 }
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
+{
+	u8 mtrr;
+
+	/*
+	 * Do not use a huge page when the range is covered by non-WB type
+	 * of MTRRs.
+	 */
+	mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE);
+	if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+		return 0;
+
+	prot = pgprot_4k_2_large(prot);
+
+	set_pte((pte_t *)pud, pfn_pte(
+		(u64)addr >> PAGE_SHIFT,
+		__pgprot(pgprot_val(prot) | _PAGE_PSE)));
+
+	return 1;
+}
+
+int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
+{
+	u8 mtrr;
+
+	/*
+	 * Do not use a huge page when the range is covered by non-WB type
+	 * of MTRRs.
+	 */
+	mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE);
+	if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+		return 0;
+
+	prot = pgprot_4k_2_large(prot);
+
+	set_pte((pte_t *)pmd, pfn_pte(
+		(u64)addr >> PAGE_SHIFT,
+		__pgprot(pgprot_val(prot) | _PAGE_PSE)));
+
+	return 1;
+}
+
+int pud_clear_huge(pud_t *pud)
+{
+	if (pud_large(*pud)) {
+		pud_clear(pud);
+		return 1;
+	}
+
+	return 0;
+}
+
+int pmd_clear_huge(pmd_t *pmd)
+{
+	if (pmd_large(*pmd)) {
+		pmd_clear(pmd);
+		return 1;
+	}
+
+	return 0;
+}
+#endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
-- 
cgit v1.2.3


From 82168140bc4cec7ec9bad39705518541149ff8b7 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 14 Apr 2015 15:47:45 -0700
Subject: x86: standardize mmap_rnd() usage

In preparation for splitting out ET_DYN ASLR, this refactors the use of
mmap_rnd() to be used similarly to arm, and extracts the checking of
PF_RANDOMIZE.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/mmap.c | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index df4552bd239e..ebfa52030d5c 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -67,22 +67,21 @@ static int mmap_is_legacy(void)
 
 static unsigned long mmap_rnd(void)
 {
-	unsigned long rnd = 0;
+	unsigned long rnd;
 
 	/*
-	*  8 bits of randomness in 32bit mmaps, 20 address space bits
-	* 28 bits of randomness in 64bit mmaps, 40 address space bits
-	*/
-	if (current->flags & PF_RANDOMIZE) {
-		if (mmap_is_ia32())
-			rnd = get_random_int() % (1<<8);
-		else
-			rnd = get_random_int() % (1<<28);
-	}
+	 *  8 bits of randomness in 32bit mmaps, 20 address space bits
+	 * 28 bits of randomness in 64bit mmaps, 40 address space bits
+	 */
+	if (mmap_is_ia32())
+		rnd = (unsigned long)get_random_int() % (1<<8);
+	else
+		rnd = (unsigned long)get_random_int() % (1<<28);
+
 	return rnd << PAGE_SHIFT;
 }
 
-static unsigned long mmap_base(void)
+static unsigned long mmap_base(unsigned long rnd)
 {
 	unsigned long gap = rlimit(RLIMIT_STACK);
 
@@ -91,19 +90,19 @@ static unsigned long mmap_base(void)
 	else if (gap > MAX_GAP)
 		gap = MAX_GAP;
 
-	return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
+	return PAGE_ALIGN(TASK_SIZE - gap - rnd);
 }
 
 /*
  * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
  * does, but not when emulating X86_32
  */
-static unsigned long mmap_legacy_base(void)
+static unsigned long mmap_legacy_base(unsigned long rnd)
 {
 	if (mmap_is_ia32())
 		return TASK_UNMAPPED_BASE;
 	else
-		return TASK_UNMAPPED_BASE + mmap_rnd();
+		return TASK_UNMAPPED_BASE + rnd;
 }
 
 /*
@@ -112,13 +111,18 @@ static unsigned long mmap_legacy_base(void)
  */
 void arch_pick_mmap_layout(struct mm_struct *mm)
 {
-	mm->mmap_legacy_base = mmap_legacy_base();
-	mm->mmap_base = mmap_base();
+	unsigned long random_factor = 0UL;
+
+	if (current->flags & PF_RANDOMIZE)
+		random_factor = mmap_rnd();
+
+	mm->mmap_legacy_base = mmap_legacy_base(random_factor);
 
 	if (mmap_is_legacy()) {
 		mm->mmap_base = mm->mmap_legacy_base;
 		mm->get_unmapped_area = arch_get_unmapped_area;
 	} else {
+		mm->mmap_base = mmap_base(random_factor);
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
 	}
 }
-- 
cgit v1.2.3


From 2b68f6caeac271620cd2f9362aeaed360e317df0 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 14 Apr 2015 15:48:00 -0700
Subject: mm: expose arch_mmap_rnd when available

When an architecture fully supports randomizing the ELF load location,
a per-arch mmap_rnd() function is used to find a randomized mmap base.
In preparation for randomizing the location of ET_DYN binaries
separately from mmap, this renames and exports these functions as
arch_mmap_rnd(). Additionally introduces CONFIG_ARCH_HAS_ELF_RANDOMIZE
for describing this feature on architectures that support it
(which is a superset of ARCH_BINFMT_ELF_RANDOMIZE_PIE, since s390
already supports a separated ET_DYN ASLR from mmap ASLR without the
ARCH_BINFMT_ELF_RANDOMIZE_PIE logic).

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Hector Marco-Gisbert <hecmargi@upv.es>
Cc: Russell King <linux@arm.linux.org.uk>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: "David A. Long" <dave.long@linaro.org>
Cc: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Arun Chandran <achandran@mvista.com>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Min-Hua Chen <orca.chen@gmail.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Alex Smith <alex@alex-smith.me.uk>
Cc: Markos Chandras <markos.chandras@imgtec.com>
Cc: Vineeth Vijayan <vvijayan@mvista.com>
Cc: Jeff Bailey <jeffbailey@google.com>
Cc: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Behan Webster <behanw@converseincode.com>
Cc: Ismael Ripoll <iripoll@upv.es>
Cc: Jan-Simon Mller <dl9pf@gmx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig   | 1 +
 arch/x86/mm/mmap.c | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0f948cefaeb1..782ddbbc1c9a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -88,6 +88,7 @@ config X86
 	select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
 	select HAVE_USER_RETURN_NOTIFIER
 	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
+	select ARCH_HAS_ELF_RANDOMIZE
 	select HAVE_ARCH_JUMP_LABEL
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 	select SPARSE_IRQ
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index ebfa52030d5c..9d518d693b4b 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -65,7 +65,7 @@ static int mmap_is_legacy(void)
 	return sysctl_legacy_va_layout;
 }
 
-static unsigned long mmap_rnd(void)
+unsigned long arch_mmap_rnd(void)
 {
 	unsigned long rnd;
 
@@ -114,7 +114,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	unsigned long random_factor = 0UL;
 
 	if (current->flags & PF_RANDOMIZE)
-		random_factor = mmap_rnd();
+		random_factor = arch_mmap_rnd();
 
 	mm->mmap_legacy_base = mmap_legacy_base(random_factor);
 
-- 
cgit v1.2.3


From d1fd836dcf00d2028c700c7e44d2c23404062c90 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 14 Apr 2015 15:48:07 -0700
Subject: mm: split ET_DYN ASLR from mmap ASLR

This fixes the "offset2lib" weakness in ASLR for arm, arm64, mips,
powerpc, and x86.  The problem is that if there is a leak of ASLR from
the executable (ET_DYN), it means a leak of shared library offset as
well (mmap), and vice versa.  Further details and a PoC of this attack
is available here:

  http://cybersecurity.upv.es/attacks/offset2lib/offset2lib.html

With this patch, a PIE linked executable (ET_DYN) has its own ASLR
region:

  $ ./show_mmaps_pie
  54859ccd6000-54859ccd7000 r-xp  ...  /tmp/show_mmaps_pie
  54859ced6000-54859ced7000 r--p  ...  /tmp/show_mmaps_pie
  54859ced7000-54859ced8000 rw-p  ...  /tmp/show_mmaps_pie
  7f75be764000-7f75be91f000 r-xp  ...  /lib/x86_64-linux-gnu/libc.so.6
  7f75be91f000-7f75beb1f000 ---p  ...  /lib/x86_64-linux-gnu/libc.so.6
  7f75beb1f000-7f75beb23000 r--p  ...  /lib/x86_64-linux-gnu/libc.so.6
  7f75beb23000-7f75beb25000 rw-p  ...  /lib/x86_64-linux-gnu/libc.so.6
  7f75beb25000-7f75beb2a000 rw-p  ...
  7f75beb2a000-7f75beb4d000 r-xp  ...  /lib64/ld-linux-x86-64.so.2
  7f75bed45000-7f75bed46000 rw-p  ...
  7f75bed46000-7f75bed47000 r-xp  ...
  7f75bed47000-7f75bed4c000 rw-p  ...
  7f75bed4c000-7f75bed4d000 r--p  ...  /lib64/ld-linux-x86-64.so.2
  7f75bed4d000-7f75bed4e000 rw-p  ...  /lib64/ld-linux-x86-64.so.2
  7f75bed4e000-7f75bed4f000 rw-p  ...
  7fffb3741000-7fffb3762000 rw-p  ...  [stack]
  7fffb377b000-7fffb377d000 r--p  ...  [vvar]
  7fffb377d000-7fffb377f000 r-xp  ...  [vdso]

The change is to add a call the newly created arch_mmap_rnd() into the
ELF loader for handling ET_DYN ASLR in a separate region from mmap ASLR,
as was already done on s390.  Removes CONFIG_BINFMT_ELF_RANDOMIZE_PIE,
which is no longer needed.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reported-by: Hector Marco-Gisbert <hecmargi@upv.es>
Cc: Russell King <linux@arm.linux.org.uk>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: "David A. Long" <dave.long@linaro.org>
Cc: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Arun Chandran <achandran@mvista.com>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Min-Hua Chen <orca.chen@gmail.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Alex Smith <alex@alex-smith.me.uk>
Cc: Markos Chandras <markos.chandras@imgtec.com>
Cc: Vineeth Vijayan <vvijayan@mvista.com>
Cc: Jeff Bailey <jeffbailey@google.com>
Cc: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Behan Webster <behanw@converseincode.com>
Cc: Ismael Ripoll <iripoll@upv.es>
Cc: Jan-Simon Mller <dl9pf@gmx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 782ddbbc1c9a..1f7f185934a5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -87,7 +87,6 @@ config X86
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
 	select HAVE_USER_RETURN_NOTIFIER
-	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
 	select ARCH_HAS_ELF_RANDOMIZE
 	select HAVE_ARCH_JUMP_LABEL
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
-- 
cgit v1.2.3


From 204db6ed17743000691d930368a5abd6ea541c58 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 14 Apr 2015 15:48:12 -0700
Subject: mm: fold arch_randomize_brk into ARCH_HAS_ELF_RANDOMIZE

The arch_randomize_brk() function is used on several architectures,
even those that don't support ET_DYN ASLR. To avoid bulky extern/#define
tricks, consolidate the support under CONFIG_ARCH_HAS_ELF_RANDOMIZE for
the architectures that support it, while still handling CONFIG_COMPAT_BRK.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Hector Marco-Gisbert <hecmargi@upv.es>
Cc: Russell King <linux@arm.linux.org.uk>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: "David A. Long" <dave.long@linaro.org>
Cc: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Arun Chandran <achandran@mvista.com>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Min-Hua Chen <orca.chen@gmail.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Alex Smith <alex@alex-smith.me.uk>
Cc: Markos Chandras <markos.chandras@imgtec.com>
Cc: Vineeth Vijayan <vvijayan@mvista.com>
Cc: Jeff Bailey <jeffbailey@google.com>
Cc: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Behan Webster <behanw@converseincode.com>
Cc: Ismael Ripoll <iripoll@upv.es>
Cc: Jan-Simon Mller <dl9pf@gmx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/elf.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 935588d95c82..f161c189c27b 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -339,9 +339,6 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
 					      int uses_interp);
 #define compat_arch_setup_additional_pages compat_arch_setup_additional_pages
 
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
-
 /*
  * True on X86_32 or when emulating IA32 on X86_64
  */
-- 
cgit v1.2.3


From 4a20799d11f64e6b8725cacc7619b1ae1dbf9acd Mon Sep 17 00:00:00 2001
From: Vladimir Murzin <vladimir.murzin@arm.com>
Date: Tue, 14 Apr 2015 15:48:27 -0700
Subject: mm: move memtest under mm

Memtest is a simple feature which fills the memory with a given set of
patterns and validates memory contents, if bad memory regions is detected
it reserves them via memblock API.  Since memblock API is widely used by
other architectures this feature can be enabled outside of x86 world.

This patch set promotes memtest to live under generic mm umbrella and
enables memtest feature for arm/arm64.

It was reported that this patch set was useful for tracking down an issue
with some errant DMA on an arm64 platform.

This patch (of 6):

There is nothing platform dependent in the core memtest code, so other
platforms might benefit from this feature too.

[linux@roeck-us.net: MEMTEST depends on MEMBLOCK]
Signed-off-by: Vladimir Murzin <vladimir.murzin@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig            |  11 -----
 arch/x86/include/asm/e820.h |   8 ---
 arch/x86/mm/Makefile        |   2 -
 arch/x86/mm/memtest.c       | 118 --------------------------------------------
 4 files changed, 139 deletions(-)
 delete mode 100644 arch/x86/mm/memtest.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1f7f185934a5..d43e7e1c784b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -721,17 +721,6 @@ endif #HYPERVISOR_GUEST
 config NO_BOOTMEM
 	def_bool y
 
-config MEMTEST
-	bool "Memtest"
-	---help---
-	  This option adds a kernel parameter 'memtest', which allows memtest
-	  to be set.
-	        memtest=0, mean disabled; -- default
-	        memtest=1, mean do 1 test pattern;
-	        ...
-	        memtest=4, mean do 4 test patterns.
-	  If you are unsure how to answer this question, answer N.
-
 source "arch/x86/Kconfig.cpu"
 
 config HPET_TIMER
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 779c2efe2e97..3ab0537872fb 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -40,14 +40,6 @@ static inline void e820_mark_nosave_regions(unsigned long limit_pfn)
 }
 #endif
 
-#ifdef CONFIG_MEMTEST
-extern void early_memtest(unsigned long start, unsigned long end);
-#else
-static inline void early_memtest(unsigned long start, unsigned long end)
-{
-}
-#endif
-
 extern unsigned long e820_end_of_ram_pfn(void);
 extern unsigned long e820_end_of_low_ram_pfn(void);
 extern u64 early_reserve_e820(u64 sizet, u64 align);
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index c4cc74006c61..a482d105172b 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -32,6 +32,4 @@ obj-$(CONFIG_AMD_NUMA)		+= amdtopology.o
 obj-$(CONFIG_ACPI_NUMA)		+= srat.o
 obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
 
-obj-$(CONFIG_MEMTEST)		+= memtest.o
-
 obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
deleted file mode 100644
index 1e9da795767a..000000000000
--- a/arch/x86/mm/memtest.c
+++ /dev/null
@@ -1,118 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/init.h>
-#include <linux/pfn.h>
-#include <linux/memblock.h>
-
-static u64 patterns[] __initdata = {
-	/* The first entry has to be 0 to leave memtest with zeroed memory */
-	0,
-	0xffffffffffffffffULL,
-	0x5555555555555555ULL,
-	0xaaaaaaaaaaaaaaaaULL,
-	0x1111111111111111ULL,
-	0x2222222222222222ULL,
-	0x4444444444444444ULL,
-	0x8888888888888888ULL,
-	0x3333333333333333ULL,
-	0x6666666666666666ULL,
-	0x9999999999999999ULL,
-	0xccccccccccccccccULL,
-	0x7777777777777777ULL,
-	0xbbbbbbbbbbbbbbbbULL,
-	0xddddddddddddddddULL,
-	0xeeeeeeeeeeeeeeeeULL,
-	0x7a6c7258554e494cULL, /* yeah ;-) */
-};
-
-static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
-{
-	printk(KERN_INFO "  %016llx bad mem addr %010llx - %010llx reserved\n",
-	       (unsigned long long) pattern,
-	       (unsigned long long) start_bad,
-	       (unsigned long long) end_bad);
-	memblock_reserve(start_bad, end_bad - start_bad);
-}
-
-static void __init memtest(u64 pattern, u64 start_phys, u64 size)
-{
-	u64 *p, *start, *end;
-	u64 start_bad, last_bad;
-	u64 start_phys_aligned;
-	const size_t incr = sizeof(pattern);
-
-	start_phys_aligned = ALIGN(start_phys, incr);
-	start = __va(start_phys_aligned);
-	end = start + (size - (start_phys_aligned - start_phys)) / incr;
-	start_bad = 0;
-	last_bad = 0;
-
-	for (p = start; p < end; p++)
-		*p = pattern;
-
-	for (p = start; p < end; p++, start_phys_aligned += incr) {
-		if (*p == pattern)
-			continue;
-		if (start_phys_aligned == last_bad + incr) {
-			last_bad += incr;
-			continue;
-		}
-		if (start_bad)
-			reserve_bad_mem(pattern, start_bad, last_bad + incr);
-		start_bad = last_bad = start_phys_aligned;
-	}
-	if (start_bad)
-		reserve_bad_mem(pattern, start_bad, last_bad + incr);
-}
-
-static void __init do_one_pass(u64 pattern, u64 start, u64 end)
-{
-	u64 i;
-	phys_addr_t this_start, this_end;
-
-	for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
-		this_start = clamp_t(phys_addr_t, this_start, start, end);
-		this_end = clamp_t(phys_addr_t, this_end, start, end);
-		if (this_start < this_end) {
-			printk(KERN_INFO "  %010llx - %010llx pattern %016llx\n",
-			       (unsigned long long)this_start,
-			       (unsigned long long)this_end,
-			       (unsigned long long)cpu_to_be64(pattern));
-			memtest(pattern, this_start, this_end - this_start);
-		}
-	}
-}
-
-/* default is disabled */
-static int memtest_pattern __initdata;
-
-static int __init parse_memtest(char *arg)
-{
-	if (arg)
-		memtest_pattern = simple_strtoul(arg, NULL, 0);
-	else
-		memtest_pattern = ARRAY_SIZE(patterns);
-
-	return 0;
-}
-
-early_param("memtest", parse_memtest);
-
-void __init early_memtest(unsigned long start, unsigned long end)
-{
-	unsigned int i;
-	unsigned int idx = 0;
-
-	if (!memtest_pattern)
-		return;
-
-	printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
-	for (i = memtest_pattern-1; i < UINT_MAX; --i) {
-		idx = i % ARRAY_SIZE(patterns);
-		do_one_pass(patterns[idx], start, end);
-	}
-}
-- 
cgit v1.2.3


From 130005231c9f2090b1b177e2cca9841b562c1784 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@linux.intel.com>
Date: Wed, 15 Apr 2015 10:24:54 +0800
Subject: kvm: mmu: don't do memslot overflow check

As Andres pointed out:

| I don't understand the value of this check here. Are we looking for a
| broken memslot? Shouldn't this be a BUG_ON? Is this the place to care
| about these things? npages is capped to KVM_MEM_MAX_NR_PAGES, i.e.
| 2^31. A 64 bit overflow would be caused by a gigantic gfn_start which
| would be trouble in many other ways.

This patch drops the memslot overflow check to make the codes more simple.

Reviewed-by: Andres Lagar-Cavilla <andreslc@google.com>
Signed-off-by: Wanpeng Li <wanpeng.li@linux.intel.com>
Message-Id: <1429064694-3072-1-git-send-email-wanpeng.li@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 146f295ee322..07bb22157338 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4504,19 +4504,12 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
 	bool flush = false;
 	unsigned long *rmapp;
 	unsigned long last_index, index;
-	gfn_t gfn_start, gfn_end;
 
 	spin_lock(&kvm->mmu_lock);
 
-	gfn_start = memslot->base_gfn;
-	gfn_end = memslot->base_gfn + memslot->npages - 1;
-
-	if (gfn_start >= gfn_end)
-		goto out;
-
 	rmapp = memslot->arch.rmap[0];
-	last_index = gfn_to_index(gfn_end, memslot->base_gfn,
-					PT_PAGE_TABLE_LEVEL);
+	last_index = gfn_to_index(memslot->base_gfn + memslot->npages - 1,
+				memslot->base_gfn, PT_PAGE_TABLE_LEVEL);
 
 	for (index = 0; index <= last_index; ++index, ++rmapp) {
 		if (*rmapp)
@@ -4534,7 +4527,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
 	if (flush)
 		kvm_flush_remote_tlbs(kvm);
 
-out:
 	spin_unlock(&kvm->mmu_lock);
 }
 
-- 
cgit v1.2.3


From decf63336e356423300b935afbebeca1fcb15184 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Tue, 14 Apr 2015 12:04:10 +0800
Subject: KVM: MMU: fix comment in kvm_mmu_zap_collapsible_spte

Soft mmu uses direct shadow page to fill guest large mapping with small
pages if huge mapping is disallowed on host. So zapping direct shadow
page works well both for soft mmu and hard mmu, it's just less widely
applicable.

Fix the comment to reflect this.

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Message-Id: <552C91BA.1010703@linux.intel.com>
[Fix comment wording further. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 07bb22157338..d43867c33bc4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4481,9 +4481,11 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
 		pfn = spte_to_pfn(*sptep);
 
 		/*
-		 * Only EPT supported for now; otherwise, one would need to
-		 * find out efficiently whether the guest page tables are
-		 * also using huge pages.
+		 * We cannot do huge page mapping for indirect shadow pages,
+		 * which are found on the last rmap (level = 1) when not using
+		 * tdp; such shadow pages are synced with the page table in
+		 * the guest, and the guest page table is using 4K page size
+		 * mapping if the indirect sp has level = 1.
 		 */
 		if (sp->role.direct &&
 			!kvm_is_reserved_pfn(pfn) &&
-- 
cgit v1.2.3


From bb668734c4c960c8f61f017585b323b97e5f47b5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 17 Mar 2015 22:26:21 +0000
Subject: VFS: assorted d_backing_inode() annotations

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/kvm/assigned-dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
index 6eb5c20ee373..d090ecf08809 100644
--- a/arch/x86/kvm/assigned-dev.c
+++ b/arch/x86/kvm/assigned-dev.c
@@ -666,7 +666,7 @@ static int probe_sysfs_permissions(struct pci_dev *dev)
 		if (r)
 			return r;
 
-		inode = path.dentry->d_inode;
+		inode = d_backing_inode(path.dentry);
 
 		r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS);
 		path_put(&path);
-- 
cgit v1.2.3


From 3ac62bc0602794dc36aad77a7ef5772e989b2e22 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 15 Apr 2015 16:17:45 -0700
Subject: x86: mtrr: if: remove use of seq_printf return value

The seq_printf return value, because it's frequently misused,
will eventually be converted to void.

See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to
     seq_has_overflowed() and make public")

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/mtrr/if.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index a041e094b8b9..d76f13d6d8d6 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -404,11 +404,10 @@ static const struct file_operations mtrr_fops = {
 static int mtrr_seq_show(struct seq_file *seq, void *offset)
 {
 	char factor;
-	int i, max, len;
+	int i, max;
 	mtrr_type type;
 	unsigned long base, size;
 
-	len = 0;
 	max = num_var_ranges;
 	for (i = 0; i < max; i++) {
 		mtrr_if->get(i, &base, &size, &type);
@@ -425,11 +424,10 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
 			size >>= 20 - PAGE_SHIFT;
 		}
 		/* Base can be > 32bit */
-		len += seq_printf(seq, "reg%02i: base=0x%06lx000 "
-			"(%5luMB), size=%5lu%cB, count=%d: %s\n",
-			i, base, base >> (20 - PAGE_SHIFT), size,
-			factor, mtrr_usage_table[i],
-			mtrr_attrib_to_str(type));
+		seq_printf(seq, "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
+			   i, base, base >> (20 - PAGE_SHIFT),
+			   size, factor,
+			   mtrr_usage_table[i], mtrr_attrib_to_str(type));
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From fd0f86b66425bd8c6af8985881e82b28c30fd450 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 16 Apr 2015 00:40:25 -0700
Subject: x86/ptrace: Fix the TIF_FORCED_TF logic in handle_signal()

When the TIF_SINGLESTEP tracee dequeues a signal,
handle_signal() clears TIF_FORCED_TF and X86_EFLAGS_TF but
leaves TIF_SINGLESTEP set.

If the tracer does PTRACE_SINGLESTEP again, enable_single_step()
sets X86_EFLAGS_TF but not TIF_FORCED_TF.  This means that the
subsequent PTRACE_CONT doesn't not clear X86_EFLAGS_TF, and the
tracee gets the wrong SIGTRAP.

Test-case (needs -O2 to avoid prologue insns in signal handler):

	#include <unistd.h>
	#include <stdio.h>
	#include <sys/ptrace.h>
	#include <sys/wait.h>
	#include <sys/user.h>
	#include <assert.h>
	#include <stddef.h>

	void handler(int n)
	{
		asm("nop");
	}

	int child(void)
	{
		assert(ptrace(PTRACE_TRACEME, 0,0,0) == 0);
		signal(SIGALRM, handler);
		kill(getpid(), SIGALRM);
		return 0x23;
	}

	void *getip(int pid)
	{
		return (void*)ptrace(PTRACE_PEEKUSER, pid,
					offsetof(struct user, regs.rip), 0);
	}

	int main(void)
	{
		int pid, status;

		pid = fork();
		if (!pid)
			return child();

		assert(wait(&status) == pid);
		assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGALRM);

		assert(ptrace(PTRACE_SINGLESTEP, pid, 0, SIGALRM) == 0);
		assert(wait(&status) == pid);
		assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP);
		assert((getip(pid) - (void*)handler) == 0);

		assert(ptrace(PTRACE_SINGLESTEP, pid, 0, SIGALRM) == 0);
		assert(wait(&status) == pid);
		assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP);
		assert((getip(pid) - (void*)handler) == 1);

		assert(ptrace(PTRACE_CONT, pid, 0,0) == 0);
		assert(wait(&status) == pid);
		assert(WIFEXITED(status) && WEXITSTATUS(status) == 0x23);

		return 0;
	}

The last assert() fails because PTRACE_CONT wrongly triggers
another single-step and X86_EFLAGS_TF can't be cleared by
debugger until the tracee does sys_rt_sigreturn().

Change handle_signal() to do user_disable_single_step() if
stepping, we do not need to preserve TIF_SINGLESTEP because we
are going to do ptrace_notify(), and it is simply wrong to leak
this bit.

While at it, change the comment to explain why we also need to
clear TF unconditionally after setup_rt_frame().

Note: in the longer term we should probably change
setup_sigcontext() to use get_flags() and then just remove this
user_disable_single_step().  And, the state of TIF_FORCED_TF can
be wrong after restore_sigcontext() which can set/clear TF, this
needs another fix.

This fix fixes the 'single_step_syscall_32' testcase in
the x86 testsuite:

Before:

	~/linux/tools/testing/selftests/x86> ./single_step_syscall_32
	[RUN]   Set TF and check nop
	[OK]    Survived with TF set and 9 traps
	[RUN]   Set TF and check int80
	[OK]    Survived with TF set and 9 traps
	[RUN]   Set TF and check a fast syscall
	[WARN]  Hit 10000 SIGTRAPs with si_addr 0xf7789cc0, ip 0xf7789cc0
	Trace/breakpoint trap (core dumped)

After:

	~/linux/linux/tools/testing/selftests/x86> ./single_step_syscall_32
	[RUN]   Set TF and check nop
	[OK]    Survived with TF set and 9 traps
	[RUN]   Set TF and check int80
	[OK]    Survived with TF set and 9 traps
	[RUN]   Set TF and check a fast syscall
	[OK]    Survived with TF set and 39 traps
	[RUN]   Fast syscall with TF cleared
	[OK]    Nothing unexpected happened

Reported-by: Evan Teran <eteran@alum.rit.edu>
Reported-by: Pedro Alves <palves@redhat.com>
Tested-by: Andres Freund <andres@anarazel.de>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
[ Added x86 self-test info. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/signal.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 3e581865c8e2..d185bdd95a4b 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -630,7 +630,8 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
 static void
 handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 {
-	bool failed;
+	bool stepping, failed;
+
 	/* Are we from a system call? */
 	if (syscall_get_nr(current, regs) >= 0) {
 		/* If so, check system call restarting.. */
@@ -654,12 +655,13 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 	}
 
 	/*
-	 * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
-	 * flag so that register information in the sigcontext is correct.
+	 * If TF is set due to a debugger (TIF_FORCED_TF), clear TF now
+	 * so that register information in the sigcontext is correct and
+	 * then notify the tracer before entering the signal handler.
 	 */
-	if (unlikely(regs->flags & X86_EFLAGS_TF) &&
-	    likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
-		regs->flags &= ~X86_EFLAGS_TF;
+	stepping = test_thread_flag(TIF_SINGLESTEP);
+	if (stepping)
+		user_disable_single_step(current);
 
 	failed = (setup_rt_frame(ksig, regs) < 0);
 	if (!failed) {
@@ -670,10 +672,8 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 		 * it might disable possible debug exception from the
 		 * signal handler.
 		 *
-		 * Clear TF when entering the signal handler, but
-		 * notify any tracer that was single-stepping it.
-		 * The tracer may want to single-step inside the
-		 * handler too.
+		 * Clear TF for the case when it wasn't set by debugger to
+		 * avoid the recursive send_sigtrap() in SIGTRAP handler.
 		 */
 		regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF);
 		/*
@@ -682,7 +682,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 		if (used_math())
 			fpu_reset_state(current);
 	}
-	signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP));
+	signal_setup_done(failed, ksig, stepping);
 }
 
 #ifdef CONFIG_X86_32
-- 
cgit v1.2.3


From c857eb56e6e8e53dccd8d1e7ea90bcaf3311996d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 15 Apr 2015 20:14:53 +0200
Subject: perf/x86: Fix hw_perf_event::flags collision

Somehow we ended up with overlapping flags when merging the
RDPMC control flag - this is bad, fix it.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 329f0356ad4a..6ac5cb7a9e14 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -65,15 +65,15 @@ struct event_constraint {
 /*
  * struct hw_perf_event.flags flags
  */
-#define PERF_X86_EVENT_PEBS_LDLAT	0x1 /* ld+ldlat data address sampling */
-#define PERF_X86_EVENT_PEBS_ST		0x2 /* st data address sampling */
-#define PERF_X86_EVENT_PEBS_ST_HSW	0x4 /* haswell style datala, store */
-#define PERF_X86_EVENT_COMMITTED	0x8 /* event passed commit_txn */
-#define PERF_X86_EVENT_PEBS_LD_HSW	0x10 /* haswell style datala, load */
-#define PERF_X86_EVENT_PEBS_NA_HSW	0x20 /* haswell style datala, unknown */
-#define PERF_X86_EVENT_EXCL		0x40 /* HT exclusivity on counter */
-#define PERF_X86_EVENT_DYNAMIC		0x80 /* dynamic alloc'd constraint */
-#define PERF_X86_EVENT_RDPMC_ALLOWED	0x40 /* grant rdpmc permission */
+#define PERF_X86_EVENT_PEBS_LDLAT	0x0001 /* ld+ldlat data address sampling */
+#define PERF_X86_EVENT_PEBS_ST		0x0002 /* st data address sampling */
+#define PERF_X86_EVENT_PEBS_ST_HSW	0x0004 /* haswell style datala, store */
+#define PERF_X86_EVENT_COMMITTED	0x0008 /* event passed commit_txn */
+#define PERF_X86_EVENT_PEBS_LD_HSW	0x0010 /* haswell style datala, load */
+#define PERF_X86_EVENT_PEBS_NA_HSW	0x0020 /* haswell style datala, unknown */
+#define PERF_X86_EVENT_EXCL		0x0040 /* HT exclusivity on counter */
+#define PERF_X86_EVENT_DYNAMIC		0x0080 /* dynamic alloc'd constraint */
+#define PERF_X86_EVENT_RDPMC_ALLOWED	0x0100 /* grant rdpmc permission */
 
 
 struct amd_nb {
-- 
cgit v1.2.3


From 517e6341fa123ec3a2f9ea78ad547be910529881 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 11 Apr 2015 12:16:22 +0200
Subject: perf/x86/intel: Fix Core2,Atom,NHM,WSM cycles:pp events

Ingo reported that cycles:pp didn't work for him on some machines.

It turns out that in this commit:

  af4bdcf675cf perf/x86/intel: Disallow flags for most Core2/Atom/Nehalem/Westmere events

Andi forgot to explicitly allow that event when he
disabled event flags for PEBS on those uarchs.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Fixes: af4bdcf675cf ("perf/x86/intel: Disallow flags for most Core2/Atom/Nehalem/Westmere events")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index ca69ea56c712..813f75d71175 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -558,6 +558,8 @@ struct event_constraint intel_core2_pebs_event_constraints[] = {
 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
+	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
 	EVENT_CONSTRAINT_END
 };
 
@@ -565,6 +567,8 @@ struct event_constraint intel_atom_pebs_event_constraints[] = {
 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
+	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
 	EVENT_CONSTRAINT_END
 };
 
@@ -588,6 +592,8 @@ struct event_constraint intel_nehalem_pebs_event_constraints[] = {
 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
+	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
 	EVENT_CONSTRAINT_END
 };
 
@@ -603,6 +609,8 @@ struct event_constraint intel_westmere_pebs_event_constraints[] = {
 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
+	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
 	EVENT_CONSTRAINT_END
 };
 
-- 
cgit v1.2.3


From 645523960102fa0ac0578d070630e49ab05f06d1 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 26 Mar 2015 14:28:45 -0700
Subject: perf/x86/intel/rapl: Fix energy counter measurements but supporing
 per domain energy units

RAPL energy hardware unit can vary within a single CPU package, e.g.
HSW server DRAM has a fixed energy unit of 15.3 uJ (2^-16) whereas
the unit on other domains can be enumerated from power unit MSR.

There might be other variations in the future, this patch adds
per cpu model quirk to allow special handling of certain cpus.

hw_unit is also removed from per cpu data since it is not per cpu
and the sampling rate for energy counter is typically not high.

Without this patch, DRAM domain on HSW servers will be counted
4x higher than the real energy counter.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1427405325-780-1-git-send-email-jacob.jun.pan@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_rapl.c | 94 ++++++++++++++++++++++-------
 1 file changed, 73 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index c4bb8b8e5017..999289b94025 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -62,6 +62,14 @@
 #define RAPL_IDX_PP1_NRG_STAT	3	/* gpu */
 #define INTEL_RAPL_PP1		0x4	/* pseudo-encoding */
 
+#define NR_RAPL_DOMAINS         0x4
+static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
+	"pp0-core",
+	"package",
+	"dram",
+	"pp1-gpu",
+};
+
 /* Clients have PP0, PKG */
 #define RAPL_IDX_CLN	(1<<RAPL_IDX_PP0_NRG_STAT|\
 			 1<<RAPL_IDX_PKG_NRG_STAT|\
@@ -112,7 +120,6 @@ static struct perf_pmu_events_attr event_attr_##v = {			\
 
 struct rapl_pmu {
 	spinlock_t	 lock;
-	int		 hw_unit;  /* 1/2^hw_unit Joule */
 	int		 n_active; /* number of active events */
 	struct list_head active_list;
 	struct pmu	 *pmu; /* pointer to rapl_pmu_class */
@@ -120,6 +127,7 @@ struct rapl_pmu {
 	struct hrtimer   hrtimer;
 };
 
+static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;  /* 1/2^hw_unit Joule */
 static struct pmu rapl_pmu_class;
 static cpumask_t rapl_cpu_mask;
 static int rapl_cntr_mask;
@@ -127,6 +135,7 @@ static int rapl_cntr_mask;
 static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
 static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
 
+static struct x86_pmu_quirk *rapl_quirks;
 static inline u64 rapl_read_counter(struct perf_event *event)
 {
 	u64 raw;
@@ -134,15 +143,28 @@ static inline u64 rapl_read_counter(struct perf_event *event)
 	return raw;
 }
 
-static inline u64 rapl_scale(u64 v)
+#define rapl_add_quirk(func_)						\
+do {									\
+	static struct x86_pmu_quirk __quirk __initdata = {		\
+		.func = func_,						\
+	};								\
+	__quirk.next = rapl_quirks;					\
+	rapl_quirks = &__quirk;						\
+} while (0)
+
+static inline u64 rapl_scale(u64 v, int cfg)
 {
+	if (cfg > NR_RAPL_DOMAINS) {
+		pr_warn("invalid domain %d, failed to scale data\n", cfg);
+		return v;
+	}
 	/*
 	 * scale delta to smallest unit (1/2^32)
 	 * users must then scale back: count * 1/(1e9*2^32) to get Joules
 	 * or use ldexp(count, -32).
 	 * Watts = Joules/Time delta
 	 */
-	return v << (32 - __this_cpu_read(rapl_pmu)->hw_unit);
+	return v << (32 - rapl_hw_unit[cfg - 1]);
 }
 
 static u64 rapl_event_update(struct perf_event *event)
@@ -173,7 +195,7 @@ again:
 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
 	delta >>= shift;
 
-	sdelta = rapl_scale(delta);
+	sdelta = rapl_scale(delta, event->hw.config);
 
 	local64_add(sdelta, &event->count);
 
@@ -546,12 +568,22 @@ static void rapl_cpu_init(int cpu)
 	cpumask_set_cpu(cpu, &rapl_cpu_mask);
 }
 
+static __init void rapl_hsw_server_quirk(void)
+{
+	/*
+	 * DRAM domain on HSW server has fixed energy unit which can be
+	 * different than the unit from power unit MSR.
+	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
+	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
+	 */
+	rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
+}
+
 static int rapl_cpu_prepare(int cpu)
 {
 	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
 	int phys_id = topology_physical_package_id(cpu);
 	u64 ms;
-	u64 msr_rapl_power_unit_bits;
 
 	if (pmu)
 		return 0;
@@ -559,24 +591,13 @@ static int rapl_cpu_prepare(int cpu)
 	if (phys_id < 0)
 		return -1;
 
-	/* protect rdmsrl() to handle virtualization */
-	if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
-		return -1;
-
 	pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
 	if (!pmu)
 		return -1;
-
 	spin_lock_init(&pmu->lock);
 
 	INIT_LIST_HEAD(&pmu->active_list);
 
-	/*
-	 * grab power unit as: 1/2^unit Joules
-	 *
-	 * we cache in local PMU instance
-	 */
-	pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
 	pmu->pmu = &rapl_pmu_class;
 
 	/*
@@ -586,8 +607,8 @@ static int rapl_cpu_prepare(int cpu)
 	 * divide interval by 2 to avoid lockstep (2 * 100)
 	 * if hw unit is 32, then we use 2 ms 1/200/2
 	 */
-	if (pmu->hw_unit < 32)
-		ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1));
+	if (rapl_hw_unit[0] < 32)
+		ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1));
 	else
 		ms = 2;
 
@@ -655,6 +676,20 @@ static int rapl_cpu_notifier(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
+static int rapl_check_hw_unit(void)
+{
+	u64 msr_rapl_power_unit_bits;
+	int i;
+
+	/* protect rdmsrl() to handle virtualization */
+	if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
+		return -1;
+	for (i = 0; i < NR_RAPL_DOMAINS; i++)
+		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
+
+	return 0;
+}
+
 static const struct x86_cpu_id rapl_cpu_match[] = {
 	[0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
 	[1] = {},
@@ -664,6 +699,8 @@ static int __init rapl_pmu_init(void)
 {
 	struct rapl_pmu *pmu;
 	int cpu, ret;
+	struct x86_pmu_quirk *quirk;
+	int i;
 
 	/*
 	 * check for Intel processor family 6
@@ -678,6 +715,11 @@ static int __init rapl_pmu_init(void)
 		rapl_cntr_mask = RAPL_IDX_CLN;
 		rapl_pmu_events_group.attrs = rapl_events_cln_attr;
 		break;
+	case 63: /* Haswell-Server */
+		rapl_add_quirk(rapl_hsw_server_quirk);
+		rapl_cntr_mask = RAPL_IDX_SRV;
+		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
+		break;
 	case 60: /* Haswell */
 	case 69: /* Haswell-Celeron */
 		rapl_cntr_mask = RAPL_IDX_HSW;
@@ -693,7 +735,13 @@ static int __init rapl_pmu_init(void)
 		/* unsupported */
 		return 0;
 	}
+	ret = rapl_check_hw_unit();
+	if (ret)
+		return ret;
 
+	/* run cpu model quirks */
+	for (quirk = rapl_quirks; quirk; quirk = quirk->next)
+		quirk->func();
 	cpu_notifier_register_begin();
 
 	for_each_online_cpu(cpu) {
@@ -714,14 +762,18 @@ static int __init rapl_pmu_init(void)
 
 	pmu = __this_cpu_read(rapl_pmu);
 
-	pr_info("RAPL PMU detected, hw unit 2^-%d Joules,"
+	pr_info("RAPL PMU detected,"
 		" API unit is 2^-32 Joules,"
 		" %d fixed counters"
 		" %llu ms ovfl timer\n",
-		pmu->hw_unit,
 		hweight32(rapl_cntr_mask),
 		ktime_to_ms(pmu->timer_interval));
-
+	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
+		if (rapl_cntr_mask & (1 << i)) {
+			pr_info("hw unit of domain %s 2^-%d Joules\n",
+				rapl_domain_names[i], rapl_hw_unit[i]);
+		}
+	}
 out:
 	cpu_notifier_register_done();
 
-- 
cgit v1.2.3


From 78d504bcd769cc496f63b626f507039eab2316b7 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 2 Apr 2015 04:12:57 -0400
Subject: perf/x86/intel: Add Broadwell support for the LBR callstack

Same as Haswell, Broadwell also support the LBR callstack.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Andi Kleen <ak@linux.intel.com>
Link: http://lkml.kernel.org/r/1427962377-40955-1-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 9da2400c2ec3..219d3fb423a1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -3275,7 +3275,7 @@ __init int intel_pmu_init(void)
 		hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
 									      BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
 
-		intel_pmu_lbr_init_snb();
+		intel_pmu_lbr_init_hsw();
 
 		x86_pmu.event_constraints = intel_bdw_event_constraints;
 		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
-- 
cgit v1.2.3


From 18ecb3bfa5a9f6fffbb3eeb4369f0b9463438ec0 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 16 Apr 2015 20:41:37 +0200
Subject: x86/fpu: Load xsave pointer *after* initialization

So I was playing with gdb today and did this simple thing:

	gdb /bin/ls

	...

	(gdb) run

Box exploded with this splat:

	BUG: unable to handle kernel NULL pointer dereference at 00000000000001d0
	IP: [<ffffffff8100fe5a>] xstateregs_get+0x7a/0x120
	[...]

	Call Trace:
	 ptrace_regset
	 ptrace_request
	 ? wait_task_inactive
	 ? preempt_count_sub
	 arch_ptrace
	 ? ptrace_get_task_struct
	 SyS_ptrace
	 system_call_fastpath

... because we do cache &target->thread.fpu.state->xsave into the
local variable xsave but that pointer is NULL at that time and
it gets initialized later, in init_fpu(), see:

	e7f180dcd8ab ("x86/fpu: Change xstateregs_get()/set() to use ->xsave.i387 rather than ->fxsave")

The fix is simple: load xsave *after* init_fpu() has run.

Also do the same in xstateregs_set(), as suggested by Oleg Nesterov.

Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Tavis Ormandy <taviso@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1429209697-5902-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/i387.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 367f39d35e9c..009183276bb7 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -341,7 +341,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 		unsigned int pos, unsigned int count,
 		void *kbuf, void __user *ubuf)
 {
-	struct xsave_struct *xsave = &target->thread.fpu.state->xsave;
+	struct xsave_struct *xsave;
 	int ret;
 
 	if (!cpu_has_xsave)
@@ -351,6 +351,8 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 	if (ret)
 		return ret;
 
+	xsave = &target->thread.fpu.state->xsave;
+
 	/*
 	 * Copy the 48bytes defined by the software first into the xstate
 	 * memory layout in the thread struct, so that we can copy the entire
@@ -369,7 +371,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 		  unsigned int pos, unsigned int count,
 		  const void *kbuf, const void __user *ubuf)
 {
-	struct xsave_struct *xsave = &target->thread.fpu.state->xsave;
+	struct xsave_struct *xsave;
 	int ret;
 
 	if (!cpu_has_xsave)
@@ -379,6 +381,8 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 	if (ret)
 		return ret;
 
+	xsave = &target->thread.fpu.state->xsave;
+
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
 	/*
 	 * mxcsr reserved bits must be masked to zero for security reasons.
-- 
cgit v1.2.3


From 8eb68bf75eb7ea73bce88cb9d42efd3bbcece3cd Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 16 Apr 2015 12:49:04 -0700
Subject: x86: switch to using asm-generic for seccomp.h

Switch to using the newly created asm-generic/seccomp.h for the seccomp
strict mode syscall definitions. The obsolete sigreturn syscall override
is retained in 32-bit mode, and the ia32 syscall overrides are used in
the compat case. Remaining definitions were identical.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/seccomp.h    | 21 ++++++++++++++++++---
 arch/x86/include/asm/seccomp_32.h | 11 -----------
 arch/x86/include/asm/seccomp_64.h | 17 -----------------
 3 files changed, 18 insertions(+), 31 deletions(-)
 delete mode 100644 arch/x86/include/asm/seccomp_32.h
 delete mode 100644 arch/x86/include/asm/seccomp_64.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
index 0f3d7f099224..0c8c7c8861b4 100644
--- a/arch/x86/include/asm/seccomp.h
+++ b/arch/x86/include/asm/seccomp.h
@@ -1,5 +1,20 @@
+#ifndef _ASM_X86_SECCOMP_H
+#define _ASM_X86_SECCOMP_H
+
+#include <asm/unistd.h>
+
 #ifdef CONFIG_X86_32
-# include <asm/seccomp_32.h>
-#else
-# include <asm/seccomp_64.h>
+#define __NR_seccomp_sigreturn		__NR_sigreturn
 #endif
+
+#ifdef CONFIG_COMPAT
+#include <asm/ia32_unistd.h>
+#define __NR_seccomp_read_32		__NR_ia32_read
+#define __NR_seccomp_write_32		__NR_ia32_write
+#define __NR_seccomp_exit_32		__NR_ia32_exit
+#define __NR_seccomp_sigreturn_32	__NR_ia32_sigreturn
+#endif
+
+#include <asm-generic/seccomp.h>
+
+#endif /* _ASM_X86_SECCOMP_H */
diff --git a/arch/x86/include/asm/seccomp_32.h b/arch/x86/include/asm/seccomp_32.h
deleted file mode 100644
index b811d6f5780c..000000000000
--- a/arch/x86/include/asm/seccomp_32.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef _ASM_X86_SECCOMP_32_H
-#define _ASM_X86_SECCOMP_32_H
-
-#include <linux/unistd.h>
-
-#define __NR_seccomp_read __NR_read
-#define __NR_seccomp_write __NR_write
-#define __NR_seccomp_exit __NR_exit
-#define __NR_seccomp_sigreturn __NR_sigreturn
-
-#endif /* _ASM_X86_SECCOMP_32_H */
diff --git a/arch/x86/include/asm/seccomp_64.h b/arch/x86/include/asm/seccomp_64.h
deleted file mode 100644
index 84ec1bd161a5..000000000000
--- a/arch/x86/include/asm/seccomp_64.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef _ASM_X86_SECCOMP_64_H
-#define _ASM_X86_SECCOMP_64_H
-
-#include <linux/unistd.h>
-#include <asm/ia32_unistd.h>
-
-#define __NR_seccomp_read __NR_read
-#define __NR_seccomp_write __NR_write
-#define __NR_seccomp_exit __NR_exit
-#define __NR_seccomp_sigreturn __NR_rt_sigreturn
-
-#define __NR_seccomp_read_32 __NR_ia32_read
-#define __NR_seccomp_write_32 __NR_ia32_write
-#define __NR_seccomp_exit_32 __NR_ia32_exit
-#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
-
-#endif /* _ASM_X86_SECCOMP_64_H */
-- 
cgit v1.2.3


From 98b228f55014870092c15d7d168fecac69f2f12a Mon Sep 17 00:00:00 2001
From: Roy Franz <roy.franz@linaro.org>
Date: Wed, 15 Apr 2015 16:32:24 -0700
Subject: x86/efi: Store upper bits of command line buffer address in
 ext_cmd_line_ptr

Until now, the EFI stub was only setting the 32 bit cmd_line_ptr in
the setup_header structure, so on 64 bit platforms this could be truncated.
This patch adds setting the upper bits of the buffer address in
ext_cmd_line_ptr.  This case was likely never hit, as the allocation
for this buffer is done at the lowest available address.  Only
x86_64 kernels have this problem, as the 1-1 mapping mandated
by EFI ensures that all memory is 32 bit addressable on 32 bit
platforms.  The EFI stub does not support mixed mode, so the
32 bit kernel on 64 bit firmware case does not need to be handled.

Signed-off-by: Roy Franz <roy.franz@linaro.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/boot/compressed/eboot.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 92b9a5f2aed6..5999980206bf 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -1110,6 +1110,8 @@ struct boot_params *make_boot_params(struct efi_config *c)
 	if (!cmdline_ptr)
 		goto fail;
 	hdr->cmd_line_ptr = (unsigned long)cmdline_ptr;
+	/* Fill in upper bits of command line address, NOP on 32 bit  */
+	boot_params->ext_cmd_line_ptr = (u64)(unsigned long)cmdline_ptr >> 32;
 
 	hdr->ramdisk_image = 0;
 	hdr->ramdisk_size = 0;
-- 
cgit v1.2.3


From 0c99241c93b8060441f3c8434848e54b5338f922 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 16 Apr 2015 12:38:30 +0200
Subject: perf/x86/intel/pt: Fix and clean up error handling in pt_event_add()

Dan Carpenter reported that pt_event_add() has buggy
error handling logic: it returns 0 instead of -EBUSY when
it fails to start a newly added event.

Furthermore, the control flow in this function is messy,
with cleanup labels mixed with direct returns.

Fix the bug and clean up the code by converting it to
a straight fast path for the regular non-failing case,
plus a clear sequence of cascading goto labels to do
all cleanup.

NOTE: I materially changed the existing clean up logic in the
pt_event_start() failure case to use the direct
perf_aux_output_end() path, not pt_event_del(), because
perf_aux_output_end() is enough here.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Julia Lawall <julia.lawall@lip6.fr>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20150416103830.GB7847@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_pt.c | 33 ++++++++++++++-----------------
 1 file changed, 15 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
index f2770641c0fd..ffe666c2c6b5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -988,39 +988,36 @@ static int pt_event_add(struct perf_event *event, int mode)
 	int ret = -EBUSY;
 
 	if (pt->handle.event)
-		goto out;
+		goto fail;
 
 	buf = perf_aux_output_begin(&pt->handle, event);
-	if (!buf) {
-		ret = -EINVAL;
-		goto out;
-	}
+	ret = -EINVAL;
+	if (!buf)
+		goto fail_stop;
 
 	pt_buffer_reset_offsets(buf, pt->handle.head);
 	if (!buf->snapshot) {
 		ret = pt_buffer_reset_markers(buf, &pt->handle);
-		if (ret) {
-			perf_aux_output_end(&pt->handle, 0, true);
-			goto out;
-		}
+		if (ret)
+			goto fail_end_stop;
 	}
 
 	if (mode & PERF_EF_START) {
 		pt_event_start(event, 0);
-		if (hwc->state == PERF_HES_STOPPED) {
-			pt_event_del(event, 0);
-			ret = -EBUSY;
-		}
+		ret = -EBUSY;
+		if (hwc->state == PERF_HES_STOPPED)
+			goto fail_end_stop;
 	} else {
 		hwc->state = PERF_HES_STOPPED;
 	}
 
-	ret = 0;
-out:
-
-	if (ret)
-		hwc->state = PERF_HES_STOPPED;
+	return 0;
 
+fail_end_stop:
+	perf_aux_output_end(&pt->handle, 0, true);
+fail_stop:
+	hwc->state = PERF_HES_STOPPED;
+fail:
 	return ret;
 }
 
-- 
cgit v1.2.3


From a6dfa128ce5c414ab46b1d690f7a1b8decb8526d Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 17 Apr 2015 15:04:48 -0400
Subject: config: Enable NEED_DMA_MAP_STATE by default when SWIOTLB is selected

A huge amount of NIC drivers use the DMA API, however if
compiled under 32-bit an very important part of the DMA API can
be ommitted leading to the drivers not working at all
(especially if used with 'swiotlb=force iommu=soft').

As Prashant Sreedharan explains it: "the driver [tg3] uses
DEFINE_DMA_UNMAP_ADDR(), dma_unmap_addr_set() to keep a copy of
the dma "mapping" and dma_unmap_addr() to get the "mapping"
value. On most of the platforms this is a no-op, but ... with
"iommu=soft and swiotlb=force" this house keeping is required,
... otherwise we pass 0 while calling pci_unmap_/pci_dma_sync_
instead of the DMA address."

As such enable this even when using 32-bit kernels.

Reported-by: Ian Jackson <Ian.Jackson@eu.citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Prashant Sreedharan <prashant@broadcom.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Chan <mchan@broadcom.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: boris.ostrovsky@oracle.com
Cc: cascardo@linux.vnet.ibm.com
Cc: david.vrabel@citrix.com
Cc: sanjeevb@broadcom.com
Cc: siva.kallam@broadcom.com
Cc: vyasevich@gmail.com
Cc: xen-devel@lists.xensource.com
Link: http://lkml.kernel.org/r/20150417190448.GA9462@l.oracle.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index faff6934c05a..eb79036e3503 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -177,7 +177,7 @@ config SBUS
 
 config NEED_DMA_MAP_STATE
 	def_bool y
-	depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG
+	depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG || SWIOTLB
 
 config NEED_SG_DMA_LENGTH
 	def_bool y
-- 
cgit v1.2.3


From 0b2bb6925eb602eae993a4b5c282a8c18ad1c949 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Thu, 26 Mar 2015 00:50:30 -0400
Subject: tools/power turbostat: Initial Skylake support

Skylake adds some additional residency counters.

Skylake supports a different mix of RAPL registers
from any previous product.

In most other ways, Skylake is like Broadwell.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/include/uapi/asm/msr-index.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index c4c75272314a..fe01b0a784e7 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -150,6 +150,11 @@
 #define MSR_PP1_ENERGY_STATUS		0x00000641
 #define MSR_PP1_POLICY			0x00000642
 
+#define MSR_PKG_WEIGHTED_CORE_C0_RES	0x00000658
+#define MSR_PKG_ANY_CORE_C0_RES		0x00000659
+#define MSR_PKG_ANY_GFXE_C0_RES		0x0000065A
+#define MSR_PKG_BOTH_CORE_GFXE_C0_RES	0x0000065B
+
 #define MSR_CORE_C1_RES			0x00000660
 
 #define MSR_CC6_DEMOTION_POLICY_CONFIG	0x00000668
-- 
cgit v1.2.3


From 94d4b4765b7ddb8478b0d57663cf7a08e2263bbf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 23 Nov 2012 19:19:07 +0100
Subject: x86/mm: Clean up types in xlate_dev_mem_ptr()

Pavel Machek reported the following compiler warning on
x86/32 CONFIG_HIGHMEM64G=y builds:

  arch/x86/mm/ioremap.c:344:10: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]

Clean up the types in this function by using a single natural type for
internal calculations (unsigned long), to make it more apparent what's
happening, and also to remove fragile casts.

Reported-by: Pavel Machek <pavel@ucw.cz>
Cc: jgross@suse.com
Cc: roland@purestorage.com
Link: http://lkml.kernel.org/r/20150416080440.GA507@amd
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/ioremap.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index fdf617c00e2f..4bf037b20f47 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -332,18 +332,20 @@ EXPORT_SYMBOL(iounmap);
  */
 void *xlate_dev_mem_ptr(phys_addr_t phys)
 {
-	void *addr;
-	unsigned long start = phys & PAGE_MASK;
+	unsigned long start  = phys &  PAGE_MASK;
+	unsigned long offset = phys & ~PAGE_MASK;
+	unsigned long vaddr;
 
 	/* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
 	if (page_is_ram(start >> PAGE_SHIFT))
 		return __va(phys);
 
-	addr = (void __force *)ioremap_cache(start, PAGE_SIZE);
-	if (addr)
-		addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
+	vaddr = (unsigned long)ioremap_cache(start, PAGE_SIZE);
+	/* Only add the offset on success and return NULL if the ioremap() failed: */
+	if (vaddr)
+		vaddr += offset;
 
-	return addr;
+	return (void *)vaddr;
 }
 
 void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
-- 
cgit v1.2.3


From 085e68eeafbf76e21848ad5bafaecec88a11dd64 Mon Sep 17 00:00:00 2001
From: Ben Serebrin <serebrin@google.com>
Date: Thu, 16 Apr 2015 11:58:05 -0700
Subject: KVM: VMX: Preserve host CR4.MCE value while in guest mode.

The host's decision to enable machine check exceptions should remain
in force during non-root mode.  KVM was writing 0 to cr4 on VCPU reset
and passed a slightly-modified 0 to the vmcs.guest_cr4 value.

Tested: Built.
On earlier version, tested by injecting machine check
while a guest is spinning.

Before the change, if guest CR4.MCE==0, then the machine check is
escalated to Catastrophic Error (CATERR) and the machine dies.
If guest CR4.MCE==1, then the machine check causes VMEXIT and is
handled normally by host Linux. After the change, injecting a machine
check causes normal Linux machine check handling.

Signed-off-by: Ben Serebrin <serebrin@google.com>
Reviewed-by: Venkatesh Srinivas <venkateshs@google.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f5e8dce8046c..f7b61687bd79 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3622,8 +3622,16 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 
 static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
-	unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
-		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
+	/*
+	 * Pass through host's Machine Check Enable value to hw_cr4, which
+	 * is in force while we are in guest mode.  Do not let guests control
+	 * this bit, even if host CR4.MCE == 0.
+	 */
+	unsigned long hw_cr4 =
+		(cr4_read_shadow() & X86_CR4_MCE) |
+		(cr4 & ~X86_CR4_MCE) |
+		(to_vmx(vcpu)->rmode.vm86_active ?
+		 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
 
 	if (cr4 & X86_CR4_VMXE) {
 		/*
-- 
cgit v1.2.3


From 3b6e042188994466ec257b71296b5f85b894dcd9 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Tue, 21 Apr 2015 17:26:23 +0200
Subject: perf/x86/intel: Add cpu_(prepare|starting|dying) for core_pmu

The core_pmu does not define cpu_* callbacks, which handles
allocation of 'struct cpu_hw_events::shared_regs' data,
initialization of debug store and PMU_FL_EXCL_CNTRS counters.

While this probably won't happen on bare metal, virtual CPU can
define x86_pmu.extra_regs together with PMU version 1 and thus
be using core_pmu -> using shared_regs data without it being
allocated. That could could leave to following panic:

	BUG: unable to handle kernel NULL pointer dereference at (null)
	IP: [<ffffffff8152cd4f>] _spin_lock_irqsave+0x1f/0x40

	SNIP

	 [<ffffffff81024bd9>] __intel_shared_reg_get_constraints+0x69/0x1e0
	 [<ffffffff81024deb>] intel_get_event_constraints+0x9b/0x180
	 [<ffffffff8101e815>] x86_schedule_events+0x75/0x1d0
	 [<ffffffff810586dc>] ? check_preempt_curr+0x7c/0x90
	 [<ffffffff810649fe>] ? try_to_wake_up+0x24e/0x3e0
	 [<ffffffff81064ba2>] ? default_wake_function+0x12/0x20
	 [<ffffffff8109eb16>] ? autoremove_wake_function+0x16/0x40
	 [<ffffffff810577e9>] ? __wake_up_common+0x59/0x90
	 [<ffffffff811a9517>] ? __d_lookup+0xa7/0x150
	 [<ffffffff8119db5f>] ? do_lookup+0x9f/0x230
	 [<ffffffff811a993a>] ? dput+0x9a/0x150
	 [<ffffffff8119c8f5>] ? path_to_nameidata+0x25/0x60
	 [<ffffffff8119e90a>] ? __link_path_walk+0x7da/0x1000
	 [<ffffffff8101d8f9>] ? x86_pmu_add+0xb9/0x170
	 [<ffffffff8101d7a7>] x86_pmu_commit_txn+0x67/0xc0
	 [<ffffffff811b07b0>] ? mntput_no_expire+0x30/0x110
	 [<ffffffff8119c731>] ? path_put+0x31/0x40
	 [<ffffffff8107c297>] ? current_fs_time+0x27/0x30
	 [<ffffffff8117d170>] ? mem_cgroup_get_reclaim_stat_from_page+0x20/0x70
	 [<ffffffff8111b7aa>] group_sched_in+0x13a/0x170
	 [<ffffffff81014a29>] ? sched_clock+0x9/0x10
	 [<ffffffff8111bac8>] ctx_sched_in+0x2e8/0x330
	 [<ffffffff8111bb7b>] perf_event_sched_in+0x6b/0xb0
	 [<ffffffff8111bc36>] perf_event_context_sched_in+0x76/0xc0
	 [<ffffffff8111eb3b>] perf_event_comm+0x1bb/0x2e0
	 [<ffffffff81195ee9>] set_task_comm+0x69/0x80
	 [<ffffffff81195fe1>] setup_new_exec+0xe1/0x2e0
	 [<ffffffff811ea68e>] load_elf_binary+0x3ce/0x1ab0

Adding cpu_(prepare|starting|dying) for core_pmu to have
shared_regs data allocated for core_pmu. AFAICS there's no harm
to initialize debug store and PMU_FL_EXCL_CNTRS either for
core_pmu.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/20150421152623.GC13169@krava.redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 66 +++++++++++++++++++---------------
 1 file changed, 38 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 219d3fb423a1..960e85de13fb 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2533,34 +2533,6 @@ ssize_t intel_event_sysfs_show(char *page, u64 config)
 	return x86_event_sysfs_show(page, config, event);
 }
 
-static __initconst const struct x86_pmu core_pmu = {
-	.name			= "core",
-	.handle_irq		= x86_pmu_handle_irq,
-	.disable_all		= x86_pmu_disable_all,
-	.enable_all		= core_pmu_enable_all,
-	.enable			= core_pmu_enable_event,
-	.disable		= x86_pmu_disable_event,
-	.hw_config		= x86_pmu_hw_config,
-	.schedule_events	= x86_schedule_events,
-	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
-	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
-	.event_map		= intel_pmu_event_map,
-	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
-	.apic			= 1,
-	/*
-	 * Intel PMCs cannot be accessed sanely above 32 bit width,
-	 * so we install an artificial 1<<31 period regardless of
-	 * the generic event period:
-	 */
-	.max_period		= (1ULL << 31) - 1,
-	.get_event_constraints	= intel_get_event_constraints,
-	.put_event_constraints	= intel_put_event_constraints,
-	.event_constraints	= intel_core_event_constraints,
-	.guest_get_msrs		= core_guest_get_msrs,
-	.format_attrs		= intel_arch_formats_attr,
-	.events_sysfs_show	= intel_event_sysfs_show,
-};
-
 struct intel_shared_regs *allocate_shared_regs(int cpu)
 {
 	struct intel_shared_regs *regs;
@@ -2743,6 +2715,44 @@ static struct attribute *intel_arch3_formats_attr[] = {
 	NULL,
 };
 
+static __initconst const struct x86_pmu core_pmu = {
+	.name			= "core",
+	.handle_irq		= x86_pmu_handle_irq,
+	.disable_all		= x86_pmu_disable_all,
+	.enable_all		= core_pmu_enable_all,
+	.enable			= core_pmu_enable_event,
+	.disable		= x86_pmu_disable_event,
+	.hw_config		= x86_pmu_hw_config,
+	.schedule_events	= x86_schedule_events,
+	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
+	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
+	.event_map		= intel_pmu_event_map,
+	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
+	.apic			= 1,
+	/*
+	 * Intel PMCs cannot be accessed sanely above 32-bit width,
+	 * so we install an artificial 1<<31 period regardless of
+	 * the generic event period:
+	 */
+	.max_period		= (1ULL<<31) - 1,
+	.get_event_constraints	= intel_get_event_constraints,
+	.put_event_constraints	= intel_put_event_constraints,
+	.event_constraints	= intel_core_event_constraints,
+	.guest_get_msrs		= core_guest_get_msrs,
+	.format_attrs		= intel_arch_formats_attr,
+	.events_sysfs_show	= intel_event_sysfs_show,
+
+	/*
+	 * Virtual (or funny metal) CPU can define x86_pmu.extra_regs
+	 * together with PMU version 1 and thus be using core_pmu with
+	 * shared_regs. We need following callbacks here to allocate
+	 * it properly.
+	 */
+	.cpu_prepare		= intel_pmu_cpu_prepare,
+	.cpu_starting		= intel_pmu_cpu_starting,
+	.cpu_dying		= intel_pmu_cpu_dying,
+};
+
 static __initconst const struct x86_pmu intel_pmu = {
 	.name			= "Intel",
 	.handle_irq		= intel_pmu_handle_irq,
-- 
cgit v1.2.3


From 80bcffb376a6890dd7452b12c1ba032f8f24fef6 Mon Sep 17 00:00:00 2001
From: Sonny Rao <sonnyrao@chromium.org>
Date: Mon, 20 Apr 2015 15:34:07 -0700
Subject: perf/x86/intel/uncore: Add support for Intel Haswell ULT (lower power
 Mobile Processor) IMC uncore PMUs

This uncore is the same as the Haswell desktop part but uses a
different PCI ID.

Signed-off-by: Sonny Rao <sonnyrao@chromium.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1429569247-16697-1-git-send-email-sonnyrao@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 3001015b755c..ca75e70865ef 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,6 +1,9 @@
 /* Nehalem/SandBridge/Haswell uncore support */
 #include "perf_event_intel_uncore.h"
 
+/* Uncore IMC PCI Id */
+#define PCI_DEVICE_ID_INTEL_HSW_U_IMC	0x0a04
+
 /* SNB event control */
 #define SNB_UNC_CTL_EV_SEL_MASK			0x000000ff
 #define SNB_UNC_CTL_UMASK_MASK			0x0000ff00
@@ -472,6 +475,10 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = {
 		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
 		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
 	},
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
 	{ /* end: all zeroes */ },
 };
 
@@ -502,6 +509,7 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
 	IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver),    /* 3rd Gen Core processor */
 	IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
 	IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver),    /* 4th Gen Core Processor */
+	IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile Processor */
 	{  /* end marker */ }
 };
 
-- 
cgit v1.2.3


From 0140e6141e4f1d4b15fb469e6912b0e71b7d1cc2 Mon Sep 17 00:00:00 2001
From: Sonny Rao <sonnyrao@chromium.org>
Date: Tue, 21 Apr 2015 12:33:11 -0700
Subject: perf/x86/intel/uncore: Move PCI IDs for IMC to uncore driver

This keeps all the related PCI IDs together in the driver where
they are used.

Signed-off-by: Sonny Rao <sonnyrao@chromium.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1429644791-25724-1-git-send-email-sonnyrao@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index ca75e70865ef..4562e9e22c60 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,7 +1,11 @@
 /* Nehalem/SandBridge/Haswell uncore support */
 #include "perf_event_intel_uncore.h"
 
-/* Uncore IMC PCI Id */
+/* Uncore IMC PCI IDs */
+#define PCI_DEVICE_ID_INTEL_SNB_IMC	0x0100
+#define PCI_DEVICE_ID_INTEL_IVB_IMC	0x0154
+#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC	0x0150
+#define PCI_DEVICE_ID_INTEL_HSW_IMC	0x0c00
 #define PCI_DEVICE_ID_INTEL_HSW_U_IMC	0x0a04
 
 /* SNB event control */
-- 
cgit v1.2.3


From 00425bb181c204c8f250fec122e2817a930e0286 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 24 Apr 2015 08:37:09 +0200
Subject: crypto: x86/sha512_ssse3 - fixup for asm function prototype change

Patch e68410ebf626 ("crypto: x86/sha512_ssse3 - move SHA-384/512
SSSE3 implementation to base layer") changed the prototypes of the
core asm SHA-512 implementations so that they are compatible with
the prototype used by the base layer.

However, in one instance, the register that was used for passing the
input buffer was reused as a scratch register later on in the code,
and since the input buffer param changed places with the digest param
-which needs to be written back before the function returns- this
resulted in the scratch register to be dereferenced in a memory write
operation, causing a GPF.

Fix this by changing the scratch register to use the same register as
the input buffer param again.

Fixes: e68410ebf626 ("crypto: x86/sha512_ssse3 - move SHA-384/512 SSSE3 implementation to base layer")
Reported-By: Bobby Powers <bobbypowers@gmail.com>
Tested-By: Bobby Powers <bobbypowers@gmail.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha512-avx2-asm.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index a4771dcd1fcf..1f20b35d8573 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -79,7 +79,7 @@ NUM_BLKS    = %rdx
 c           = %rcx
 d           = %r8
 e           = %rdx
-y3          = %rdi
+y3          = %rsi
 
 TBL   = %rbp
 
-- 
cgit v1.2.3


From d869844bd081081bf537e806a44811884230643e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 23 Apr 2015 08:33:59 -0700
Subject: x86: fix special __probe_kernel_write() tail zeroing case

Commit cae2a173fe94 ("x86: clean up/fix 'copy_in_user()' tail zeroing")
fixed the failure case tail zeroing of one special case of the x86-64
generic user-copy routine, namely when used for the user-to-user case
("copy_in_user()").

But in the process it broke an even more unusual case: using the user
copy routine for kernel-to-kernel copying.

Now, normally kernel-kernel copies are obviously done using memcpy(),
but we have a couple of special cases when we use the user-copy
functions.  One is when we pass a kernel buffer to a regular user-buffer
routine, using set_fs(KERNEL_DS).  That's a "normal" case, and continued
to work fine, because it never takes any faults (with the possible
exception of a silent and successful vmalloc fault).

But Jan Beulich pointed out another, very unusual, special case: when we
use the user-copy routines not because it's a path that expects a user
pointer, but for a couple of ftrace/kgdb cases that want to do a kernel
copy, but do so using "unsafe" buffers, and use the user-copy routine to
gracefully handle faults.  IOW, for probe_kernel_write().

And that broke for the case of a faulting kernel destination, because we
saw the kernel destination and wanted to try to clear the tail of the
buffer.  Which doesn't work, since that's what faults.

This only triggers for things like kgdb and ftrace users (eg trying
setting a breakpoint on read-only memory), but it's definitely a bug.
The fix is to not compare against the kernel address start (TASK_SIZE),
but instead use the same limits "access_ok()" uses.

Reported-and-tested-by: Jan Beulich <jbeulich@suse.com>
Cc: stable@vger.kernel.org # 4.0
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/lib/usercopy_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 1f33b3d1fd68..0a42327a59d7 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -82,7 +82,7 @@ copy_user_handle_tail(char *to, char *from, unsigned len)
 	clac();
 
 	/* If the destination is a kernel buffer, we always clear the end */
-	if ((unsigned long)to >= TASK_SIZE_MAX)
+	if (!__addr_ok(to))
 		memset(to, 0, len);
 	return len;
 }
-- 
cgit v1.2.3


From 61f01dd941ba9e06d2bf05994450ecc3d61b6b8b Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 26 Apr 2015 16:47:59 -0700
Subject: x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

AMD CPUs don't reinitialize the SS descriptor on SYSRET, so SYSRET with
SS == 0 results in an invalid usermode state in which SS is apparently
equal to __USER_DS but causes #SS if used.

Work around the issue by setting SS to __KERNEL_DS __switch_to, thus
ensuring that SYSRET never happens with SS set to NULL.

This was exposed by a recent vDSO cleanup.

Fixes: e7d6eefaaa44 x86/vdso32/syscall.S: Do not load __USER32_DS to %ss
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Peter Anvin <hpa@zytor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/ia32/ia32entry.S         |  7 +++++++
 arch/x86/include/asm/cpufeature.h |  1 +
 arch/x86/kernel/cpu/amd.c         |  3 +++
 arch/x86/kernel/entry_64.S        |  9 +++++++++
 arch/x86/kernel/process_64.c      | 28 ++++++++++++++++++++++++++++
 5 files changed, 48 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a821b1cd4fa7..72bf2680f819 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -427,6 +427,13 @@ sysretl_from_sys_call:
 	 * cs and ss are loaded from MSRs.
 	 * (Note: 32bit->32bit SYSRET is different: since r11
 	 * does not exist, it merely sets eflags.IF=1).
+	 *
+	 * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
+	 * descriptor is not reinitialized.  This means that we must
+	 * avoid SYSRET with SS == NULL, which could happen if we schedule,
+	 * exit the kernel, and re-enter using an interrupt vector.  (All
+	 * interrupt entries on x86_64 set SS to NULL.)  We prevent that
+	 * from happening by reloading SS in __switch_to.
 	 */
 	USERGS_SYSRET32
 
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 7ee9b94d9921..3d6606fb97d0 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -265,6 +265,7 @@
 #define X86_BUG_11AP		X86_BUG(5) /* Bad local APIC aka 11AP */
 #define X86_BUG_FXSAVE_LEAK	X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
 #define X86_BUG_CLFLUSH_MONITOR	X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
+#define X86_BUG_SYSRET_SS_ATTRS	X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
 
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index fd470ebf924e..e4cf63301ff4 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -720,6 +720,9 @@ static void init_amd(struct cpuinfo_x86 *c)
 	if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
 		if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
 			set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
+
+	/* AMD CPUs don't reset SS attributes on SYSRET */
+	set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c7b238494b31..02c2eff7478d 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -295,6 +295,15 @@ system_call_fastpath:
 	 * rflags from r11 (but RF and VM bits are forced to 0),
 	 * cs and ss are loaded from MSRs.
 	 * Restoration of rflags re-enables interrupts.
+	 *
+	 * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
+	 * descriptor is not reinitialized.  This means that we should
+	 * avoid SYSRET with SS == NULL, which could happen if we schedule,
+	 * exit the kernel, and re-enter using an interrupt vector.  (All
+	 * interrupt entries on x86_64 set SS to NULL.)  We prevent that
+	 * from happening by reloading SS in __switch_to.  (Actually
+	 * detecting the failure in 64-bit userspace is tricky but can be
+	 * done.)
 	 */
 	USERGS_SYSRET64
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 4baaa972f52a..ddfdbf74f174 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -419,6 +419,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
 		__switch_to_xtra(prev_p, next_p, tss);
 
+	if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
+		/*
+		 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
+		 * does not update the cached descriptor.  As a result, if we
+		 * do SYSRET while SS is NULL, we'll end up in user mode with
+		 * SS apparently equal to __USER_DS but actually unusable.
+		 *
+		 * The straightforward workaround would be to fix it up just
+		 * before SYSRET, but that would slow down the system call
+		 * fast paths.  Instead, we ensure that SS is never NULL in
+		 * system call context.  We do this by replacing NULL SS
+		 * selectors at every context switch.  SYSCALL sets up a valid
+		 * SS, so the only way to get NULL is to re-enter the kernel
+		 * from CPL 3 through an interrupt.  Since that can't happen
+		 * in the same task as a running syscall, we are guaranteed to
+		 * context switch between every interrupt vector entry and a
+		 * subsequent SYSRET.
+		 *
+		 * We read SS first because SS reads are much faster than
+		 * writes.  Out of caution, we force SS to __KERNEL_DS even if
+		 * it previously had a different non-NULL value.
+		 */
+		unsigned short ss_sel;
+		savesegment(ss, ss_sel);
+		if (ss_sel != __KERNEL_DS)
+			loadsegment(ss, __KERNEL_DS);
+	}
+
 	return prev_p;
 }
 
-- 
cgit v1.2.3


From 5dca0d9147458be9b9363b8a484aa77d710b412a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Wed, 25 Mar 2015 12:08:14 +0100
Subject: kvm: x86: fix kvmclock update protocol
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The kvmclock spec says that the host will increment a version field to
an odd number, then update stuff, then increment it to an even number.
The host is buggy and doesn't do this, and the result is observable
when one vcpu reads another vcpu's kvmclock data.

There's no good way for a guest kernel to keep its vdso from reading
a different vcpu's kvmclock data, but we don't need to care about
changing VCPUs as long as we read a consistent data from kvmclock.
(VCPU can change outside of this loop too, so it doesn't matter if we
return a value not fit for this VCPU.)

Based on a patch by Radim Krčmář.

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Acked-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ed31c31b2485..c73efcd03e29 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1669,12 +1669,28 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 		&guest_hv_clock, sizeof(guest_hv_clock))))
 		return 0;
 
-	/*
-	 * The interface expects us to write an even number signaling that the
-	 * update is finished. Since the guest won't see the intermediate
-	 * state, we just increase by 2 at the end.
+	/* This VCPU is paused, but it's legal for a guest to read another
+	 * VCPU's kvmclock, so we really have to follow the specification where
+	 * it says that version is odd if data is being modified, and even after
+	 * it is consistent.
+	 *
+	 * Version field updates must be kept separate.  This is because
+	 * kvm_write_guest_cached might use a "rep movs" instruction, and
+	 * writes within a string instruction are weakly ordered.  So there
+	 * are three writes overall.
+	 *
+	 * As a small optimization, only write the version field in the first
+	 * and third write.  The vcpu->pv_time cache is still valid, because the
+	 * version field is the first in the struct.
 	 */
-	vcpu->hv_clock.version = guest_hv_clock.version + 2;
+	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+	vcpu->hv_clock.version = guest_hv_clock.version + 1;
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock.version));
+
+	smp_wmb();
 
 	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
 	pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
@@ -1695,6 +1711,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
 				&vcpu->hv_clock,
 				sizeof(vcpu->hv_clock));
+
+	smp_wmb();
+
+	vcpu->hv_clock.version++;
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock.version));
 	return 0;
 }
 
-- 
cgit v1.2.3


From 73459e2a1ada09a68c02cc5b73f3116fc8194b3d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 23 Apr 2015 13:20:18 +0200
Subject: x86: pvclock: Really remove the sched notifier for cross-cpu
 migrations

This reverts commits 0a4e6be9ca17c54817cf814b4b5aa60478c6df27
and 80f7fdb1c7f0f9266421f823964fd1962681f6ce.

The task migration notifier was originally introduced in order to support
the pvclock vsyscall with non-synchronized TSC, but KVM only supports it
with synchronized TSC.  Hence, on KVM the race condition is only needed
due to a bad implementation on the host side, and even then it's so rare
that it's mostly theoretical.

As far as KVM is concerned it's possible to fix the host, avoiding the
additional complexity in the vDSO and the (re)introduction of the task
migration notifier.

Xen, on the other hand, hasn't yet implemented vsyscall support at
all, so we do not care about its plans for non-synchronized TSC.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/pvclock.h |  1 -
 arch/x86/kernel/pvclock.c      | 44 ------------------------------------------
 arch/x86/vdso/vclock_gettime.c | 34 ++++++++++++++------------------
 3 files changed, 15 insertions(+), 64 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 25b1cc07d496..d6b078e9fa28 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -95,7 +95,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
 
 struct pvclock_vsyscall_time_info {
 	struct pvclock_vcpu_time_info pvti;
-	u32 migrate_count;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index e5ecd20e72dd..2f355d229a58 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -141,46 +141,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
 	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
 }
 
-static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
-
-static struct pvclock_vsyscall_time_info *
-pvclock_get_vsyscall_user_time_info(int cpu)
-{
-	if (!pvclock_vdso_info) {
-		BUG();
-		return NULL;
-	}
-
-	return &pvclock_vdso_info[cpu];
-}
-
-struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
-{
-	return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
-}
-
 #ifdef CONFIG_X86_64
-static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
-			        void *v)
-{
-	struct task_migration_notifier *mn = v;
-	struct pvclock_vsyscall_time_info *pvti;
-
-	pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
-
-	/* this is NULL when pvclock vsyscall is not initialized */
-	if (unlikely(pvti == NULL))
-		return NOTIFY_DONE;
-
-	pvti->migrate_count++;
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block pvclock_migrate = {
-	.notifier_call = pvclock_task_migrate,
-};
-
 /*
  * Initialize the generic pvclock vsyscall state.  This will allocate
  * a/some page(s) for the per-vcpu pvclock information, set up a
@@ -194,17 +155,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
 
 	WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
 
-	pvclock_vdso_info = i;
-
 	for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
 		__set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
 			     __pa(i) + (idx*PAGE_SIZE),
 			     PAGE_KERNEL_VVAR);
 	}
 
-
-	register_task_migration_notifier(&pvclock_migrate);
-
 	return 0;
 }
 #endif
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 40d2473836c9..9793322751e0 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -82,15 +82,18 @@ static notrace cycle_t vread_pvclock(int *mode)
 	cycle_t ret;
 	u64 last;
 	u32 version;
-	u32 migrate_count;
 	u8 flags;
 	unsigned cpu, cpu1;
 
 
 	/*
-	 * When looping to get a consistent (time-info, tsc) pair, we
-	 * also need to deal with the possibility we can switch vcpus,
-	 * so make sure we always re-fetch time-info for the current vcpu.
+	 * Note: hypervisor must guarantee that:
+	 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
+	 * 2. that per-CPU pvclock time info is updated if the
+	 *    underlying CPU changes.
+	 * 3. that version is increased whenever underlying CPU
+	 *    changes.
+	 *
 	 */
 	do {
 		cpu = __getcpu() & VGETCPU_CPU_MASK;
@@ -99,27 +102,20 @@ static notrace cycle_t vread_pvclock(int *mode)
 		 * __getcpu() calls (Gleb).
 		 */
 
-		/* Make sure migrate_count will change if we leave the VCPU. */
-		do {
-			pvti = get_pvti(cpu);
-			migrate_count = pvti->migrate_count;
-
-			cpu1 = cpu;
-			cpu = __getcpu() & VGETCPU_CPU_MASK;
-		} while (unlikely(cpu != cpu1));
+		pvti = get_pvti(cpu);
 
 		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
 
 		/*
 		 * Test we're still on the cpu as well as the version.
-		 * - We must read TSC of pvti's VCPU.
-		 * - KVM doesn't follow the versioning protocol, so data could
-		 *   change before version if we left the VCPU.
+		 * We could have been migrated just after the first
+		 * vgetcpu but before fetching the version, so we
+		 * wouldn't notice a version change.
 		 */
-		smp_rmb();
-	} while (unlikely((pvti->pvti.version & 1) ||
-			  pvti->pvti.version != version ||
-			  pvti->migrate_count != migrate_count));
+		cpu1 = __getcpu() & VGETCPU_CPU_MASK;
+	} while (unlikely(cpu != cpu1 ||
+			  (pvti->pvti.version & 1) ||
+			  pvti->pvti.version != version));
 
 	if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
 		*mode = VCLOCK_NONE;
-- 
cgit v1.2.3


From 2b953a5e994ce279904ec70220f7d4f31d380a0a Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Tue, 28 Apr 2015 18:46:20 -0400
Subject: xen: Suspend ticks on all CPUs during suspend

Commit 77e32c89a711 ("clockevents: Manage device's state separately for
the core") decouples clockevent device's modes from states. With this
change when a Xen guest tries to resume, it won't be calling its
set_mode op which needs to be done on each VCPU in order to make the
hypervisor aware that we are in oneshot mode.

This happens because clockevents_tick_resume() (which is an intermediate
step of resuming ticks on a processor) doesn't call clockevents_set_state()
anymore and because during suspend clockevent devices on all VCPUs (except
for the one doing the suspend) are left in ONESHOT state. As result, during
resume the clockevents state machine will assume that device is already
where it should be and doesn't need to be updated.

To avoid this problem we should suspend ticks on all VCPUs during
suspend.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/suspend.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index d9497698645a..53b4c0811f4f 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -88,7 +88,17 @@ static void xen_vcpu_notify_restore(void *data)
 	tick_resume_local();
 }
 
+static void xen_vcpu_notify_suspend(void *data)
+{
+	tick_suspend_local();
+}
+
 void xen_arch_resume(void)
 {
 	on_each_cpu(xen_vcpu_notify_restore, NULL, 1);
 }
+
+void xen_arch_suspend(void)
+{
+	on_each_cpu(xen_vcpu_notify_suspend, NULL, 1);
+}
-- 
cgit v1.2.3


From 2c62e8492ed7358bbe7da51666c7e0f6da9474ee Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Thu, 30 Apr 2015 12:41:28 +0800
Subject: x86/PCI/ACPI: Make all resources except [io 0xcf8-0xcff] available on
 PCI bus

An IO port or MMIO resource assigned to a PCI host bridge may be
consumed by the host bridge itself or available to its child
bus/devices. The ACPI specification defines a bit (Producer/Consumer)
to tell whether the resource is consumed by the host bridge itself,
but firmware hasn't used that bit consistently, so we can't rely on it.

Before commit 593669c2ac0f ("x86/PCI/ACPI: Use common ACPI resource
interfaces to simplify implementation"), arch/x86/pci/acpi.c ignored
all IO port resources defined by acpi_resource_io and
acpi_resource_fixed_io to filter out IO ports consumed by the host
bridge itself.

Commit 593669c2ac0f ("x86/PCI/ACPI: Use common ACPI resource interfaces
to simplify implementation") started accepting all IO port and MMIO
resources, which caused a regression that IO port resources consumed
by the host bridge itself became available to its child devices.

Then commit 63f1789ec716 ("x86/PCI/ACPI: Ignore resources consumed by
host bridge itself") ignored resources consumed by the host bridge
itself by checking the IORESOURCE_WINDOW flag, which accidently removed
MMIO resources defined by acpi_resource_memory24, acpi_resource_memory32
and acpi_resource_fixed_memory32.

On x86 and IA64 platforms, all IO port and MMIO resources are assumed
to be available to child bus/devices except one special case:
    IO port [0xCF8-0xCFF] is consumed by the host bridge itself
    to access PCI configuration space.

So explicitly filter out PCI CFG IO ports[0xCF8-0xCFF]. This solution
will also ease the way to consolidate ACPI PCI host bridge common code
from x86, ia64 and ARM64.

Related ACPI table are archived at:
https://bugzilla.kernel.org/show_bug.cgi?id=94221

Related discussions at:
http://patchwork.ozlabs.org/patch/461633/
https://lkml.org/lkml/2015/3/29/304

Fixes: 63f1789ec716 (Ignore resources consumed by host bridge itself)
Reported-by: Bernhard Thaler <bernhard.thaler@wvnet.at>
Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Cc: 4.0+ <stable@vger.kernel.org> # 4.0+
Reviewed-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/pci/acpi.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index e4695985f9de..d93963340c3c 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -325,6 +325,26 @@ static void release_pci_root_info(struct pci_host_bridge *bridge)
 	kfree(info);
 }
 
+/*
+ * An IO port or MMIO resource assigned to a PCI host bridge may be
+ * consumed by the host bridge itself or available to its child
+ * bus/devices. The ACPI specification defines a bit (Producer/Consumer)
+ * to tell whether the resource is consumed by the host bridge itself,
+ * but firmware hasn't used that bit consistently, so we can't rely on it.
+ *
+ * On x86 and IA64 platforms, all IO port and MMIO resources are assumed
+ * to be available to child bus/devices except one special case:
+ *     IO port [0xCF8-0xCFF] is consumed by the host bridge itself
+ *     to access PCI configuration space.
+ *
+ * So explicitly filter out PCI CFG IO ports[0xCF8-0xCFF].
+ */
+static bool resource_is_pcicfg_ioport(struct resource *res)
+{
+	return (res->flags & IORESOURCE_IO) &&
+		res->start == 0xCF8 && res->end == 0xCFF;
+}
+
 static void probe_pci_root_info(struct pci_root_info *info,
 				struct acpi_device *device,
 				int busnum, int domain,
@@ -346,8 +366,8 @@ static void probe_pci_root_info(struct pci_root_info *info,
 			"no IO and memory resources present in _CRS\n");
 	else
 		resource_list_for_each_entry_safe(entry, tmp, list) {
-			if ((entry->res->flags & IORESOURCE_WINDOW) == 0 ||
-			    (entry->res->flags & IORESOURCE_DISABLED))
+			if ((entry->res->flags & IORESOURCE_DISABLED) ||
+			    resource_is_pcicfg_ioport(entry->res))
 				resource_list_destroy_entry(entry);
 			else
 				entry->res->name = info->name;
-- 
cgit v1.2.3


From e8a4a2696fecb398b0288c43c0e0dbb91e265bb2 Mon Sep 17 00:00:00 2001
From: Tahsin Erdogan <tahsin@google.com>
Date: Mon, 4 May 2015 21:15:31 -0700
Subject: x86/spinlocks: Fix regression in spinlock contention detection

A spinlock is regarded as contended when there is at least one waiter.
Currently, the code that checks whether there are any waiters rely on
tail value being greater than head. However, this is not true if tail
reaches the max value and wraps back to zero, so arch_spin_is_contended()
incorrectly returns 0 (not contended) when tail is smaller than head.

The original code (before regression) handled this case by casting the
(tail - head) to an unsigned value. This change simply restores that
behavior.

Fixes: d6abfdb20223 ("x86/spinlocks/paravirt: Fix memory corruption on unlock")
Signed-off-by: Tahsin Erdogan <tahsin@google.com>
Cc: peterz@infradead.org
Cc: Waiman.Long@hp.com
Cc: borntraeger@de.ibm.com
Cc: oleg@redhat.com
Cc: raghavendra.kt@linux.vnet.ibm.com
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/1430799331-20445-1-git-send-email-tahsin@google.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/spinlock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index cf87de3fc390..64b611782ef0 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -169,7 +169,7 @@ static inline int arch_spin_is_contended(arch_spinlock_t *lock)
 	struct __raw_tickets tmp = READ_ONCE(lock->tickets);
 
 	tmp.head &= ~TICKET_SLOWPATH_FLAG;
-	return (tmp.tail - tmp.head) > TICKET_LOCK_INC;
+	return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
 }
 #define arch_spin_is_contended	arch_spin_is_contended
 
-- 
cgit v1.2.3


From a71dbdaa8ca2933391b08e0ae5567083e3af0892 Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Mon, 4 May 2015 11:02:15 -0400
Subject: hypervisor/x86/xen: Unset X86_BUG_SYSRET_SS_ATTRS on Xen PV guests

Commit 61f01dd941ba ("x86_64, asm: Work around AMD SYSRET SS descriptor
attribute issue") makes AMD processors set SS to __KERNEL_DS in
__switch_to() to deal with cases when SS is NULL.

This breaks Xen PV guests who do not want to load SS with__KERNEL_DS.

Since the problem that the commit is trying to address would have to be
fixed in the hypervisor (if it in fact exists under Xen) there is no
reason to set X86_BUG_SYSRET_SS_ATTRS flag for PV VPCUs here.

This can be easily achieved by adding x86_hyper_xen_hvm.set_cpu_features
op which will clear this flag. (And since this structure is no longer
HVM-specific we should do some renaming).

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reported-by: Sander Eikelenboom <linux@eikelenboom.it>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/include/asm/hypervisor.h |  2 +-
 arch/x86/kernel/cpu/hypervisor.c  |  4 ++--
 arch/x86/xen/enlighten.c          | 27 ++++++++++++++++++---------
 3 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index e42f758a0fbd..055ea9941dd5 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -50,7 +50,7 @@ extern const struct hypervisor_x86 *x86_hyper;
 /* Recognized hypervisors */
 extern const struct hypervisor_x86 x86_hyper_vmware;
 extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
-extern const struct hypervisor_x86 x86_hyper_xen_hvm;
+extern const struct hypervisor_x86 x86_hyper_xen;
 extern const struct hypervisor_x86 x86_hyper_kvm;
 
 extern void init_hypervisor(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 36ce402a3fa5..d820d8eae96b 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -27,8 +27,8 @@
 
 static const __initconst struct hypervisor_x86 * const hypervisors[] =
 {
-#ifdef CONFIG_XEN_PVHVM
-	&x86_hyper_xen_hvm,
+#ifdef CONFIG_XEN
+	&x86_hyper_xen,
 #endif
 	&x86_hyper_vmware,
 	&x86_hyper_ms_hyperv,
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 94578efd3067..46957ead3060 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1760,6 +1760,9 @@ static struct notifier_block xen_hvm_cpu_notifier = {
 
 static void __init xen_hvm_guest_init(void)
 {
+	if (xen_pv_domain())
+		return;
+
 	init_hvm_pv_info();
 
 	xen_hvm_init_shared_info();
@@ -1775,6 +1778,7 @@ static void __init xen_hvm_guest_init(void)
 	xen_hvm_init_time_ops();
 	xen_hvm_init_mmu_ops();
 }
+#endif
 
 static bool xen_nopv = false;
 static __init int xen_parse_nopv(char *arg)
@@ -1784,14 +1788,11 @@ static __init int xen_parse_nopv(char *arg)
 }
 early_param("xen_nopv", xen_parse_nopv);
 
-static uint32_t __init xen_hvm_platform(void)
+static uint32_t __init xen_platform(void)
 {
 	if (xen_nopv)
 		return 0;
 
-	if (xen_pv_domain())
-		return 0;
-
 	return xen_cpuid_base();
 }
 
@@ -1809,11 +1810,19 @@ bool xen_hvm_need_lapic(void)
 }
 EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
 
-const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = {
-	.name			= "Xen HVM",
-	.detect			= xen_hvm_platform,
+static void xen_set_cpu_features(struct cpuinfo_x86 *c)
+{
+	if (xen_pv_domain())
+		clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+}
+
+const struct hypervisor_x86 x86_hyper_xen = {
+	.name			= "Xen",
+	.detect			= xen_platform,
+#ifdef CONFIG_XEN_PVHVM
 	.init_platform		= xen_hvm_guest_init,
+#endif
 	.x2apic_available	= xen_x2apic_para_available,
+	.set_cpu_features       = xen_set_cpu_features,
 };
-EXPORT_SYMBOL(x86_hyper_xen_hvm);
-#endif
+EXPORT_SYMBOL(x86_hyper_xen);
-- 
cgit v1.2.3


From de71ad2c97862eae1516aa36528cc3b317c17b2f Mon Sep 17 00:00:00 2001
From: Marc Dionne <marc.c.dionne@gmail.com>
Date: Mon, 4 May 2015 15:16:44 -0300
Subject: x86: Make cpu_tss available to external modules

Commit 75182b1632 ("x86/asm/entry: Switch all C consumers of
kernel_stack to this_cpu_sp0()") changed current_thread_info
to use this_cpu_sp0, and indirectly made it rely on init_tss
which was exported with EXPORT_PER_CPU_SYMBOL_GPL.
As a result some macros and inline functions such as set/get_fs,
test_thread_flag and variants have been made unusable for
external modules.

Make cpu_tss exported with EXPORT_PER_CPU_SYMBOL so that these
functions are accessible again, as they were previously.

Signed-off-by: Marc Dionne <marc.dionne@your-file-system.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Link: http://lkml.kernel.org/r/1430763404-21221-1-git-send-email-marc.dionne@your-file-system.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 8213da62b1b7..bfc99b3b6522 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -57,7 +57,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
 	.io_bitmap		= { [0 ... IO_BITMAP_LONGS] = ~0 },
 #endif
 };
-EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss);
+EXPORT_PER_CPU_SYMBOL(cpu_tss);
 
 #ifdef CONFIG_X86_64
 static DEFINE_PER_CPU(unsigned char, is_idle);
-- 
cgit v1.2.3


From c88d47480d300eaad80c213d50c9bf6077fc49bc Mon Sep 17 00:00:00 2001
From: Bobby Powers <bobbypowers@gmail.com>
Date: Mon, 27 Apr 2015 08:10:41 -0700
Subject: x86/fpu: Always restore_xinit_state() when use_eager_cpu()

The following commit:

  f893959b0898 ("x86/fpu: Don't abuse drop_init_fpu() in flush_thread()")

removed drop_init_fpu() usage from flush_thread(). This seems to break
things for me - the Go 1.4 test suite fails all over the place with
floating point comparision errors (offending commit found through
bisection).

The functional change was that flush_thread() after this commit
only calls restore_init_xstate() when both use_eager_fpu() and
!used_math() are true. drop_init_fpu() (now fpu_reset_state()) calls
restore_init_xstate() regardless of whether current used_math() - apply
the same logic here.

Switch used_math() -> tsk_used_math(tsk) to consistently use the grabbed
tsk instead of current, like in the rest of flush_thread().

Tested-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Bobby Powers <bobbypowers@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pekka Riikonen <priikone@iki.fi>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suresh Siddha <sbsiddha@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: f893959b ("x86/fpu: Don't abuse drop_init_fpu() in flush_thread()")
Link: http://lkml.kernel.org/r/1430147441-9820-1-git-send-email-bobbypowers@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/process.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index bfc99b3b6522..6e338e3b1dc0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -156,11 +156,13 @@ void flush_thread(void)
 		/* FPU state will be reallocated lazily at the first use. */
 		drop_fpu(tsk);
 		free_thread_xstate(tsk);
-	} else if (!used_math()) {
-		/* kthread execs. TODO: cleanup this horror. */
-		if (WARN_ON(init_fpu(tsk)))
-			force_sig(SIGKILL, tsk);
-		user_fpu_begin();
+	} else {
+		if (!tsk_used_math(tsk)) {
+			/* kthread execs. TODO: cleanup this horror. */
+			if (WARN_ON(init_fpu(tsk)))
+				force_sig(SIGKILL, tsk);
+			user_fpu_begin();
+		}
 		restore_init_xstate();
 	}
 }
-- 
cgit v1.2.3


From 8746515d7f04c9ea94cf43e2db1fd2cfca93276d Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Fri, 24 Apr 2015 10:16:40 +0100
Subject: xen: Add __GFP_DMA flag when xen_swiotlb_init gets free pages on ARM

Make sure that xen_swiotlb_init allocates buffers that are DMA capable
when at least one memblock is available below 4G. Otherwise we assume
that all devices on the SoC can cope with >4G addresses. We do this on
ARM and ARM64, where dom0 is mapped 1:1, so pfn == mfn in this case.

No functional changes on x86.

From: Chen Baozi <baozich@gmail.com>

Signed-off-by: Chen Baozi <baozich@gmail.com>
Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Tested-by: Chen Baozi <baozich@gmail.com>
Acked-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/include/asm/xen/page.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 358dcd338915..c44a5d53e464 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -269,4 +269,9 @@ static inline bool xen_arch_need_swiotlb(struct device *dev,
 	return false;
 }
 
+static inline unsigned long xen_get_swiotlb_free_pages(unsigned int order)
+{
+	return __get_free_pages(__GFP_NOWARN, order);
+}
+
 #endif /* _ASM_X86_XEN_PAGE_H */
-- 
cgit v1.2.3