From b9fde58db7e5738cacb740b0ec547933fe314fbe Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 7 Sep 2017 16:35:44 +1000
Subject: powerpc/powernv: Rework EEH initialization on powernv

Remove the post_init callback which is only used
by powernv, we can just call it explicitly from
the powernv code.

This partially kills the ability to "disable" eeh at
runtime via debugfs as this was calling that same
callback again, but this is both unused and broken
in several ways. If we want to revive it, we need
to create a dedicated enable/disable callback on the
backend that does the right thing.

Let the bulk of eeh initialize normally at
core_initcall() like it does on pseries by removing
the hack in eeh_init() that delays it.

Instead we make sure our eeh->probe cleanly bails
out of the PEs haven't been created yet and we force
a re-probe where we used to call eeh_init() again.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Russell Currey <ruscur@russell.cc>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/eeh-powernv.c | 42 +++++++++++++++-------------
 arch/powerpc/platforms/powernv/pci-ioda.c    |  3 +-
 arch/powerpc/platforms/powernv/pci.h         |  1 +
 3 files changed, 24 insertions(+), 22 deletions(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 8864065eba22..4650fb294e7a 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -41,7 +41,6 @@
 #include "powernv.h"
 #include "pci.h"
 
-static bool pnv_eeh_nb_init = false;
 static int eeh_event_irq = -EINVAL;
 
 static int pnv_eeh_init(void)
@@ -197,31 +196,31 @@ PNV_EEH_DBGFS_ENTRY(inbB, 0xE10);
  * been built. If the I/O cache staff has been built, EEH is
  * ready to supply service.
  */
-static int pnv_eeh_post_init(void)
+int pnv_eeh_post_init(void)
 {
 	struct pci_controller *hose;
 	struct pnv_phb *phb;
 	int ret = 0;
 
-	/* Register OPAL event notifier */
-	if (!pnv_eeh_nb_init) {
-		eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR));
-		if (eeh_event_irq < 0) {
-			pr_err("%s: Can't register OPAL event interrupt (%d)\n",
-			       __func__, eeh_event_irq);
-			return eeh_event_irq;
-		}
+	/* Probe devices & build address cache */
+	eeh_probe_devices();
+	eeh_addr_cache_build();
 
-		ret = request_irq(eeh_event_irq, pnv_eeh_event,
-				IRQ_TYPE_LEVEL_HIGH, "opal-eeh", NULL);
-		if (ret < 0) {
-			irq_dispose_mapping(eeh_event_irq);
-			pr_err("%s: Can't request OPAL event interrupt (%d)\n",
-			       __func__, eeh_event_irq);
-			return ret;
-		}
+	/* Register OPAL event notifier */
+	eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR));
+	if (eeh_event_irq < 0) {
+		pr_err("%s: Can't register OPAL event interrupt (%d)\n",
+		       __func__, eeh_event_irq);
+		return eeh_event_irq;
+	}
 
-		pnv_eeh_nb_init = true;
+	ret = request_irq(eeh_event_irq, pnv_eeh_event,
+			  IRQ_TYPE_LEVEL_HIGH, "opal-eeh", NULL);
+	if (ret < 0) {
+		irq_dispose_mapping(eeh_event_irq);
+		pr_err("%s: Can't request OPAL event interrupt (%d)\n",
+		       __func__, eeh_event_irq);
+		return ret;
 	}
 
 	if (!eeh_enabled())
@@ -367,6 +366,10 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
 	if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA)
 		return NULL;
 
+	/* Skip if we haven't probed yet */
+	if (phb->ioda.pe_rmap[config_addr] == IODA_INVALID_PE)
+		return NULL;
+
 	/* Initialize eeh device */
 	edev->class_code = pdn->class_code;
 	edev->mode	&= 0xFFFFFF00;
@@ -1731,7 +1734,6 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn)
 static struct eeh_ops pnv_eeh_ops = {
 	.name                   = "powernv",
 	.init                   = pnv_eeh_init,
-	.post_init              = pnv_eeh_post_init,
 	.probe			= pnv_eeh_probe,
 	.set_option             = pnv_eeh_set_option,
 	.get_pe_addr            = pnv_eeh_get_pe_addr,
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 57f9e55f4352..fb5cd7511189 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -3293,8 +3293,7 @@ static void pnv_pci_ioda_fixup(void)
 	pnv_pci_ioda_create_dbgfs();
 
 #ifdef CONFIG_EEH
-	eeh_init();
-	eeh_addr_cache_build();
+	pnv_eeh_post_init();
 #endif
 }
 
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index a95273c524f6..56d1f272d4ad 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -234,6 +234,7 @@ extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
 extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
 extern bool pnv_pci_enable_device_hook(struct pci_dev *dev);
 extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
+extern int pnv_eeh_post_init(void);
 
 extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
 			    const char *fmt, ...);
-- 
cgit v1.2.3


From 5080332c2c893118dbc18755f35c8b0131cf0fc4 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Fri, 15 Sep 2017 15:25:48 +1000
Subject: powerpc/64s: Add workaround for P9 vector CI load issue

POWER9 DD2.1 and earlier has an issue where some cache inhibited
vector load will return bad data. The workaround is two part, one
firmware/microcode part triggers HMI interrupts when hitting such
loads, the other part is this patch which then emulates the
instructions in Linux.

The affected instructions are limited to lxvd2x, lxvw4x, lxvb16x and
lxvh8x.

When an instruction triggers the HMI, all threads in the core will be
sent to the HMI handler, not just the one running the vector load.

In general, these spurious HMIs are detected by the emulation code and
we just return back to the running process. Unfortunately, if a
spurious interrupt occurs on a vector load that's to normal memory we
have no way to detect that it's spurious (unless we walk the page
tables, which is very expensive). In this case we emulate the load but
we need do so using a vector load itself to ensure 128bit atomicity is
preserved.

Some additional debugfs emulated instruction counters are added also.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Switch CONFIG_PPC_BOOK3S_64 to CONFIG_VSX to unbreak the build]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/smp.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index c17f81e433f7..355d3f99cafb 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -49,6 +49,13 @@
 
 static void pnv_smp_setup_cpu(int cpu)
 {
+	/*
+	 * P9 workaround for CI vector load (see traps.c),
+	 * enable the corresponding HMI interrupt
+	 */
+	if (pvr_version_is(PVR_POWER9))
+		mtspr(SPRN_HMEER, mfspr(SPRN_HMEER) | PPC_BIT(17));
+
 	if (xive_enabled())
 		xive_smp_setup_cpu();
 	else if (cpu != boot_cpuid)
-- 
cgit v1.2.3


From e36d0a2ed5019184bb9b94ff1138c87c05905789 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 29 Sep 2017 13:29:42 +1000
Subject: powerpc/powernv: Implement NMI IPI with OPAL_SIGNAL_SYSTEM_RESET

This allows MSR[EE]=0 lockups to be detected on an OPAL (bare metal)
system similarly to the hcall NMI IPI on pseries guests, when the
platform/firmware supports it.

This is an example of CPU10 spinning with interrupts hard disabled:

  Watchdog CPU:32 detected Hard LOCKUP other CPUS:10
  Watchdog CPU:10 Hard LOCKUP
  CPU: 10 PID: 4410 Comm: bash Not tainted 4.13.0-rc7-00074-ge89ce1f89f62-dirty #34
  task: c0000003a82b4400 task.stack: c0000003af55c000
  NIP: c0000000000a7b38 LR: c000000000659044 CTR: c0000000000a7b00
  REGS: c00000000fd23d80 TRAP: 0100   Not tainted  (4.13.0-rc7-00074-ge89ce1f89f62-dirty)
  MSR: 90000000000c1033 <SF,HV,ME,IR,DR,RI,LE>
  CR: 28422222  XER: 20000000
  CFAR: c0000000000a7b38 SOFTE: 0
  GPR00: c000000000659044 c0000003af55fbb0 c000000001072a00 0000000000000078
  GPR04: c0000003c81b5c80 c0000003c81cc7e8 9000000000009033 0000000000000000
  GPR08: 0000000000000000 c0000000000a7b00 0000000000000001 9000000000001003
  GPR12: c0000000000a7b00 c00000000fd83200 0000000010180df8 0000000010189e60
  GPR16: 0000000010189ed8 0000000010151270 000000001018bd88 000000001018de78
  GPR20: 00000000370a0668 0000000000000001 00000000101645e0 0000000010163c10
  GPR24: 00007fffd14d6294 00007fffd14d6290 c000000000fba6f0 0000000000000004
  GPR28: c000000000f351d8 0000000000000078 c000000000f4095c 0000000000000000
  NIP [c0000000000a7b38] sysrq_handle_xmon+0x38/0x40
  LR [c000000000659044] __handle_sysrq+0xe4/0x270
  Call Trace:
  [c0000003af55fbd0] [c000000000659044] __handle_sysrq+0xe4/0x270
  [c0000003af55fc70] [c000000000659810] write_sysrq_trigger+0x70/0xa0
  [c0000003af55fca0] [c0000000003da650] proc_reg_write+0xb0/0x110
  [c0000003af55fcf0] [c0000000003423bc] __vfs_write+0x6c/0x1b0
  [c0000003af55fd90] [c000000000344398] vfs_write+0xd8/0x240
  [c0000003af55fde0] [c00000000034632c] SyS_write+0x6c/0x110
  [c0000003af55fe30] [c00000000000b220] system_call+0x58/0x6c

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
[mpe: Use kernel types for opal_signal_system_reset()]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-wrappers.S |  1 +
 arch/powerpc/platforms/powernv/setup.c         |  1 +
 arch/powerpc/platforms/powernv/smp.c           | 52 ++++++++++++++++++++++++++
 3 files changed, 54 insertions(+)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 8c1ede2d3f7e..37cd170201a2 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -307,6 +307,7 @@ OPAL_CALL(opal_xive_get_vp_info,		OPAL_XIVE_GET_VP_INFO);
 OPAL_CALL(opal_xive_set_vp_info,		OPAL_XIVE_SET_VP_INFO);
 OPAL_CALL(opal_xive_sync,			OPAL_XIVE_SYNC);
 OPAL_CALL(opal_xive_dump,			OPAL_XIVE_DUMP);
+OPAL_CALL(opal_signal_system_reset,		OPAL_SIGNAL_SYSTEM_RESET);
 OPAL_CALL(opal_npu_init_context,		OPAL_NPU_INIT_CONTEXT);
 OPAL_CALL(opal_npu_destroy_context,		OPAL_NPU_DESTROY_CONTEXT);
 OPAL_CALL(opal_npu_map_lpar,			OPAL_NPU_MAP_LPAR);
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 897aa1400eb8..cf52d53da460 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -282,6 +282,7 @@ static void __init pnv_setup_machdep_opal(void)
 	ppc_md.restart = pnv_restart;
 	pm_power_off = pnv_power_off;
 	ppc_md.halt = pnv_halt;
+	/* ppc_md.system_reset_exception gets filled in by pnv_smp_init() */
 	ppc_md.machine_check_exception = opal_machine_check;
 	ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery;
 	ppc_md.hmi_exception_early = opal_hmi_exception_early;
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 355d3f99cafb..ba030669eca1 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -297,6 +297,54 @@ static void __init pnv_smp_probe(void)
 	}
 }
 
+static int pnv_system_reset_exception(struct pt_regs *regs)
+{
+	if (smp_handle_nmi_ipi(regs))
+		return 1;
+	return 0;
+}
+
+static int pnv_cause_nmi_ipi(int cpu)
+{
+	int64_t rc;
+
+	if (cpu >= 0) {
+		rc = opal_signal_system_reset(get_hard_smp_processor_id(cpu));
+		if (rc != OPAL_SUCCESS)
+			return 0;
+		return 1;
+
+	} else if (cpu == NMI_IPI_ALL_OTHERS) {
+		bool success = true;
+		int c;
+
+
+		/*
+		 * We do not use broadcasts (yet), because it's not clear
+		 * exactly what semantics Linux wants or the firmware should
+		 * provide.
+		 */
+		for_each_online_cpu(c) {
+			if (c == smp_processor_id())
+				continue;
+
+			rc = opal_signal_system_reset(
+						get_hard_smp_processor_id(c));
+			if (rc != OPAL_SUCCESS)
+				success = false;
+		}
+		if (success)
+			return 1;
+
+		/*
+		 * Caller will fall back to doorbells, which may pick
+		 * up the remainders.
+		 */
+	}
+
+	return 0;
+}
+
 static struct smp_ops_t pnv_smp_ops = {
 	.message_pass	= NULL, /* Use smp_muxed_ipi_message_pass */
 	.cause_ipi	= NULL,	/* Filled at runtime by pnv_smp_probe() */
@@ -315,6 +363,10 @@ static struct smp_ops_t pnv_smp_ops = {
 /* This is called very early during platform setup_arch */
 void __init pnv_smp_init(void)
 {
+	if (opal_check_token(OPAL_SIGNAL_SYSTEM_RESET)) {
+		ppc_md.system_reset_exception = pnv_system_reset_exception;
+		pnv_smp_ops.cause_nmi_ipi = pnv_cause_nmi_ipi;
+	}
 	smp_ops = &pnv_smp_ops;
 
 #ifdef CONFIG_HOTPLUG_CPU
-- 
cgit v1.2.3


From 01451ad47e2724eb123e87a5bae04e943046b87a Mon Sep 17 00:00:00 2001
From: Allen Pais <allen.lkml@gmail.com>
Date: Fri, 22 Sep 2017 17:05:00 +0530
Subject: powerpc/powermac: Use setup_timer() helper

Use setup_timer function instead of initializing timer with the
function and data fields.

Signed-off-by: Allen Pais <allen.lkml@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powermac/low_i2c.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c
index 70183eb3d5c8..39a1d4225e0f 100644
--- a/arch/powerpc/platforms/powermac/low_i2c.c
+++ b/arch/powerpc/platforms/powermac/low_i2c.c
@@ -513,9 +513,7 @@ static struct pmac_i2c_host_kw *__init kw_i2c_host_init(struct device_node *np)
 	mutex_init(&host->mutex);
 	init_completion(&host->complete);
 	spin_lock_init(&host->lock);
-	init_timer(&host->timeout_timer);
-	host->timeout_timer.function = kw_i2c_timeout;
-	host->timeout_timer.data = (unsigned long)host;
+	setup_timer(&host->timeout_timer, kw_i2c_timeout, (unsigned long)host);
 
 	psteps = of_get_property(np, "AAPL,address-step", NULL);
 	steps = psteps ? (*psteps) : 0x10;
-- 
cgit v1.2.3


From c6baa077b784c3b37391a8c11f433e3f881a80df Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Fri, 29 Sep 2017 13:58:02 +1000
Subject: powerpc/powernv: Make opal_event_shutdown() callable from IRQ context

In opal_event_shutdown() we free all the IRQs hanging off the
opal_event_irqchip. However it's not safe to do so if we're called
from IRQ context, because free_irq() wants to synchronise versus IRQ
context. This can lead to warnings and a stuck system.

For example from sysrq-b:

  Trying to free IRQ 17 from IRQ context!
  ------------[ cut here ]------------
  WARNING: CPU: 0 PID: 0 at kernel/irq/manage.c:1461 __free_irq+0x398/0x8d0
  ...
  NIP __free_irq+0x398/0x8d0
  LR __free_irq+0x394/0x8d0
  Call Trace:
    __free_irq+0x394/0x8d0 (unreliable)
    free_irq+0xa4/0x140
    opal_event_shutdown+0x128/0x180
    opal_shutdown+0x1c/0xb0
    pnv_shutdown+0x20/0x40
    machine_restart+0x38/0x90
    emergency_restart+0x28/0x40
    sysrq_handle_reboot+0x24/0x40
    __handle_sysrq+0x198/0x590
    hvc_poll+0x48c/0x8c0
    hvc_handle_interrupt+0x1c/0x50
    __handle_irq_event_percpu+0xe8/0x6e0
    handle_irq_event_percpu+0x34/0xe0
    handle_irq_event+0xc4/0x210
    handle_level_irq+0x250/0x770
    generic_handle_irq+0x5c/0xa0
    opal_handle_events+0x11c/0x240
    opal_interrupt+0x38/0x50
    __handle_irq_event_percpu+0xe8/0x6e0
    handle_irq_event_percpu+0x34/0xe0
    handle_irq_event+0xc4/0x210
    handle_fasteoi_irq+0x174/0xa10
    generic_handle_irq+0x5c/0xa0
    __do_irq+0xbc/0x4e0
    call_do_irq+0x14/0x24
    do_IRQ+0x18c/0x540
    hardware_interrupt_common+0x158/0x180

We can avoid that by using disable_irq_nosync() rather than
free_irq(). Although it doesn't fully free the IRQ, it should be
sufficient when we're shutting down, particularly in an emergency.

Add an in_interrupt() check and use free_irq() when we're shutting
down normally. It's probably OK to use disable_irq_nosync() in that
case too, but for now it's safer to leave that behaviour as-is.

Fixes: 9f0fd0499d30 ("powerpc/powernv: Add a virtual irqchip for opal events")
Reported-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-irqchip.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c
index ecdcba9d1220..9d1b8c0aaf93 100644
--- a/arch/powerpc/platforms/powernv/opal-irqchip.c
+++ b/arch/powerpc/platforms/powernv/opal-irqchip.c
@@ -174,8 +174,14 @@ void opal_event_shutdown(void)
 
 	/* First free interrupts, which will also mask them */
 	for (i = 0; i < opal_irq_count; i++) {
-		if (opal_irqs[i])
+		if (!opal_irqs[i])
+			continue;
+
+		if (in_interrupt())
+			disable_irq_nosync(opal_irqs[i]);
+		else
 			free_irq(opal_irqs[i], NULL);
+
 		opal_irqs[i] = 0;
 	}
 }
-- 
cgit v1.2.3


From cee5405da4020b0b0233bc8fb7c8da7322d2c52e Mon Sep 17 00:00:00 2001
From: Michael Bringmann <mwb@linux.vnet.ibm.com>
Date: Fri, 8 Sep 2017 15:47:47 -0500
Subject: powerpc/hotplug: Improve responsiveness of hotplug change

powerpc/hotplug: On Power systems with shared configurations of CPUs
and memory, there are some issues with the association of additional
CPUs and memory to nodes when hot-adding resources.  During hotplug
CPU operations, this patch resets the timer on topology update work
function to a small value to better ensure that the CPU topology is
detected and configured sooner.

Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/hotplug-cpu.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index fadb95efbb9e..a7d14aa7bb7c 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -363,6 +363,7 @@ static int dlpar_online_cpu(struct device_node *dn)
 			BUG_ON(get_cpu_current_state(cpu)
 					!= CPU_STATE_OFFLINE);
 			cpu_maps_update_done();
+			timed_topology_update(1);
 			rc = device_online(get_cpu_device(cpu));
 			if (rc)
 				goto out;
@@ -533,6 +534,7 @@ static int dlpar_offline_cpu(struct device_node *dn)
 				set_preferred_offline_state(cpu,
 							    CPU_STATE_OFFLINE);
 				cpu_maps_update_done();
+				timed_topology_update(1);
 				rc = device_offline(get_cpu_device(cpu));
 				if (rc)
 					goto out;
-- 
cgit v1.2.3


From 54820530c5faa9fd78e1c08cb6449100b1a19157 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 12 Oct 2017 21:17:18 +1100
Subject: powerpc/powernv: Enable TM without suspend if possible

Some Power9 revisions can run in a mode where TM operates without
suspended state. If we find ourself on a CPU that might be in this
mode, we query OPAL to check, and if so we reenable TM in CPU
features, and enable a new user feature to signal to userspace that we
are in this mode.

We do not enable the "normal" user feature, PPC_FEATURE2_HTM, but we
do enable PPC_FEATURE2_HTM_NOSC because that indicates to userspace
that the kernel will abort transactions on syscall entry, which is
true regardless of the suspend mode.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/setup.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index cf52d53da460..d23f148a11f0 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -36,6 +36,7 @@
 #include <asm/opal.h>
 #include <asm/kexec.h>
 #include <asm/smp.h>
+#include <asm/tm.h>
 
 #include "powernv.h"
 
@@ -304,6 +305,28 @@ static int __init pnv_probe(void)
 	return 1;
 }
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+void __init pnv_tm_init(void)
+{
+	if (!firmware_has_feature(FW_FEATURE_OPAL) ||
+	    !pvr_version_is(PVR_POWER9) ||
+	    early_cpu_has_feature(CPU_FTR_TM))
+		return;
+
+	if (opal_reinit_cpus(OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED) != OPAL_SUCCESS)
+		return;
+
+	pr_info("Enabling TM (Transactional Memory) with Suspend Disabled\n");
+	cur_cpu_spec->cpu_features |= CPU_FTR_TM;
+	/* Make sure "normal" HTM is off (it should be) */
+	cur_cpu_spec->cpu_user_features2 &= ~PPC_FEATURE2_HTM;
+	/* Turn on no suspend mode, and HTM no SC */
+	cur_cpu_spec->cpu_user_features2 |= PPC_FEATURE2_HTM_NO_SUSPEND | \
+					    PPC_FEATURE2_HTM_NOSC;
+	tm_suspend_disabled = true;
+}
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
 /*
  * Returns the cpu frequency for 'cpu' in Hz. This is used by
  * /proc/cpuinfo
-- 
cgit v1.2.3


From c28237f1d4ed2c9022f9eed656ecf36999e34f47 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 17 Oct 2017 13:31:42 +0200
Subject: powerpc-opal: Fix a typo in a comment line of two file headers

Fix a word in these descriptions.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-hmi.c           | 2 +-
 arch/powerpc/platforms/powernv/opal-memory-errors.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c
index d78fed728cdf..c9e1a4ff295c 100644
--- a/arch/powerpc/platforms/powernv/opal-hmi.c
+++ b/arch/powerpc/platforms/powernv/opal-hmi.c
@@ -1,5 +1,5 @@
 /*
- * OPAL hypervisor Maintenance interrupt handling support in PowreNV.
+ * OPAL hypervisor Maintenance interrupt handling support in PowerNV.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c
index 4495f428b500..d9916ea62305 100644
--- a/arch/powerpc/platforms/powernv/opal-memory-errors.c
+++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c
@@ -1,5 +1,5 @@
 /*
- * OPAL asynchronus Memory error handling support in PowreNV.
+ * OPAL asynchronus Memory error handling support in PowerNV.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
-- 
cgit v1.2.3


From 4dd9eab39c71628d113168a01473ee17b5f61eac Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Wed, 18 Oct 2017 20:48:52 +0200
Subject: powerpc/pseries: Cleanup error handling in
 iommu_pseries_alloc_group()

Although kfree(NULL) is legal, it's a bit lazy to rely on that to
implement the error handling. So do it the normal Linux way using
labels for each failure path.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
[mpe: Squash a few patches and rewrite change log]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/iommu.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 7c181467d0ad..69921f72e2da 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -55,23 +55,23 @@
 
 static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 {
-	struct iommu_table_group *table_group = NULL;
-	struct iommu_table *tbl = NULL;
-	struct iommu_table_group_link *tgl = NULL;
+	struct iommu_table_group *table_group;
+	struct iommu_table *tbl;
+	struct iommu_table_group_link *tgl;
 
 	table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
 			   node);
 	if (!table_group)
-		goto fail_exit;
+		return NULL;
 
 	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
 	if (!tbl)
-		goto fail_exit;
+		goto free_group;
 
 	tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL,
 			node);
 	if (!tgl)
-		goto fail_exit;
+		goto free_table;
 
 	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
 	kref_init(&tbl->it_kref);
@@ -82,11 +82,10 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 
 	return table_group;
 
-fail_exit:
-	kfree(tgl);
-	kfree(table_group);
+free_table:
 	kfree(tbl);
-
+free_group:
+	kfree(table_group);
 	return NULL;
 }
 
-- 
cgit v1.2.3


From 63c9d8a4b394f9d8e995292a7c74648760235b44 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Mon, 23 Oct 2017 17:08:15 +1000
Subject: powerpc/powernv: Use FIXUP_ENDIAN_HV in OPAL return

Close the recoverability gap for OPAL calls by using FIXUP_ENDIAN_HV
in the return path.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-wrappers.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 37cd170201a2..6f4b00a2ac46 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -94,7 +94,7 @@ opal_return:
 	 * bytes (always BE) since MSR:LE will end up fixed up as a side
 	 * effect of the rfid.
 	 */
-	FIXUP_ENDIAN
+	FIXUP_ENDIAN_HV
 	ld	r2,PACATOC(r13);
 	lwz	r4,8(r1);
 	ld	r5,PPC_LR_STKOFF(r1);
@@ -120,7 +120,7 @@ opal_real_call:
 	hrfid
 
 opal_return_realmode:
-	FIXUP_ENDIAN
+	FIXUP_ENDIAN_HV
 	ld	r2,PACATOC(r13);
 	lwz	r11,8(r1);
 	ld	r12,PPC_LR_STKOFF(r1)
-- 
cgit v1.2.3


From b8f89fea599d91e674497aad572613eb63181f31 Mon Sep 17 00:00:00 2001
From: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
Date: Thu, 28 Sep 2017 20:19:20 -0400
Subject: powerpc/pseries/vio: Dispose of virq mapping on vdevice unregister

When a vdevice is DLPAR removed from the system the vio subsystem
doesn't bother unmapping the virq from the irq_domain. As a result we
have a virq mapped to a hardware irq that is no longer valid for the
irq_domain. A side effect is that we are left with /proc/irq/<irq#>
affinity entries, and attempts to modify the smp_affinity of the irq
will fail.

In the following observed example the kernel log is spammed by
ics_rtas_set_affinity errors after the removal of a VSCSI adapter.
This is a result of irqbalance trying to adjust the affinity every 10
seconds.

  rpadlpar_io: slot U8408.E8E.10A7ACV-V5-C25 removed
  ics_rtas_set_affinity: ibm,set-xive irq=655385 returns -3
  ics_rtas_set_affinity: ibm,set-xive irq=655385 returns -3

This patch fixes the issue by calling irq_dispose_mapping() on the
virq of the viodev on unregister.

Fixes: f2ab6219969f ("powerpc/pseries: Add PFO support to the VIO bus")
Signed-off-by: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/vio.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c
index 12277bc9fd9e..d86938260a86 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -1592,6 +1592,8 @@ ATTRIBUTE_GROUPS(vio_dev);
 void vio_unregister_device(struct vio_dev *viodev)
 {
 	device_unregister(&viodev->dev);
+	if (viodev->family == VDEVICE)
+		irq_dispose_mapping(viodev->irq);
 }
 EXPORT_SYMBOL(vio_unregister_device);
 
-- 
cgit v1.2.3


From d6f934fd48803d9e58040e2cbab2feafe9bb9f01 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 27 Sep 2017 16:52:31 +1000
Subject: powerpc/powernv: Reserve a hole which appears after enabling IOV

In order to make generic IOV code work, the physical function IOV BAR
should start from offset of the first VF. Since M64 segments share
PE number space across PHB, and some PEs may be in use at the time
when IOV is enabled, the existing code shifts the IOV BAR to the index
of the first PE/VF. This creates a hole in IOMEM space which can be
potentially taken by some other device.

This reserves a temporary hole on a parent and releases it when IOV is
disabled; the temporary resources are stored in pci_dn to avoid
kmalloc/free.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index fb5cd7511189..7e87867984e7 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1002,9 +1002,12 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
 	}
 
 	/*
-	 * After doing so, there would be a "hole" in the /proc/iomem when
-	 * offset is a positive value. It looks like the device return some
-	 * mmio back to the system, which actually no one could use it.
+	 * Since M64 BAR shares segments among all possible 256 PEs,
+	 * we have to shift the beginning of PF IOV BAR to make it start from
+	 * the segment which belongs to the PE number assigned to the first VF.
+	 * This creates a "hole" in the /proc/iomem which could be used for
+	 * allocating other resources so we reserve this area below and
+	 * release when IOV is released.
 	 */
 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
 		res = &dev->resource[i + PCI_IOV_RESOURCES];
@@ -1018,7 +1021,22 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
 		dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n",
 			 i, &res2, res, (offset > 0) ? "En" : "Dis",
 			 num_vfs, offset);
+
+		if (offset < 0) {
+			devm_release_resource(&dev->dev, &pdn->holes[i]);
+			memset(&pdn->holes[i], 0, sizeof(pdn->holes[i]));
+		}
+
 		pci_update_resource(dev, i + PCI_IOV_RESOURCES);
+
+		if (offset > 0) {
+			pdn->holes[i].start = res2.start;
+			pdn->holes[i].end = res2.start + size * offset - 1;
+			pdn->holes[i].flags = IORESOURCE_BUS;
+			pdn->holes[i].name = "pnv_iov_reserved";
+			devm_request_resource(&dev->dev, res->parent,
+					&pdn->holes[i]);
+		}
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 4e003747043d57aa75c9762fa148ef38afe68dd8 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 19 Oct 2017 15:08:43 +1100
Subject: powerpc/64s: Replace CONFIG_PPC_STD_MMU_64 with CONFIG_PPC_BOOK3S_64

CONFIG_PPC_STD_MMU_64 indicates support for the "standard" powerpc MMU
on 64-bit CPUs. The "standard" MMU refers to the hash page table MMU
found in "server" processors, from IBM mainly.

Currently CONFIG_PPC_STD_MMU_64 is == CONFIG_PPC_BOOK3S_64. While it's
annoying to have two symbols that always have the same value, it's not
quite annoying enough to bother removing one.

However with the arrival of Power9, we now have the situation where
CONFIG_PPC_STD_MMU_64 is enabled, but the kernel is running using the
Radix MMU - *not* the "standard" MMU. So it is now actively confusing
to use it, because it implies that code is disabled or inactive when
the Radix MMU is in use, however that is not necessarily true.

So s/CONFIG_PPC_STD_MMU_64/CONFIG_PPC_BOOK3S_64/, and do some minor
formatting updates of some of the affected lines.

This will be a pain for backports, but c'est la vie.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/Kconfig.cputype   | 6 +-----
 arch/powerpc/platforms/pseries/lpar.c    | 8 ++++----
 arch/powerpc/platforms/pseries/lparcfg.c | 2 +-
 3 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 13663efc1d31..f8928ee85f6b 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -294,10 +294,6 @@ config PPC_STD_MMU_32
 	def_bool y
 	depends on PPC_STD_MMU && PPC32
 
-config PPC_STD_MMU_64
-	def_bool y
-	depends on PPC_STD_MMU && PPC64
-
 config PPC_RADIX_MMU
 	bool "Radix MMU Support"
 	depends on PPC_BOOK3S_64
@@ -323,7 +319,7 @@ config PPC_BOOK3E_MMU
 
 config PPC_MM_SLICES
 	bool
-	default y if PPC_STD_MMU_64
+	default y if PPC_BOOK3S_64
 	default n
 
 config PPC_HAVE_PMU_SUPPORT
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 495ba4e7336d..0ee4a469a4ae 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -93,7 +93,7 @@ void vpa_init(int cpu)
 		return;
 	}
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	/*
 	 * PAPR says this feature is SLB-Buffer but firmware never
 	 * reports that.  All SPLPAR support SLB shadow buffer.
@@ -106,7 +106,7 @@ void vpa_init(int cpu)
 			       "cpu %d (hw %d) of area %lx failed with %ld\n",
 			       cpu, hwcpu, addr, ret);
 	}
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
 
 	/*
 	 * Register dispatch trace log, if one has been allocated.
@@ -129,7 +129,7 @@ void vpa_init(int cpu)
 	}
 }
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 
 static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
 				     unsigned long vpn, unsigned long pa,
@@ -824,7 +824,7 @@ void arch_free_page(struct page *page, int order)
 EXPORT_SYMBOL(arch_free_page);
 
 #endif /* CONFIG_PPC_SMLPAR */
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
 
 #ifdef CONFIG_TRACEPOINTS
 #ifdef HAVE_JUMP_LABEL
diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c
index 779fc2a1c8f7..b2706c483067 100644
--- a/arch/powerpc/platforms/pseries/lparcfg.c
+++ b/arch/powerpc/platforms/pseries/lparcfg.c
@@ -485,7 +485,7 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
 	seq_printf(m, "shared_processor_mode=%d\n",
 		   lppaca_shared_proc(get_lppaca()));
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	seq_printf(m, "slb_size=%d\n", mmu_slb_size);
 #endif
 	parse_em_data(m);
-- 
cgit v1.2.3


From 1fd6c02207107c8892219dacef01de7ced3d4ce7 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 24 Oct 2017 17:48:49 +0200
Subject: powerpc/mm: Add a CONFIG option to choose if radix is used by default

Currently if the hardware supports the radix MMU we will use
it, *unless* "disable_radix" is passed on the kernel command line.

However some users would like the reverse semantics. ie. The kernel
uses the hash MMU by default, unless radix is explicitly requested on
the command line.

So add a CONFIG option to choose whether we use radix by default or
not, and expand the disable_radix command line option to allow
"disable_radix=no" which *enables* radix.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/Kconfig.cputype | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index f8928ee85f6b..596bd9091478 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -304,6 +304,19 @@ config PPC_RADIX_MMU
 	  is only implemented by IBM Power9 CPUs, if you don't have one of them
 	  you can probably disable this.
 
+config PPC_RADIX_MMU_DEFAULT
+	bool "Default to using the Radix MMU when possible"
+	depends on PPC_RADIX_MMU
+	default y
+	help
+	  When the hardware supports the Radix MMU, default to using it unless
+	  "disable_radix[=yes]" is specified on the kernel command line.
+
+	  If this option is disabled, the Hash MMU will be used by default,
+	  unless "disable_radix=no" is specified on the kernel command line.
+
+	  If you're unsure, say Y.
+
 config ARCH_ENABLE_HUGEPAGE_MIGRATION
 	def_bool y
 	depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION
-- 
cgit v1.2.3


From 71e24d7731a2903b1ae2bba2b2971c654d9c2aa6 Mon Sep 17 00:00:00 2001
From: "William A. Kennington III" <wak@google.com>
Date: Fri, 22 Sep 2017 16:58:00 -0700
Subject: powerpc/opal: Fix EBUSY bug in acquiring tokens

The current code checks the completion map to look for the first token
that is complete. In some cases, a completion can come in but the
token can still be on lease to the caller processing the completion.
If this completed but unreleased token is the first token found in the
bitmap by another tasks trying to acquire a token, then the
__test_and_set_bit call will fail since the token will still be on
lease. The acquisition will then fail with an EBUSY.

This patch reorganizes the acquisition code to look at the
opal_async_token_map for an unleased token. If the token has no lease
it must have no outstanding completions so we should never see an
EBUSY, unless we have leased out too many tokens. Since
opal_async_get_token_inrerruptible is protected by a semaphore, we
will practically never see EBUSY anymore.

Fixes: 8d7248232208 ("powerpc/powernv: Infrastructure to support OPAL async completion")
Signed-off-by: William A. Kennington III <wak@google.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-async.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c
index cf33769a7b72..45b3feb8aa2f 100644
--- a/arch/powerpc/platforms/powernv/opal-async.c
+++ b/arch/powerpc/platforms/powernv/opal-async.c
@@ -39,18 +39,18 @@ int __opal_async_get_token(void)
 	int token;
 
 	spin_lock_irqsave(&opal_async_comp_lock, flags);
-	token = find_first_bit(opal_async_complete_map, opal_max_async_tokens);
+	token = find_first_zero_bit(opal_async_token_map, opal_max_async_tokens);
 	if (token >= opal_max_async_tokens) {
 		token = -EBUSY;
 		goto out;
 	}
 
-	if (__test_and_set_bit(token, opal_async_token_map)) {
+	if (!__test_and_clear_bit(token, opal_async_complete_map)) {
 		token = -EBUSY;
 		goto out;
 	}
 
-	__clear_bit(token, opal_async_complete_map);
+	__set_bit(token, opal_async_token_map);
 
 out:
 	spin_unlock_irqrestore(&opal_async_comp_lock, flags);
-- 
cgit v1.2.3


From 59cf9a1cfcd9de6392d218fcd69413f2e77babbe Mon Sep 17 00:00:00 2001
From: Cyril Bur <cyrilbur@gmail.com>
Date: Fri, 3 Nov 2017 13:41:41 +1100
Subject: powerpc/opal: Make __opal_async_{get, release}_token() static

There are no callers of both __opal_async_get_token() and
__opal_async_release_token().

This patch also removes the possibility of "emergency through
synchronous call to __opal_async_get_token()" as such it makes more
sense to initialise opal_sync_sem for the maximum number of async
tokens.

Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-async.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c
index 45b3feb8aa2f..64255d3ee14a 100644
--- a/arch/powerpc/platforms/powernv/opal-async.c
+++ b/arch/powerpc/platforms/powernv/opal-async.c
@@ -33,7 +33,7 @@ static struct semaphore opal_async_sem;
 static struct opal_msg *opal_async_responses;
 static unsigned int opal_max_async_tokens;
 
-int __opal_async_get_token(void)
+static int __opal_async_get_token(void)
 {
 	unsigned long flags;
 	int token;
@@ -73,7 +73,7 @@ int opal_async_get_token_interruptible(void)
 }
 EXPORT_SYMBOL_GPL(opal_async_get_token_interruptible);
 
-int __opal_async_release_token(int token)
+static int __opal_async_release_token(int token)
 {
 	unsigned long flags;
 
@@ -199,11 +199,7 @@ int __init opal_async_comp_init(void)
 		goto out_opal_node;
 	}
 
-	/* Initialize to 1 less than the maximum tokens available, as we may
-	 * require to pop one during emergency through synchronous call to
-	 * __opal_async_get_token()
-	 */
-	sema_init(&opal_async_sem, opal_max_async_tokens - 1);
+	sema_init(&opal_async_sem, opal_max_async_tokens);
 
 out_opal_node:
 	of_node_put(opal_node);
-- 
cgit v1.2.3


From 86cd6d98020924f65a6773784c66c5b842e3e320 Mon Sep 17 00:00:00 2001
From: Cyril Bur <cyrilbur@gmail.com>
Date: Fri, 3 Nov 2017 13:41:42 +1100
Subject: powerpc/opal: Rework the opal-async interface

Future work will add an opal_async_wait_response_interruptible()
which will call wait_event_interruptible(). This work requires extra
token state to be tracked as wait_event_interruptible() can return and
the caller could release the token before OPAL responds.

Currently token state is tracked with two bitfields which are 64 bits
big but may not need to be as OPAL informs Linux how many async tokens
there are. It also uses an array indexed by token to store response
messages for each token.

The bitfields make it difficult to add more state and also provide a
hard maximum as to how many tokens there can be - it is possible that
OPAL will inform Linux that there are more than 64 tokens.

Rather than add a bitfield to track the extra state, rework the
internals slightly.

Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
[mpe: Fix __opal_async_get_token() when no tokens are free]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-async.c | 92 ++++++++++++++++-------------
 1 file changed, 51 insertions(+), 41 deletions(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c
index 64255d3ee14a..a8a57310759a 100644
--- a/arch/powerpc/platforms/powernv/opal-async.c
+++ b/arch/powerpc/platforms/powernv/opal-async.c
@@ -1,7 +1,7 @@
 /*
  * PowerNV OPAL asynchronous completion interfaces
  *
- * Copyright 2013 IBM Corp.
+ * Copyright 2013-2017 IBM Corp.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
@@ -23,40 +23,47 @@
 #include <asm/machdep.h>
 #include <asm/opal.h>
 
-#define N_ASYNC_COMPLETIONS	64
+enum opal_async_token_state {
+	ASYNC_TOKEN_UNALLOCATED = 0,
+	ASYNC_TOKEN_ALLOCATED,
+	ASYNC_TOKEN_COMPLETED
+};
+
+struct opal_async_token {
+	enum opal_async_token_state state;
+	struct opal_msg response;
+};
 
-static DECLARE_BITMAP(opal_async_complete_map, N_ASYNC_COMPLETIONS) = {~0UL};
-static DECLARE_BITMAP(opal_async_token_map, N_ASYNC_COMPLETIONS);
 static DECLARE_WAIT_QUEUE_HEAD(opal_async_wait);
 static DEFINE_SPINLOCK(opal_async_comp_lock);
 static struct semaphore opal_async_sem;
-static struct opal_msg *opal_async_responses;
 static unsigned int opal_max_async_tokens;
+static struct opal_async_token *opal_async_tokens;
 
 static int __opal_async_get_token(void)
 {
 	unsigned long flags;
-	int token;
+	int i, token = -EBUSY;
 
 	spin_lock_irqsave(&opal_async_comp_lock, flags);
-	token = find_first_zero_bit(opal_async_token_map, opal_max_async_tokens);
-	if (token >= opal_max_async_tokens) {
-		token = -EBUSY;
-		goto out;
-	}
 
-	if (!__test_and_clear_bit(token, opal_async_complete_map)) {
-		token = -EBUSY;
-		goto out;
+	for (i = 0; i < opal_max_async_tokens; i++) {
+		if (opal_async_tokens[i].state == ASYNC_TOKEN_UNALLOCATED) {
+			opal_async_tokens[i].state = ASYNC_TOKEN_ALLOCATED;
+			token = i;
+			break;
+		}
 	}
 
-	__set_bit(token, opal_async_token_map);
-
-out:
 	spin_unlock_irqrestore(&opal_async_comp_lock, flags);
 	return token;
 }
 
+/*
+ * Note: If the returned token is used in an opal call and opal returns
+ * OPAL_ASYNC_COMPLETION you MUST call opal_async_wait_response() before
+ * calling another other opal_async_* function
+ */
 int opal_async_get_token_interruptible(void)
 {
 	int token;
@@ -76,6 +83,7 @@ EXPORT_SYMBOL_GPL(opal_async_get_token_interruptible);
 static int __opal_async_release_token(int token)
 {
 	unsigned long flags;
+	int rc;
 
 	if (token < 0 || token >= opal_max_async_tokens) {
 		pr_err("%s: Passed token is out of range, token %d\n",
@@ -84,11 +92,18 @@ static int __opal_async_release_token(int token)
 	}
 
 	spin_lock_irqsave(&opal_async_comp_lock, flags);
-	__set_bit(token, opal_async_complete_map);
-	__clear_bit(token, opal_async_token_map);
+	switch (opal_async_tokens[token].state) {
+	case ASYNC_TOKEN_COMPLETED:
+	case ASYNC_TOKEN_ALLOCATED:
+		opal_async_tokens[token].state = ASYNC_TOKEN_UNALLOCATED;
+		rc = 0;
+		break;
+	default:
+		rc = 1;
+	}
 	spin_unlock_irqrestore(&opal_async_comp_lock, flags);
 
-	return 0;
+	return rc;
 }
 
 int opal_async_release_token(int token)
@@ -96,12 +111,10 @@ int opal_async_release_token(int token)
 	int ret;
 
 	ret = __opal_async_release_token(token);
-	if (ret)
-		return ret;
-
-	up(&opal_async_sem);
+	if (!ret)
+		up(&opal_async_sem);
 
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(opal_async_release_token);
 
@@ -122,13 +135,15 @@ int opal_async_wait_response(uint64_t token, struct opal_msg *msg)
 	 * functional.
 	 */
 	opal_wake_poller();
-	wait_event(opal_async_wait, test_bit(token, opal_async_complete_map));
-	memcpy(msg, &opal_async_responses[token], sizeof(*msg));
+	wait_event(opal_async_wait, opal_async_tokens[token].state
+			== ASYNC_TOKEN_COMPLETED);
+	memcpy(msg, &opal_async_tokens[token].response, sizeof(*msg));
 
 	return 0;
 }
 EXPORT_SYMBOL_GPL(opal_async_wait_response);
 
+/* Called from interrupt context */
 static int opal_async_comp_event(struct notifier_block *nb,
 		unsigned long msg_type, void *msg)
 {
@@ -140,9 +155,9 @@ static int opal_async_comp_event(struct notifier_block *nb,
 		return 0;
 
 	token = be64_to_cpu(comp_msg->params[0]);
-	memcpy(&opal_async_responses[token], comp_msg, sizeof(*comp_msg));
+	memcpy(&opal_async_tokens[token].response, comp_msg, sizeof(*comp_msg));
 	spin_lock_irqsave(&opal_async_comp_lock, flags);
-	__set_bit(token, opal_async_complete_map);
+	opal_async_tokens[token].state = ASYNC_TOKEN_COMPLETED;
 	spin_unlock_irqrestore(&opal_async_comp_lock, flags);
 
 	wake_up(&opal_async_wait);
@@ -178,24 +193,19 @@ int __init opal_async_comp_init(void)
 	}
 
 	opal_max_async_tokens = be32_to_cpup(async);
-	if (opal_max_async_tokens > N_ASYNC_COMPLETIONS)
-		opal_max_async_tokens = N_ASYNC_COMPLETIONS;
+	opal_async_tokens = kcalloc(opal_max_async_tokens,
+			sizeof(*opal_async_tokens), GFP_KERNEL);
+	if (!opal_async_tokens) {
+		err = -ENOMEM;
+		goto out_opal_node;
+	}
 
 	err = opal_message_notifier_register(OPAL_MSG_ASYNC_COMP,
 			&opal_async_comp_nb);
 	if (err) {
 		pr_err("%s: Can't register OPAL event notifier (%d)\n",
 				__func__, err);
-		goto out_opal_node;
-	}
-
-	opal_async_responses = kzalloc(
-			sizeof(*opal_async_responses) * opal_max_async_tokens,
-			GFP_KERNEL);
-	if (!opal_async_responses) {
-		pr_err("%s: Out of memory, failed to do asynchronous "
-				"completion init\n", __func__);
-		err = -ENOMEM;
+		kfree(opal_async_tokens);
 		goto out_opal_node;
 	}
 
-- 
cgit v1.2.3


From 95e1bc1daaeee4d598b235dc85b64b7a0bcc3060 Mon Sep 17 00:00:00 2001
From: Stewart Smith <stewart@linux.vnet.ibm.com>
Date: Fri, 3 Nov 2017 13:41:43 +1100
Subject: powernv/opal-sensor: remove not needed lock

Parallel sensor reads could run out of async tokens due to
opal_get_sensor_data grabbing tokens but then doing the sensor
read behind a mutex, essentially serializing the (possibly
asynchronous and relatively slow) sensor read.

It turns out that the mutex isn't needed at all, not only
should the OPAL interface allow concurrent reads, the implementation
is certainly safe for that, and if any sensor we were reading
from somewhere isn't, doing the mutual exclusion in the kernel
is the wrong place to do it, OPAL should be doing it for the kernel.

So, remove the mutex.

Additionally, we shouldn't be printing out an error when we don't
get a token as the only way this should happen is if we've been
interrupted in down_interruptible() on the semaphore.

Reported-by: Robert Lippert <rlippert@google.com>
Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-sensor.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c
index aa267f120033..0a7074bb91dc 100644
--- a/arch/powerpc/platforms/powernv/opal-sensor.c
+++ b/arch/powerpc/platforms/powernv/opal-sensor.c
@@ -19,13 +19,10 @@
  */
 
 #include <linux/delay.h>
-#include <linux/mutex.h>
 #include <linux/of_platform.h>
 #include <asm/opal.h>
 #include <asm/machdep.h>
 
-static DEFINE_MUTEX(opal_sensor_mutex);
-
 /*
  * This will return sensor information to driver based on the requested sensor
  * handle. A handle is an opaque id for the powernv, read by the driver from the
@@ -38,13 +35,9 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data)
 	__be32 data;
 
 	token = opal_async_get_token_interruptible();
-	if (token < 0) {
-		pr_err("%s: Couldn't get the token, returning\n", __func__);
-		ret = token;
-		goto out;
-	}
+	if (token < 0)
+		return token;
 
-	mutex_lock(&opal_sensor_mutex);
 	ret = opal_sensor_read(sensor_hndl, token, &data);
 	switch (ret) {
 	case OPAL_ASYNC_COMPLETION:
@@ -52,7 +45,7 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data)
 		if (ret) {
 			pr_err("%s: Failed to wait for the async response, %d\n",
 			       __func__, ret);
-			goto out_token;
+			goto out;
 		}
 
 		ret = opal_error_code(opal_get_async_rc(msg));
@@ -73,10 +66,8 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data)
 		break;
 	}
 
-out_token:
-	mutex_unlock(&opal_sensor_mutex);
-	opal_async_release_token(token);
 out:
+	opal_async_release_token(token);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(opal_get_sensor_data);
-- 
cgit v1.2.3


From 9aab24495c5644b25ced0d11816cc3c061bf74fc Mon Sep 17 00:00:00 2001
From: Cyril Bur <cyrilbur@gmail.com>
Date: Fri, 3 Nov 2017 13:41:44 +1100
Subject: powerpc/opal: Add opal_async_wait_response_interruptible() to
 opal-async

This patch adds an _interruptible version of opal_async_wait_response().
This is useful when a long running OPAL call is performed on behalf of
a userspace thread, for example, the opal_flash_{read,write,erase}
functions performed by the powernv-flash MTD driver.

It is foreseeable that these functions would take upwards of two
minutes causing the wait_event() to block long enough to cause hung
task warnings. Furthermore, wait_event_interruptible() is preferable
as otherwise there is no way for signals to stop the process which is
going to be confusing in userspace.

Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-async.c | 84 +++++++++++++++++++++++++++--
 1 file changed, 80 insertions(+), 4 deletions(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c
index a8a57310759a..18a355fa15e8 100644
--- a/arch/powerpc/platforms/powernv/opal-async.c
+++ b/arch/powerpc/platforms/powernv/opal-async.c
@@ -26,6 +26,8 @@
 enum opal_async_token_state {
 	ASYNC_TOKEN_UNALLOCATED = 0,
 	ASYNC_TOKEN_ALLOCATED,
+	ASYNC_TOKEN_DISPATCHED,
+	ASYNC_TOKEN_ABANDONED,
 	ASYNC_TOKEN_COMPLETED
 };
 
@@ -61,8 +63,9 @@ static int __opal_async_get_token(void)
 
 /*
  * Note: If the returned token is used in an opal call and opal returns
- * OPAL_ASYNC_COMPLETION you MUST call opal_async_wait_response() before
- * calling another other opal_async_* function
+ * OPAL_ASYNC_COMPLETION you MUST call one of opal_async_wait_response() or
+ * opal_async_wait_response_interruptible() at least once before calling another
+ * opal_async_* function
  */
 int opal_async_get_token_interruptible(void)
 {
@@ -98,6 +101,14 @@ static int __opal_async_release_token(int token)
 		opal_async_tokens[token].state = ASYNC_TOKEN_UNALLOCATED;
 		rc = 0;
 		break;
+	/*
+	 * DISPATCHED and ABANDONED tokens must wait for OPAL to respond.
+	 * Mark a DISPATCHED token as ABANDONED so that the response handling
+	 * code knows no one cares and that it can free it then.
+	 */
+	case ASYNC_TOKEN_DISPATCHED:
+		opal_async_tokens[token].state = ASYNC_TOKEN_ABANDONED;
+		/* Fall through */
 	default:
 		rc = 1;
 	}
@@ -130,7 +141,11 @@ int opal_async_wait_response(uint64_t token, struct opal_msg *msg)
 		return -EINVAL;
 	}
 
-	/* Wakeup the poller before we wait for events to speed things
+	/*
+	 * There is no need to mark the token as dispatched, wait_event()
+	 * will block until the token completes.
+	 *
+	 * Wakeup the poller before we wait for events to speed things
 	 * up on platforms or simulators where the interrupts aren't
 	 * functional.
 	 */
@@ -143,11 +158,66 @@ int opal_async_wait_response(uint64_t token, struct opal_msg *msg)
 }
 EXPORT_SYMBOL_GPL(opal_async_wait_response);
 
+int opal_async_wait_response_interruptible(uint64_t token, struct opal_msg *msg)
+{
+	unsigned long flags;
+	int ret;
+
+	if (token >= opal_max_async_tokens) {
+		pr_err("%s: Invalid token passed\n", __func__);
+		return -EINVAL;
+	}
+
+	if (!msg) {
+		pr_err("%s: Invalid message pointer passed\n", __func__);
+		return -EINVAL;
+	}
+
+	/*
+	 * The first time this gets called we mark the token as DISPATCHED
+	 * so that if wait_event_interruptible() returns not zero and the
+	 * caller frees the token, we know not to actually free the token
+	 * until the response comes.
+	 *
+	 * Only change if the token is ALLOCATED - it may have been
+	 * completed even before the caller gets around to calling this
+	 * the first time.
+	 *
+	 * There is also a dirty great comment at the token allocation
+	 * function that if the opal call returns OPAL_ASYNC_COMPLETION to
+	 * the caller then the caller *must* call this or the not
+	 * interruptible version before doing anything else with the
+	 * token.
+	 */
+	if (opal_async_tokens[token].state == ASYNC_TOKEN_ALLOCATED) {
+		spin_lock_irqsave(&opal_async_comp_lock, flags);
+		if (opal_async_tokens[token].state == ASYNC_TOKEN_ALLOCATED)
+			opal_async_tokens[token].state = ASYNC_TOKEN_DISPATCHED;
+		spin_unlock_irqrestore(&opal_async_comp_lock, flags);
+	}
+
+	/*
+	 * Wakeup the poller before we wait for events to speed things
+	 * up on platforms or simulators where the interrupts aren't
+	 * functional.
+	 */
+	opal_wake_poller();
+	ret = wait_event_interruptible(opal_async_wait,
+			opal_async_tokens[token].state ==
+			ASYNC_TOKEN_COMPLETED);
+	if (!ret)
+		memcpy(msg, &opal_async_tokens[token].response, sizeof(*msg));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(opal_async_wait_response_interruptible);
+
 /* Called from interrupt context */
 static int opal_async_comp_event(struct notifier_block *nb,
 		unsigned long msg_type, void *msg)
 {
 	struct opal_msg *comp_msg = msg;
+	enum opal_async_token_state state;
 	unsigned long flags;
 	uint64_t token;
 
@@ -155,11 +225,17 @@ static int opal_async_comp_event(struct notifier_block *nb,
 		return 0;
 
 	token = be64_to_cpu(comp_msg->params[0]);
-	memcpy(&opal_async_tokens[token].response, comp_msg, sizeof(*comp_msg));
 	spin_lock_irqsave(&opal_async_comp_lock, flags);
+	state = opal_async_tokens[token].state;
 	opal_async_tokens[token].state = ASYNC_TOKEN_COMPLETED;
 	spin_unlock_irqrestore(&opal_async_comp_lock, flags);
 
+	if (state == ASYNC_TOKEN_ABANDONED) {
+		/* Free the token, no one else will */
+		opal_async_release_token(token);
+		return 0;
+	}
+	memcpy(&opal_async_tokens[token].response, comp_msg, sizeof(*comp_msg));
 	wake_up(&opal_async_wait);
 
 	return 0;
-- 
cgit v1.2.3


From 77adbd2207e858f5923aa94e4a7d2f29f09217ed Mon Sep 17 00:00:00 2001
From: Cyril Bur <cyrilbur@gmail.com>
Date: Fri, 3 Nov 2017 13:41:45 +1100
Subject: powerpc/powernv: Add OPAL_BUSY to opal_error_code()

Also export opal_error_code() so that it can be used in modules

Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 65c79ecf5a4d..041ddbd1fc57 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -998,6 +998,7 @@ int opal_error_code(int rc)
 
 	case OPAL_PARAMETER:		return -EINVAL;
 	case OPAL_ASYNC_COMPLETION:	return -EINPROGRESS;
+	case OPAL_BUSY:
 	case OPAL_BUSY_EVENT:		return -EBUSY;
 	case OPAL_NO_MEM:		return -ENOMEM;
 	case OPAL_PERMISSION:		return -EPERM;
@@ -1037,3 +1038,4 @@ EXPORT_SYMBOL_GPL(opal_write_oppanel_async);
 /* Export this for KVM */
 EXPORT_SYMBOL_GPL(opal_int_set_mfrr);
 EXPORT_SYMBOL_GPL(opal_int_eoi);
+EXPORT_SYMBOL_GPL(opal_error_code);
-- 
cgit v1.2.3


From cd77b5ce208c153260ed7882d8910f2395bfaabd Mon Sep 17 00:00:00 2001
From: Shriya <shriyak@linux.vnet.ibm.com>
Date: Fri, 13 Oct 2017 10:06:41 +0530
Subject: powerpc/powernv/cpufreq: Fix the frequency read by /proc/cpuinfo

The call to /proc/cpuinfo in turn calls cpufreq_quick_get() which
returns the last frequency requested by the kernel, but may not
reflect the actual frequency the processor is running at. This patch
makes a call to cpufreq_get() instead which returns the current
frequency reported by the hardware.

Fixes: fb5153d05a7d ("powerpc: powernv: Implement ppc_md.get_proc_freq()")
Signed-off-by: Shriya <shriyak@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index d23f148a11f0..62f4a5ad8594 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -335,7 +335,7 @@ static unsigned long pnv_get_proc_freq(unsigned int cpu)
 {
 	unsigned long ret_freq;
 
-	ret_freq = cpufreq_quick_get(cpu) * 1000ul;
+	ret_freq = cpufreq_get(cpu) * 1000ul;
 
 	/*
 	 * If the backend cpufreq driver does not exist,
-- 
cgit v1.2.3


From 9003a249815a15704f415954039d1c7ea27da9ad Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Tue, 7 Nov 2017 14:43:01 +1100
Subject: powerpc/powernv/ioda: Remove explicit max window size check

DMA windows can only have a size of power of two on IODA2 hardware and
using memory_hotplug_max() to determine the upper limit won't work
correcly if it returns not power of two value.

This removes the check as the platform code does this check in
pnv_pci_ioda2_setup_default_config() anyway; the other client is VFIO
and that thing checks against locked_vm limit which prevents the userspace
from locking too much memory.

It is expected to impact DPDK on machines with non-power-of-two RAM size,
mostly. KVM guests are less likely to be affected as usually guests get
less than half of hosts RAM.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 7e87867984e7..749055553064 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2797,7 +2797,7 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 	if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
 		return -EINVAL;
 
-	if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size))
+	if (!is_power_of_2(window_size))
 		return -EINVAL;
 
 	/* Adjust direct table size from window_size and levels */
-- 
cgit v1.2.3


From e34917fbee1226144c94413697ddbf2d5b06d0d3 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:41 -0800
Subject: powerpc/vas: init missing fields from [rt]xattr

Initialize a few missing window context fields from the window attributes
specified by the caller. These fields are currently set to their default
values by the caller (NX-842), but would be good to apply them anyway.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas-window.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index 5aae845b8cd9..cec7ab7119df 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -679,10 +679,13 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin,
 
 	winctx->nx_win = rxattr->nx_win;
 	winctx->fault_win = rxattr->fault_win;
+	winctx->user_win = rxattr->user_win;
+	winctx->rej_no_credit = rxattr->rej_no_credit;
 	winctx->rx_word_mode = rxattr->rx_win_ord_mode;
 	winctx->tx_word_mode = rxattr->tx_win_ord_mode;
 	winctx->rx_wcred_mode = rxattr->rx_wcred_mode;
 	winctx->tx_wcred_mode = rxattr->tx_wcred_mode;
+	winctx->notify_early = rxattr->notify_early;
 
 	if (winctx->nx_win) {
 		winctx->data_stamp = true;
@@ -889,11 +892,14 @@ static void init_winctx_for_txwin(struct vas_window *txwin,
 	winctx->user_win = txattr->user_win;
 	winctx->nx_win = txwin->rxwin->nx_win;
 	winctx->pin_win = txattr->pin_win;
+	winctx->rej_no_credit = txattr->rej_no_credit;
+	winctx->rsvd_txbuf_enable = txattr->rsvd_txbuf_enable;
 
 	winctx->rx_wcred_mode = txattr->rx_wcred_mode;
 	winctx->tx_wcred_mode = txattr->tx_wcred_mode;
 	winctx->rx_word_mode = txattr->rx_win_ord_mode;
 	winctx->tx_word_mode = txattr->tx_win_ord_mode;
+	winctx->rsvd_txbuf_count = txattr->rsvd_txbuf_count;
 
 	if (winctx->nx_win) {
 		winctx->data_stamp = true;
-- 
cgit v1.2.3


From 51b537124fc24074aee67cae9ca94ec4d9c204fc Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:42 -0800
Subject: powerpc/vas: Validate window credits

NX-842, the only user of VAS, sets the window credits to default values
but VAS should check the credits against the possible max values.

The VAS_WCREDS_MIN is not needed and can be dropped.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas-window.c | 6 ++++++
 arch/powerpc/platforms/powernv/vas.h        | 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index cec7ab7119df..a2fe120ac06d 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -738,6 +738,9 @@ static bool rx_win_args_valid(enum vas_cop_type cop,
 	if (attr->rx_fifo_size > VAS_RX_FIFO_SIZE_MAX)
 		return false;
 
+	if (attr->wcreds_max > VAS_RX_WCREDS_MAX)
+		return false;
+
 	if (attr->nx_win) {
 		/* cannot be fault or user window if it is nx */
 		if (attr->fault_win || attr->user_win)
@@ -927,6 +930,9 @@ static bool tx_win_args_valid(enum vas_cop_type cop,
 	if (cop > VAS_COP_TYPE_MAX)
 		return false;
 
+	if (attr->wcreds_max > VAS_TX_WCREDS_MAX)
+		return false;
+
 	if (attr->user_win &&
 			(cop != VAS_COP_TYPE_FTW || attr->rsvd_txbuf_count))
 		return false;
diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h
index 38dee5d50f31..fea0de44f076 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -106,8 +106,8 @@
  *
  * TODO: Needs tuning for per-process credits
  */
-#define VAS_WCREDS_MIN			16
-#define VAS_WCREDS_MAX			((64 << 10) - 1)
+#define VAS_RX_WCREDS_MAX		((64 << 10) - 1)
+#define VAS_TX_WCREDS_MAX		((4 << 10) - 1)
 #define VAS_WCREDS_DEFAULT		(1 << 10)
 
 /*
-- 
cgit v1.2.3


From 0a2c2c24cf78473da785654361ec957f129f4820 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:43 -0800
Subject: powerpc/vas: Cleanup some debug code

Clean up vas.h and the debug code around ifdef vas_debug.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas-window.c |  8 +++--
 arch/powerpc/platforms/powernv/vas.h        | 54 ++++++-----------------------
 2 files changed, 17 insertions(+), 45 deletions(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index a2fe120ac06d..67ffc5d994cc 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -726,7 +726,10 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin,
 static bool rx_win_args_valid(enum vas_cop_type cop,
 			struct vas_rx_win_attr *attr)
 {
-	dump_rx_win_attr(attr);
+	pr_debug("Rxattr: fault %d, notify %d, intr %d, early %d, fifo %d\n",
+			attr->fault_win, attr->notify_disable,
+			attr->intr_disable, attr->notify_early,
+			attr->rx_fifo_size);
 
 	if (cop >= VAS_COP_TYPE_MAX)
 		return false;
@@ -1050,7 +1053,8 @@ int vas_paste_crb(struct vas_window *txwin, int offset, bool re)
 	else
 		rc = -EINVAL;
 
-	print_fifo_msg_count(txwin);
+	pr_debug("Txwin #%d: Msg count %llu\n", txwin->winid,
+			read_hvwc_reg(txwin, VREG(LRFIFO_PUSH)));
 
 	return rc;
 }
diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h
index fea0de44f076..63e8e037eda0 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -258,6 +258,16 @@
 #define VAS_NX_UTIL_ADDER_OFFSET	0x180
 #define VAS_NX_UTIL_ADDER		PPC_BITMASK(32, 63)
 
+/*
+ * VREG(x):
+ * Expand a register's short name (eg: LPID) into two parameters:
+ *	- the register's short name in string form ("LPID"), and
+ *	- the name of the macro (eg: VAS_LPID_OFFSET), defining the
+ *	  register's offset in the window context
+ */
+#define VREG_SFX(n, s)	__stringify(n), VAS_##n##s
+#define VREG(r)		VREG_SFX(r, _OFFSET)
+
 /*
  * Local Notify Scope Control Register. (Receive windows only).
  */
@@ -385,43 +395,15 @@ struct vas_winctx {
 
 extern struct vas_instance *find_vas_instance(int vasid);
 
-/*
- * VREG(x):
- * Expand a register's short name (eg: LPID) into two parameters:
- *	- the register's short name in string form ("LPID"), and
- *	- the name of the macro (eg: VAS_LPID_OFFSET), defining the
- *	  register's offset in the window context
- */
-#define VREG_SFX(n, s)	__stringify(n), VAS_##n##s
-#define VREG(r)		VREG_SFX(r, _OFFSET)
-
-#ifdef vas_debug
-static inline void dump_rx_win_attr(struct vas_rx_win_attr *attr)
-{
-	pr_err("fault %d, notify %d, intr %d early %d\n",
-			attr->fault_win, attr->notify_disable,
-			attr->intr_disable, attr->notify_early);
-
-	pr_err("rx_fifo_size %d, max value %d\n",
-				attr->rx_fifo_size, VAS_RX_FIFO_SIZE_MAX);
-}
-
 static inline void vas_log_write(struct vas_window *win, char *name,
 			void *regptr, u64 val)
 {
 	if (val)
-		pr_err("%swin #%d: %s reg %p, val 0x%016llx\n",
+		pr_debug("%swin #%d: %s reg %p, val 0x%016llx\n",
 				win->tx_win ? "Tx" : "Rx", win->winid, name,
 				regptr, val);
 }
 
-#else	/* vas_debug */
-
-#define vas_log_write(win, name, reg, val)
-#define dump_rx_win_attr(attr)
-
-#endif	/* vas_debug */
-
 static inline void write_uwc_reg(struct vas_window *win, char *name,
 			s32 reg, u64 val)
 {
@@ -450,18 +432,4 @@ static inline u64 read_hvwc_reg(struct vas_window *win,
 	return in_be64(win->hvwc_map+reg);
 }
 
-#ifdef vas_debug
-
-static void print_fifo_msg_count(struct vas_window *txwin)
-{
-	uint64_t read_hvwc_reg(struct vas_window *w, char *n, uint64_t o);
-	pr_devel("Winid %d, Msg count %llu\n", txwin->winid,
-			(uint64_t)read_hvwc_reg(txwin, VREG(LRFIFO_PUSH)));
-}
-#else	/* vas_debug */
-
-#define print_fifo_msg_count(window)
-
-#endif	/* vas_debug */
-
 #endif /* _VAS_H */
-- 
cgit v1.2.3


From 4963ac3632dda7433db5149d6abdfc644a8d8ab2 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:44 -0800
Subject: powerpc/vas: Drop poll_window_cast_out().

Polling for window cast out is listed in the spec, but turns out that
it is not strictly necessary and slows down window close. Making it a
stub for now.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas-window.c | 34 ++++++++++++++---------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index 67ffc5d994cc..8ab8a8208347 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1079,25 +1079,25 @@ retry:
 	}
 }
 
+/*
+ * Have the hardware cast a window out of cache and wait for it to
+ * be completed.
+ *
+ * NOTE: It can take a relatively long time to cast the window context
+ *	out of the cache. It is not strictly necessary to cast out if:
+ *
+ *	- we clear the "Pin Window" bit (so hardware is free to evict)
+ *
+ *	- we re-initialize the window context when it is reassigned.
+ *
+ *	We do the former in vas_win_close() and latter in vas_win_open().
+ *	So, ignoring the cast-out for now. We can add it as needed. If
+ *	casting out becomes necessary we should consider offloading the
+ *	job to a worker thread, so the window close can proceed quickly.
+ */
 static void poll_window_castout(struct vas_window *window)
 {
-	int cached;
-	u64 val;
-
-	/* Cast window context out of the cache */
-retry:
-	val = read_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL));
-	cached = GET_FIELD(VAS_WIN_CACHE_STATUS, val);
-	if (cached) {
-		val = 0ULL;
-		val = SET_FIELD(VAS_CASTOUT_REQ, val, 1);
-		val = SET_FIELD(VAS_PUSH_TO_MEM, val, 0);
-		write_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL), val);
-
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		schedule_timeout(HZ);
-		goto retry;
-	}
+	/* stub for now */
 }
 
 /*
-- 
cgit v1.2.3


From 36a288fe9dab9a6b0b50ffdb5c34f04c42cee2ac Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:45 -0800
Subject: powerpc/vas: Use helper to unpin/close window

Use a helper to have the hardware unpin and mark a window closed.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas-window.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index 8ab8a8208347..95622a984b05 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1100,6 +1100,20 @@ static void poll_window_castout(struct vas_window *window)
 	/* stub for now */
 }
 
+/*
+ * Unpin and close a window so no new requests are accepted and the
+ * hardware can evict this window from cache if necessary.
+ */
+static void unpin_close_window(struct vas_window *window)
+{
+	u64 val;
+
+	val = read_hvwc_reg(window, VREG(WINCTL));
+	val = SET_FIELD(VAS_WINCTL_PIN, val, 0);
+	val = SET_FIELD(VAS_WINCTL_OPEN, val, 0);
+	write_hvwc_reg(window, VREG(WINCTL), val);
+}
+
 /*
  * Close a window.
  *
@@ -1114,8 +1128,6 @@ static void poll_window_castout(struct vas_window *window)
  */
 int vas_win_close(struct vas_window *window)
 {
-	u64 val;
-
 	if (!window)
 		return 0;
 
@@ -1131,11 +1143,7 @@ int vas_win_close(struct vas_window *window)
 
 	poll_window_busy_state(window);
 
-	/* Unpin window from cache and close it */
-	val = read_hvwc_reg(window, VREG(WINCTL));
-	val = SET_FIELD(VAS_WINCTL_PIN, val, 0);
-	val = SET_FIELD(VAS_WINCTL_OPEN, val, 0);
-	write_hvwc_reg(window, VREG(WINCTL), val);
+	unpin_close_window(window);
 
 	poll_window_castout(window);
 
-- 
cgit v1.2.3


From dfe954e4456277effffb2c5add47fa25390f8cea Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:46 -0800
Subject: powerpc/vas: Reduce polling interval for busy state

A VAS window is normally in "busy" state for only a short duration.
Reduce the time we wait for the window to go to "not-busy" state to
speed-up vas_win_close() a bit.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas-window.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index 95622a984b05..1422cdd7d917 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1060,21 +1060,23 @@ int vas_paste_crb(struct vas_window *txwin, int offset, bool re)
 }
 EXPORT_SYMBOL_GPL(vas_paste_crb);
 
+/*
+ * Wait for the window to go to "not-busy" state. It should only take a
+ * short time to queue a CRB, so window should not be busy for too long.
+ * Trying 5ms intervals.
+ */
 static void poll_window_busy_state(struct vas_window *window)
 {
 	int busy;
 	u64 val;
 
 retry:
-	/*
-	 * Poll Window Busy flag
-	 */
 	val = read_hvwc_reg(window, VREG(WIN_STATUS));
 	busy = GET_FIELD(VAS_WIN_BUSY, val);
 	if (busy) {
 		val = 0;
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		schedule_timeout(HZ);
+		schedule_timeout(msecs_to_jiffies(5));
 		goto retry;
 	}
 }
-- 
cgit v1.2.3


From 62f659e08ccd657ead6901011f5e542dbdc477c5 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:47 -0800
Subject: powerpc/vas: Save configured window credits

Save the configured max window credits for a window in the vas_window
structure. We will need this when polling for return of window credits.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas-window.c | 6 ++++--
 arch/powerpc/platforms/powernv/vas.h        | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index 1422cdd7d917..a59a187c0cd1 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -674,7 +674,7 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin,
 
 	winctx->rx_fifo = rxattr->rx_fifo;
 	winctx->rx_fifo_size = rxattr->rx_fifo_size;
-	winctx->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT;
+	winctx->wcreds_max = rxwin->wcreds_max;
 	winctx->pin_win = rxattr->pin_win;
 
 	winctx->nx_win = rxattr->nx_win;
@@ -844,6 +844,7 @@ struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop,
 	rxwin->nx_win = rxattr->nx_win;
 	rxwin->user_win = rxattr->user_win;
 	rxwin->cop = cop;
+	rxwin->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT;
 	if (rxattr->user_win)
 		rxwin->pid = task_pid_vnr(current);
 
@@ -893,7 +894,7 @@ static void init_winctx_for_txwin(struct vas_window *txwin,
 	 */
 	memset(winctx, 0, sizeof(struct vas_winctx));
 
-	winctx->wcreds_max = txattr->wcreds_max ?: VAS_WCREDS_DEFAULT;
+	winctx->wcreds_max = txwin->wcreds_max;
 
 	winctx->user_win = txattr->user_win;
 	winctx->nx_win = txwin->rxwin->nx_win;
@@ -978,6 +979,7 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
 	txwin->nx_win = txwin->rxwin->nx_win;
 	txwin->pid = attr->pid;
 	txwin->user_win = attr->user_win;
+	txwin->wcreds_max = attr->wcreds_max ?: VAS_WCREDS_DEFAULT;
 
 	init_winctx_for_txwin(txwin, attr, &winctx);
 
diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h
index 63e8e037eda0..02d8a31d9051 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -332,6 +332,7 @@ struct vas_window {
 	void *hvwc_map;		/* HV window context */
 	void *uwc_map;		/* OS/User window context */
 	pid_t pid;		/* Linux process id of owner */
+	int wcreds_max;		/* Window credits */
 
 	/* Fields applicable only to send windows */
 	void *paste_kaddr;
-- 
cgit v1.2.3


From 6fccac16c578c699bf0714a6c930b0ceb81305a0 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:48 -0800
Subject: powerpc/vas: poll for return of window credits

Normally, the NX driver waits for the CRBs to be processed before closing
the window. But it is better to ensure that the credits are returned before
the window gets reassigned later.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas-window.c | 45 +++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index a59a187c0cd1..23c13a7dcf89 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1062,6 +1062,49 @@ int vas_paste_crb(struct vas_window *txwin, int offset, bool re)
 }
 EXPORT_SYMBOL_GPL(vas_paste_crb);
 
+/*
+ * If credit checking is enabled for this window, poll for the return
+ * of window credits (i.e for NX engines to process any outstanding CRBs).
+ * Since NX-842 waits for the CRBs to be processed before closing the
+ * window, we should not have to wait for too long.
+ *
+ * TODO: We retry in 10ms intervals now. We could/should probably peek at
+ *	the VAS_LRFIFO_PUSH_OFFSET register to get an estimate of pending
+ *	CRBs on the FIFO and compute the delay dynamically on each retry.
+ *	But that is not really needed until we support NX-GZIP access from
+ *	user space. (NX-842 driver waits for CSB and Fast thread-wakeup
+ *	doesn't use credit checking).
+ */
+static void poll_window_credits(struct vas_window *window)
+{
+	u64 val;
+	int creds, mode;
+
+	val = read_hvwc_reg(window, VREG(WINCTL));
+	if (window->tx_win)
+		mode = GET_FIELD(VAS_WINCTL_TX_WCRED_MODE, val);
+	else
+		mode = GET_FIELD(VAS_WINCTL_RX_WCRED_MODE, val);
+
+	if (!mode)
+		return;
+retry:
+	if (window->tx_win) {
+		val = read_hvwc_reg(window, VREG(TX_WCRED));
+		creds = GET_FIELD(VAS_TX_WCRED, val);
+	} else {
+		val = read_hvwc_reg(window, VREG(LRX_WCRED));
+		creds = GET_FIELD(VAS_LRX_WCRED, val);
+	}
+
+	if (creds < window->wcreds_max) {
+		val = 0;
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(msecs_to_jiffies(10));
+		goto retry;
+	}
+}
+
 /*
  * Wait for the window to go to "not-busy" state. It should only take a
  * short time to queue a CRB, so window should not be busy for too long.
@@ -1149,6 +1192,8 @@ int vas_win_close(struct vas_window *window)
 
 	unpin_close_window(window);
 
+	poll_window_credits(window);
+
 	poll_window_castout(window);
 
 	/* if send window, drop reference to matching receive window */
-- 
cgit v1.2.3


From ca03258b6b338b392c778bed9c7dd56e6a513012 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:49 -0800
Subject: powerpc/vas: Create cpu to vas id mapping

Create a cpu to vasid mapping so callers can specify -1 instead of
trying to find a VAS id.

Changelog[v2]
	[Michael Ellerman] Use per-cpu variables to simplify code.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c
index 565a4878fefa..abb7090a22b4 100644
--- a/arch/powerpc/platforms/powernv/vas.c
+++ b/arch/powerpc/platforms/powernv/vas.c
@@ -18,15 +18,18 @@
 #include <linux/of_platform.h>
 #include <linux/of_address.h>
 #include <linux/of.h>
+#include <asm/prom.h>
 
 #include "vas.h"
 
 static DEFINE_MUTEX(vas_mutex);
 static LIST_HEAD(vas_instances);
 
+static DEFINE_PER_CPU(int, cpu_vas_id);
+
 static int init_vas_instance(struct platform_device *pdev)
 {
-	int rc, vasid;
+	int rc, cpu, vasid;
 	struct resource *res;
 	struct vas_instance *vinst;
 	struct device_node *dn = pdev->dev.of_node;
@@ -74,6 +77,11 @@ static int init_vas_instance(struct platform_device *pdev)
 			"paste_win_id_shift 0x%llx\n", pdev->name, vasid,
 			vinst->paste_base_addr, vinst->paste_win_id_shift);
 
+	for_each_possible_cpu(cpu) {
+		if (cpu_to_chip_id(cpu) == of_get_ibm_chip_id(dn))
+			per_cpu(cpu_vas_id, cpu) = vasid;
+	}
+
 	mutex_lock(&vas_mutex);
 	list_add(&vinst->node, &vas_instances);
 	mutex_unlock(&vas_mutex);
@@ -98,6 +106,10 @@ struct vas_instance *find_vas_instance(int vasid)
 	struct vas_instance *vinst;
 
 	mutex_lock(&vas_mutex);
+
+	if (vasid == -1)
+		vasid = per_cpu(cpu_vas_id, smp_processor_id());
+
 	list_for_each(ent, &vas_instances) {
 		vinst = list_entry(ent, struct vas_instance, node);
 		if (vinst->vas_id == vasid) {
-- 
cgit v1.2.3


From d4ef61b5e8955fb913e2e1a6c1533414859a839d Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:50 -0800
Subject: powerpc/vas, nx-842: Define and use chip_to_vas_id()

Define a helper, chip_to_vas_id() to map a given chip id to corresponding
vas id.

Normally, callers of vas_rx_win_open() and vas_tx_win_open() want the VAS
window to be on the same chip where the calling thread is executing. These
callers can pass in -1 for the VAS id.

This interface will be useful if a thread running on one chip wants to open
a window on another chip (like the NX-842 driver does during start up).

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c
index abb7090a22b4..cd9a733d05e2 100644
--- a/arch/powerpc/platforms/powernv/vas.c
+++ b/arch/powerpc/platforms/powernv/vas.c
@@ -123,6 +123,17 @@ struct vas_instance *find_vas_instance(int vasid)
 	return NULL;
 }
 
+int chip_to_vas_id(int chipid)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		if (cpu_to_chip_id(cpu) == chipid)
+			return per_cpu(cpu_vas_id, cpu);
+	}
+	return -1;
+}
+
 static int vas_probe(struct platform_device *pdev)
 {
 	return init_vas_instance(pdev);
-- 
cgit v1.2.3


From ece4e51291485bb4a71ff554964948b02ab89823 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:51 -0800
Subject: powerpc/vas: Export HVWC to debugfs

Export the VAS Window context information to debugfs.

We need to hold a mutex when closing the window to prevent a race
with the debugfs read(). Rather than introduce a per-instance mutex,
we use the global vas_mutex for now, since it is not heavily contended.

The window->cop field is only relevant to a receive window so we were
not setting it for a send window (which is is paired to a receive window
anyway). But to simplify reporting in debugfs, set the 'cop' field for the
send window also.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/Makefile     |   3 +-
 arch/powerpc/platforms/powernv/vas-debug.c  | 209 ++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/vas-window.c |  34 ++++-
 arch/powerpc/platforms/powernv/vas.c        |   6 +-
 arch/powerpc/platforms/powernv/vas.h        |  14 ++
 5 files changed, 257 insertions(+), 9 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/vas-debug.c

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 37d60f7dd86d..17921c45d10b 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -14,4 +14,5 @@ obj-$(CONFIG_TRACEPOINTS)	+= opal-tracepoints.o
 obj-$(CONFIG_OPAL_PRD)	+= opal-prd.o
 obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
 obj-$(CONFIG_PPC_MEMTRACE)	+= memtrace.o
-obj-$(CONFIG_PPC_VAS)	+= vas.o vas-window.o
+obj-$(CONFIG_PPC_VAS)	+= vas.o vas-window.o vas-debug.o
+obj-$(CONFIG_PPC_FTW)	+= nx-ftw.o
diff --git a/arch/powerpc/platforms/powernv/vas-debug.c b/arch/powerpc/platforms/powernv/vas-debug.c
new file mode 100644
index 000000000000..ca22f1eae050
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas-debug.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright 2016-17 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "vas: " fmt
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include "vas.h"
+
+static struct dentry *vas_debugfs;
+
+static char *cop_to_str(int cop)
+{
+	switch (cop) {
+	case VAS_COP_TYPE_FAULT:	return "Fault";
+	case VAS_COP_TYPE_842:		return "NX-842 Normal Priority";
+	case VAS_COP_TYPE_842_HIPRI:	return "NX-842 High Priority";
+	case VAS_COP_TYPE_GZIP:		return "NX-GZIP Normal Priority";
+	case VAS_COP_TYPE_GZIP_HIPRI:	return "NX-GZIP High Priority";
+	case VAS_COP_TYPE_FTW:		return "Fast Thread-wakeup";
+	default:			return "Unknown";
+	}
+}
+
+static int info_dbg_show(struct seq_file *s, void *private)
+{
+	struct vas_window *window = s->private;
+
+	mutex_lock(&vas_mutex);
+
+	/* ensure window is not unmapped */
+	if (!window->hvwc_map)
+		goto unlock;
+
+	seq_printf(s, "Type: %s, %s\n", cop_to_str(window->cop),
+					window->tx_win ? "Send" : "Receive");
+	seq_printf(s, "Pid : %d\n", window->pid);
+
+unlock:
+	mutex_unlock(&vas_mutex);
+	return 0;
+}
+
+static int info_dbg_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, info_dbg_show, inode->i_private);
+}
+
+static const struct file_operations info_fops = {
+	.open		= info_dbg_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static inline void print_reg(struct seq_file *s, struct vas_window *win,
+			char *name, u32 reg)
+{
+	seq_printf(s, "0x%016llx %s\n", read_hvwc_reg(win, name, reg), name);
+}
+
+static int hvwc_dbg_show(struct seq_file *s, void *private)
+{
+	struct vas_window *window = s->private;
+
+	mutex_lock(&vas_mutex);
+
+	/* ensure window is not unmapped */
+	if (!window->hvwc_map)
+		goto unlock;
+
+	print_reg(s, window, VREG(LPID));
+	print_reg(s, window, VREG(PID));
+	print_reg(s, window, VREG(XLATE_MSR));
+	print_reg(s, window, VREG(XLATE_LPCR));
+	print_reg(s, window, VREG(XLATE_CTL));
+	print_reg(s, window, VREG(AMR));
+	print_reg(s, window, VREG(SEIDR));
+	print_reg(s, window, VREG(FAULT_TX_WIN));
+	print_reg(s, window, VREG(OSU_INTR_SRC_RA));
+	print_reg(s, window, VREG(HV_INTR_SRC_RA));
+	print_reg(s, window, VREG(PSWID));
+	print_reg(s, window, VREG(LFIFO_BAR));
+	print_reg(s, window, VREG(LDATA_STAMP_CTL));
+	print_reg(s, window, VREG(LDMA_CACHE_CTL));
+	print_reg(s, window, VREG(LRFIFO_PUSH));
+	print_reg(s, window, VREG(CURR_MSG_COUNT));
+	print_reg(s, window, VREG(LNOTIFY_AFTER_COUNT));
+	print_reg(s, window, VREG(LRX_WCRED));
+	print_reg(s, window, VREG(LRX_WCRED_ADDER));
+	print_reg(s, window, VREG(TX_WCRED));
+	print_reg(s, window, VREG(TX_WCRED_ADDER));
+	print_reg(s, window, VREG(LFIFO_SIZE));
+	print_reg(s, window, VREG(WINCTL));
+	print_reg(s, window, VREG(WIN_STATUS));
+	print_reg(s, window, VREG(WIN_CTX_CACHING_CTL));
+	print_reg(s, window, VREG(TX_RSVD_BUF_COUNT));
+	print_reg(s, window, VREG(LRFIFO_WIN_PTR));
+	print_reg(s, window, VREG(LNOTIFY_CTL));
+	print_reg(s, window, VREG(LNOTIFY_PID));
+	print_reg(s, window, VREG(LNOTIFY_LPID));
+	print_reg(s, window, VREG(LNOTIFY_TID));
+	print_reg(s, window, VREG(LNOTIFY_SCOPE));
+	print_reg(s, window, VREG(NX_UTIL_ADDER));
+unlock:
+	mutex_unlock(&vas_mutex);
+	return 0;
+}
+
+static int hvwc_dbg_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, hvwc_dbg_show, inode->i_private);
+}
+
+static const struct file_operations hvwc_fops = {
+	.open		= hvwc_dbg_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+void vas_window_free_dbgdir(struct vas_window *window)
+{
+	if (window->dbgdir) {
+		debugfs_remove_recursive(window->dbgdir);
+		kfree(window->dbgname);
+		window->dbgdir = NULL;
+		window->dbgname = NULL;
+	}
+}
+
+void vas_window_init_dbgdir(struct vas_window *window)
+{
+	struct dentry *f, *d;
+
+	if (!window->vinst->dbgdir)
+		return;
+
+	window->dbgname = kzalloc(16, GFP_KERNEL);
+	if (!window->dbgname)
+		return;
+
+	snprintf(window->dbgname, 16, "w%d", window->winid);
+
+	d = debugfs_create_dir(window->dbgname, window->vinst->dbgdir);
+	if (IS_ERR(d))
+		goto free_name;
+
+	window->dbgdir = d;
+
+	f = debugfs_create_file("info", 0444, d, window, &info_fops);
+	if (IS_ERR(f))
+		goto remove_dir;
+
+	f = debugfs_create_file("hvwc", 0444, d, window, &hvwc_fops);
+	if (IS_ERR(f))
+		goto remove_dir;
+
+	return;
+
+free_name:
+	kfree(window->dbgname);
+	window->dbgname = NULL;
+
+remove_dir:
+	debugfs_remove_recursive(window->dbgdir);
+	window->dbgdir = NULL;
+}
+
+void vas_instance_init_dbgdir(struct vas_instance *vinst)
+{
+	struct dentry *d;
+
+	if (!vas_debugfs)
+		return;
+
+	vinst->dbgname = kzalloc(16, GFP_KERNEL);
+	if (!vinst->dbgname)
+		return;
+
+	snprintf(vinst->dbgname, 16, "v%d", vinst->vas_id);
+
+	d = debugfs_create_dir(vinst->dbgname, vas_debugfs);
+	if (IS_ERR(d))
+		goto free_name;
+
+	vinst->dbgdir = d;
+	return;
+
+free_name:
+	kfree(vinst->dbgname);
+	vinst->dbgname = NULL;
+	vinst->dbgdir = NULL;
+}
+
+void vas_init_dbgdir(void)
+{
+	vas_debugfs = debugfs_create_dir("vas", NULL);
+	if (IS_ERR(vas_debugfs))
+		vas_debugfs = NULL;
+}
diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index 23c13a7dcf89..c030d4cf982e 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -145,23 +145,37 @@ static void unmap_paste_region(struct vas_window *window)
 }
 
 /*
- * Unmap the MMIO regions for a window.
+ * Unmap the MMIO regions for a window. Hold the vas_mutex so we don't
+ * unmap when the window's debugfs dir is in use. This serializes close
+ * of a window even on another VAS instance but since its not a critical
+ * path, just minimize the time we hold the mutex for now. We can add
+ * a per-instance mutex later if necessary.
  */
 static void unmap_winctx_mmio_bars(struct vas_window *window)
 {
 	int len;
+	void *uwc_map;
+	void *hvwc_map;
 	u64 busaddr_start;
 
-	if (window->hvwc_map) {
+	mutex_lock(&vas_mutex);
+
+	hvwc_map = window->hvwc_map;
+	window->hvwc_map = NULL;
+
+	uwc_map = window->uwc_map;
+	window->uwc_map = NULL;
+
+	mutex_unlock(&vas_mutex);
+
+	if (hvwc_map) {
 		get_hvwc_mmio_bar(window, &busaddr_start, &len);
-		unmap_region(window->hvwc_map, busaddr_start, len);
-		window->hvwc_map = NULL;
+		unmap_region(hvwc_map, busaddr_start, len);
 	}
 
-	if (window->uwc_map) {
+	if (uwc_map) {
 		get_uwc_mmio_bar(window, &busaddr_start, &len);
-		unmap_region(window->uwc_map, busaddr_start, len);
-		window->uwc_map = NULL;
+		unmap_region(uwc_map, busaddr_start, len);
 	}
 }
 
@@ -528,6 +542,9 @@ static void vas_window_free(struct vas_window *window)
 	struct vas_instance *vinst = window->vinst;
 
 	unmap_winctx_mmio_bars(window);
+
+	vas_window_free_dbgdir(window);
+
 	kfree(window);
 
 	vas_release_window_id(&vinst->ida, winid);
@@ -552,6 +569,8 @@ static struct vas_window *vas_window_alloc(struct vas_instance *vinst)
 	if (map_winctx_mmio_bars(window))
 		goto out_free;
 
+	vas_window_init_dbgdir(window);
+
 	return window;
 
 out_free:
@@ -974,6 +993,7 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
 		goto put_rxwin;
 	}
 
+	txwin->cop = cop;
 	txwin->tx_win = 1;
 	txwin->rxwin = rxwin;
 	txwin->nx_win = txwin->rxwin->nx_win;
diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c
index cd9a733d05e2..c488621dbec3 100644
--- a/arch/powerpc/platforms/powernv/vas.c
+++ b/arch/powerpc/platforms/powernv/vas.c
@@ -22,7 +22,7 @@
 
 #include "vas.h"
 
-static DEFINE_MUTEX(vas_mutex);
+DEFINE_MUTEX(vas_mutex);
 static LIST_HEAD(vas_instances);
 
 static DEFINE_PER_CPU(int, cpu_vas_id);
@@ -86,6 +86,8 @@ static int init_vas_instance(struct platform_device *pdev)
 	list_add(&vinst->node, &vas_instances);
 	mutex_unlock(&vas_mutex);
 
+	vas_instance_init_dbgdir(vinst);
+
 	dev_set_drvdata(&pdev->dev, vinst);
 
 	return 0;
@@ -157,6 +159,8 @@ static int __init vas_init(void)
 	int found = 0;
 	struct device_node *dn;
 
+	vas_init_dbgdir();
+
 	platform_driver_register(&vas_driver);
 
 	for_each_compatible_node(dn, NULL, "ibm,vas") {
diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h
index 02d8a31d9051..756cbc5335bc 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -13,6 +13,8 @@
 #include <linux/idr.h>
 #include <asm/vas.h>
 #include <linux/io.h>
+#include <linux/dcache.h>
+#include <linux/mutex.h>
 
 /*
  * Overview of Virtual Accelerator Switchboard (VAS).
@@ -317,6 +319,9 @@ struct vas_instance {
 	struct mutex mutex;
 	struct vas_window *rxwin[VAS_COP_TYPE_MAX];
 	struct vas_window *windows[VAS_WINDOWS_PER_CHIP];
+
+	char *dbgname;
+	struct dentry *dbgdir;
 };
 
 /*
@@ -334,6 +339,9 @@ struct vas_window {
 	pid_t pid;		/* Linux process id of owner */
 	int wcreds_max;		/* Window credits */
 
+	char *dbgname;
+	struct dentry *dbgdir;
+
 	/* Fields applicable only to send windows */
 	void *paste_kaddr;
 	char *paste_addr_name;
@@ -394,7 +402,13 @@ struct vas_winctx {
 	enum vas_notify_after_count notify_after_count;
 };
 
+extern struct mutex vas_mutex;
+
 extern struct vas_instance *find_vas_instance(int vasid);
+extern void vas_init_dbgdir(void);
+extern void vas_instance_init_dbgdir(struct vas_instance *vinst);
+extern void vas_window_init_dbgdir(struct vas_window *win);
+extern void vas_window_free_dbgdir(struct vas_window *win);
 
 static inline void vas_log_write(struct vas_window *win, char *name,
 			void *regptr, u64 val)
-- 
cgit v1.2.3


From 5676be2fb7035ac32da3a96241611e7eddff6157 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:56 -0800
Subject: powerpc/vas: Define vas_win_paste_addr()

Define an interface that the NX drivers can use to find the physical
paste address of a send window. This interface is expected to be used
with the mmap() operation of the NX driver's device. i.e the user space
process can use driver's mmap() operation to map the send window's paste
address into their address space and then use copy and paste instructions
to submit the CRBs to the NX engine.

Note that kernel drivers will use vas_paste_crb() directly and don't need
this interface.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas-window.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index c030d4cf982e..d7d06533a1e9 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -40,6 +40,16 @@ static void compute_paste_address(struct vas_window *window, u64 *addr, int *len
 	pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr);
 }
 
+u64 vas_win_paste_addr(struct vas_window *win)
+{
+	u64 addr;
+
+	compute_paste_address(win, &addr, NULL);
+
+	return addr;
+}
+EXPORT_SYMBOL(vas_win_paste_addr);
+
 static inline void get_hvwc_mmio_bar(struct vas_window *window,
 			u64 *start, int *len)
 {
-- 
cgit v1.2.3


From 61f3cca8cda979646c24accd9dbf3e2de7ea6ceb Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:57 -0800
Subject: powerpc/vas: Define vas_win_id()

Define an interface to return a system-wide unique id for a given VAS
window.

The vas_win_id() will be used in a follow-on patch to generate an unique
handle for a user space receive window. Applications can use this handle
to pair send and receive windows for fast thread-wakeup.

The hardware refers to this system-wide unique id as a Partition Send
Window ID which is expected to be used during fault handling. Hence the
"pswid" in the function names.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas-window.c |  9 +++++++++
 arch/powerpc/platforms/powernv/vas.h        | 28 ++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index d7d06533a1e9..82754924c2bc 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1235,3 +1235,12 @@ int vas_win_close(struct vas_window *window)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(vas_win_close);
+
+/*
+ * Return a system-wide unique window id for the window @win.
+ */
+u32 vas_win_id(struct vas_window *win)
+{
+	return encode_pswid(win->vinst->vas_id, win->winid);
+}
+EXPORT_SYMBOL_GPL(vas_win_id);
diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h
index 756cbc5335bc..ae0100fd35bb 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -447,4 +447,32 @@ static inline u64 read_hvwc_reg(struct vas_window *win,
 	return in_be64(win->hvwc_map+reg);
 }
 
+/*
+ * Encode/decode the Partition Send Window ID (PSWID) for a window in
+ * a way that we can uniquely identify any window in the system. i.e.
+ * we should be able to locate the 'struct vas_window' given the PSWID.
+ *
+ *	Bits	Usage
+ *	0:7	VAS id (8 bits)
+ *	8:15	Unused, 0 (3 bits)
+ *	16:31	Window id (16 bits)
+ */
+static inline u32 encode_pswid(int vasid, int winid)
+{
+	u32 pswid = 0;
+
+	pswid |= vasid << (31 - 7);
+	pswid |= winid;
+
+	return pswid;
+}
+
+static inline void decode_pswid(u32 pswid, int *vasid, int *winid)
+{
+	if (vasid)
+		*vasid = pswid >> (31 - 7) & 0xFF;
+
+	if (winid)
+		*winid = pswid & 0xFFFF;
+}
 #endif /* _VAS_H */
-- 
cgit v1.2.3


From 6c8e6bb2a52d5e7ae5bbde21c21f6d5dfd6e9ae8 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:58 -0800
Subject: powerpc/vas: Add support for user receive window

Add support for user space receive window (for the Fast thread-wakeup
coprocessor type)

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas-window.c | 56 +++++++++++++++++++++++++----
 1 file changed, 49 insertions(+), 7 deletions(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index 82754924c2bc..2b3eb01ab110 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -16,7 +16,8 @@
 #include <linux/log2.h>
 #include <linux/rcupdate.h>
 #include <linux/cred.h>
-
+#include <asm/switch_to.h>
+#include <asm/ppc-opcode.h>
 #include "vas.h"
 #include "copy-paste.h"
 
@@ -597,6 +598,32 @@ static void put_rx_win(struct vas_window *rxwin)
 	atomic_dec(&rxwin->num_txwins);
 }
 
+/*
+ * Find the user space receive window given the @pswid.
+ *      - We must have a valid vasid and it must belong to this instance.
+ *        (so both send and receive windows are on the same VAS instance)
+ *      - The window must refer to an OPEN, FTW, RECEIVE window.
+ *
+ * NOTE: We access ->windows[] table and assume that vinst->mutex is held.
+ */
+static struct vas_window *get_user_rxwin(struct vas_instance *vinst, u32 pswid)
+{
+	int vasid, winid;
+	struct vas_window *rxwin;
+
+	decode_pswid(pswid, &vasid, &winid);
+
+	if (vinst->vas_id != vasid)
+		return ERR_PTR(-EINVAL);
+
+	rxwin = vinst->windows[winid];
+
+	if (!rxwin || rxwin->tx_win || rxwin->cop != VAS_COP_TYPE_FTW)
+		return ERR_PTR(-EINVAL);
+
+	return rxwin;
+}
+
 /*
  * Get the VAS receive window associated with NX engine identified
  * by @cop and if applicable, @pswid.
@@ -610,10 +637,10 @@ static struct vas_window *get_vinst_rxwin(struct vas_instance *vinst,
 
 	mutex_lock(&vinst->mutex);
 
-	if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI)
-		rxwin = vinst->rxwin[cop] ?: ERR_PTR(-EINVAL);
+	if (cop == VAS_COP_TYPE_FTW)
+		rxwin = get_user_rxwin(vinst, pswid);
 	else
-		rxwin = ERR_PTR(-EINVAL);
+		rxwin = vinst->rxwin[cop] ?: ERR_PTR(-EINVAL);
 
 	if (!IS_ERR(rxwin))
 		atomic_inc(&rxwin->num_txwins);
@@ -937,10 +964,9 @@ static void init_winctx_for_txwin(struct vas_window *txwin,
 	winctx->tx_word_mode = txattr->tx_win_ord_mode;
 	winctx->rsvd_txbuf_count = txattr->rsvd_txbuf_count;
 
-	if (winctx->nx_win) {
+	winctx->intr_disable = true;
+	if (winctx->nx_win)
 		winctx->data_stamp = true;
-		winctx->intr_disable = true;
-	}
 
 	winctx->lpid = txattr->lpid;
 	winctx->pidr = txattr->pidr;
@@ -985,6 +1011,14 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
 	if (!tx_win_args_valid(cop, attr))
 		return ERR_PTR(-EINVAL);
 
+	/*
+	 * If caller did not specify a vasid but specified the PSWID of a
+	 * receive window (applicable only to FTW windows), use the vasid
+	 * from that receive window.
+	 */
+	if (vasid == -1 && attr->pswid)
+		decode_pswid(attr->pswid, &vasid, NULL);
+
 	vinst = find_vas_instance(vasid);
 	if (!vinst) {
 		pr_devel("vasid %d not found!\n", vasid);
@@ -1031,6 +1065,14 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
 		}
 	}
 
+	/*
+	 * Now that we have a send window, ensure context switch issues
+	 * CP_ABORT for this thread.
+	 */
+	rc = -EINVAL;
+	if (set_thread_uses_vas() < 0)
+		goto free_window;
+
 	set_vinst_win(vinst, txwin);
 
 	return txwin;
-- 
cgit v1.2.3


From 2a31ad093bb1b8c1f1e04cbe222ad17bc09c4534 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Wed, 6 Sep 2017 11:48:59 +1000
Subject: powerpc/powernv/npu: Use flush_all_mm() instead of flush_tlb_mm()

With the optimisations introduced by commit a46cc7a908 ("powerpc/mm/radix:
Improve TLB/PWC flushes"), flush_tlb_mm() no longer flushes the page walk
cache with radix. Switch to using flush_all_mm() to ensure the pwc and tlb
are properly flushed on the nmmu.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/npu-dma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 2cb6cbea4b3b..2fff9a65975b 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -549,7 +549,7 @@ static void mmio_invalidate(struct npu_context *npu_context, int va,
 	 * Unfortunately the nest mmu does not support flushing specific
 	 * addresses so we have to flush the whole mm.
 	 */
-	flush_tlb_mm(npu_context->mm);
+	flush_all_mm(npu_context->mm);
 
 	/*
 	 * Loop over all the NPUs this process is active on and launch
-- 
cgit v1.2.3


From 1b2c2b12386f9bb009a2249eca00e01a9d76d7c1 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Wed, 6 Sep 2017 11:49:00 +1000
Subject: powerpc/powernv/npu: Don't explicitly flush nmmu tlb

The nest mmu required an explicit flush as a tlbi would not flush it in the
same way as the core. However an alternate firmware fix exists which should
eliminate the need for this flush, so instead add a device-tree property
(ibm,nmmu-flush) on the NVLink2 PHB to enable it only if required.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Reviewed-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/npu-dma.c | 28 +++++++++++++++++++++++-----
 arch/powerpc/platforms/powernv/pci.h     |  3 +++
 2 files changed, 26 insertions(+), 5 deletions(-)

(limited to 'arch/powerpc/platforms')

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 2fff9a65975b..f6cbc1a71472 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -395,6 +395,7 @@ struct npu_context {
 	struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS];
 	struct mmu_notifier mn;
 	struct kref kref;
+	bool nmmu_flush;
 
 	/* Callback to stop translation requests on a given GPU */
 	struct npu_context *(*release_cb)(struct npu_context *, void *);
@@ -545,11 +546,13 @@ static void mmio_invalidate(struct npu_context *npu_context, int va,
 	struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
 	unsigned long pid = npu_context->mm->context.id;
 
-	/*
-	 * Unfortunately the nest mmu does not support flushing specific
-	 * addresses so we have to flush the whole mm.
-	 */
-	flush_all_mm(npu_context->mm);
+	if (npu_context->nmmu_flush)
+		/*
+		 * Unfortunately the nest mmu does not support flushing specific
+		 * addresses so we have to flush the whole mm once before
+		 * shooting down the GPU translation.
+		 */
+		flush_all_mm(npu_context->mm);
 
 	/*
 	 * Loop over all the NPUs this process is active on and launch
@@ -722,6 +725,16 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
 		return ERR_PTR(-ENODEV);
 	npu_context->npdev[npu->index][nvlink_index] = npdev;
 
+	if (!nphb->npu.nmmu_flush) {
+		/*
+		 * If we're not explicitly flushing ourselves we need to mark
+		 * the thread for global flushes
+		 */
+		npu_context->nmmu_flush = false;
+		mm_context_add_copro(mm);
+	} else
+		npu_context->nmmu_flush = true;
+
 	return npu_context;
 }
 EXPORT_SYMBOL(pnv_npu2_init_context);
@@ -731,6 +744,9 @@ static void pnv_npu2_release_context(struct kref *kref)
 	struct npu_context *npu_context =
 		container_of(kref, struct npu_context, kref);
 
+	if (!npu_context->nmmu_flush)
+		mm_context_remove_copro(npu_context->mm);
+
 	npu_context->mm->context.npu_context = NULL;
 	mmu_notifier_unregister(&npu_context->mn,
 				npu_context->mm);
@@ -819,6 +835,8 @@ int pnv_npu2_init(struct pnv_phb *phb)
 	static int npu_index;
 	uint64_t rc = 0;
 
+	phb->npu.nmmu_flush =
+		of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush");
 	for_each_child_of_node(phb->hose->dn, dn) {
 		gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn));
 		if (gpdev) {
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 56d1f272d4ad..96151b3a2dd4 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -187,6 +187,9 @@ struct pnv_phb {
 
 		/* Bitmask for MMIO register usage */
 		unsigned long mmio_atsd_usage;
+
+		/* Do we need to explicitly flush the nest mmu? */
+		bool nmmu_flush;
 	} npu;
 
 #ifdef CONFIG_CXL_BASE
-- 
cgit v1.2.3