From b9fde58db7e5738cacb740b0ec547933fe314fbe Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 7 Sep 2017 16:35:44 +1000 Subject: powerpc/powernv: Rework EEH initialization on powernv Remove the post_init callback which is only used by powernv, we can just call it explicitly from the powernv code. This partially kills the ability to "disable" eeh at runtime via debugfs as this was calling that same callback again, but this is both unused and broken in several ways. If we want to revive it, we need to create a dedicated enable/disable callback on the backend that does the right thing. Let the bulk of eeh initialize normally at core_initcall() like it does on pseries by removing the hack in eeh_init() that delays it. Instead we make sure our eeh->probe cleanly bails out of the PEs haven't been created yet and we force a re-probe where we used to call eeh_init() again. Signed-off-by: Benjamin Herrenschmidt Acked-by: Russell Currey Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/eeh-powernv.c | 42 +++++++++++++++------------- arch/powerpc/platforms/powernv/pci-ioda.c | 3 +- arch/powerpc/platforms/powernv/pci.h | 1 + 3 files changed, 24 insertions(+), 22 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 8864065eba22..4650fb294e7a 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -41,7 +41,6 @@ #include "powernv.h" #include "pci.h" -static bool pnv_eeh_nb_init = false; static int eeh_event_irq = -EINVAL; static int pnv_eeh_init(void) @@ -197,31 +196,31 @@ PNV_EEH_DBGFS_ENTRY(inbB, 0xE10); * been built. If the I/O cache staff has been built, EEH is * ready to supply service. */ -static int pnv_eeh_post_init(void) +int pnv_eeh_post_init(void) { struct pci_controller *hose; struct pnv_phb *phb; int ret = 0; - /* Register OPAL event notifier */ - if (!pnv_eeh_nb_init) { - eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR)); - if (eeh_event_irq < 0) { - pr_err("%s: Can't register OPAL event interrupt (%d)\n", - __func__, eeh_event_irq); - return eeh_event_irq; - } + /* Probe devices & build address cache */ + eeh_probe_devices(); + eeh_addr_cache_build(); - ret = request_irq(eeh_event_irq, pnv_eeh_event, - IRQ_TYPE_LEVEL_HIGH, "opal-eeh", NULL); - if (ret < 0) { - irq_dispose_mapping(eeh_event_irq); - pr_err("%s: Can't request OPAL event interrupt (%d)\n", - __func__, eeh_event_irq); - return ret; - } + /* Register OPAL event notifier */ + eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR)); + if (eeh_event_irq < 0) { + pr_err("%s: Can't register OPAL event interrupt (%d)\n", + __func__, eeh_event_irq); + return eeh_event_irq; + } - pnv_eeh_nb_init = true; + ret = request_irq(eeh_event_irq, pnv_eeh_event, + IRQ_TYPE_LEVEL_HIGH, "opal-eeh", NULL); + if (ret < 0) { + irq_dispose_mapping(eeh_event_irq); + pr_err("%s: Can't request OPAL event interrupt (%d)\n", + __func__, eeh_event_irq); + return ret; } if (!eeh_enabled()) @@ -367,6 +366,10 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) return NULL; + /* Skip if we haven't probed yet */ + if (phb->ioda.pe_rmap[config_addr] == IODA_INVALID_PE) + return NULL; + /* Initialize eeh device */ edev->class_code = pdn->class_code; edev->mode &= 0xFFFFFF00; @@ -1731,7 +1734,6 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn) static struct eeh_ops pnv_eeh_ops = { .name = "powernv", .init = pnv_eeh_init, - .post_init = pnv_eeh_post_init, .probe = pnv_eeh_probe, .set_option = pnv_eeh_set_option, .get_pe_addr = pnv_eeh_get_pe_addr, diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 57f9e55f4352..fb5cd7511189 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3293,8 +3293,7 @@ static void pnv_pci_ioda_fixup(void) pnv_pci_ioda_create_dbgfs(); #ifdef CONFIG_EEH - eeh_init(); - eeh_addr_cache_build(); + pnv_eeh_post_init(); #endif } diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index a95273c524f6..56d1f272d4ad 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -234,6 +234,7 @@ extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev); extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq); extern bool pnv_pci_enable_device_hook(struct pci_dev *dev); extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); +extern int pnv_eeh_post_init(void); extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, const char *fmt, ...); -- cgit v1.2.3 From 5080332c2c893118dbc18755f35c8b0131cf0fc4 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Fri, 15 Sep 2017 15:25:48 +1000 Subject: powerpc/64s: Add workaround for P9 vector CI load issue POWER9 DD2.1 and earlier has an issue where some cache inhibited vector load will return bad data. The workaround is two part, one firmware/microcode part triggers HMI interrupts when hitting such loads, the other part is this patch which then emulates the instructions in Linux. The affected instructions are limited to lxvd2x, lxvw4x, lxvb16x and lxvh8x. When an instruction triggers the HMI, all threads in the core will be sent to the HMI handler, not just the one running the vector load. In general, these spurious HMIs are detected by the emulation code and we just return back to the running process. Unfortunately, if a spurious interrupt occurs on a vector load that's to normal memory we have no way to detect that it's spurious (unless we walk the page tables, which is very expensive). In this case we emulate the load but we need do so using a vector load itself to ensure 128bit atomicity is preserved. Some additional debugfs emulated instruction counters are added also. Signed-off-by: Michael Neuling Signed-off-by: Benjamin Herrenschmidt [mpe: Switch CONFIG_PPC_BOOK3S_64 to CONFIG_VSX to unbreak the build] Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/smp.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index c17f81e433f7..355d3f99cafb 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -49,6 +49,13 @@ static void pnv_smp_setup_cpu(int cpu) { + /* + * P9 workaround for CI vector load (see traps.c), + * enable the corresponding HMI interrupt + */ + if (pvr_version_is(PVR_POWER9)) + mtspr(SPRN_HMEER, mfspr(SPRN_HMEER) | PPC_BIT(17)); + if (xive_enabled()) xive_smp_setup_cpu(); else if (cpu != boot_cpuid) -- cgit v1.2.3 From e36d0a2ed5019184bb9b94ff1138c87c05905789 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 29 Sep 2017 13:29:42 +1000 Subject: powerpc/powernv: Implement NMI IPI with OPAL_SIGNAL_SYSTEM_RESET This allows MSR[EE]=0 lockups to be detected on an OPAL (bare metal) system similarly to the hcall NMI IPI on pseries guests, when the platform/firmware supports it. This is an example of CPU10 spinning with interrupts hard disabled: Watchdog CPU:32 detected Hard LOCKUP other CPUS:10 Watchdog CPU:10 Hard LOCKUP CPU: 10 PID: 4410 Comm: bash Not tainted 4.13.0-rc7-00074-ge89ce1f89f62-dirty #34 task: c0000003a82b4400 task.stack: c0000003af55c000 NIP: c0000000000a7b38 LR: c000000000659044 CTR: c0000000000a7b00 REGS: c00000000fd23d80 TRAP: 0100 Not tainted (4.13.0-rc7-00074-ge89ce1f89f62-dirty) MSR: 90000000000c1033 CR: 28422222 XER: 20000000 CFAR: c0000000000a7b38 SOFTE: 0 GPR00: c000000000659044 c0000003af55fbb0 c000000001072a00 0000000000000078 GPR04: c0000003c81b5c80 c0000003c81cc7e8 9000000000009033 0000000000000000 GPR08: 0000000000000000 c0000000000a7b00 0000000000000001 9000000000001003 GPR12: c0000000000a7b00 c00000000fd83200 0000000010180df8 0000000010189e60 GPR16: 0000000010189ed8 0000000010151270 000000001018bd88 000000001018de78 GPR20: 00000000370a0668 0000000000000001 00000000101645e0 0000000010163c10 GPR24: 00007fffd14d6294 00007fffd14d6290 c000000000fba6f0 0000000000000004 GPR28: c000000000f351d8 0000000000000078 c000000000f4095c 0000000000000000 NIP [c0000000000a7b38] sysrq_handle_xmon+0x38/0x40 LR [c000000000659044] __handle_sysrq+0xe4/0x270 Call Trace: [c0000003af55fbd0] [c000000000659044] __handle_sysrq+0xe4/0x270 [c0000003af55fc70] [c000000000659810] write_sysrq_trigger+0x70/0xa0 [c0000003af55fca0] [c0000000003da650] proc_reg_write+0xb0/0x110 [c0000003af55fcf0] [c0000000003423bc] __vfs_write+0x6c/0x1b0 [c0000003af55fd90] [c000000000344398] vfs_write+0xd8/0x240 [c0000003af55fde0] [c00000000034632c] SyS_write+0x6c/0x110 [c0000003af55fe30] [c00000000000b220] system_call+0x58/0x6c Signed-off-by: Nicholas Piggin [mpe: Use kernel types for opal_signal_system_reset()] Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-wrappers.S | 1 + arch/powerpc/platforms/powernv/setup.c | 1 + arch/powerpc/platforms/powernv/smp.c | 52 ++++++++++++++++++++++++++ 3 files changed, 54 insertions(+) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 8c1ede2d3f7e..37cd170201a2 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -307,6 +307,7 @@ OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC); OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP); +OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET); OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT); OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT); OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR); diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 897aa1400eb8..cf52d53da460 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -282,6 +282,7 @@ static void __init pnv_setup_machdep_opal(void) ppc_md.restart = pnv_restart; pm_power_off = pnv_power_off; ppc_md.halt = pnv_halt; + /* ppc_md.system_reset_exception gets filled in by pnv_smp_init() */ ppc_md.machine_check_exception = opal_machine_check; ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery; ppc_md.hmi_exception_early = opal_hmi_exception_early; diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 355d3f99cafb..ba030669eca1 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -297,6 +297,54 @@ static void __init pnv_smp_probe(void) } } +static int pnv_system_reset_exception(struct pt_regs *regs) +{ + if (smp_handle_nmi_ipi(regs)) + return 1; + return 0; +} + +static int pnv_cause_nmi_ipi(int cpu) +{ + int64_t rc; + + if (cpu >= 0) { + rc = opal_signal_system_reset(get_hard_smp_processor_id(cpu)); + if (rc != OPAL_SUCCESS) + return 0; + return 1; + + } else if (cpu == NMI_IPI_ALL_OTHERS) { + bool success = true; + int c; + + + /* + * We do not use broadcasts (yet), because it's not clear + * exactly what semantics Linux wants or the firmware should + * provide. + */ + for_each_online_cpu(c) { + if (c == smp_processor_id()) + continue; + + rc = opal_signal_system_reset( + get_hard_smp_processor_id(c)); + if (rc != OPAL_SUCCESS) + success = false; + } + if (success) + return 1; + + /* + * Caller will fall back to doorbells, which may pick + * up the remainders. + */ + } + + return 0; +} + static struct smp_ops_t pnv_smp_ops = { .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */ .cause_ipi = NULL, /* Filled at runtime by pnv_smp_probe() */ @@ -315,6 +363,10 @@ static struct smp_ops_t pnv_smp_ops = { /* This is called very early during platform setup_arch */ void __init pnv_smp_init(void) { + if (opal_check_token(OPAL_SIGNAL_SYSTEM_RESET)) { + ppc_md.system_reset_exception = pnv_system_reset_exception; + pnv_smp_ops.cause_nmi_ipi = pnv_cause_nmi_ipi; + } smp_ops = &pnv_smp_ops; #ifdef CONFIG_HOTPLUG_CPU -- cgit v1.2.3 From 01451ad47e2724eb123e87a5bae04e943046b87a Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Fri, 22 Sep 2017 17:05:00 +0530 Subject: powerpc/powermac: Use setup_timer() helper Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powermac/low_i2c.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c index 70183eb3d5c8..39a1d4225e0f 100644 --- a/arch/powerpc/platforms/powermac/low_i2c.c +++ b/arch/powerpc/platforms/powermac/low_i2c.c @@ -513,9 +513,7 @@ static struct pmac_i2c_host_kw *__init kw_i2c_host_init(struct device_node *np) mutex_init(&host->mutex); init_completion(&host->complete); spin_lock_init(&host->lock); - init_timer(&host->timeout_timer); - host->timeout_timer.function = kw_i2c_timeout; - host->timeout_timer.data = (unsigned long)host; + setup_timer(&host->timeout_timer, kw_i2c_timeout, (unsigned long)host); psteps = of_get_property(np, "AAPL,address-step", NULL); steps = psteps ? (*psteps) : 0x10; -- cgit v1.2.3 From c6baa077b784c3b37391a8c11f433e3f881a80df Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 29 Sep 2017 13:58:02 +1000 Subject: powerpc/powernv: Make opal_event_shutdown() callable from IRQ context In opal_event_shutdown() we free all the IRQs hanging off the opal_event_irqchip. However it's not safe to do so if we're called from IRQ context, because free_irq() wants to synchronise versus IRQ context. This can lead to warnings and a stuck system. For example from sysrq-b: Trying to free IRQ 17 from IRQ context! ------------[ cut here ]------------ WARNING: CPU: 0 PID: 0 at kernel/irq/manage.c:1461 __free_irq+0x398/0x8d0 ... NIP __free_irq+0x398/0x8d0 LR __free_irq+0x394/0x8d0 Call Trace: __free_irq+0x394/0x8d0 (unreliable) free_irq+0xa4/0x140 opal_event_shutdown+0x128/0x180 opal_shutdown+0x1c/0xb0 pnv_shutdown+0x20/0x40 machine_restart+0x38/0x90 emergency_restart+0x28/0x40 sysrq_handle_reboot+0x24/0x40 __handle_sysrq+0x198/0x590 hvc_poll+0x48c/0x8c0 hvc_handle_interrupt+0x1c/0x50 __handle_irq_event_percpu+0xe8/0x6e0 handle_irq_event_percpu+0x34/0xe0 handle_irq_event+0xc4/0x210 handle_level_irq+0x250/0x770 generic_handle_irq+0x5c/0xa0 opal_handle_events+0x11c/0x240 opal_interrupt+0x38/0x50 __handle_irq_event_percpu+0xe8/0x6e0 handle_irq_event_percpu+0x34/0xe0 handle_irq_event+0xc4/0x210 handle_fasteoi_irq+0x174/0xa10 generic_handle_irq+0x5c/0xa0 __do_irq+0xbc/0x4e0 call_do_irq+0x14/0x24 do_IRQ+0x18c/0x540 hardware_interrupt_common+0x158/0x180 We can avoid that by using disable_irq_nosync() rather than free_irq(). Although it doesn't fully free the IRQ, it should be sufficient when we're shutting down, particularly in an emergency. Add an in_interrupt() check and use free_irq() when we're shutting down normally. It's probably OK to use disable_irq_nosync() in that case too, but for now it's safer to leave that behaviour as-is. Fixes: 9f0fd0499d30 ("powerpc/powernv: Add a virtual irqchip for opal events") Reported-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-irqchip.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c index ecdcba9d1220..9d1b8c0aaf93 100644 --- a/arch/powerpc/platforms/powernv/opal-irqchip.c +++ b/arch/powerpc/platforms/powernv/opal-irqchip.c @@ -174,8 +174,14 @@ void opal_event_shutdown(void) /* First free interrupts, which will also mask them */ for (i = 0; i < opal_irq_count; i++) { - if (opal_irqs[i]) + if (!opal_irqs[i]) + continue; + + if (in_interrupt()) + disable_irq_nosync(opal_irqs[i]); + else free_irq(opal_irqs[i], NULL); + opal_irqs[i] = 0; } } -- cgit v1.2.3 From cee5405da4020b0b0233bc8fb7c8da7322d2c52e Mon Sep 17 00:00:00 2001 From: Michael Bringmann Date: Fri, 8 Sep 2017 15:47:47 -0500 Subject: powerpc/hotplug: Improve responsiveness of hotplug change powerpc/hotplug: On Power systems with shared configurations of CPUs and memory, there are some issues with the association of additional CPUs and memory to nodes when hot-adding resources. During hotplug CPU operations, this patch resets the timer on topology update work function to a small value to better ensure that the CPU topology is detected and configured sooner. Signed-off-by: Michael Bringmann Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/hotplug-cpu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index fadb95efbb9e..a7d14aa7bb7c 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -363,6 +363,7 @@ static int dlpar_online_cpu(struct device_node *dn) BUG_ON(get_cpu_current_state(cpu) != CPU_STATE_OFFLINE); cpu_maps_update_done(); + timed_topology_update(1); rc = device_online(get_cpu_device(cpu)); if (rc) goto out; @@ -533,6 +534,7 @@ static int dlpar_offline_cpu(struct device_node *dn) set_preferred_offline_state(cpu, CPU_STATE_OFFLINE); cpu_maps_update_done(); + timed_topology_update(1); rc = device_offline(get_cpu_device(cpu)); if (rc) goto out; -- cgit v1.2.3 From 54820530c5faa9fd78e1c08cb6449100b1a19157 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 12 Oct 2017 21:17:18 +1100 Subject: powerpc/powernv: Enable TM without suspend if possible Some Power9 revisions can run in a mode where TM operates without suspended state. If we find ourself on a CPU that might be in this mode, we query OPAL to check, and if so we reenable TM in CPU features, and enable a new user feature to signal to userspace that we are in this mode. We do not enable the "normal" user feature, PPC_FEATURE2_HTM, but we do enable PPC_FEATURE2_HTM_NOSC because that indicates to userspace that the kernel will abort transactions on syscall entry, which is true regardless of the suspend mode. Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/setup.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index cf52d53da460..d23f148a11f0 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "powernv.h" @@ -304,6 +305,28 @@ static int __init pnv_probe(void) return 1; } +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +void __init pnv_tm_init(void) +{ + if (!firmware_has_feature(FW_FEATURE_OPAL) || + !pvr_version_is(PVR_POWER9) || + early_cpu_has_feature(CPU_FTR_TM)) + return; + + if (opal_reinit_cpus(OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED) != OPAL_SUCCESS) + return; + + pr_info("Enabling TM (Transactional Memory) with Suspend Disabled\n"); + cur_cpu_spec->cpu_features |= CPU_FTR_TM; + /* Make sure "normal" HTM is off (it should be) */ + cur_cpu_spec->cpu_user_features2 &= ~PPC_FEATURE2_HTM; + /* Turn on no suspend mode, and HTM no SC */ + cur_cpu_spec->cpu_user_features2 |= PPC_FEATURE2_HTM_NO_SUSPEND | \ + PPC_FEATURE2_HTM_NOSC; + tm_suspend_disabled = true; +} +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + /* * Returns the cpu frequency for 'cpu' in Hz. This is used by * /proc/cpuinfo -- cgit v1.2.3 From c28237f1d4ed2c9022f9eed656ecf36999e34f47 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 17 Oct 2017 13:31:42 +0200 Subject: powerpc-opal: Fix a typo in a comment line of two file headers Fix a word in these descriptions. Signed-off-by: Markus Elfring Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-hmi.c | 2 +- arch/powerpc/platforms/powernv/opal-memory-errors.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c index d78fed728cdf..c9e1a4ff295c 100644 --- a/arch/powerpc/platforms/powernv/opal-hmi.c +++ b/arch/powerpc/platforms/powernv/opal-hmi.c @@ -1,5 +1,5 @@ /* - * OPAL hypervisor Maintenance interrupt handling support in PowreNV. + * OPAL hypervisor Maintenance interrupt handling support in PowerNV. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c index 4495f428b500..d9916ea62305 100644 --- a/arch/powerpc/platforms/powernv/opal-memory-errors.c +++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c @@ -1,5 +1,5 @@ /* - * OPAL asynchronus Memory error handling support in PowreNV. + * OPAL asynchronus Memory error handling support in PowerNV. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by -- cgit v1.2.3 From 4dd9eab39c71628d113168a01473ee17b5f61eac Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Wed, 18 Oct 2017 20:48:52 +0200 Subject: powerpc/pseries: Cleanup error handling in iommu_pseries_alloc_group() Although kfree(NULL) is legal, it's a bit lazy to rely on that to implement the error handling. So do it the normal Linux way using labels for each failure path. Signed-off-by: Markus Elfring [mpe: Squash a few patches and rewrite change log] Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/iommu.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 7c181467d0ad..69921f72e2da 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -55,23 +55,23 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node) { - struct iommu_table_group *table_group = NULL; - struct iommu_table *tbl = NULL; - struct iommu_table_group_link *tgl = NULL; + struct iommu_table_group *table_group; + struct iommu_table *tbl; + struct iommu_table_group_link *tgl; table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL, node); if (!table_group) - goto fail_exit; + return NULL; tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node); if (!tbl) - goto fail_exit; + goto free_group; tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL, node); if (!tgl) - goto fail_exit; + goto free_table; INIT_LIST_HEAD_RCU(&tbl->it_group_list); kref_init(&tbl->it_kref); @@ -82,11 +82,10 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node) return table_group; -fail_exit: - kfree(tgl); - kfree(table_group); +free_table: kfree(tbl); - +free_group: + kfree(table_group); return NULL; } -- cgit v1.2.3 From 63c9d8a4b394f9d8e995292a7c74648760235b44 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 23 Oct 2017 17:08:15 +1000 Subject: powerpc/powernv: Use FIXUP_ENDIAN_HV in OPAL return Close the recoverability gap for OPAL calls by using FIXUP_ENDIAN_HV in the return path. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-wrappers.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 37cd170201a2..6f4b00a2ac46 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -94,7 +94,7 @@ opal_return: * bytes (always BE) since MSR:LE will end up fixed up as a side * effect of the rfid. */ - FIXUP_ENDIAN + FIXUP_ENDIAN_HV ld r2,PACATOC(r13); lwz r4,8(r1); ld r5,PPC_LR_STKOFF(r1); @@ -120,7 +120,7 @@ opal_real_call: hrfid opal_return_realmode: - FIXUP_ENDIAN + FIXUP_ENDIAN_HV ld r2,PACATOC(r13); lwz r11,8(r1); ld r12,PPC_LR_STKOFF(r1) -- cgit v1.2.3 From b8f89fea599d91e674497aad572613eb63181f31 Mon Sep 17 00:00:00 2001 From: Tyrel Datwyler Date: Thu, 28 Sep 2017 20:19:20 -0400 Subject: powerpc/pseries/vio: Dispose of virq mapping on vdevice unregister When a vdevice is DLPAR removed from the system the vio subsystem doesn't bother unmapping the virq from the irq_domain. As a result we have a virq mapped to a hardware irq that is no longer valid for the irq_domain. A side effect is that we are left with /proc/irq/ affinity entries, and attempts to modify the smp_affinity of the irq will fail. In the following observed example the kernel log is spammed by ics_rtas_set_affinity errors after the removal of a VSCSI adapter. This is a result of irqbalance trying to adjust the affinity every 10 seconds. rpadlpar_io: slot U8408.E8E.10A7ACV-V5-C25 removed ics_rtas_set_affinity: ibm,set-xive irq=655385 returns -3 ics_rtas_set_affinity: ibm,set-xive irq=655385 returns -3 This patch fixes the issue by calling irq_dispose_mapping() on the virq of the viodev on unregister. Fixes: f2ab6219969f ("powerpc/pseries: Add PFO support to the VIO bus") Signed-off-by: Tyrel Datwyler Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/vio.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 12277bc9fd9e..d86938260a86 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -1592,6 +1592,8 @@ ATTRIBUTE_GROUPS(vio_dev); void vio_unregister_device(struct vio_dev *viodev) { device_unregister(&viodev->dev); + if (viodev->family == VDEVICE) + irq_dispose_mapping(viodev->irq); } EXPORT_SYMBOL(vio_unregister_device); -- cgit v1.2.3 From d6f934fd48803d9e58040e2cbab2feafe9bb9f01 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 27 Sep 2017 16:52:31 +1000 Subject: powerpc/powernv: Reserve a hole which appears after enabling IOV In order to make generic IOV code work, the physical function IOV BAR should start from offset of the first VF. Since M64 segments share PE number space across PHB, and some PEs may be in use at the time when IOV is enabled, the existing code shifts the IOV BAR to the index of the first PE/VF. This creates a hole in IOMEM space which can be potentially taken by some other device. This reserves a temporary hole on a parent and releases it when IOV is disabled; the temporary resources are stored in pci_dn to avoid kmalloc/free. Signed-off-by: Alexey Kardashevskiy Acked-by: Bjorn Helgaas Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index fb5cd7511189..7e87867984e7 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1002,9 +1002,12 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) } /* - * After doing so, there would be a "hole" in the /proc/iomem when - * offset is a positive value. It looks like the device return some - * mmio back to the system, which actually no one could use it. + * Since M64 BAR shares segments among all possible 256 PEs, + * we have to shift the beginning of PF IOV BAR to make it start from + * the segment which belongs to the PE number assigned to the first VF. + * This creates a "hole" in the /proc/iomem which could be used for + * allocating other resources so we reserve this area below and + * release when IOV is released. */ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { res = &dev->resource[i + PCI_IOV_RESOURCES]; @@ -1018,7 +1021,22 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n", i, &res2, res, (offset > 0) ? "En" : "Dis", num_vfs, offset); + + if (offset < 0) { + devm_release_resource(&dev->dev, &pdn->holes[i]); + memset(&pdn->holes[i], 0, sizeof(pdn->holes[i])); + } + pci_update_resource(dev, i + PCI_IOV_RESOURCES); + + if (offset > 0) { + pdn->holes[i].start = res2.start; + pdn->holes[i].end = res2.start + size * offset - 1; + pdn->holes[i].flags = IORESOURCE_BUS; + pdn->holes[i].name = "pnv_iov_reserved"; + devm_request_resource(&dev->dev, res->parent, + &pdn->holes[i]); + } } return 0; } -- cgit v1.2.3 From 4e003747043d57aa75c9762fa148ef38afe68dd8 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 19 Oct 2017 15:08:43 +1100 Subject: powerpc/64s: Replace CONFIG_PPC_STD_MMU_64 with CONFIG_PPC_BOOK3S_64 CONFIG_PPC_STD_MMU_64 indicates support for the "standard" powerpc MMU on 64-bit CPUs. The "standard" MMU refers to the hash page table MMU found in "server" processors, from IBM mainly. Currently CONFIG_PPC_STD_MMU_64 is == CONFIG_PPC_BOOK3S_64. While it's annoying to have two symbols that always have the same value, it's not quite annoying enough to bother removing one. However with the arrival of Power9, we now have the situation where CONFIG_PPC_STD_MMU_64 is enabled, but the kernel is running using the Radix MMU - *not* the "standard" MMU. So it is now actively confusing to use it, because it implies that code is disabled or inactive when the Radix MMU is in use, however that is not necessarily true. So s/CONFIG_PPC_STD_MMU_64/CONFIG_PPC_BOOK3S_64/, and do some minor formatting updates of some of the affected lines. This will be a pain for backports, but c'est la vie. Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/Kconfig.cputype | 6 +----- arch/powerpc/platforms/pseries/lpar.c | 8 ++++---- arch/powerpc/platforms/pseries/lparcfg.c | 2 +- 3 files changed, 6 insertions(+), 10 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 13663efc1d31..f8928ee85f6b 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -294,10 +294,6 @@ config PPC_STD_MMU_32 def_bool y depends on PPC_STD_MMU && PPC32 -config PPC_STD_MMU_64 - def_bool y - depends on PPC_STD_MMU && PPC64 - config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 @@ -323,7 +319,7 @@ config PPC_BOOK3E_MMU config PPC_MM_SLICES bool - default y if PPC_STD_MMU_64 + default y if PPC_BOOK3S_64 default n config PPC_HAVE_PMU_SUPPORT diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 495ba4e7336d..0ee4a469a4ae 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -93,7 +93,7 @@ void vpa_init(int cpu) return; } -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* * PAPR says this feature is SLB-Buffer but firmware never * reports that. All SPLPAR support SLB shadow buffer. @@ -106,7 +106,7 @@ void vpa_init(int cpu) "cpu %d (hw %d) of area %lx failed with %ld\n", cpu, hwcpu, addr, ret); } -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ /* * Register dispatch trace log, if one has been allocated. @@ -129,7 +129,7 @@ void vpa_init(int cpu) } } -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 static long pSeries_lpar_hpte_insert(unsigned long hpte_group, unsigned long vpn, unsigned long pa, @@ -824,7 +824,7 @@ void arch_free_page(struct page *page, int order) EXPORT_SYMBOL(arch_free_page); #endif /* CONFIG_PPC_SMLPAR */ -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_TRACEPOINTS #ifdef HAVE_JUMP_LABEL diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index 779fc2a1c8f7..b2706c483067 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -485,7 +485,7 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) seq_printf(m, "shared_processor_mode=%d\n", lppaca_shared_proc(get_lppaca())); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 seq_printf(m, "slb_size=%d\n", mmu_slb_size); #endif parse_em_data(m); -- cgit v1.2.3 From 1fd6c02207107c8892219dacef01de7ced3d4ce7 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 24 Oct 2017 17:48:49 +0200 Subject: powerpc/mm: Add a CONFIG option to choose if radix is used by default Currently if the hardware supports the radix MMU we will use it, *unless* "disable_radix" is passed on the kernel command line. However some users would like the reverse semantics. ie. The kernel uses the hash MMU by default, unless radix is explicitly requested on the command line. So add a CONFIG option to choose whether we use radix by default or not, and expand the disable_radix command line option to allow "disable_radix=no" which *enables* radix. Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/Kconfig.cputype | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index f8928ee85f6b..596bd9091478 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -304,6 +304,19 @@ config PPC_RADIX_MMU is only implemented by IBM Power9 CPUs, if you don't have one of them you can probably disable this. +config PPC_RADIX_MMU_DEFAULT + bool "Default to using the Radix MMU when possible" + depends on PPC_RADIX_MMU + default y + help + When the hardware supports the Radix MMU, default to using it unless + "disable_radix[=yes]" is specified on the kernel command line. + + If this option is disabled, the Hash MMU will be used by default, + unless "disable_radix=no" is specified on the kernel command line. + + If you're unsure, say Y. + config ARCH_ENABLE_HUGEPAGE_MIGRATION def_bool y depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION -- cgit v1.2.3 From 71e24d7731a2903b1ae2bba2b2971c654d9c2aa6 Mon Sep 17 00:00:00 2001 From: "William A. Kennington III" Date: Fri, 22 Sep 2017 16:58:00 -0700 Subject: powerpc/opal: Fix EBUSY bug in acquiring tokens The current code checks the completion map to look for the first token that is complete. In some cases, a completion can come in but the token can still be on lease to the caller processing the completion. If this completed but unreleased token is the first token found in the bitmap by another tasks trying to acquire a token, then the __test_and_set_bit call will fail since the token will still be on lease. The acquisition will then fail with an EBUSY. This patch reorganizes the acquisition code to look at the opal_async_token_map for an unleased token. If the token has no lease it must have no outstanding completions so we should never see an EBUSY, unless we have leased out too many tokens. Since opal_async_get_token_inrerruptible is protected by a semaphore, we will practically never see EBUSY anymore. Fixes: 8d7248232208 ("powerpc/powernv: Infrastructure to support OPAL async completion") Signed-off-by: William A. Kennington III Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-async.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c index cf33769a7b72..45b3feb8aa2f 100644 --- a/arch/powerpc/platforms/powernv/opal-async.c +++ b/arch/powerpc/platforms/powernv/opal-async.c @@ -39,18 +39,18 @@ int __opal_async_get_token(void) int token; spin_lock_irqsave(&opal_async_comp_lock, flags); - token = find_first_bit(opal_async_complete_map, opal_max_async_tokens); + token = find_first_zero_bit(opal_async_token_map, opal_max_async_tokens); if (token >= opal_max_async_tokens) { token = -EBUSY; goto out; } - if (__test_and_set_bit(token, opal_async_token_map)) { + if (!__test_and_clear_bit(token, opal_async_complete_map)) { token = -EBUSY; goto out; } - __clear_bit(token, opal_async_complete_map); + __set_bit(token, opal_async_token_map); out: spin_unlock_irqrestore(&opal_async_comp_lock, flags); -- cgit v1.2.3 From 59cf9a1cfcd9de6392d218fcd69413f2e77babbe Mon Sep 17 00:00:00 2001 From: Cyril Bur Date: Fri, 3 Nov 2017 13:41:41 +1100 Subject: powerpc/opal: Make __opal_async_{get, release}_token() static There are no callers of both __opal_async_get_token() and __opal_async_release_token(). This patch also removes the possibility of "emergency through synchronous call to __opal_async_get_token()" as such it makes more sense to initialise opal_sync_sem for the maximum number of async tokens. Signed-off-by: Cyril Bur Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-async.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c index 45b3feb8aa2f..64255d3ee14a 100644 --- a/arch/powerpc/platforms/powernv/opal-async.c +++ b/arch/powerpc/platforms/powernv/opal-async.c @@ -33,7 +33,7 @@ static struct semaphore opal_async_sem; static struct opal_msg *opal_async_responses; static unsigned int opal_max_async_tokens; -int __opal_async_get_token(void) +static int __opal_async_get_token(void) { unsigned long flags; int token; @@ -73,7 +73,7 @@ int opal_async_get_token_interruptible(void) } EXPORT_SYMBOL_GPL(opal_async_get_token_interruptible); -int __opal_async_release_token(int token) +static int __opal_async_release_token(int token) { unsigned long flags; @@ -199,11 +199,7 @@ int __init opal_async_comp_init(void) goto out_opal_node; } - /* Initialize to 1 less than the maximum tokens available, as we may - * require to pop one during emergency through synchronous call to - * __opal_async_get_token() - */ - sema_init(&opal_async_sem, opal_max_async_tokens - 1); + sema_init(&opal_async_sem, opal_max_async_tokens); out_opal_node: of_node_put(opal_node); -- cgit v1.2.3 From 86cd6d98020924f65a6773784c66c5b842e3e320 Mon Sep 17 00:00:00 2001 From: Cyril Bur Date: Fri, 3 Nov 2017 13:41:42 +1100 Subject: powerpc/opal: Rework the opal-async interface Future work will add an opal_async_wait_response_interruptible() which will call wait_event_interruptible(). This work requires extra token state to be tracked as wait_event_interruptible() can return and the caller could release the token before OPAL responds. Currently token state is tracked with two bitfields which are 64 bits big but may not need to be as OPAL informs Linux how many async tokens there are. It also uses an array indexed by token to store response messages for each token. The bitfields make it difficult to add more state and also provide a hard maximum as to how many tokens there can be - it is possible that OPAL will inform Linux that there are more than 64 tokens. Rather than add a bitfield to track the extra state, rework the internals slightly. Signed-off-by: Cyril Bur [mpe: Fix __opal_async_get_token() when no tokens are free] Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-async.c | 92 ++++++++++++++++------------- 1 file changed, 51 insertions(+), 41 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c index 64255d3ee14a..a8a57310759a 100644 --- a/arch/powerpc/platforms/powernv/opal-async.c +++ b/arch/powerpc/platforms/powernv/opal-async.c @@ -1,7 +1,7 @@ /* * PowerNV OPAL asynchronous completion interfaces * - * Copyright 2013 IBM Corp. + * Copyright 2013-2017 IBM Corp. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -23,40 +23,47 @@ #include #include -#define N_ASYNC_COMPLETIONS 64 +enum opal_async_token_state { + ASYNC_TOKEN_UNALLOCATED = 0, + ASYNC_TOKEN_ALLOCATED, + ASYNC_TOKEN_COMPLETED +}; + +struct opal_async_token { + enum opal_async_token_state state; + struct opal_msg response; +}; -static DECLARE_BITMAP(opal_async_complete_map, N_ASYNC_COMPLETIONS) = {~0UL}; -static DECLARE_BITMAP(opal_async_token_map, N_ASYNC_COMPLETIONS); static DECLARE_WAIT_QUEUE_HEAD(opal_async_wait); static DEFINE_SPINLOCK(opal_async_comp_lock); static struct semaphore opal_async_sem; -static struct opal_msg *opal_async_responses; static unsigned int opal_max_async_tokens; +static struct opal_async_token *opal_async_tokens; static int __opal_async_get_token(void) { unsigned long flags; - int token; + int i, token = -EBUSY; spin_lock_irqsave(&opal_async_comp_lock, flags); - token = find_first_zero_bit(opal_async_token_map, opal_max_async_tokens); - if (token >= opal_max_async_tokens) { - token = -EBUSY; - goto out; - } - if (!__test_and_clear_bit(token, opal_async_complete_map)) { - token = -EBUSY; - goto out; + for (i = 0; i < opal_max_async_tokens; i++) { + if (opal_async_tokens[i].state == ASYNC_TOKEN_UNALLOCATED) { + opal_async_tokens[i].state = ASYNC_TOKEN_ALLOCATED; + token = i; + break; + } } - __set_bit(token, opal_async_token_map); - -out: spin_unlock_irqrestore(&opal_async_comp_lock, flags); return token; } +/* + * Note: If the returned token is used in an opal call and opal returns + * OPAL_ASYNC_COMPLETION you MUST call opal_async_wait_response() before + * calling another other opal_async_* function + */ int opal_async_get_token_interruptible(void) { int token; @@ -76,6 +83,7 @@ EXPORT_SYMBOL_GPL(opal_async_get_token_interruptible); static int __opal_async_release_token(int token) { unsigned long flags; + int rc; if (token < 0 || token >= opal_max_async_tokens) { pr_err("%s: Passed token is out of range, token %d\n", @@ -84,11 +92,18 @@ static int __opal_async_release_token(int token) } spin_lock_irqsave(&opal_async_comp_lock, flags); - __set_bit(token, opal_async_complete_map); - __clear_bit(token, opal_async_token_map); + switch (opal_async_tokens[token].state) { + case ASYNC_TOKEN_COMPLETED: + case ASYNC_TOKEN_ALLOCATED: + opal_async_tokens[token].state = ASYNC_TOKEN_UNALLOCATED; + rc = 0; + break; + default: + rc = 1; + } spin_unlock_irqrestore(&opal_async_comp_lock, flags); - return 0; + return rc; } int opal_async_release_token(int token) @@ -96,12 +111,10 @@ int opal_async_release_token(int token) int ret; ret = __opal_async_release_token(token); - if (ret) - return ret; - - up(&opal_async_sem); + if (!ret) + up(&opal_async_sem); - return 0; + return ret; } EXPORT_SYMBOL_GPL(opal_async_release_token); @@ -122,13 +135,15 @@ int opal_async_wait_response(uint64_t token, struct opal_msg *msg) * functional. */ opal_wake_poller(); - wait_event(opal_async_wait, test_bit(token, opal_async_complete_map)); - memcpy(msg, &opal_async_responses[token], sizeof(*msg)); + wait_event(opal_async_wait, opal_async_tokens[token].state + == ASYNC_TOKEN_COMPLETED); + memcpy(msg, &opal_async_tokens[token].response, sizeof(*msg)); return 0; } EXPORT_SYMBOL_GPL(opal_async_wait_response); +/* Called from interrupt context */ static int opal_async_comp_event(struct notifier_block *nb, unsigned long msg_type, void *msg) { @@ -140,9 +155,9 @@ static int opal_async_comp_event(struct notifier_block *nb, return 0; token = be64_to_cpu(comp_msg->params[0]); - memcpy(&opal_async_responses[token], comp_msg, sizeof(*comp_msg)); + memcpy(&opal_async_tokens[token].response, comp_msg, sizeof(*comp_msg)); spin_lock_irqsave(&opal_async_comp_lock, flags); - __set_bit(token, opal_async_complete_map); + opal_async_tokens[token].state = ASYNC_TOKEN_COMPLETED; spin_unlock_irqrestore(&opal_async_comp_lock, flags); wake_up(&opal_async_wait); @@ -178,24 +193,19 @@ int __init opal_async_comp_init(void) } opal_max_async_tokens = be32_to_cpup(async); - if (opal_max_async_tokens > N_ASYNC_COMPLETIONS) - opal_max_async_tokens = N_ASYNC_COMPLETIONS; + opal_async_tokens = kcalloc(opal_max_async_tokens, + sizeof(*opal_async_tokens), GFP_KERNEL); + if (!opal_async_tokens) { + err = -ENOMEM; + goto out_opal_node; + } err = opal_message_notifier_register(OPAL_MSG_ASYNC_COMP, &opal_async_comp_nb); if (err) { pr_err("%s: Can't register OPAL event notifier (%d)\n", __func__, err); - goto out_opal_node; - } - - opal_async_responses = kzalloc( - sizeof(*opal_async_responses) * opal_max_async_tokens, - GFP_KERNEL); - if (!opal_async_responses) { - pr_err("%s: Out of memory, failed to do asynchronous " - "completion init\n", __func__); - err = -ENOMEM; + kfree(opal_async_tokens); goto out_opal_node; } -- cgit v1.2.3 From 95e1bc1daaeee4d598b235dc85b64b7a0bcc3060 Mon Sep 17 00:00:00 2001 From: Stewart Smith Date: Fri, 3 Nov 2017 13:41:43 +1100 Subject: powernv/opal-sensor: remove not needed lock Parallel sensor reads could run out of async tokens due to opal_get_sensor_data grabbing tokens but then doing the sensor read behind a mutex, essentially serializing the (possibly asynchronous and relatively slow) sensor read. It turns out that the mutex isn't needed at all, not only should the OPAL interface allow concurrent reads, the implementation is certainly safe for that, and if any sensor we were reading from somewhere isn't, doing the mutual exclusion in the kernel is the wrong place to do it, OPAL should be doing it for the kernel. So, remove the mutex. Additionally, we shouldn't be printing out an error when we don't get a token as the only way this should happen is if we've been interrupted in down_interruptible() on the semaphore. Reported-by: Robert Lippert Signed-off-by: Stewart Smith Signed-off-by: Cyril Bur Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-sensor.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c index aa267f120033..0a7074bb91dc 100644 --- a/arch/powerpc/platforms/powernv/opal-sensor.c +++ b/arch/powerpc/platforms/powernv/opal-sensor.c @@ -19,13 +19,10 @@ */ #include -#include #include #include #include -static DEFINE_MUTEX(opal_sensor_mutex); - /* * This will return sensor information to driver based on the requested sensor * handle. A handle is an opaque id for the powernv, read by the driver from the @@ -38,13 +35,9 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data) __be32 data; token = opal_async_get_token_interruptible(); - if (token < 0) { - pr_err("%s: Couldn't get the token, returning\n", __func__); - ret = token; - goto out; - } + if (token < 0) + return token; - mutex_lock(&opal_sensor_mutex); ret = opal_sensor_read(sensor_hndl, token, &data); switch (ret) { case OPAL_ASYNC_COMPLETION: @@ -52,7 +45,7 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data) if (ret) { pr_err("%s: Failed to wait for the async response, %d\n", __func__, ret); - goto out_token; + goto out; } ret = opal_error_code(opal_get_async_rc(msg)); @@ -73,10 +66,8 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data) break; } -out_token: - mutex_unlock(&opal_sensor_mutex); - opal_async_release_token(token); out: + opal_async_release_token(token); return ret; } EXPORT_SYMBOL_GPL(opal_get_sensor_data); -- cgit v1.2.3 From 9aab24495c5644b25ced0d11816cc3c061bf74fc Mon Sep 17 00:00:00 2001 From: Cyril Bur Date: Fri, 3 Nov 2017 13:41:44 +1100 Subject: powerpc/opal: Add opal_async_wait_response_interruptible() to opal-async This patch adds an _interruptible version of opal_async_wait_response(). This is useful when a long running OPAL call is performed on behalf of a userspace thread, for example, the opal_flash_{read,write,erase} functions performed by the powernv-flash MTD driver. It is foreseeable that these functions would take upwards of two minutes causing the wait_event() to block long enough to cause hung task warnings. Furthermore, wait_event_interruptible() is preferable as otherwise there is no way for signals to stop the process which is going to be confusing in userspace. Signed-off-by: Cyril Bur Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-async.c | 84 +++++++++++++++++++++++++++-- 1 file changed, 80 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c index a8a57310759a..18a355fa15e8 100644 --- a/arch/powerpc/platforms/powernv/opal-async.c +++ b/arch/powerpc/platforms/powernv/opal-async.c @@ -26,6 +26,8 @@ enum opal_async_token_state { ASYNC_TOKEN_UNALLOCATED = 0, ASYNC_TOKEN_ALLOCATED, + ASYNC_TOKEN_DISPATCHED, + ASYNC_TOKEN_ABANDONED, ASYNC_TOKEN_COMPLETED }; @@ -61,8 +63,9 @@ static int __opal_async_get_token(void) /* * Note: If the returned token is used in an opal call and opal returns - * OPAL_ASYNC_COMPLETION you MUST call opal_async_wait_response() before - * calling another other opal_async_* function + * OPAL_ASYNC_COMPLETION you MUST call one of opal_async_wait_response() or + * opal_async_wait_response_interruptible() at least once before calling another + * opal_async_* function */ int opal_async_get_token_interruptible(void) { @@ -98,6 +101,14 @@ static int __opal_async_release_token(int token) opal_async_tokens[token].state = ASYNC_TOKEN_UNALLOCATED; rc = 0; break; + /* + * DISPATCHED and ABANDONED tokens must wait for OPAL to respond. + * Mark a DISPATCHED token as ABANDONED so that the response handling + * code knows no one cares and that it can free it then. + */ + case ASYNC_TOKEN_DISPATCHED: + opal_async_tokens[token].state = ASYNC_TOKEN_ABANDONED; + /* Fall through */ default: rc = 1; } @@ -130,7 +141,11 @@ int opal_async_wait_response(uint64_t token, struct opal_msg *msg) return -EINVAL; } - /* Wakeup the poller before we wait for events to speed things + /* + * There is no need to mark the token as dispatched, wait_event() + * will block until the token completes. + * + * Wakeup the poller before we wait for events to speed things * up on platforms or simulators where the interrupts aren't * functional. */ @@ -143,11 +158,66 @@ int opal_async_wait_response(uint64_t token, struct opal_msg *msg) } EXPORT_SYMBOL_GPL(opal_async_wait_response); +int opal_async_wait_response_interruptible(uint64_t token, struct opal_msg *msg) +{ + unsigned long flags; + int ret; + + if (token >= opal_max_async_tokens) { + pr_err("%s: Invalid token passed\n", __func__); + return -EINVAL; + } + + if (!msg) { + pr_err("%s: Invalid message pointer passed\n", __func__); + return -EINVAL; + } + + /* + * The first time this gets called we mark the token as DISPATCHED + * so that if wait_event_interruptible() returns not zero and the + * caller frees the token, we know not to actually free the token + * until the response comes. + * + * Only change if the token is ALLOCATED - it may have been + * completed even before the caller gets around to calling this + * the first time. + * + * There is also a dirty great comment at the token allocation + * function that if the opal call returns OPAL_ASYNC_COMPLETION to + * the caller then the caller *must* call this or the not + * interruptible version before doing anything else with the + * token. + */ + if (opal_async_tokens[token].state == ASYNC_TOKEN_ALLOCATED) { + spin_lock_irqsave(&opal_async_comp_lock, flags); + if (opal_async_tokens[token].state == ASYNC_TOKEN_ALLOCATED) + opal_async_tokens[token].state = ASYNC_TOKEN_DISPATCHED; + spin_unlock_irqrestore(&opal_async_comp_lock, flags); + } + + /* + * Wakeup the poller before we wait for events to speed things + * up on platforms or simulators where the interrupts aren't + * functional. + */ + opal_wake_poller(); + ret = wait_event_interruptible(opal_async_wait, + opal_async_tokens[token].state == + ASYNC_TOKEN_COMPLETED); + if (!ret) + memcpy(msg, &opal_async_tokens[token].response, sizeof(*msg)); + + return ret; +} +EXPORT_SYMBOL_GPL(opal_async_wait_response_interruptible); + /* Called from interrupt context */ static int opal_async_comp_event(struct notifier_block *nb, unsigned long msg_type, void *msg) { struct opal_msg *comp_msg = msg; + enum opal_async_token_state state; unsigned long flags; uint64_t token; @@ -155,11 +225,17 @@ static int opal_async_comp_event(struct notifier_block *nb, return 0; token = be64_to_cpu(comp_msg->params[0]); - memcpy(&opal_async_tokens[token].response, comp_msg, sizeof(*comp_msg)); spin_lock_irqsave(&opal_async_comp_lock, flags); + state = opal_async_tokens[token].state; opal_async_tokens[token].state = ASYNC_TOKEN_COMPLETED; spin_unlock_irqrestore(&opal_async_comp_lock, flags); + if (state == ASYNC_TOKEN_ABANDONED) { + /* Free the token, no one else will */ + opal_async_release_token(token); + return 0; + } + memcpy(&opal_async_tokens[token].response, comp_msg, sizeof(*comp_msg)); wake_up(&opal_async_wait); return 0; -- cgit v1.2.3 From 77adbd2207e858f5923aa94e4a7d2f29f09217ed Mon Sep 17 00:00:00 2001 From: Cyril Bur Date: Fri, 3 Nov 2017 13:41:45 +1100 Subject: powerpc/powernv: Add OPAL_BUSY to opal_error_code() Also export opal_error_code() so that it can be used in modules Signed-off-by: Cyril Bur Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 65c79ecf5a4d..041ddbd1fc57 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -998,6 +998,7 @@ int opal_error_code(int rc) case OPAL_PARAMETER: return -EINVAL; case OPAL_ASYNC_COMPLETION: return -EINPROGRESS; + case OPAL_BUSY: case OPAL_BUSY_EVENT: return -EBUSY; case OPAL_NO_MEM: return -ENOMEM; case OPAL_PERMISSION: return -EPERM; @@ -1037,3 +1038,4 @@ EXPORT_SYMBOL_GPL(opal_write_oppanel_async); /* Export this for KVM */ EXPORT_SYMBOL_GPL(opal_int_set_mfrr); EXPORT_SYMBOL_GPL(opal_int_eoi); +EXPORT_SYMBOL_GPL(opal_error_code); -- cgit v1.2.3 From cd77b5ce208c153260ed7882d8910f2395bfaabd Mon Sep 17 00:00:00 2001 From: Shriya Date: Fri, 13 Oct 2017 10:06:41 +0530 Subject: powerpc/powernv/cpufreq: Fix the frequency read by /proc/cpuinfo The call to /proc/cpuinfo in turn calls cpufreq_quick_get() which returns the last frequency requested by the kernel, but may not reflect the actual frequency the processor is running at. This patch makes a call to cpufreq_get() instead which returns the current frequency reported by the hardware. Fixes: fb5153d05a7d ("powerpc: powernv: Implement ppc_md.get_proc_freq()") Signed-off-by: Shriya Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index d23f148a11f0..62f4a5ad8594 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -335,7 +335,7 @@ static unsigned long pnv_get_proc_freq(unsigned int cpu) { unsigned long ret_freq; - ret_freq = cpufreq_quick_get(cpu) * 1000ul; + ret_freq = cpufreq_get(cpu) * 1000ul; /* * If the backend cpufreq driver does not exist, -- cgit v1.2.3 From 9003a249815a15704f415954039d1c7ea27da9ad Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 7 Nov 2017 14:43:01 +1100 Subject: powerpc/powernv/ioda: Remove explicit max window size check DMA windows can only have a size of power of two on IODA2 hardware and using memory_hotplug_max() to determine the upper limit won't work correcly if it returns not power of two value. This removes the check as the platform code does this check in pnv_pci_ioda2_setup_default_config() anyway; the other client is VFIO and that thing checks against locked_vm limit which prevents the userspace from locking too much memory. It is expected to impact DPDK on machines with non-power-of-two RAM size, mostly. KVM guests are less likely to be affected as usually guests get less than half of hosts RAM. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 7e87867984e7..749055553064 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2797,7 +2797,7 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS)) return -EINVAL; - if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size)) + if (!is_power_of_2(window_size)) return -EINVAL; /* Adjust direct table size from window_size and levels */ -- cgit v1.2.3 From e34917fbee1226144c94413697ddbf2d5b06d0d3 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:41 -0800 Subject: powerpc/vas: init missing fields from [rt]xattr Initialize a few missing window context fields from the window attributes specified by the caller. These fields are currently set to their default values by the caller (NX-842), but would be good to apply them anyway. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas-window.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 5aae845b8cd9..cec7ab7119df 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -679,10 +679,13 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin, winctx->nx_win = rxattr->nx_win; winctx->fault_win = rxattr->fault_win; + winctx->user_win = rxattr->user_win; + winctx->rej_no_credit = rxattr->rej_no_credit; winctx->rx_word_mode = rxattr->rx_win_ord_mode; winctx->tx_word_mode = rxattr->tx_win_ord_mode; winctx->rx_wcred_mode = rxattr->rx_wcred_mode; winctx->tx_wcred_mode = rxattr->tx_wcred_mode; + winctx->notify_early = rxattr->notify_early; if (winctx->nx_win) { winctx->data_stamp = true; @@ -889,11 +892,14 @@ static void init_winctx_for_txwin(struct vas_window *txwin, winctx->user_win = txattr->user_win; winctx->nx_win = txwin->rxwin->nx_win; winctx->pin_win = txattr->pin_win; + winctx->rej_no_credit = txattr->rej_no_credit; + winctx->rsvd_txbuf_enable = txattr->rsvd_txbuf_enable; winctx->rx_wcred_mode = txattr->rx_wcred_mode; winctx->tx_wcred_mode = txattr->tx_wcred_mode; winctx->rx_word_mode = txattr->rx_win_ord_mode; winctx->tx_word_mode = txattr->tx_win_ord_mode; + winctx->rsvd_txbuf_count = txattr->rsvd_txbuf_count; if (winctx->nx_win) { winctx->data_stamp = true; -- cgit v1.2.3 From 51b537124fc24074aee67cae9ca94ec4d9c204fc Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:42 -0800 Subject: powerpc/vas: Validate window credits NX-842, the only user of VAS, sets the window credits to default values but VAS should check the credits against the possible max values. The VAS_WCREDS_MIN is not needed and can be dropped. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas-window.c | 6 ++++++ arch/powerpc/platforms/powernv/vas.h | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index cec7ab7119df..a2fe120ac06d 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -738,6 +738,9 @@ static bool rx_win_args_valid(enum vas_cop_type cop, if (attr->rx_fifo_size > VAS_RX_FIFO_SIZE_MAX) return false; + if (attr->wcreds_max > VAS_RX_WCREDS_MAX) + return false; + if (attr->nx_win) { /* cannot be fault or user window if it is nx */ if (attr->fault_win || attr->user_win) @@ -927,6 +930,9 @@ static bool tx_win_args_valid(enum vas_cop_type cop, if (cop > VAS_COP_TYPE_MAX) return false; + if (attr->wcreds_max > VAS_TX_WCREDS_MAX) + return false; + if (attr->user_win && (cop != VAS_COP_TYPE_FTW || attr->rsvd_txbuf_count)) return false; diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h index 38dee5d50f31..fea0de44f076 100644 --- a/arch/powerpc/platforms/powernv/vas.h +++ b/arch/powerpc/platforms/powernv/vas.h @@ -106,8 +106,8 @@ * * TODO: Needs tuning for per-process credits */ -#define VAS_WCREDS_MIN 16 -#define VAS_WCREDS_MAX ((64 << 10) - 1) +#define VAS_RX_WCREDS_MAX ((64 << 10) - 1) +#define VAS_TX_WCREDS_MAX ((4 << 10) - 1) #define VAS_WCREDS_DEFAULT (1 << 10) /* -- cgit v1.2.3 From 0a2c2c24cf78473da785654361ec957f129f4820 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:43 -0800 Subject: powerpc/vas: Cleanup some debug code Clean up vas.h and the debug code around ifdef vas_debug. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas-window.c | 8 +++-- arch/powerpc/platforms/powernv/vas.h | 54 ++++++----------------------- 2 files changed, 17 insertions(+), 45 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index a2fe120ac06d..67ffc5d994cc 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -726,7 +726,10 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin, static bool rx_win_args_valid(enum vas_cop_type cop, struct vas_rx_win_attr *attr) { - dump_rx_win_attr(attr); + pr_debug("Rxattr: fault %d, notify %d, intr %d, early %d, fifo %d\n", + attr->fault_win, attr->notify_disable, + attr->intr_disable, attr->notify_early, + attr->rx_fifo_size); if (cop >= VAS_COP_TYPE_MAX) return false; @@ -1050,7 +1053,8 @@ int vas_paste_crb(struct vas_window *txwin, int offset, bool re) else rc = -EINVAL; - print_fifo_msg_count(txwin); + pr_debug("Txwin #%d: Msg count %llu\n", txwin->winid, + read_hvwc_reg(txwin, VREG(LRFIFO_PUSH))); return rc; } diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h index fea0de44f076..63e8e037eda0 100644 --- a/arch/powerpc/platforms/powernv/vas.h +++ b/arch/powerpc/platforms/powernv/vas.h @@ -258,6 +258,16 @@ #define VAS_NX_UTIL_ADDER_OFFSET 0x180 #define VAS_NX_UTIL_ADDER PPC_BITMASK(32, 63) +/* + * VREG(x): + * Expand a register's short name (eg: LPID) into two parameters: + * - the register's short name in string form ("LPID"), and + * - the name of the macro (eg: VAS_LPID_OFFSET), defining the + * register's offset in the window context + */ +#define VREG_SFX(n, s) __stringify(n), VAS_##n##s +#define VREG(r) VREG_SFX(r, _OFFSET) + /* * Local Notify Scope Control Register. (Receive windows only). */ @@ -385,43 +395,15 @@ struct vas_winctx { extern struct vas_instance *find_vas_instance(int vasid); -/* - * VREG(x): - * Expand a register's short name (eg: LPID) into two parameters: - * - the register's short name in string form ("LPID"), and - * - the name of the macro (eg: VAS_LPID_OFFSET), defining the - * register's offset in the window context - */ -#define VREG_SFX(n, s) __stringify(n), VAS_##n##s -#define VREG(r) VREG_SFX(r, _OFFSET) - -#ifdef vas_debug -static inline void dump_rx_win_attr(struct vas_rx_win_attr *attr) -{ - pr_err("fault %d, notify %d, intr %d early %d\n", - attr->fault_win, attr->notify_disable, - attr->intr_disable, attr->notify_early); - - pr_err("rx_fifo_size %d, max value %d\n", - attr->rx_fifo_size, VAS_RX_FIFO_SIZE_MAX); -} - static inline void vas_log_write(struct vas_window *win, char *name, void *regptr, u64 val) { if (val) - pr_err("%swin #%d: %s reg %p, val 0x%016llx\n", + pr_debug("%swin #%d: %s reg %p, val 0x%016llx\n", win->tx_win ? "Tx" : "Rx", win->winid, name, regptr, val); } -#else /* vas_debug */ - -#define vas_log_write(win, name, reg, val) -#define dump_rx_win_attr(attr) - -#endif /* vas_debug */ - static inline void write_uwc_reg(struct vas_window *win, char *name, s32 reg, u64 val) { @@ -450,18 +432,4 @@ static inline u64 read_hvwc_reg(struct vas_window *win, return in_be64(win->hvwc_map+reg); } -#ifdef vas_debug - -static void print_fifo_msg_count(struct vas_window *txwin) -{ - uint64_t read_hvwc_reg(struct vas_window *w, char *n, uint64_t o); - pr_devel("Winid %d, Msg count %llu\n", txwin->winid, - (uint64_t)read_hvwc_reg(txwin, VREG(LRFIFO_PUSH))); -} -#else /* vas_debug */ - -#define print_fifo_msg_count(window) - -#endif /* vas_debug */ - #endif /* _VAS_H */ -- cgit v1.2.3 From 4963ac3632dda7433db5149d6abdfc644a8d8ab2 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:44 -0800 Subject: powerpc/vas: Drop poll_window_cast_out(). Polling for window cast out is listed in the spec, but turns out that it is not strictly necessary and slows down window close. Making it a stub for now. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas-window.c | 34 ++++++++++++++--------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 67ffc5d994cc..8ab8a8208347 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -1079,25 +1079,25 @@ retry: } } +/* + * Have the hardware cast a window out of cache and wait for it to + * be completed. + * + * NOTE: It can take a relatively long time to cast the window context + * out of the cache. It is not strictly necessary to cast out if: + * + * - we clear the "Pin Window" bit (so hardware is free to evict) + * + * - we re-initialize the window context when it is reassigned. + * + * We do the former in vas_win_close() and latter in vas_win_open(). + * So, ignoring the cast-out for now. We can add it as needed. If + * casting out becomes necessary we should consider offloading the + * job to a worker thread, so the window close can proceed quickly. + */ static void poll_window_castout(struct vas_window *window) { - int cached; - u64 val; - - /* Cast window context out of the cache */ -retry: - val = read_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL)); - cached = GET_FIELD(VAS_WIN_CACHE_STATUS, val); - if (cached) { - val = 0ULL; - val = SET_FIELD(VAS_CASTOUT_REQ, val, 1); - val = SET_FIELD(VAS_PUSH_TO_MEM, val, 0); - write_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL), val); - - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); - goto retry; - } + /* stub for now */ } /* -- cgit v1.2.3 From 36a288fe9dab9a6b0b50ffdb5c34f04c42cee2ac Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:45 -0800 Subject: powerpc/vas: Use helper to unpin/close window Use a helper to have the hardware unpin and mark a window closed. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas-window.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 8ab8a8208347..95622a984b05 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -1100,6 +1100,20 @@ static void poll_window_castout(struct vas_window *window) /* stub for now */ } +/* + * Unpin and close a window so no new requests are accepted and the + * hardware can evict this window from cache if necessary. + */ +static void unpin_close_window(struct vas_window *window) +{ + u64 val; + + val = read_hvwc_reg(window, VREG(WINCTL)); + val = SET_FIELD(VAS_WINCTL_PIN, val, 0); + val = SET_FIELD(VAS_WINCTL_OPEN, val, 0); + write_hvwc_reg(window, VREG(WINCTL), val); +} + /* * Close a window. * @@ -1114,8 +1128,6 @@ static void poll_window_castout(struct vas_window *window) */ int vas_win_close(struct vas_window *window) { - u64 val; - if (!window) return 0; @@ -1131,11 +1143,7 @@ int vas_win_close(struct vas_window *window) poll_window_busy_state(window); - /* Unpin window from cache and close it */ - val = read_hvwc_reg(window, VREG(WINCTL)); - val = SET_FIELD(VAS_WINCTL_PIN, val, 0); - val = SET_FIELD(VAS_WINCTL_OPEN, val, 0); - write_hvwc_reg(window, VREG(WINCTL), val); + unpin_close_window(window); poll_window_castout(window); -- cgit v1.2.3 From dfe954e4456277effffb2c5add47fa25390f8cea Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:46 -0800 Subject: powerpc/vas: Reduce polling interval for busy state A VAS window is normally in "busy" state for only a short duration. Reduce the time we wait for the window to go to "not-busy" state to speed-up vas_win_close() a bit. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas-window.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 95622a984b05..1422cdd7d917 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -1060,21 +1060,23 @@ int vas_paste_crb(struct vas_window *txwin, int offset, bool re) } EXPORT_SYMBOL_GPL(vas_paste_crb); +/* + * Wait for the window to go to "not-busy" state. It should only take a + * short time to queue a CRB, so window should not be busy for too long. + * Trying 5ms intervals. + */ static void poll_window_busy_state(struct vas_window *window) { int busy; u64 val; retry: - /* - * Poll Window Busy flag - */ val = read_hvwc_reg(window, VREG(WIN_STATUS)); busy = GET_FIELD(VAS_WIN_BUSY, val); if (busy) { val = 0; set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); + schedule_timeout(msecs_to_jiffies(5)); goto retry; } } -- cgit v1.2.3 From 62f659e08ccd657ead6901011f5e542dbdc477c5 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:47 -0800 Subject: powerpc/vas: Save configured window credits Save the configured max window credits for a window in the vas_window structure. We will need this when polling for return of window credits. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas-window.c | 6 ++++-- arch/powerpc/platforms/powernv/vas.h | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 1422cdd7d917..a59a187c0cd1 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -674,7 +674,7 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin, winctx->rx_fifo = rxattr->rx_fifo; winctx->rx_fifo_size = rxattr->rx_fifo_size; - winctx->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT; + winctx->wcreds_max = rxwin->wcreds_max; winctx->pin_win = rxattr->pin_win; winctx->nx_win = rxattr->nx_win; @@ -844,6 +844,7 @@ struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop, rxwin->nx_win = rxattr->nx_win; rxwin->user_win = rxattr->user_win; rxwin->cop = cop; + rxwin->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT; if (rxattr->user_win) rxwin->pid = task_pid_vnr(current); @@ -893,7 +894,7 @@ static void init_winctx_for_txwin(struct vas_window *txwin, */ memset(winctx, 0, sizeof(struct vas_winctx)); - winctx->wcreds_max = txattr->wcreds_max ?: VAS_WCREDS_DEFAULT; + winctx->wcreds_max = txwin->wcreds_max; winctx->user_win = txattr->user_win; winctx->nx_win = txwin->rxwin->nx_win; @@ -978,6 +979,7 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, txwin->nx_win = txwin->rxwin->nx_win; txwin->pid = attr->pid; txwin->user_win = attr->user_win; + txwin->wcreds_max = attr->wcreds_max ?: VAS_WCREDS_DEFAULT; init_winctx_for_txwin(txwin, attr, &winctx); diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h index 63e8e037eda0..02d8a31d9051 100644 --- a/arch/powerpc/platforms/powernv/vas.h +++ b/arch/powerpc/platforms/powernv/vas.h @@ -332,6 +332,7 @@ struct vas_window { void *hvwc_map; /* HV window context */ void *uwc_map; /* OS/User window context */ pid_t pid; /* Linux process id of owner */ + int wcreds_max; /* Window credits */ /* Fields applicable only to send windows */ void *paste_kaddr; -- cgit v1.2.3 From 6fccac16c578c699bf0714a6c930b0ceb81305a0 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:48 -0800 Subject: powerpc/vas: poll for return of window credits Normally, the NX driver waits for the CRBs to be processed before closing the window. But it is better to ensure that the credits are returned before the window gets reassigned later. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas-window.c | 45 +++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index a59a187c0cd1..23c13a7dcf89 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -1062,6 +1062,49 @@ int vas_paste_crb(struct vas_window *txwin, int offset, bool re) } EXPORT_SYMBOL_GPL(vas_paste_crb); +/* + * If credit checking is enabled for this window, poll for the return + * of window credits (i.e for NX engines to process any outstanding CRBs). + * Since NX-842 waits for the CRBs to be processed before closing the + * window, we should not have to wait for too long. + * + * TODO: We retry in 10ms intervals now. We could/should probably peek at + * the VAS_LRFIFO_PUSH_OFFSET register to get an estimate of pending + * CRBs on the FIFO and compute the delay dynamically on each retry. + * But that is not really needed until we support NX-GZIP access from + * user space. (NX-842 driver waits for CSB and Fast thread-wakeup + * doesn't use credit checking). + */ +static void poll_window_credits(struct vas_window *window) +{ + u64 val; + int creds, mode; + + val = read_hvwc_reg(window, VREG(WINCTL)); + if (window->tx_win) + mode = GET_FIELD(VAS_WINCTL_TX_WCRED_MODE, val); + else + mode = GET_FIELD(VAS_WINCTL_RX_WCRED_MODE, val); + + if (!mode) + return; +retry: + if (window->tx_win) { + val = read_hvwc_reg(window, VREG(TX_WCRED)); + creds = GET_FIELD(VAS_TX_WCRED, val); + } else { + val = read_hvwc_reg(window, VREG(LRX_WCRED)); + creds = GET_FIELD(VAS_LRX_WCRED, val); + } + + if (creds < window->wcreds_max) { + val = 0; + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(msecs_to_jiffies(10)); + goto retry; + } +} + /* * Wait for the window to go to "not-busy" state. It should only take a * short time to queue a CRB, so window should not be busy for too long. @@ -1149,6 +1192,8 @@ int vas_win_close(struct vas_window *window) unpin_close_window(window); + poll_window_credits(window); + poll_window_castout(window); /* if send window, drop reference to matching receive window */ -- cgit v1.2.3 From ca03258b6b338b392c778bed9c7dd56e6a513012 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:49 -0800 Subject: powerpc/vas: Create cpu to vas id mapping Create a cpu to vasid mapping so callers can specify -1 instead of trying to find a VAS id. Changelog[v2] [Michael Ellerman] Use per-cpu variables to simplify code. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c index 565a4878fefa..abb7090a22b4 100644 --- a/arch/powerpc/platforms/powernv/vas.c +++ b/arch/powerpc/platforms/powernv/vas.c @@ -18,15 +18,18 @@ #include #include #include +#include #include "vas.h" static DEFINE_MUTEX(vas_mutex); static LIST_HEAD(vas_instances); +static DEFINE_PER_CPU(int, cpu_vas_id); + static int init_vas_instance(struct platform_device *pdev) { - int rc, vasid; + int rc, cpu, vasid; struct resource *res; struct vas_instance *vinst; struct device_node *dn = pdev->dev.of_node; @@ -74,6 +77,11 @@ static int init_vas_instance(struct platform_device *pdev) "paste_win_id_shift 0x%llx\n", pdev->name, vasid, vinst->paste_base_addr, vinst->paste_win_id_shift); + for_each_possible_cpu(cpu) { + if (cpu_to_chip_id(cpu) == of_get_ibm_chip_id(dn)) + per_cpu(cpu_vas_id, cpu) = vasid; + } + mutex_lock(&vas_mutex); list_add(&vinst->node, &vas_instances); mutex_unlock(&vas_mutex); @@ -98,6 +106,10 @@ struct vas_instance *find_vas_instance(int vasid) struct vas_instance *vinst; mutex_lock(&vas_mutex); + + if (vasid == -1) + vasid = per_cpu(cpu_vas_id, smp_processor_id()); + list_for_each(ent, &vas_instances) { vinst = list_entry(ent, struct vas_instance, node); if (vinst->vas_id == vasid) { -- cgit v1.2.3 From d4ef61b5e8955fb913e2e1a6c1533414859a839d Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:50 -0800 Subject: powerpc/vas, nx-842: Define and use chip_to_vas_id() Define a helper, chip_to_vas_id() to map a given chip id to corresponding vas id. Normally, callers of vas_rx_win_open() and vas_tx_win_open() want the VAS window to be on the same chip where the calling thread is executing. These callers can pass in -1 for the VAS id. This interface will be useful if a thread running on one chip wants to open a window on another chip (like the NX-842 driver does during start up). Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c index abb7090a22b4..cd9a733d05e2 100644 --- a/arch/powerpc/platforms/powernv/vas.c +++ b/arch/powerpc/platforms/powernv/vas.c @@ -123,6 +123,17 @@ struct vas_instance *find_vas_instance(int vasid) return NULL; } +int chip_to_vas_id(int chipid) +{ + int cpu; + + for_each_possible_cpu(cpu) { + if (cpu_to_chip_id(cpu) == chipid) + return per_cpu(cpu_vas_id, cpu); + } + return -1; +} + static int vas_probe(struct platform_device *pdev) { return init_vas_instance(pdev); -- cgit v1.2.3 From ece4e51291485bb4a71ff554964948b02ab89823 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:51 -0800 Subject: powerpc/vas: Export HVWC to debugfs Export the VAS Window context information to debugfs. We need to hold a mutex when closing the window to prevent a race with the debugfs read(). Rather than introduce a per-instance mutex, we use the global vas_mutex for now, since it is not heavily contended. The window->cop field is only relevant to a receive window so we were not setting it for a send window (which is is paired to a receive window anyway). But to simplify reporting in debugfs, set the 'cop' field for the send window also. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/Makefile | 3 +- arch/powerpc/platforms/powernv/vas-debug.c | 209 ++++++++++++++++++++++++++++ arch/powerpc/platforms/powernv/vas-window.c | 34 ++++- arch/powerpc/platforms/powernv/vas.c | 6 +- arch/powerpc/platforms/powernv/vas.h | 14 ++ 5 files changed, 257 insertions(+), 9 deletions(-) create mode 100644 arch/powerpc/platforms/powernv/vas-debug.c (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 37d60f7dd86d..17921c45d10b 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -14,4 +14,5 @@ obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o obj-$(CONFIG_OPAL_PRD) += opal-prd.o obj-$(CONFIG_PERF_EVENTS) += opal-imc.o obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o -obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o +obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o +obj-$(CONFIG_PPC_FTW) += nx-ftw.o diff --git a/arch/powerpc/platforms/powernv/vas-debug.c b/arch/powerpc/platforms/powernv/vas-debug.c new file mode 100644 index 000000000000..ca22f1eae050 --- /dev/null +++ b/arch/powerpc/platforms/powernv/vas-debug.c @@ -0,0 +1,209 @@ +/* + * Copyright 2016-17 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "vas: " fmt + +#include +#include +#include +#include +#include "vas.h" + +static struct dentry *vas_debugfs; + +static char *cop_to_str(int cop) +{ + switch (cop) { + case VAS_COP_TYPE_FAULT: return "Fault"; + case VAS_COP_TYPE_842: return "NX-842 Normal Priority"; + case VAS_COP_TYPE_842_HIPRI: return "NX-842 High Priority"; + case VAS_COP_TYPE_GZIP: return "NX-GZIP Normal Priority"; + case VAS_COP_TYPE_GZIP_HIPRI: return "NX-GZIP High Priority"; + case VAS_COP_TYPE_FTW: return "Fast Thread-wakeup"; + default: return "Unknown"; + } +} + +static int info_dbg_show(struct seq_file *s, void *private) +{ + struct vas_window *window = s->private; + + mutex_lock(&vas_mutex); + + /* ensure window is not unmapped */ + if (!window->hvwc_map) + goto unlock; + + seq_printf(s, "Type: %s, %s\n", cop_to_str(window->cop), + window->tx_win ? "Send" : "Receive"); + seq_printf(s, "Pid : %d\n", window->pid); + +unlock: + mutex_unlock(&vas_mutex); + return 0; +} + +static int info_dbg_open(struct inode *inode, struct file *file) +{ + return single_open(file, info_dbg_show, inode->i_private); +} + +static const struct file_operations info_fops = { + .open = info_dbg_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static inline void print_reg(struct seq_file *s, struct vas_window *win, + char *name, u32 reg) +{ + seq_printf(s, "0x%016llx %s\n", read_hvwc_reg(win, name, reg), name); +} + +static int hvwc_dbg_show(struct seq_file *s, void *private) +{ + struct vas_window *window = s->private; + + mutex_lock(&vas_mutex); + + /* ensure window is not unmapped */ + if (!window->hvwc_map) + goto unlock; + + print_reg(s, window, VREG(LPID)); + print_reg(s, window, VREG(PID)); + print_reg(s, window, VREG(XLATE_MSR)); + print_reg(s, window, VREG(XLATE_LPCR)); + print_reg(s, window, VREG(XLATE_CTL)); + print_reg(s, window, VREG(AMR)); + print_reg(s, window, VREG(SEIDR)); + print_reg(s, window, VREG(FAULT_TX_WIN)); + print_reg(s, window, VREG(OSU_INTR_SRC_RA)); + print_reg(s, window, VREG(HV_INTR_SRC_RA)); + print_reg(s, window, VREG(PSWID)); + print_reg(s, window, VREG(LFIFO_BAR)); + print_reg(s, window, VREG(LDATA_STAMP_CTL)); + print_reg(s, window, VREG(LDMA_CACHE_CTL)); + print_reg(s, window, VREG(LRFIFO_PUSH)); + print_reg(s, window, VREG(CURR_MSG_COUNT)); + print_reg(s, window, VREG(LNOTIFY_AFTER_COUNT)); + print_reg(s, window, VREG(LRX_WCRED)); + print_reg(s, window, VREG(LRX_WCRED_ADDER)); + print_reg(s, window, VREG(TX_WCRED)); + print_reg(s, window, VREG(TX_WCRED_ADDER)); + print_reg(s, window, VREG(LFIFO_SIZE)); + print_reg(s, window, VREG(WINCTL)); + print_reg(s, window, VREG(WIN_STATUS)); + print_reg(s, window, VREG(WIN_CTX_CACHING_CTL)); + print_reg(s, window, VREG(TX_RSVD_BUF_COUNT)); + print_reg(s, window, VREG(LRFIFO_WIN_PTR)); + print_reg(s, window, VREG(LNOTIFY_CTL)); + print_reg(s, window, VREG(LNOTIFY_PID)); + print_reg(s, window, VREG(LNOTIFY_LPID)); + print_reg(s, window, VREG(LNOTIFY_TID)); + print_reg(s, window, VREG(LNOTIFY_SCOPE)); + print_reg(s, window, VREG(NX_UTIL_ADDER)); +unlock: + mutex_unlock(&vas_mutex); + return 0; +} + +static int hvwc_dbg_open(struct inode *inode, struct file *file) +{ + return single_open(file, hvwc_dbg_show, inode->i_private); +} + +static const struct file_operations hvwc_fops = { + .open = hvwc_dbg_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +void vas_window_free_dbgdir(struct vas_window *window) +{ + if (window->dbgdir) { + debugfs_remove_recursive(window->dbgdir); + kfree(window->dbgname); + window->dbgdir = NULL; + window->dbgname = NULL; + } +} + +void vas_window_init_dbgdir(struct vas_window *window) +{ + struct dentry *f, *d; + + if (!window->vinst->dbgdir) + return; + + window->dbgname = kzalloc(16, GFP_KERNEL); + if (!window->dbgname) + return; + + snprintf(window->dbgname, 16, "w%d", window->winid); + + d = debugfs_create_dir(window->dbgname, window->vinst->dbgdir); + if (IS_ERR(d)) + goto free_name; + + window->dbgdir = d; + + f = debugfs_create_file("info", 0444, d, window, &info_fops); + if (IS_ERR(f)) + goto remove_dir; + + f = debugfs_create_file("hvwc", 0444, d, window, &hvwc_fops); + if (IS_ERR(f)) + goto remove_dir; + + return; + +free_name: + kfree(window->dbgname); + window->dbgname = NULL; + +remove_dir: + debugfs_remove_recursive(window->dbgdir); + window->dbgdir = NULL; +} + +void vas_instance_init_dbgdir(struct vas_instance *vinst) +{ + struct dentry *d; + + if (!vas_debugfs) + return; + + vinst->dbgname = kzalloc(16, GFP_KERNEL); + if (!vinst->dbgname) + return; + + snprintf(vinst->dbgname, 16, "v%d", vinst->vas_id); + + d = debugfs_create_dir(vinst->dbgname, vas_debugfs); + if (IS_ERR(d)) + goto free_name; + + vinst->dbgdir = d; + return; + +free_name: + kfree(vinst->dbgname); + vinst->dbgname = NULL; + vinst->dbgdir = NULL; +} + +void vas_init_dbgdir(void) +{ + vas_debugfs = debugfs_create_dir("vas", NULL); + if (IS_ERR(vas_debugfs)) + vas_debugfs = NULL; +} diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 23c13a7dcf89..c030d4cf982e 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -145,23 +145,37 @@ static void unmap_paste_region(struct vas_window *window) } /* - * Unmap the MMIO regions for a window. + * Unmap the MMIO regions for a window. Hold the vas_mutex so we don't + * unmap when the window's debugfs dir is in use. This serializes close + * of a window even on another VAS instance but since its not a critical + * path, just minimize the time we hold the mutex for now. We can add + * a per-instance mutex later if necessary. */ static void unmap_winctx_mmio_bars(struct vas_window *window) { int len; + void *uwc_map; + void *hvwc_map; u64 busaddr_start; - if (window->hvwc_map) { + mutex_lock(&vas_mutex); + + hvwc_map = window->hvwc_map; + window->hvwc_map = NULL; + + uwc_map = window->uwc_map; + window->uwc_map = NULL; + + mutex_unlock(&vas_mutex); + + if (hvwc_map) { get_hvwc_mmio_bar(window, &busaddr_start, &len); - unmap_region(window->hvwc_map, busaddr_start, len); - window->hvwc_map = NULL; + unmap_region(hvwc_map, busaddr_start, len); } - if (window->uwc_map) { + if (uwc_map) { get_uwc_mmio_bar(window, &busaddr_start, &len); - unmap_region(window->uwc_map, busaddr_start, len); - window->uwc_map = NULL; + unmap_region(uwc_map, busaddr_start, len); } } @@ -528,6 +542,9 @@ static void vas_window_free(struct vas_window *window) struct vas_instance *vinst = window->vinst; unmap_winctx_mmio_bars(window); + + vas_window_free_dbgdir(window); + kfree(window); vas_release_window_id(&vinst->ida, winid); @@ -552,6 +569,8 @@ static struct vas_window *vas_window_alloc(struct vas_instance *vinst) if (map_winctx_mmio_bars(window)) goto out_free; + vas_window_init_dbgdir(window); + return window; out_free: @@ -974,6 +993,7 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, goto put_rxwin; } + txwin->cop = cop; txwin->tx_win = 1; txwin->rxwin = rxwin; txwin->nx_win = txwin->rxwin->nx_win; diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c index cd9a733d05e2..c488621dbec3 100644 --- a/arch/powerpc/platforms/powernv/vas.c +++ b/arch/powerpc/platforms/powernv/vas.c @@ -22,7 +22,7 @@ #include "vas.h" -static DEFINE_MUTEX(vas_mutex); +DEFINE_MUTEX(vas_mutex); static LIST_HEAD(vas_instances); static DEFINE_PER_CPU(int, cpu_vas_id); @@ -86,6 +86,8 @@ static int init_vas_instance(struct platform_device *pdev) list_add(&vinst->node, &vas_instances); mutex_unlock(&vas_mutex); + vas_instance_init_dbgdir(vinst); + dev_set_drvdata(&pdev->dev, vinst); return 0; @@ -157,6 +159,8 @@ static int __init vas_init(void) int found = 0; struct device_node *dn; + vas_init_dbgdir(); + platform_driver_register(&vas_driver); for_each_compatible_node(dn, NULL, "ibm,vas") { diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h index 02d8a31d9051..756cbc5335bc 100644 --- a/arch/powerpc/platforms/powernv/vas.h +++ b/arch/powerpc/platforms/powernv/vas.h @@ -13,6 +13,8 @@ #include #include #include +#include +#include /* * Overview of Virtual Accelerator Switchboard (VAS). @@ -317,6 +319,9 @@ struct vas_instance { struct mutex mutex; struct vas_window *rxwin[VAS_COP_TYPE_MAX]; struct vas_window *windows[VAS_WINDOWS_PER_CHIP]; + + char *dbgname; + struct dentry *dbgdir; }; /* @@ -334,6 +339,9 @@ struct vas_window { pid_t pid; /* Linux process id of owner */ int wcreds_max; /* Window credits */ + char *dbgname; + struct dentry *dbgdir; + /* Fields applicable only to send windows */ void *paste_kaddr; char *paste_addr_name; @@ -394,7 +402,13 @@ struct vas_winctx { enum vas_notify_after_count notify_after_count; }; +extern struct mutex vas_mutex; + extern struct vas_instance *find_vas_instance(int vasid); +extern void vas_init_dbgdir(void); +extern void vas_instance_init_dbgdir(struct vas_instance *vinst); +extern void vas_window_init_dbgdir(struct vas_window *win); +extern void vas_window_free_dbgdir(struct vas_window *win); static inline void vas_log_write(struct vas_window *win, char *name, void *regptr, u64 val) -- cgit v1.2.3 From 5676be2fb7035ac32da3a96241611e7eddff6157 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:56 -0800 Subject: powerpc/vas: Define vas_win_paste_addr() Define an interface that the NX drivers can use to find the physical paste address of a send window. This interface is expected to be used with the mmap() operation of the NX driver's device. i.e the user space process can use driver's mmap() operation to map the send window's paste address into their address space and then use copy and paste instructions to submit the CRBs to the NX engine. Note that kernel drivers will use vas_paste_crb() directly and don't need this interface. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas-window.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index c030d4cf982e..d7d06533a1e9 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -40,6 +40,16 @@ static void compute_paste_address(struct vas_window *window, u64 *addr, int *len pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr); } +u64 vas_win_paste_addr(struct vas_window *win) +{ + u64 addr; + + compute_paste_address(win, &addr, NULL); + + return addr; +} +EXPORT_SYMBOL(vas_win_paste_addr); + static inline void get_hvwc_mmio_bar(struct vas_window *window, u64 *start, int *len) { -- cgit v1.2.3 From 61f3cca8cda979646c24accd9dbf3e2de7ea6ceb Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:57 -0800 Subject: powerpc/vas: Define vas_win_id() Define an interface to return a system-wide unique id for a given VAS window. The vas_win_id() will be used in a follow-on patch to generate an unique handle for a user space receive window. Applications can use this handle to pair send and receive windows for fast thread-wakeup. The hardware refers to this system-wide unique id as a Partition Send Window ID which is expected to be used during fault handling. Hence the "pswid" in the function names. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas-window.c | 9 +++++++++ arch/powerpc/platforms/powernv/vas.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index d7d06533a1e9..82754924c2bc 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -1235,3 +1235,12 @@ int vas_win_close(struct vas_window *window) return 0; } EXPORT_SYMBOL_GPL(vas_win_close); + +/* + * Return a system-wide unique window id for the window @win. + */ +u32 vas_win_id(struct vas_window *win) +{ + return encode_pswid(win->vinst->vas_id, win->winid); +} +EXPORT_SYMBOL_GPL(vas_win_id); diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h index 756cbc5335bc..ae0100fd35bb 100644 --- a/arch/powerpc/platforms/powernv/vas.h +++ b/arch/powerpc/platforms/powernv/vas.h @@ -447,4 +447,32 @@ static inline u64 read_hvwc_reg(struct vas_window *win, return in_be64(win->hvwc_map+reg); } +/* + * Encode/decode the Partition Send Window ID (PSWID) for a window in + * a way that we can uniquely identify any window in the system. i.e. + * we should be able to locate the 'struct vas_window' given the PSWID. + * + * Bits Usage + * 0:7 VAS id (8 bits) + * 8:15 Unused, 0 (3 bits) + * 16:31 Window id (16 bits) + */ +static inline u32 encode_pswid(int vasid, int winid) +{ + u32 pswid = 0; + + pswid |= vasid << (31 - 7); + pswid |= winid; + + return pswid; +} + +static inline void decode_pswid(u32 pswid, int *vasid, int *winid) +{ + if (vasid) + *vasid = pswid >> (31 - 7) & 0xFF; + + if (winid) + *winid = pswid & 0xFFFF; +} #endif /* _VAS_H */ -- cgit v1.2.3 From 6c8e6bb2a52d5e7ae5bbde21c21f6d5dfd6e9ae8 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:58 -0800 Subject: powerpc/vas: Add support for user receive window Add support for user space receive window (for the Fast thread-wakeup coprocessor type) Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas-window.c | 56 +++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 82754924c2bc..2b3eb01ab110 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -16,7 +16,8 @@ #include #include #include - +#include +#include #include "vas.h" #include "copy-paste.h" @@ -597,6 +598,32 @@ static void put_rx_win(struct vas_window *rxwin) atomic_dec(&rxwin->num_txwins); } +/* + * Find the user space receive window given the @pswid. + * - We must have a valid vasid and it must belong to this instance. + * (so both send and receive windows are on the same VAS instance) + * - The window must refer to an OPEN, FTW, RECEIVE window. + * + * NOTE: We access ->windows[] table and assume that vinst->mutex is held. + */ +static struct vas_window *get_user_rxwin(struct vas_instance *vinst, u32 pswid) +{ + int vasid, winid; + struct vas_window *rxwin; + + decode_pswid(pswid, &vasid, &winid); + + if (vinst->vas_id != vasid) + return ERR_PTR(-EINVAL); + + rxwin = vinst->windows[winid]; + + if (!rxwin || rxwin->tx_win || rxwin->cop != VAS_COP_TYPE_FTW) + return ERR_PTR(-EINVAL); + + return rxwin; +} + /* * Get the VAS receive window associated with NX engine identified * by @cop and if applicable, @pswid. @@ -610,10 +637,10 @@ static struct vas_window *get_vinst_rxwin(struct vas_instance *vinst, mutex_lock(&vinst->mutex); - if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) - rxwin = vinst->rxwin[cop] ?: ERR_PTR(-EINVAL); + if (cop == VAS_COP_TYPE_FTW) + rxwin = get_user_rxwin(vinst, pswid); else - rxwin = ERR_PTR(-EINVAL); + rxwin = vinst->rxwin[cop] ?: ERR_PTR(-EINVAL); if (!IS_ERR(rxwin)) atomic_inc(&rxwin->num_txwins); @@ -937,10 +964,9 @@ static void init_winctx_for_txwin(struct vas_window *txwin, winctx->tx_word_mode = txattr->tx_win_ord_mode; winctx->rsvd_txbuf_count = txattr->rsvd_txbuf_count; - if (winctx->nx_win) { + winctx->intr_disable = true; + if (winctx->nx_win) winctx->data_stamp = true; - winctx->intr_disable = true; - } winctx->lpid = txattr->lpid; winctx->pidr = txattr->pidr; @@ -985,6 +1011,14 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, if (!tx_win_args_valid(cop, attr)) return ERR_PTR(-EINVAL); + /* + * If caller did not specify a vasid but specified the PSWID of a + * receive window (applicable only to FTW windows), use the vasid + * from that receive window. + */ + if (vasid == -1 && attr->pswid) + decode_pswid(attr->pswid, &vasid, NULL); + vinst = find_vas_instance(vasid); if (!vinst) { pr_devel("vasid %d not found!\n", vasid); @@ -1031,6 +1065,14 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, } } + /* + * Now that we have a send window, ensure context switch issues + * CP_ABORT for this thread. + */ + rc = -EINVAL; + if (set_thread_uses_vas() < 0) + goto free_window; + set_vinst_win(vinst, txwin); return txwin; -- cgit v1.2.3 From 2a31ad093bb1b8c1f1e04cbe222ad17bc09c4534 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 6 Sep 2017 11:48:59 +1000 Subject: powerpc/powernv/npu: Use flush_all_mm() instead of flush_tlb_mm() With the optimisations introduced by commit a46cc7a908 ("powerpc/mm/radix: Improve TLB/PWC flushes"), flush_tlb_mm() no longer flushes the page walk cache with radix. Switch to using flush_all_mm() to ensure the pwc and tlb are properly flushed on the nmmu. Signed-off-by: Alistair Popple Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/npu-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 2cb6cbea4b3b..2fff9a65975b 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -549,7 +549,7 @@ static void mmio_invalidate(struct npu_context *npu_context, int va, * Unfortunately the nest mmu does not support flushing specific * addresses so we have to flush the whole mm. */ - flush_tlb_mm(npu_context->mm); + flush_all_mm(npu_context->mm); /* * Loop over all the NPUs this process is active on and launch -- cgit v1.2.3 From 1b2c2b12386f9bb009a2249eca00e01a9d76d7c1 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 6 Sep 2017 11:49:00 +1000 Subject: powerpc/powernv/npu: Don't explicitly flush nmmu tlb The nest mmu required an explicit flush as a tlbi would not flush it in the same way as the core. However an alternate firmware fix exists which should eliminate the need for this flush, so instead add a device-tree property (ibm,nmmu-flush) on the NVLink2 PHB to enable it only if required. Signed-off-by: Alistair Popple Reviewed-by: Frederic Barrat Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/npu-dma.c | 28 +++++++++++++++++++++++----- arch/powerpc/platforms/powernv/pci.h | 3 +++ 2 files changed, 26 insertions(+), 5 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 2fff9a65975b..f6cbc1a71472 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -395,6 +395,7 @@ struct npu_context { struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS]; struct mmu_notifier mn; struct kref kref; + bool nmmu_flush; /* Callback to stop translation requests on a given GPU */ struct npu_context *(*release_cb)(struct npu_context *, void *); @@ -545,11 +546,13 @@ static void mmio_invalidate(struct npu_context *npu_context, int va, struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]; unsigned long pid = npu_context->mm->context.id; - /* - * Unfortunately the nest mmu does not support flushing specific - * addresses so we have to flush the whole mm. - */ - flush_all_mm(npu_context->mm); + if (npu_context->nmmu_flush) + /* + * Unfortunately the nest mmu does not support flushing specific + * addresses so we have to flush the whole mm once before + * shooting down the GPU translation. + */ + flush_all_mm(npu_context->mm); /* * Loop over all the NPUs this process is active on and launch @@ -722,6 +725,16 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, return ERR_PTR(-ENODEV); npu_context->npdev[npu->index][nvlink_index] = npdev; + if (!nphb->npu.nmmu_flush) { + /* + * If we're not explicitly flushing ourselves we need to mark + * the thread for global flushes + */ + npu_context->nmmu_flush = false; + mm_context_add_copro(mm); + } else + npu_context->nmmu_flush = true; + return npu_context; } EXPORT_SYMBOL(pnv_npu2_init_context); @@ -731,6 +744,9 @@ static void pnv_npu2_release_context(struct kref *kref) struct npu_context *npu_context = container_of(kref, struct npu_context, kref); + if (!npu_context->nmmu_flush) + mm_context_remove_copro(npu_context->mm); + npu_context->mm->context.npu_context = NULL; mmu_notifier_unregister(&npu_context->mn, npu_context->mm); @@ -819,6 +835,8 @@ int pnv_npu2_init(struct pnv_phb *phb) static int npu_index; uint64_t rc = 0; + phb->npu.nmmu_flush = + of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush"); for_each_child_of_node(phb->hose->dn, dn) { gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn)); if (gpdev) { diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 56d1f272d4ad..96151b3a2dd4 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -187,6 +187,9 @@ struct pnv_phb { /* Bitmask for MMIO register usage */ unsigned long mmio_atsd_usage; + + /* Do we need to explicitly flush the nest mmu? */ + bool nmmu_flush; } npu; #ifdef CONFIG_CXL_BASE -- cgit v1.2.3