diff options
author | Nathan Lynch <nathanl@linux.ibm.com> | 2023-02-10 12:42:00 -0600 |
---|---|---|
committer | Michael Ellerman <mpe@ellerman.id.au> | 2023-02-13 22:35:02 +1100 |
commit | 43033bc62d349d8d852855a336c91d046de819bd (patch) | |
tree | 9f7ce98c09f955129e605446b45fedf442da0e20 /arch/powerpc/platforms | |
parent | 24098f580e2b5ceb2cec4f02833e0a0bb5d46d2e (diff) | |
download | linux-stable-43033bc62d349d8d852855a336c91d046de819bd.tar.gz linux-stable-43033bc62d349d8d852855a336c91d046de819bd.tar.bz2 linux-stable-43033bc62d349d8d852855a336c91d046de819bd.zip |
powerpc/pseries: add RTAS work area allocator
Various pseries-specific RTAS functions take a temporary "work area"
parameter - a buffer in memory accessible to RTAS. Typically such
functions are passed the statically allocated rtas_data_buf buffer as
the argument. This buffer is protected by a global spinlock. So users
of rtas_data_buf cannot perform sleeping operations while accessing
the buffer.
Most RTAS functions that have a work area parameter can return a
status (-2/990x) that indicates that the caller should retry. Before
retrying, the caller may need to reschedule or sleep (see
rtas_busy_delay() for details). This combination of factors
leads to uncomfortable constructions like this:
do {
spin_lock(&rtas_data_buf_lock);
rc = rtas_call(token, __pa(rtas_data_buf, ...);
if (rc == 0) {
/* parse or copy out rtas_data_buf contents */
}
spin_unlock(&rtas_data_buf_lock);
} while (rtas_busy_delay(rc));
Another unfortunately common way of handling this is for callers to
blithely ignore the possibility of a -2/990x status and hope for the
best.
If users were allowed to perform blocking operations while owning a
work area, the programming model would become less tedious and
error-prone. Users could schedule away, sleep, or perform other
blocking operations without having to release and re-acquire
resources.
We could continue to use a single work area buffer, and convert
rtas_data_buf_lock to a mutex. But that would impose an unnecessarily
coarse serialization on all users. As awkward as the current design
is, it prevents longer running operations that need to repeatedly use
rtas_data_buf from blocking the progress of others.
There are more considerations. One is that while 4KB is fine for all
current in-kernel uses, some RTAS calls can take much smaller buffers,
and some (VPD, platform dumps) would likely benefit from larger
ones. Another is that at least one RTAS function (ibm,get-vpd)
has *two* work area parameters. And finally, we should expect the
number of work area users in the kernel to increase over time as we
introduce lockdown-compatible ABIs to replace less safe use cases
based on sys_rtas/librtas.
So a special-purpose allocator for RTAS work area buffers seems worth
trying.
Properties:
* The backing memory for the allocator is reserved early in boot in
order to satisfy RTAS addressing requirements, and then managed with
genalloc.
* Allocations can block, but they never fail (mempool-like).
* Prioritizes first-come, first-serve fairness over throughput.
* Early boot allocations before the allocator has been initialized are
served via an internal static buffer.
Intended to replace rtas_data_buf. New code that needs RTAS work area
buffers should prefer this API.
Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20230125-b4-powerpc-rtas-queue-v3-12-26929c8cce78@linux.ibm.com
Diffstat (limited to 'arch/powerpc/platforms')
-rw-r--r-- | arch/powerpc/platforms/pseries/Makefile | 2 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/rtas-work-area.c | 209 |
2 files changed, 210 insertions, 1 deletions
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index 20a0f3c3fe04..8992b09a7a20 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -3,7 +3,7 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) ccflags-$(CONFIG_PPC_PSERIES_DEBUG) += -DDEBUG obj-y := lpar.o hvCall.o nvram.o reconfig.o \ - of_helpers.o \ + of_helpers.o rtas-work-area.o \ setup.o iommu.o event_sources.o ras.o \ firmware.o power.o dlpar.o mobility.o rng.o \ pci.o pci_dlpar.o eeh_pseries.o msi.o \ diff --git a/arch/powerpc/platforms/pseries/rtas-work-area.c b/arch/powerpc/platforms/pseries/rtas-work-area.c new file mode 100644 index 000000000000..1ee63335bd4b --- /dev/null +++ b/arch/powerpc/platforms/pseries/rtas-work-area.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "rtas-work-area: " fmt + +#include <linux/genalloc.h> +#include <linux/log2.h> +#include <linux/kernel.h> +#include <linux/memblock.h> +#include <linux/mempool.h> +#include <linux/minmax.h> +#include <linux/mutex.h> +#include <linux/numa.h> +#include <linux/sizes.h> +#include <linux/wait.h> + +#include <asm/machdep.h> +#include <asm/rtas-work-area.h> +#include <asm/rtas.h> + +enum { + /* + * Ensure the pool is page-aligned. + */ + RTAS_WORK_AREA_ARENA_ALIGN = PAGE_SIZE, + /* + * Don't let a single allocation claim the whole arena. + */ + RTAS_WORK_AREA_ARENA_SZ = RTAS_WORK_AREA_MAX_ALLOC_SZ * 2, + /* + * The smallest known work area size is for ibm,get-vpd's + * location code argument, which is limited to 79 characters + * plus 1 nul terminator. + * + * PAPR+ 7.3.20 ibm,get-vpd RTAS Call + * PAPR+ 12.3.2.4 Converged Location Code Rules - Length Restrictions + */ + RTAS_WORK_AREA_MIN_ALLOC_SZ = roundup_pow_of_two(80), +}; + +static struct { + struct gen_pool *gen_pool; + char *arena; + struct mutex mutex; /* serializes allocations */ + struct wait_queue_head wqh; + mempool_t descriptor_pool; + bool available; +} rwa_state = { + .mutex = __MUTEX_INITIALIZER(rwa_state.mutex), + .wqh = __WAIT_QUEUE_HEAD_INITIALIZER(rwa_state.wqh), +}; + +/* + * A single work area buffer and descriptor to serve requests early in + * boot before the allocator is fully initialized. We know 4KB is the + * most any boot time user needs (they all call ibm,get-system-parameter). + */ +static bool early_work_area_in_use __initdata; +static char early_work_area_buf[SZ_4K] __initdata __aligned(SZ_4K); +static struct rtas_work_area early_work_area __initdata = { + .buf = early_work_area_buf, + .size = sizeof(early_work_area_buf), +}; + + +static struct rtas_work_area * __init rtas_work_area_alloc_early(size_t size) +{ + WARN_ON(size > early_work_area.size); + WARN_ON(early_work_area_in_use); + early_work_area_in_use = true; + memset(early_work_area.buf, 0, early_work_area.size); + return &early_work_area; +} + +static void __init rtas_work_area_free_early(struct rtas_work_area *work_area) +{ + WARN_ON(work_area != &early_work_area); + WARN_ON(!early_work_area_in_use); + early_work_area_in_use = false; +} + +struct rtas_work_area * __ref __rtas_work_area_alloc(size_t size) +{ + struct rtas_work_area *area; + unsigned long addr; + + might_sleep(); + + /* + * The rtas_work_area_alloc() wrapper enforces this at build + * time. Requests that exceed the arena size will block + * indefinitely. + */ + WARN_ON(size > RTAS_WORK_AREA_MAX_ALLOC_SZ); + + if (!rwa_state.available) + return rtas_work_area_alloc_early(size); + /* + * To ensure FCFS behavior and prevent a high rate of smaller + * requests from starving larger ones, use the mutex to queue + * allocations. + */ + mutex_lock(&rwa_state.mutex); + wait_event(rwa_state.wqh, + (addr = gen_pool_alloc(rwa_state.gen_pool, size)) != 0); + mutex_unlock(&rwa_state.mutex); + + area = mempool_alloc(&rwa_state.descriptor_pool, GFP_KERNEL); + area->buf = (char *)addr; + area->size = size; + + return area; +} + +void __ref rtas_work_area_free(struct rtas_work_area *area) +{ + if (!rwa_state.available) { + rtas_work_area_free_early(area); + return; + } + + gen_pool_free(rwa_state.gen_pool, (unsigned long)area->buf, area->size); + mempool_free(area, &rwa_state.descriptor_pool); + wake_up(&rwa_state.wqh); +} + +/* + * Initialization of the work area allocator happens in two parts. To + * reliably reserve an arena that satisfies RTAS addressing + * requirements, we must perform a memblock allocation early, + * immmediately after RTAS instantiation. Then we have to wait until + * the slab allocator is up before setting up the descriptor mempool + * and adding the arena to a gen_pool. + */ +static __init int rtas_work_area_allocator_init(void) +{ + const unsigned int order = ilog2(RTAS_WORK_AREA_MIN_ALLOC_SZ); + const phys_addr_t pa_start = __pa(rwa_state.arena); + const phys_addr_t pa_end = pa_start + RTAS_WORK_AREA_ARENA_SZ - 1; + struct gen_pool *pool; + const int nid = NUMA_NO_NODE; + int err; + + err = -ENOMEM; + if (!rwa_state.arena) + goto err_out; + + pool = gen_pool_create(order, nid); + if (!pool) + goto err_out; + /* + * All RTAS functions that consume work areas are OK with + * natural alignment, when they have alignment requirements at + * all. + */ + gen_pool_set_algo(pool, gen_pool_first_fit_order_align, NULL); + + err = gen_pool_add(pool, (unsigned long)rwa_state.arena, + RTAS_WORK_AREA_ARENA_SZ, nid); + if (err) + goto err_destroy; + + err = mempool_init_kmalloc_pool(&rwa_state.descriptor_pool, 1, + sizeof(struct rtas_work_area)); + if (err) + goto err_destroy; + + rwa_state.gen_pool = pool; + rwa_state.available = true; + + pr_debug("arena [%pa-%pa] (%uK), min/max alloc sizes %u/%u\n", + &pa_start, &pa_end, + RTAS_WORK_AREA_ARENA_SZ / SZ_1K, + RTAS_WORK_AREA_MIN_ALLOC_SZ, + RTAS_WORK_AREA_MAX_ALLOC_SZ); + + return 0; + +err_destroy: + gen_pool_destroy(pool); +err_out: + return err; +} +machine_arch_initcall(pseries, rtas_work_area_allocator_init); + +/** + * rtas_work_area_reserve_arena() - Reserve memory suitable for RTAS work areas. + */ +void __init rtas_work_area_reserve_arena(const phys_addr_t limit) +{ + const phys_addr_t align = RTAS_WORK_AREA_ARENA_ALIGN; + const phys_addr_t size = RTAS_WORK_AREA_ARENA_SZ; + const phys_addr_t min = MEMBLOCK_LOW_LIMIT; + const int nid = NUMA_NO_NODE; + + /* + * Too early for a machine_is(pseries) check. But PAPR + * effectively mandates that ibm,get-system-parameter is + * present: + * + * R1–7.3.16–1. All platforms must support the System + * Parameters option. + * + * So set up the arena if we find that, with a fallback to + * ibm,configure-connector, just in case. + */ + if (rtas_service_present("ibm,get-system-parameter") || + rtas_service_present("ibm,configure-connector")) + rwa_state.arena = memblock_alloc_try_nid(size, align, min, limit, nid); +} |