From 0d6f1693f255795d5c747dc444d69c6512586d98 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Wed, 4 Dec 2019 09:29:20 +0100 Subject: s390/cpum_sf: Rework sampling buffer allocation Adjust sampling buffer allocation depending on frequency and correct comments. Investigation on the interrupt handler revealed that almost always one interupt services one SDB, even when running with the maximum frequency of 100000. Very rarely there have been 2 SBD serviced per interrupt. Therefore reduce the number of SBD per CPU. Each SDB is one page in size. The new formula results in freq:4000 n_sdb:32 new:16 freq:10000 n_sdb:80 new:16 freq:20000 n_sdb:159 new:17 freq:40000 n_sdb:318 new:19 freq:50000 n_sdb:397 new:20 freq:62500 n_sdb:497 new:22 freq:83333 n_sdb:662 new:24 freq:100000 n_sdb:794 new:25 Signed-off-by: Thomas Richter Signed-off-by: Vasily Gorbik --- arch/s390/kernel/perf_cpum_sf.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'arch') diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index b095b1c78987..cf2020b8db44 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -372,28 +372,33 @@ static void deallocate_buffers(struct cpu_hw_sf *cpuhw) static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) { - unsigned long n_sdb, freq, factor; + unsigned long n_sdb, freq; size_t sample_size; /* Calculate sampling buffers using 4K pages * - * 1. Determine the sample data size which depends on the used - * sampling functions, for example, basic-sampling or - * basic-sampling with diagnostic-sampling. + * 1. The sampling size is 32 bytes for basic sampling. This size + * is the same for all machine types. Diagnostic + * sampling uses auxlilary data buffer setup which provides the + * memory for SDBs using linux common code auxiliary trace + * setup. * - * 2. Use the sampling frequency as input. The sampling buffer is - * designed for almost one second. This can be adjusted through - * the "factor" variable. - * In any case, alloc_sampling_buffer() sets the Alert Request + * 2. Function alloc_sampling_buffer() sets the Alert Request * Control indicator to trigger a measurement-alert to harvest - * sample-data-blocks (sdb). + * sample-data-blocks (SDB). This is done per SDB. This + * measurement alert interrupt fires quick enough to handle + * one SDB, on very high frequency and work loads there might + * be 2 to 3 SBDs available for sample processing. + * Currently there is no need for setup alert request on every + * n-th page. This is counterproductive as one IRQ triggers + * a very high number of samples to be processed at one IRQ. * - * 3. Compute the number of sample-data-blocks and ensure a minimum - * of CPUM_SF_MIN_SDB. Also ensure the upper limit does not - * exceed a "calculated" maximum. The symbolic maximum is - * designed for basic-sampling only and needs to be increased if - * diagnostic-sampling is active. - * See also the remarks for these symbolic constants. + * 3. Use the sampling frequency as input. + * Compute the number of SDBs and ensure a minimum + * of CPUM_SF_MIN_SDB. Depending on frequency add some more + * SDBs to handle a higher sampling rate. + * Use a minimum of CPUM_SF_MIN_SDB and allow for 100 samples + * (one SDB) for every 10000 HZ frequency increment. * * 4. Compute the number of sample-data-block-tables (SDBT) and * ensure a minimum of CPUM_SF_MIN_SDBT (one table can manage up @@ -401,10 +406,7 @@ static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) */ sample_size = sizeof(struct hws_basic_entry); freq = sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc)); - factor = 1; - n_sdb = DIV_ROUND_UP(freq, factor * ((PAGE_SIZE-64) / sample_size)); - if (n_sdb < CPUM_SF_MIN_SDB) - n_sdb = CPUM_SF_MIN_SDB; + n_sdb = CPUM_SF_MIN_SDB + DIV_ROUND_UP(freq, 10000); /* If there is already a sampling buffer allocated, it is very likely * that the sampling facility is enabled too. If the event to be -- cgit v1.2.3 From b059a39cfa27c04e8e03e4ddf44f16501f36357d Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 20 Jan 2020 13:10:37 +0100 Subject: s390/arch: install kernels with their proper version ID In case $INSTALLKERNEL is not available, we should install the kernel image with its version number, and save the previous one accordingly. Also, we're adding a hint so users know that they still need to perform one more configuration step (usually adjusting zipl config). Signed-off-by: Stefan Raspl Signed-off-by: Vasily Gorbik --- arch/s390/boot/install.sh | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/s390/boot/install.sh b/arch/s390/boot/install.sh index bed227f267ae..515b27a996b3 100644 --- a/arch/s390/boot/install.sh +++ b/arch/s390/boot/install.sh @@ -21,15 +21,10 @@ if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi -# Default install - same as make zlilo +echo "Warning: '${INSTALLKERNEL}' command not available - additional " \ + "bootloader config required" >&2 +if [ -f $4/vmlinuz-$1 ]; then mv $4/vmlinuz-$1 $4/vmlinuz-$1.old; fi +if [ -f $4/System.map-$1 ]; then mv $4/System.map-$1 $4/System.map-$1.old; fi -if [ -f $4/vmlinuz ]; then - mv $4/vmlinuz $4/vmlinuz.old -fi - -if [ -f $4/System.map ]; then - mv $4/System.map $4/System.old -fi - -cat $2 > $4/vmlinuz -cp $3 $4/System.map +cat $2 > $4/vmlinuz-$1 +cp $3 $4/System.map-$1 -- cgit v1.2.3 From fa226f1d81e2d3798d30eaa14550d7f35c35e6f3 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 21 Feb 2020 09:06:12 -0600 Subject: s390: Replace zero-length array with flexible-array member The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Link: https://lkml.kernel.org/r/20200221150612.GA9717@embeddedor Signed-off-by: Gustavo A. R. Silva Signed-off-by: Vasily Gorbik --- arch/s390/appldata/appldata_os.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c index 54f375627532..8bf46d705957 100644 --- a/arch/s390/appldata/appldata_os.c +++ b/arch/s390/appldata/appldata_os.c @@ -75,7 +75,7 @@ struct appldata_os_data { (waiting for I/O) */ /* per cpu data */ - struct appldata_os_per_cpu os_cpu[0]; + struct appldata_os_per_cpu os_cpu[]; } __attribute__((packed)); static struct appldata_os_data *appldata_os_data; -- cgit v1.2.3 From 4a559cd15dbc79958fa9b18ad4e8afe4a0bf4744 Mon Sep 17 00:00:00 2001 From: Torsten Duwe Date: Tue, 25 Feb 2020 15:34:30 +0100 Subject: s390/crypto: explicitly memzero stack key material in aes_s390.c aes_s390.c has several functions which allocate space for key material on the stack and leave the used keys there. It is considered good practice to clean these locations before the function returns. Link: https://lkml.kernel.org/r/20200221165511.GB6928@lst.de Signed-off-by: Torsten Duwe Signed-off-by: Harald Freudenberger Signed-off-by: Vasily Gorbik --- arch/s390/crypto/aes_s390.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c index 1c23d84a9097..73044634d342 100644 --- a/arch/s390/crypto/aes_s390.c +++ b/arch/s390/crypto/aes_s390.c @@ -342,6 +342,7 @@ static int cbc_aes_crypt(struct skcipher_request *req, unsigned long modifier) memcpy(walk.iv, param.iv, AES_BLOCK_SIZE); ret = skcipher_walk_done(&walk, nbytes - n); } + memzero_explicit(¶m, sizeof(param)); return ret; } @@ -470,6 +471,8 @@ static int xts_aes_crypt(struct skcipher_request *req, unsigned long modifier) walk.dst.virt.addr, walk.src.virt.addr, n); ret = skcipher_walk_done(&walk, nbytes - n); } + memzero_explicit(&pcc_param, sizeof(pcc_param)); + memzero_explicit(&xts_param, sizeof(xts_param)); return ret; } -- cgit v1.2.3 From 701dc81e7412daaf3c5bf4bc55d35c8b1525112a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 19 Feb 2020 13:29:15 +0100 Subject: s390/mm: remove fake numa support It turned out that fake numa support is rather useless on s390, since there are no scenarios where there is any performance or other benefit when used. However it does provide maintenance cost and breaks from time to time. Therefore remove it. CONFIG_NUMA is still supported with a very small backend and only one node. This way userspace applications which require NUMA interfaces continue to work. Note that NODES_SHIFT is set to 1 (= 2 nodes) instead of 0 (= 1 node), since there is quite a bit of kernel code which assumes that more than one node is possible if CONFIG_NUMA is enabled. Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/Kconfig | 61 +---- arch/s390/include/asm/numa.h | 13 +- arch/s390/include/asm/topology.h | 9 +- arch/s390/kernel/setup.c | 1 + arch/s390/kernel/topology.c | 2 - arch/s390/numa/Makefile | 2 - arch/s390/numa/mode_emu.c | 577 --------------------------------------- arch/s390/numa/numa.c | 147 +--------- arch/s390/numa/numa_mode.h | 25 -- arch/s390/numa/toptree.c | 351 ------------------------ arch/s390/numa/toptree.h | 61 ----- 11 files changed, 19 insertions(+), 1230 deletions(-) delete mode 100644 arch/s390/numa/mode_emu.c delete mode 100644 arch/s390/numa/numa_mode.h delete mode 100644 arch/s390/numa/toptree.c delete mode 100644 arch/s390/numa/toptree.h (limited to 'arch') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 8abe77536d9d..6b1f715dd8bb 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -450,14 +450,6 @@ config NR_CPUS config HOTPLUG_CPU def_bool y -# Some NUMA nodes have memory ranges that span -# other nodes. Even though a pfn is valid and -# between a node's start and end pfns, it may not -# reside on that node. See memmap_init_zone() -# for details. <- They meant memory holes! -config NODES_SPAN_OTHER_NODES - def_bool NUMA - config NUMA bool "NUMA support" depends on SCHED_TOPOLOGY @@ -467,58 +459,9 @@ config NUMA This option adds NUMA support to the kernel. - An operation mode can be selected by appending - numa= to the kernel command line. - - The default behaviour is identical to appending numa=plain to - the command line. This will create just one node with all - available memory and all CPUs in it. - config NODES_SHIFT - int "Maximum NUMA nodes (as a power of 2)" - range 1 10 - depends on NUMA - default "4" - help - Specify the maximum number of NUMA nodes available on the target - system. Increases memory reserved to accommodate various tables. - -menu "Select NUMA modes" - depends on NUMA - -config NUMA_EMU - bool "NUMA emulation" - default y - help - Numa emulation mode will split the available system memory into - equal chunks which then are distributed over the configured number - of nodes in a round-robin manner. - - The number of fake nodes is limited by the number of available memory - chunks (i.e. memory size / fake size) and the number of supported - nodes in the kernel. - - The CPUs are assigned to the nodes in a way that partially respects - the original machine topology (if supported by the machine). - Fair distribution of the CPUs is not guaranteed. - -config EMU_SIZE - hex "NUMA emulation memory chunk size" - default 0x10000000 - range 0x400000 0x100000000 - depends on NUMA_EMU - help - Select the default size by which the memory is chopped and then - assigned to emulated NUMA nodes. - - This can be overridden by specifying - - emu_size= - - on the kernel command line where also suffixes K, M, G, and T are - supported. - -endmenu + int + default "1" config SCHED_SMT def_bool n diff --git a/arch/s390/include/asm/numa.h b/arch/s390/include/asm/numa.h index 35f8cbe7e5bb..23cd5d1b734b 100644 --- a/arch/s390/include/asm/numa.h +++ b/arch/s390/include/asm/numa.h @@ -13,24 +13,13 @@ #ifdef CONFIG_NUMA #include -#include void numa_setup(void); -int numa_pfn_to_nid(unsigned long pfn); -int __node_distance(int a, int b); -void numa_update_cpu_topology(void); - -extern cpumask_t node_to_cpumask_map[MAX_NUMNODES]; -extern int numa_debug_enabled; #else static inline void numa_setup(void) { } -static inline void numa_update_cpu_topology(void) { } -static inline int numa_pfn_to_nid(unsigned long pfn) -{ - return 0; -} #endif /* CONFIG_NUMA */ + #endif /* _ASM_S390_NUMA_H */ diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index cca406fdbe51..bd3417185e30 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -16,7 +16,6 @@ struct cpu_topology_s390 { unsigned short socket_id; unsigned short book_id; unsigned short drawer_id; - unsigned short node_id; unsigned short dedicated : 1; cpumask_t thread_mask; cpumask_t core_mask; @@ -71,19 +70,23 @@ static inline void topology_expect_change(void) { } #define cpu_to_node cpu_to_node static inline int cpu_to_node(int cpu) { - return cpu_topology[cpu].node_id; + return 0; } /* Returns a pointer to the cpumask of CPUs on node 'node'. */ #define cpumask_of_node cpumask_of_node static inline const struct cpumask *cpumask_of_node(int node) { - return &node_to_cpumask_map[node]; + return cpu_possible_mask; } #define pcibus_to_node(bus) __pcibus_to_node(bus) #define node_distance(a, b) __node_distance(a, b) +static inline int __node_distance(int a, int b) +{ + return 0; +} #else /* !CONFIG_NUMA */ diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index b2c2f75860e8..1158a63a8e0e 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -790,6 +790,7 @@ static void __init memblock_add_mem_detect_info(void) memblock_physmem_add(start, end - start); } memblock_set_bottom_up(false); + memblock_set_node(0, ULONG_MAX, &memblock.memory, 0); memblock_dump_all(); } diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 3627953007ed..c189f5d996ff 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -26,7 +26,6 @@ #include #include #include -#include #define PTF_HORIZONTAL (0UL) #define PTF_VERTICAL (1UL) @@ -267,7 +266,6 @@ static void update_cpu_masks(void) cpumask_set_cpu(cpu, &cpus_with_topology); } } - numa_update_cpu_topology(); } void store_topology(struct sysinfo_15_1_x *info) diff --git a/arch/s390/numa/Makefile b/arch/s390/numa/Makefile index 66c2dff74895..c89d26f4f77d 100644 --- a/arch/s390/numa/Makefile +++ b/arch/s390/numa/Makefile @@ -1,4 +1,2 @@ # SPDX-License-Identifier: GPL-2.0 obj-y += numa.o -obj-y += toptree.o -obj-$(CONFIG_NUMA_EMU) += mode_emu.o diff --git a/arch/s390/numa/mode_emu.c b/arch/s390/numa/mode_emu.c deleted file mode 100644 index 72d742bb2d17..000000000000 --- a/arch/s390/numa/mode_emu.c +++ /dev/null @@ -1,577 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * NUMA support for s390 - * - * NUMA emulation (aka fake NUMA) distributes the available memory to nodes - * without using real topology information about the physical memory of the - * machine. - * - * It distributes the available CPUs to nodes while respecting the original - * machine topology information. This is done by trying to avoid to separate - * CPUs which reside on the same book or even on the same MC. - * - * Because the current Linux scheduler code requires a stable cpu to node - * mapping, cores are pinned to nodes when the first CPU thread is set online. - * - * Copyright IBM Corp. 2015 - */ - -#define KMSG_COMPONENT "numa_emu" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include "numa_mode.h" -#include "toptree.h" - -/* Distances between the different system components */ -#define DIST_EMPTY 0 -#define DIST_CORE 1 -#define DIST_MC 2 -#define DIST_BOOK 3 -#define DIST_DRAWER 4 -#define DIST_MAX 5 - -/* Node distance reported to common code */ -#define EMU_NODE_DIST 10 - -/* Node ID for free (not yet pinned) cores */ -#define NODE_ID_FREE -1 - -/* Different levels of toptree */ -enum toptree_level {CORE, MC, BOOK, DRAWER, NODE, TOPOLOGY}; - -/* The two toptree IDs */ -enum {TOPTREE_ID_PHYS, TOPTREE_ID_NUMA}; - -/* Number of NUMA nodes */ -static int emu_nodes = 1; -/* NUMA stripe size */ -static unsigned long emu_size; - -/* - * Node to core pinning information updates are protected by - * "sched_domains_mutex". - */ -static struct { - s32 to_node_id[CONFIG_NR_CPUS]; /* Pinned core to node mapping */ - int total; /* Total number of pinned cores */ - int per_node_target; /* Cores per node without extra cores */ - int per_node[MAX_NUMNODES]; /* Number of cores pinned to node */ -} *emu_cores; - -/* - * Pin a core to a node - */ -static void pin_core_to_node(int core_id, int node_id) -{ - if (emu_cores->to_node_id[core_id] == NODE_ID_FREE) { - emu_cores->per_node[node_id]++; - emu_cores->to_node_id[core_id] = node_id; - emu_cores->total++; - } else { - WARN_ON(emu_cores->to_node_id[core_id] != node_id); - } -} - -/* - * Number of pinned cores of a node - */ -static int cores_pinned(struct toptree *node) -{ - return emu_cores->per_node[node->id]; -} - -/* - * ID of the node where the core is pinned (or NODE_ID_FREE) - */ -static int core_pinned_to_node_id(struct toptree *core) -{ - return emu_cores->to_node_id[core->id]; -} - -/* - * Number of cores in the tree that are not yet pinned - */ -static int cores_free(struct toptree *tree) -{ - struct toptree *core; - int count = 0; - - toptree_for_each(core, tree, CORE) { - if (core_pinned_to_node_id(core) == NODE_ID_FREE) - count++; - } - return count; -} - -/* - * Return node of core - */ -static struct toptree *core_node(struct toptree *core) -{ - return core->parent->parent->parent->parent; -} - -/* - * Return drawer of core - */ -static struct toptree *core_drawer(struct toptree *core) -{ - return core->parent->parent->parent; -} - -/* - * Return book of core - */ -static struct toptree *core_book(struct toptree *core) -{ - return core->parent->parent; -} - -/* - * Return mc of core - */ -static struct toptree *core_mc(struct toptree *core) -{ - return core->parent; -} - -/* - * Distance between two cores - */ -static int dist_core_to_core(struct toptree *core1, struct toptree *core2) -{ - if (core_drawer(core1)->id != core_drawer(core2)->id) - return DIST_DRAWER; - if (core_book(core1)->id != core_book(core2)->id) - return DIST_BOOK; - if (core_mc(core1)->id != core_mc(core2)->id) - return DIST_MC; - /* Same core or sibling on same MC */ - return DIST_CORE; -} - -/* - * Distance of a node to a core - */ -static int dist_node_to_core(struct toptree *node, struct toptree *core) -{ - struct toptree *core_node; - int dist_min = DIST_MAX; - - toptree_for_each(core_node, node, CORE) - dist_min = min(dist_min, dist_core_to_core(core_node, core)); - return dist_min == DIST_MAX ? DIST_EMPTY : dist_min; -} - -/* - * Unify will delete empty nodes, therefore recreate nodes. - */ -static void toptree_unify_tree(struct toptree *tree) -{ - int nid; - - toptree_unify(tree); - for (nid = 0; nid < emu_nodes; nid++) - toptree_get_child(tree, nid); -} - -/* - * Find the best/nearest node for a given core and ensure that no node - * gets more than "emu_cores->per_node_target + extra" cores. - */ -static struct toptree *node_for_core(struct toptree *numa, struct toptree *core, - int extra) -{ - struct toptree *node, *node_best = NULL; - int dist_cur, dist_best, cores_target; - - cores_target = emu_cores->per_node_target + extra; - dist_best = DIST_MAX; - node_best = NULL; - toptree_for_each(node, numa, NODE) { - /* Already pinned cores must use their nodes */ - if (core_pinned_to_node_id(core) == node->id) { - node_best = node; - break; - } - /* Skip nodes that already have enough cores */ - if (cores_pinned(node) >= cores_target) - continue; - dist_cur = dist_node_to_core(node, core); - if (dist_cur < dist_best) { - dist_best = dist_cur; - node_best = node; - } - } - return node_best; -} - -/* - * Find the best node for each core with respect to "extra" core count - */ -static void toptree_to_numa_single(struct toptree *numa, struct toptree *phys, - int extra) -{ - struct toptree *node, *core, *tmp; - - toptree_for_each_safe(core, tmp, phys, CORE) { - node = node_for_core(numa, core, extra); - if (!node) - return; - toptree_move(core, node); - pin_core_to_node(core->id, node->id); - } -} - -/* - * Move structures of given level to specified NUMA node - */ -static void move_level_to_numa_node(struct toptree *node, struct toptree *phys, - enum toptree_level level, bool perfect) -{ - int cores_free, cores_target = emu_cores->per_node_target; - struct toptree *cur, *tmp; - - toptree_for_each_safe(cur, tmp, phys, level) { - cores_free = cores_target - toptree_count(node, CORE); - if (perfect) { - if (cores_free == toptree_count(cur, CORE)) - toptree_move(cur, node); - } else { - if (cores_free >= toptree_count(cur, CORE)) - toptree_move(cur, node); - } - } -} - -/* - * Move structures of a given level to NUMA nodes. If "perfect" is specified - * move only perfectly fitting structures. Otherwise move also smaller - * than needed structures. - */ -static void move_level_to_numa(struct toptree *numa, struct toptree *phys, - enum toptree_level level, bool perfect) -{ - struct toptree *node; - - toptree_for_each(node, numa, NODE) - move_level_to_numa_node(node, phys, level, perfect); -} - -/* - * For the first run try to move the big structures - */ -static void toptree_to_numa_first(struct toptree *numa, struct toptree *phys) -{ - struct toptree *core; - - /* Always try to move perfectly fitting structures first */ - move_level_to_numa(numa, phys, DRAWER, true); - move_level_to_numa(numa, phys, DRAWER, false); - move_level_to_numa(numa, phys, BOOK, true); - move_level_to_numa(numa, phys, BOOK, false); - move_level_to_numa(numa, phys, MC, true); - move_level_to_numa(numa, phys, MC, false); - /* Now pin all the moved cores */ - toptree_for_each(core, numa, CORE) - pin_core_to_node(core->id, core_node(core)->id); -} - -/* - * Allocate new topology and create required nodes - */ -static struct toptree *toptree_new(int id, int nodes) -{ - struct toptree *tree; - int nid; - - tree = toptree_alloc(TOPOLOGY, id); - if (!tree) - goto fail; - for (nid = 0; nid < nodes; nid++) { - if (!toptree_get_child(tree, nid)) - goto fail; - } - return tree; -fail: - panic("NUMA emulation could not allocate topology"); -} - -/* - * Allocate and initialize core to node mapping - */ -static void __ref create_core_to_node_map(void) -{ - int i; - - emu_cores = memblock_alloc(sizeof(*emu_cores), 8); - if (!emu_cores) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*emu_cores), 8); - for (i = 0; i < ARRAY_SIZE(emu_cores->to_node_id); i++) - emu_cores->to_node_id[i] = NODE_ID_FREE; -} - -/* - * Move cores from physical topology into NUMA target topology - * and try to keep as much of the physical topology as possible. - */ -static struct toptree *toptree_to_numa(struct toptree *phys) -{ - static int first = 1; - struct toptree *numa; - int cores_total; - - cores_total = emu_cores->total + cores_free(phys); - emu_cores->per_node_target = cores_total / emu_nodes; - numa = toptree_new(TOPTREE_ID_NUMA, emu_nodes); - if (first) { - toptree_to_numa_first(numa, phys); - first = 0; - } - toptree_to_numa_single(numa, phys, 0); - toptree_to_numa_single(numa, phys, 1); - toptree_unify_tree(numa); - - WARN_ON(cpumask_weight(&phys->mask)); - return numa; -} - -/* - * Create a toptree out of the physical topology that we got from the hypervisor - */ -static struct toptree *toptree_from_topology(void) -{ - struct toptree *phys, *node, *drawer, *book, *mc, *core; - struct cpu_topology_s390 *top; - int cpu; - - phys = toptree_new(TOPTREE_ID_PHYS, 1); - - for_each_cpu(cpu, &cpus_with_topology) { - top = &cpu_topology[cpu]; - node = toptree_get_child(phys, 0); - drawer = toptree_get_child(node, top->drawer_id); - book = toptree_get_child(drawer, top->book_id); - mc = toptree_get_child(book, top->socket_id); - core = toptree_get_child(mc, smp_get_base_cpu(cpu)); - if (!drawer || !book || !mc || !core) - panic("NUMA emulation could not allocate memory"); - cpumask_set_cpu(cpu, &core->mask); - toptree_update_mask(mc); - } - return phys; -} - -/* - * Add toptree core to topology and create correct CPU masks - */ -static void topology_add_core(struct toptree *core) -{ - struct cpu_topology_s390 *top; - int cpu; - - for_each_cpu(cpu, &core->mask) { - top = &cpu_topology[cpu]; - cpumask_copy(&top->thread_mask, &core->mask); - cpumask_copy(&top->core_mask, &core_mc(core)->mask); - cpumask_copy(&top->book_mask, &core_book(core)->mask); - cpumask_copy(&top->drawer_mask, &core_drawer(core)->mask); - cpumask_set_cpu(cpu, &node_to_cpumask_map[core_node(core)->id]); - top->node_id = core_node(core)->id; - } -} - -/* - * Apply toptree to topology and create CPU masks - */ -static void toptree_to_topology(struct toptree *numa) -{ - struct toptree *core; - int i; - - /* Clear all node masks */ - for (i = 0; i < MAX_NUMNODES; i++) - cpumask_clear(&node_to_cpumask_map[i]); - - /* Rebuild all masks */ - toptree_for_each(core, numa, CORE) - topology_add_core(core); -} - -/* - * Show the node to core mapping - */ -static void print_node_to_core_map(void) -{ - int nid, cid; - - if (!numa_debug_enabled) - return; - printk(KERN_DEBUG "NUMA node to core mapping\n"); - for (nid = 0; nid < emu_nodes; nid++) { - printk(KERN_DEBUG " node %3d: ", nid); - for (cid = 0; cid < ARRAY_SIZE(emu_cores->to_node_id); cid++) { - if (emu_cores->to_node_id[cid] == nid) - printk(KERN_CONT "%d ", cid); - } - printk(KERN_CONT "\n"); - } -} - -static void pin_all_possible_cpus(void) -{ - int core_id, node_id, cpu; - static int initialized; - - if (initialized) - return; - print_node_to_core_map(); - node_id = 0; - for_each_possible_cpu(cpu) { - core_id = smp_get_base_cpu(cpu); - if (emu_cores->to_node_id[core_id] != NODE_ID_FREE) - continue; - pin_core_to_node(core_id, node_id); - cpu_topology[cpu].node_id = node_id; - node_id = (node_id + 1) % emu_nodes; - } - print_node_to_core_map(); - initialized = 1; -} - -/* - * Transfer physical topology into a NUMA topology and modify CPU masks - * according to the NUMA topology. - * - * Must be called with "sched_domains_mutex" lock held. - */ -static void emu_update_cpu_topology(void) -{ - struct toptree *phys, *numa; - - if (emu_cores == NULL) - create_core_to_node_map(); - phys = toptree_from_topology(); - numa = toptree_to_numa(phys); - toptree_free(phys); - toptree_to_topology(numa); - toptree_free(numa); - pin_all_possible_cpus(); -} - -/* - * If emu_size is not set, use CONFIG_EMU_SIZE. Then round to minimum - * alignment (needed for memory hotplug). - */ -static unsigned long emu_setup_size_adjust(unsigned long size) -{ - unsigned long size_new; - - size = size ? : CONFIG_EMU_SIZE; - size_new = roundup(size, memory_block_size_bytes()); - if (size_new == size) - return size; - pr_warn("Increasing memory stripe size from %ld MB to %ld MB\n", - size >> 20, size_new >> 20); - return size_new; -} - -/* - * If we have not enough memory for the specified nodes, reduce the node count. - */ -static int emu_setup_nodes_adjust(int nodes) -{ - int nodes_max; - - nodes_max = memblock.memory.total_size / emu_size; - nodes_max = max(nodes_max, 1); - if (nodes_max >= nodes) - return nodes; - pr_warn("Not enough memory for %d nodes, reducing node count\n", nodes); - return nodes_max; -} - -/* - * Early emu setup - */ -static void emu_setup(void) -{ - int nid; - - emu_size = emu_setup_size_adjust(emu_size); - emu_nodes = emu_setup_nodes_adjust(emu_nodes); - for (nid = 0; nid < emu_nodes; nid++) - node_set(nid, node_possible_map); - pr_info("Creating %d nodes with memory stripe size %ld MB\n", - emu_nodes, emu_size >> 20); -} - -/* - * Return node id for given page number - */ -static int emu_pfn_to_nid(unsigned long pfn) -{ - return (pfn / (emu_size >> PAGE_SHIFT)) % emu_nodes; -} - -/* - * Return stripe size - */ -static unsigned long emu_align(void) -{ - return emu_size; -} - -/* - * Return distance between two nodes - */ -static int emu_distance(int node1, int node2) -{ - return (node1 != node2) * EMU_NODE_DIST; -} - -/* - * Define callbacks for generic s390 NUMA infrastructure - */ -const struct numa_mode numa_mode_emu = { - .name = "emu", - .setup = emu_setup, - .update_cpu_topology = emu_update_cpu_topology, - .__pfn_to_nid = emu_pfn_to_nid, - .align = emu_align, - .distance = emu_distance, -}; - -/* - * Kernel parameter: emu_nodes= - */ -static int __init early_parse_emu_nodes(char *p) -{ - int count; - - if (!p || kstrtoint(p, 0, &count) != 0 || count <= 0) - return 0; - emu_nodes = min(count, MAX_NUMNODES); - return 0; -} -early_param("emu_nodes", early_parse_emu_nodes); - -/* - * Kernel parameter: emu_size=[[k|M|G|T]] - */ -static int __init early_parse_emu_size(char *p) -{ - if (p) - emu_size = memparse(p, NULL); - return 0; -} -early_param("emu_size", early_parse_emu_size); diff --git a/arch/s390/numa/numa.c b/arch/s390/numa/numa.c index d2910fa834c8..51c5a9f6e525 100644 --- a/arch/s390/numa/numa.c +++ b/arch/s390/numa/numa.c @@ -7,165 +7,36 @@ * Copyright IBM Corp. 2015 */ -#define KMSG_COMPONENT "numa" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - #include #include #include #include -#include #include - #include -#include "numa_mode.h" -pg_data_t *node_data[MAX_NUMNODES]; +struct pglist_data *node_data[MAX_NUMNODES]; EXPORT_SYMBOL(node_data); -cpumask_t node_to_cpumask_map[MAX_NUMNODES]; -EXPORT_SYMBOL(node_to_cpumask_map); - -static void plain_setup(void) -{ - node_set(0, node_possible_map); -} - -const struct numa_mode numa_mode_plain = { - .name = "plain", - .setup = plain_setup, -}; - -static const struct numa_mode *mode = &numa_mode_plain; - -int numa_pfn_to_nid(unsigned long pfn) -{ - return mode->__pfn_to_nid ? mode->__pfn_to_nid(pfn) : 0; -} - -void numa_update_cpu_topology(void) -{ - if (mode->update_cpu_topology) - mode->update_cpu_topology(); -} - -int __node_distance(int a, int b) -{ - return mode->distance ? mode->distance(a, b) : 0; -} -EXPORT_SYMBOL(__node_distance); - -int numa_debug_enabled; - -/* - * numa_setup_memory() - Assign bootmem to nodes - * - * The memory is first added to memblock without any respect to nodes. - * This is fixed before remaining memblock memory is handed over to the - * buddy allocator. - * An important side effect is that large bootmem allocations might easily - * cross node boundaries, which can be needed for large allocations with - * smaller memory stripes in each node (i.e. when using NUMA emulation). - * - * Memory defines nodes: - * Therefore this routine also sets the nodes online with memory. - */ -static void __init numa_setup_memory(void) +void __init numa_setup(void) { - unsigned long cur_base, align, end_of_dram; - int nid = 0; - - end_of_dram = memblock_end_of_DRAM(); - align = mode->align ? mode->align() : ULONG_MAX; - - /* - * Step through all available memory and assign it to the nodes - * indicated by the mode implementation. - * All nodes which are seen here will be set online. - */ - cur_base = 0; - do { - nid = numa_pfn_to_nid(PFN_DOWN(cur_base)); - node_set_online(nid); - memblock_set_node(cur_base, align, &memblock.memory, nid); - cur_base += align; - } while (cur_base < end_of_dram); + int nid; - /* Allocate and fill out node_data */ + nodes_clear(node_possible_map); + node_set(0, node_possible_map); + node_set_online(0); for (nid = 0; nid < MAX_NUMNODES; nid++) { NODE_DATA(nid) = memblock_alloc(sizeof(pg_data_t), 8); if (!NODE_DATA(nid)) panic("%s: Failed to allocate %zu bytes align=0x%x\n", __func__, sizeof(pg_data_t), 8); } - - for_each_online_node(nid) { - unsigned long start_pfn, end_pfn; - unsigned long t_start, t_end; - int i; - - start_pfn = ULONG_MAX; - end_pfn = 0; - for_each_mem_pfn_range(i, nid, &t_start, &t_end, NULL) { - if (t_start < start_pfn) - start_pfn = t_start; - if (t_end > end_pfn) - end_pfn = t_end; - } - NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; - NODE_DATA(nid)->node_id = nid; - } -} - -/* - * numa_setup() - Earliest initialization - * - * Assign the mode and call the mode's setup routine. - */ -void __init numa_setup(void) -{ - pr_info("NUMA mode: %s\n", mode->name); - nodes_clear(node_possible_map); - /* Initially attach all possible CPUs to node 0. */ - cpumask_copy(&node_to_cpumask_map[0], cpu_possible_mask); - if (mode->setup) - mode->setup(); - numa_setup_memory(); - memblock_dump_all(); + NODE_DATA(0)->node_spanned_pages = memblock_end_of_DRAM() >> PAGE_SHIFT; + NODE_DATA(0)->node_id = 0; } -/* - * numa_init_late() - Initialization initcall - * - * Register NUMA nodes. - */ static int __init numa_init_late(void) { - int nid; - - for_each_online_node(nid) - register_one_node(nid); + register_one_node(0); return 0; } arch_initcall(numa_init_late); - -static int __init parse_debug(char *parm) -{ - numa_debug_enabled = 1; - return 0; -} -early_param("numa_debug", parse_debug); - -static int __init parse_numa(char *parm) -{ - if (!parm) - return 1; - if (strcmp(parm, numa_mode_plain.name) == 0) - mode = &numa_mode_plain; -#ifdef CONFIG_NUMA_EMU - if (strcmp(parm, numa_mode_emu.name) == 0) - mode = &numa_mode_emu; -#endif - return 0; -} -early_param("numa", parse_numa); diff --git a/arch/s390/numa/numa_mode.h b/arch/s390/numa/numa_mode.h deleted file mode 100644 index dfd3e2784081..000000000000 --- a/arch/s390/numa/numa_mode.h +++ /dev/null @@ -1,25 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * NUMA support for s390 - * - * Define declarations used for communication between NUMA mode - * implementations and NUMA core functionality. - * - * Copyright IBM Corp. 2015 - */ -#ifndef __S390_NUMA_MODE_H -#define __S390_NUMA_MODE_H - -struct numa_mode { - char *name; /* Name of mode */ - void (*setup)(void); /* Initizalize mode */ - void (*update_cpu_topology)(void); /* Called by topology code */ - int (*__pfn_to_nid)(unsigned long pfn); /* PFN to node ID */ - unsigned long (*align)(void); /* Minimum node alignment */ - int (*distance)(int a, int b); /* Distance between two nodes */ -}; - -extern const struct numa_mode numa_mode_plain; -extern const struct numa_mode numa_mode_emu; - -#endif /* __S390_NUMA_MODE_H */ diff --git a/arch/s390/numa/toptree.c b/arch/s390/numa/toptree.c deleted file mode 100644 index 71a608cd4f61..000000000000 --- a/arch/s390/numa/toptree.c +++ /dev/null @@ -1,351 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * NUMA support for s390 - * - * A tree structure used for machine topology mangling - * - * Copyright IBM Corp. 2015 - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "toptree.h" - -/** - * toptree_alloc - Allocate and initialize a new tree node. - * @level: The node's vertical level; level 0 contains the leaves. - * @id: ID number, explicitly not unique beyond scope of node's siblings - * - * Allocate a new tree node and initialize it. - * - * RETURNS: - * Pointer to the new tree node or NULL on error - */ -struct toptree __ref *toptree_alloc(int level, int id) -{ - struct toptree *res; - - if (slab_is_available()) - res = kzalloc(sizeof(*res), GFP_KERNEL); - else - res = memblock_alloc(sizeof(*res), 8); - if (!res) - return res; - - INIT_LIST_HEAD(&res->children); - INIT_LIST_HEAD(&res->sibling); - cpumask_clear(&res->mask); - res->level = level; - res->id = id; - return res; -} - -/** - * toptree_remove - Remove a tree node from a tree - * @cand: Pointer to the node to remove - * - * The node is detached from its parent node. The parent node's - * masks will be updated to reflect the loss of the child. - */ -static void toptree_remove(struct toptree *cand) -{ - struct toptree *oldparent; - - list_del_init(&cand->sibling); - oldparent = cand->parent; - cand->parent = NULL; - toptree_update_mask(oldparent); -} - -/** - * toptree_free - discard a tree node - * @cand: Pointer to the tree node to discard - * - * Checks if @cand is attached to a parent node. Detaches it - * cleanly using toptree_remove. Possible children are freed - * recursively. In the end @cand itself is freed. - */ -void __ref toptree_free(struct toptree *cand) -{ - struct toptree *child, *tmp; - - if (cand->parent) - toptree_remove(cand); - toptree_for_each_child_safe(child, tmp, cand) - toptree_free(child); - if (slab_is_available()) - kfree(cand); - else - memblock_free_early((unsigned long)cand, sizeof(*cand)); -} - -/** - * toptree_update_mask - Update node bitmasks - * @cand: Pointer to a tree node - * - * The node's cpumask will be updated by combining all children's - * masks. Then toptree_update_mask is called recursively for the - * parent if applicable. - * - * NOTE: - * This must not be called on leaves. If called on a leaf, its - * CPU mask is cleared and lost. - */ -void toptree_update_mask(struct toptree *cand) -{ - struct toptree *child; - - cpumask_clear(&cand->mask); - list_for_each_entry(child, &cand->children, sibling) - cpumask_or(&cand->mask, &cand->mask, &child->mask); - if (cand->parent) - toptree_update_mask(cand->parent); -} - -/** - * toptree_insert - Insert a tree node into tree - * @cand: Pointer to the node to insert - * @target: Pointer to the node to which @cand will added as a child - * - * Insert a tree node into a tree. Masks will be updated automatically. - * - * RETURNS: - * 0 on success, -1 if NULL is passed as argument or the node levels - * don't fit. - */ -static int toptree_insert(struct toptree *cand, struct toptree *target) -{ - if (!cand || !target) - return -1; - if (target->level != (cand->level + 1)) - return -1; - list_add_tail(&cand->sibling, &target->children); - cand->parent = target; - toptree_update_mask(target); - return 0; -} - -/** - * toptree_move_children - Move all child nodes of a node to a new place - * @cand: Pointer to the node whose children are to be moved - * @target: Pointer to the node to which @cand's children will be attached - * - * Take all child nodes of @cand and move them using toptree_move. - */ -static void toptree_move_children(struct toptree *cand, struct toptree *target) -{ - struct toptree *child, *tmp; - - toptree_for_each_child_safe(child, tmp, cand) - toptree_move(child, target); -} - -/** - * toptree_unify - Merge children with same ID - * @cand: Pointer to node whose direct children should be made unique - * - * When mangling the tree it is possible that a node has two or more children - * which have the same ID. This routine merges these children into one and - * moves all children of the merged nodes into the unified node. - */ -void toptree_unify(struct toptree *cand) -{ - struct toptree *child, *tmp, *cand_copy; - - /* Threads cannot be split, cores are not split */ - if (cand->level < 2) - return; - - cand_copy = toptree_alloc(cand->level, 0); - toptree_for_each_child_safe(child, tmp, cand) { - struct toptree *tmpchild; - - if (!cpumask_empty(&child->mask)) { - tmpchild = toptree_get_child(cand_copy, child->id); - toptree_move_children(child, tmpchild); - } - toptree_free(child); - } - toptree_move_children(cand_copy, cand); - toptree_free(cand_copy); - - toptree_for_each_child(child, cand) - toptree_unify(child); -} - -/** - * toptree_move - Move a node to another context - * @cand: Pointer to the node to move - * @target: Pointer to the node where @cand should go - * - * In the easiest case @cand is exactly on the level below @target - * and will be immediately moved to the target. - * - * If @target's level is not the direct parent level of @cand, - * nodes for the missing levels are created and put between - * @cand and @target. The "stacking" nodes' IDs are taken from - * @cand's parents. - * - * After this it is likely to have redundant nodes in the tree - * which are addressed by means of toptree_unify. - */ -void toptree_move(struct toptree *cand, struct toptree *target) -{ - struct toptree *stack_target, *real_insert_point, *ptr, *tmp; - - if (cand->level + 1 == target->level) { - toptree_remove(cand); - toptree_insert(cand, target); - return; - } - - real_insert_point = NULL; - ptr = cand; - stack_target = NULL; - - do { - tmp = stack_target; - stack_target = toptree_alloc(ptr->level + 1, - ptr->parent->id); - toptree_insert(tmp, stack_target); - if (!real_insert_point) - real_insert_point = stack_target; - ptr = ptr->parent; - } while (stack_target->level < (target->level - 1)); - - toptree_remove(cand); - toptree_insert(cand, real_insert_point); - toptree_insert(stack_target, target); -} - -/** - * toptree_get_child - Access a tree node's child by its ID - * @cand: Pointer to tree node whose child is to access - * @id: The desired child's ID - * - * @cand's children are searched for a child with matching ID. - * If no match can be found, a new child with the desired ID - * is created and returned. - */ -struct toptree *toptree_get_child(struct toptree *cand, int id) -{ - struct toptree *child; - - toptree_for_each_child(child, cand) - if (child->id == id) - return child; - child = toptree_alloc(cand->level-1, id); - toptree_insert(child, cand); - return child; -} - -/** - * toptree_first - Find the first descendant on specified level - * @context: Pointer to tree node whose descendants are to be used - * @level: The level of interest - * - * RETURNS: - * @context's first descendant on the specified level, or NULL - * if there is no matching descendant - */ -struct toptree *toptree_first(struct toptree *context, int level) -{ - struct toptree *child, *tmp; - - if (context->level == level) - return context; - - if (!list_empty(&context->children)) { - list_for_each_entry(child, &context->children, sibling) { - tmp = toptree_first(child, level); - if (tmp) - return tmp; - } - } - return NULL; -} - -/** - * toptree_next_sibling - Return next sibling - * @cur: Pointer to a tree node - * - * RETURNS: - * If @cur has a parent and is not the last in the parent's children list, - * the next sibling is returned. Or NULL when there are no siblings left. - */ -static struct toptree *toptree_next_sibling(struct toptree *cur) -{ - if (cur->parent == NULL) - return NULL; - - if (cur == list_last_entry(&cur->parent->children, - struct toptree, sibling)) - return NULL; - return (struct toptree *) list_next_entry(cur, sibling); -} - -/** - * toptree_next - Tree traversal function - * @cur: Pointer to current element - * @context: Pointer to the root node of the tree or subtree to - * be traversed. - * @level: The level of interest. - * - * RETURNS: - * Pointer to the next node on level @level - * or NULL when there is no next node. - */ -struct toptree *toptree_next(struct toptree *cur, struct toptree *context, - int level) -{ - struct toptree *cur_context, *tmp; - - if (!cur) - return NULL; - - if (context->level == level) - return NULL; - - tmp = toptree_next_sibling(cur); - if (tmp != NULL) - return tmp; - - cur_context = cur; - while (cur_context->level < context->level - 1) { - /* Step up */ - cur_context = cur_context->parent; - /* Step aside */ - tmp = toptree_next_sibling(cur_context); - if (tmp != NULL) { - /* Step down */ - tmp = toptree_first(tmp, level); - if (tmp != NULL) - return tmp; - } - } - return NULL; -} - -/** - * toptree_count - Count descendants on specified level - * @context: Pointer to node whose descendants are to be considered - * @level: Only descendants on the specified level will be counted - * - * RETURNS: - * Number of descendants on the specified level - */ -int toptree_count(struct toptree *context, int level) -{ - struct toptree *cur; - int cnt = 0; - - toptree_for_each(cur, context, level) - cnt++; - return cnt; -} diff --git a/arch/s390/numa/toptree.h b/arch/s390/numa/toptree.h deleted file mode 100644 index 5246371ec713..000000000000 --- a/arch/s390/numa/toptree.h +++ /dev/null @@ -1,61 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * NUMA support for s390 - * - * A tree structure used for machine topology mangling - * - * Copyright IBM Corp. 2015 - */ -#ifndef S390_TOPTREE_H -#define S390_TOPTREE_H - -#include -#include - -struct toptree { - int level; - int id; - cpumask_t mask; - struct toptree *parent; - struct list_head sibling; - struct list_head children; -}; - -struct toptree *toptree_alloc(int level, int id); -void toptree_free(struct toptree *cand); -void toptree_update_mask(struct toptree *cand); -void toptree_unify(struct toptree *cand); -struct toptree *toptree_get_child(struct toptree *cand, int id); -void toptree_move(struct toptree *cand, struct toptree *target); -int toptree_count(struct toptree *context, int level); - -struct toptree *toptree_first(struct toptree *context, int level); -struct toptree *toptree_next(struct toptree *cur, struct toptree *context, - int level); - -#define toptree_for_each_child(child, ptree) \ - list_for_each_entry(child, &ptree->children, sibling) - -#define toptree_for_each_child_safe(child, ptmp, ptree) \ - list_for_each_entry_safe(child, ptmp, &ptree->children, sibling) - -#define toptree_is_last(ptree) \ - ((ptree->parent == NULL) || \ - (ptree->parent->children.prev == &ptree->sibling)) - -#define toptree_for_each(ptree, cont, ttype) \ - for (ptree = toptree_first(cont, ttype); \ - ptree != NULL; \ - ptree = toptree_next(ptree, cont, ttype)) - -#define toptree_for_each_safe(ptree, tmp, cont, ttype) \ - for (ptree = toptree_first(cont, ttype), \ - tmp = toptree_next(ptree, cont, ttype); \ - ptree != NULL; \ - ptree = tmp, \ - tmp = toptree_next(ptree, cont, ttype)) - -#define toptree_for_each_sibling(ptree, start) \ - toptree_for_each(ptree, start->parent, start->level) - -#endif /* S390_TOPTREE_H */ -- cgit v1.2.3 From 12437759602315d1eaa5ece8169cd1eedd018ff2 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 27 Feb 2020 14:42:29 +0100 Subject: s390/mm: mark private defines for vm_fault_t as such This fixes several sparse warnings for fault.c: arch/s390/mm/fault.c:336:36: warning: restricted vm_fault_t degrades to integer arch/s390/mm/fault.c:573:23: warning: incorrect type in assignment (different base types) arch/s390/mm/fault.c:573:23: expected restricted vm_fault_t [usertype] fault arch/s390/mm/fault.c:573:23: got int Signed-off-by: Christian Borntraeger Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/mm/fault.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 7b0bb475c166..151adef0d5dd 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -44,11 +44,11 @@ #define __SUBCODE_MASK 0x0600 #define __PF_RES_FIELD 0x8000000000000000ULL -#define VM_FAULT_BADCONTEXT 0x010000 -#define VM_FAULT_BADMAP 0x020000 -#define VM_FAULT_BADACCESS 0x040000 -#define VM_FAULT_SIGNAL 0x080000 -#define VM_FAULT_PFAULT 0x100000 +#define VM_FAULT_BADCONTEXT ((__force vm_fault_t) 0x010000) +#define VM_FAULT_BADMAP ((__force vm_fault_t) 0x020000) +#define VM_FAULT_BADACCESS ((__force vm_fault_t) 0x040000) +#define VM_FAULT_SIGNAL ((__force vm_fault_t) 0x080000) +#define VM_FAULT_PFAULT ((__force vm_fault_t) 0x100000) enum fault_type { KERNEL_FAULT, -- cgit v1.2.3 From 014b020475d4b9670d3cff11b751a7c20208f78b Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 25 Feb 2020 12:44:06 +0100 Subject: s390/mm: cleanup phys_to_pfn() and friends Make page, frame, virtual and physical address conversion macros more expressive by avoiding redundant definitions and defining new macros using existing ones. Signed-off-by: Alexander Gordeev Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/page.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 85e944f04c70..2e53b27f4f1a 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -161,20 +161,20 @@ static inline int devmem_is_allowed(unsigned long pfn) #define __pa(x) ((unsigned long)(x)) #define __va(x) ((void *)(unsigned long)(x)) -#define virt_to_pfn(kaddr) (__pa(kaddr) >> PAGE_SHIFT) -#define pfn_to_virt(pfn) __va((pfn) << PAGE_SHIFT) +#define phys_to_pfn(phys) ((phys) >> PAGE_SHIFT) +#define pfn_to_phys(pfn) ((pfn) << PAGE_SHIFT) + +#define phys_to_page(phys) pfn_to_page(phys_to_pfn(phys)) +#define page_to_phys(page) pfn_to_phys(page_to_pfn(page)) + +#define pfn_to_virt(pfn) __va(pfn_to_phys(pfn)) +#define virt_to_pfn(kaddr) (phys_to_pfn(__pa(kaddr))) #define pfn_to_kaddr(pfn) pfn_to_virt(pfn) #define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) #define page_to_virt(page) pfn_to_virt(page_to_pfn(page)) -#define phys_to_pfn(kaddr) ((kaddr) >> PAGE_SHIFT) -#define pfn_to_phys(pfn) ((pfn) << PAGE_SHIFT) - -#define phys_to_page(kaddr) pfn_to_page(phys_to_pfn(kaddr)) -#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) - -#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) +#define virt_addr_valid(kaddr) pfn_valid(virt_to_pfn(kaddr)) #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -- cgit v1.2.3 From 035f212fa7f21035537cf6dea620fe5653191eb6 Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Mon, 10 Feb 2020 17:53:25 +0100 Subject: s390/pci: embedding hotplug_slot in zdev Embedding the hotplug_slot in zdev structure allows to greatly simplify the hotplug handling by eliminating the handling of the slot_list. Signed-off-by: Pierre Morel Reviewed-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/pci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index b05187ce5dbd..73b69a777152 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -96,6 +97,7 @@ struct s390_domain; struct zpci_dev { struct pci_bus *bus; struct list_head entry; /* list of all zpci_devices, needed for hotplug, etc. */ + struct hotplug_slot hotplug_slot; enum zpci_state state; u32 fid; /* function ID, used by sclp */ -- cgit v1.2.3 From d68d5d51dc898895b4e15bea52e5668ca9e76180 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Thu, 5 Mar 2020 07:44:09 +0100 Subject: s390/cpum_cf: Add new extended counters for IBM z15 Add CPU measurement counter facility event description for IBM z15. Signed-off-by: Thomas Richter Reviewed-by: Sumanth Korikkar Signed-off-by: Vasily Gorbik --- arch/s390/kernel/perf_cpum_cf_events.c | 123 ++++++++++++++++++++++++++++++++- 1 file changed, 122 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c index 8b33e03e47b8..1e3df52b2b65 100644 --- a/arch/s390/kernel/perf_cpum_cf_events.c +++ b/arch/s390/kernel/perf_cpum_cf_events.c @@ -238,6 +238,64 @@ CPUMF_EVENT_ATTR(cf_z14, TX_C_TABORT_SPECIAL, 0x00f5); CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); +CPUMF_EVENT_ATTR(cf_z15, L1D_RO_EXCL_WRITES, 0x0080); +CPUMF_EVENT_ATTR(cf_z15, DTLB2_WRITES, 0x0081); +CPUMF_EVENT_ATTR(cf_z15, DTLB2_MISSES, 0x0082); +CPUMF_EVENT_ATTR(cf_z15, DTLB2_HPAGE_WRITES, 0x0083); +CPUMF_EVENT_ATTR(cf_z15, DTLB2_GPAGE_WRITES, 0x0084); +CPUMF_EVENT_ATTR(cf_z15, L1D_L2D_SOURCED_WRITES, 0x0085); +CPUMF_EVENT_ATTR(cf_z15, ITLB2_WRITES, 0x0086); +CPUMF_EVENT_ATTR(cf_z15, ITLB2_MISSES, 0x0087); +CPUMF_EVENT_ATTR(cf_z15, L1I_L2I_SOURCED_WRITES, 0x0088); +CPUMF_EVENT_ATTR(cf_z15, TLB2_PTE_WRITES, 0x0089); +CPUMF_EVENT_ATTR(cf_z15, TLB2_CRSTE_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_z15, TLB2_ENGINES_BUSY, 0x008b); +CPUMF_EVENT_ATTR(cf_z15, TX_C_TEND, 0x008c); +CPUMF_EVENT_ATTR(cf_z15, TX_NC_TEND, 0x008d); +CPUMF_EVENT_ATTR(cf_z15, L1C_TLB2_MISSES, 0x008f); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES, 0x0090); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_MEMORY_SOURCED_WRITES, 0x0091); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_IV, 0x0092); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES, 0x0093); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_MEMORY_SOURCED_WRITES, 0x0094); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES_IV, 0x0095); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES, 0x0096); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES, 0x0097); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV, 0x0098); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES, 0x0099); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_MEMORY_SOURCED_WRITES, 0x009a); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES_IV, 0x009b); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONDRAWER_L4_SOURCED_WRITES, 0x009c); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L4_SOURCED_WRITES, 0x009d); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_RO, 0x009e); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES, 0x00a2); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_MEMORY_SOURCED_WRITES, 0x00a3); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES_IV, 0x00a4); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES, 0x00a5); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_MEMORY_SOURCED_WRITES, 0x00a6); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES_IV, 0x00a7); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES, 0x00a8); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES, 0x00a9); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV, 0x00aa); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES, 0x00ab); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_MEMORY_SOURCED_WRITES, 0x00ac); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES_IV, 0x00ad); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONDRAWER_L4_SOURCED_WRITES, 0x00ae); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L4_SOURCED_WRITES, 0x00af); +CPUMF_EVENT_ATTR(cf_z15, BCD_DFP_EXECUTION_SLOTS, 0x00e0); +CPUMF_EVENT_ATTR(cf_z15, VX_BCD_EXECUTION_SLOTS, 0x00e1); +CPUMF_EVENT_ATTR(cf_z15, DECIMAL_INSTRUCTIONS, 0x00e2); +CPUMF_EVENT_ATTR(cf_z15, LAST_HOST_TRANSLATIONS, 0x00e8); +CPUMF_EVENT_ATTR(cf_z15, TX_NC_TABORT, 0x00f3); +CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_NO_SPECIAL, 0x00f4); +CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_SPECIAL, 0x00f5); +CPUMF_EVENT_ATTR(cf_z15, DFLT_ACCESS, 0x00f7); +CPUMF_EVENT_ATTR(cf_z15, DFLT_CYCLES, 0x00fc); +CPUMF_EVENT_ATTR(cf_z15, DFLT_CC, 0x00108); +CPUMF_EVENT_ATTR(cf_z15, DFLT_CCERROR, 0x00109); +CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); +CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); + static struct attribute *cpumcf_fvn1_pmu_event_attr[] __initdata = { CPUMF_EVENT_PTR(cf_fvn1, CPU_CYCLES), CPUMF_EVENT_PTR(cf_fvn1, INSTRUCTIONS), @@ -516,6 +574,67 @@ static struct attribute *cpumcf_z14_pmu_event_attr[] __initdata = { NULL, }; +static struct attribute *cpumcf_z15_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_z15, L1D_RO_EXCL_WRITES), + CPUMF_EVENT_PTR(cf_z15, DTLB2_WRITES), + CPUMF_EVENT_PTR(cf_z15, DTLB2_MISSES), + CPUMF_EVENT_PTR(cf_z15, DTLB2_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z15, DTLB2_GPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_L2D_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, ITLB2_WRITES), + CPUMF_EVENT_PTR(cf_z15, ITLB2_MISSES), + CPUMF_EVENT_PTR(cf_z15, L1I_L2I_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_z15, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_z15, TLB2_ENGINES_BUSY), + CPUMF_EVENT_PTR(cf_z15, TX_C_TEND), + CPUMF_EVENT_PTR(cf_z15, TX_NC_TEND), + CPUMF_EVENT_PTR(cf_z15, L1C_TLB2_MISSES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1D_ONDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_RO), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1I_ONDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, BCD_DFP_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z15, VX_BCD_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z15, DECIMAL_INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_z15, LAST_HOST_TRANSLATIONS), + CPUMF_EVENT_PTR(cf_z15, TX_NC_TABORT), + CPUMF_EVENT_PTR(cf_z15, TX_C_TABORT_NO_SPECIAL), + CPUMF_EVENT_PTR(cf_z15, TX_C_TABORT_SPECIAL), + CPUMF_EVENT_PTR(cf_z15, DFLT_ACCESS), + CPUMF_EVENT_PTR(cf_z15, DFLT_CYCLES), + CPUMF_EVENT_PTR(cf_z15, DFLT_CC), + CPUMF_EVENT_PTR(cf_z15, DFLT_CCERROR), + CPUMF_EVENT_PTR(cf_z15, MT_DIAG_CYCLES_ONE_THR_ACTIVE), + CPUMF_EVENT_PTR(cf_z15, MT_DIAG_CYCLES_TWO_THR_ACTIVE), + NULL, +}; + /* END: CPUM_CF COUNTER DEFINITIONS ===================================== */ static struct attribute_group cpumcf_pmu_events_group = { @@ -624,9 +743,11 @@ __init const struct attribute_group **cpumf_cf_event_group(void) break; case 0x3906: case 0x3907: + model = cpumcf_z14_pmu_event_attr; + break; case 0x8561: case 0x8562: - model = cpumcf_z14_pmu_event_attr; + model = cpumcf_z15_pmu_event_attr; break; default: model = none; -- cgit v1.2.3 From d2abfbe4652d2b49d30fe77548cf663e63d2d469 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 5 Mar 2020 15:01:21 +0100 Subject: s390: enable bpf jit by default when not built as always-on This is the s390 variant of commit 81c22041d9f1 ("bpf, x86, arm64: Enable jit by default when not built as always-on"). Acked-by: Ilya Leoshkevich Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 6b1f715dd8bb..f4ff75ff62f2 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -109,6 +109,7 @@ config S390 select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_WANTS_DYNAMIC_TASK_STRUCT + select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_IPC_PARSE_VERSION select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS2 -- cgit v1.2.3 From 0b38b5e1d0e2f361e418e05c179db05bb688bbd6 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Wed, 22 Jan 2020 13:38:22 +0100 Subject: s390: prevent leaking kernel address in BEAR When userspace executes a syscall or gets interrupted, BEAR contains a kernel address when returning to userspace. This make it pretty easy to figure out where the kernel is mapped even with KASLR enabled. To fix this, add lpswe to lowcore and always execute it there, so userspace sees only the lowcore address of lpswe. For this we have to extend both critical_cleanup and the SWITCH_ASYNC macro to also check for lpswe addresses in lowcore. Fixes: b2d24b97b2a9 ("s390/kernel: add support for kernel address space layout randomization (KASLR)") Cc: # v5.2+ Reviewed-by: Gerald Schaefer Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/lowcore.h | 4 ++- arch/s390/include/asm/processor.h | 1 + arch/s390/include/asm/setup.h | 7 +++++ arch/s390/kernel/asm-offsets.c | 2 ++ arch/s390/kernel/entry.S | 65 +++++++++++++++++++++++---------------- arch/s390/kernel/process.c | 1 + arch/s390/kernel/setup.c | 3 ++ arch/s390/kernel/smp.c | 2 ++ arch/s390/mm/vmem.c | 4 +++ 9 files changed, 62 insertions(+), 27 deletions(-) (limited to 'arch') diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 237ee0c4169f..612ed3c6d581 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -141,7 +141,9 @@ struct lowcore { /* br %r1 trampoline */ __u16 br_r1_trampoline; /* 0x0400 */ - __u8 pad_0x0402[0x0e00-0x0402]; /* 0x0402 */ + __u32 return_lpswe; /* 0x0402 */ + __u32 return_mcck_lpswe; /* 0x0406 */ + __u8 pad_0x040a[0x0e00-0x040a]; /* 0x040a */ /* * 0xe00 contains the address of the IPL Parameter Information diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 361ef5eda468..c9522346799f 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -162,6 +162,7 @@ typedef struct thread_struct thread_struct; #define INIT_THREAD { \ .ksp = sizeof(init_stack) + (unsigned long) &init_stack, \ .fpu.regs = (void *) init_task.thread.fpu.fprs, \ + .last_break = 1, \ } /* diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index b241ddb67caf..534f212753d6 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -8,6 +8,7 @@ #include #include +#include #define EP_OFFSET 0x10008 #define EP_STRING "S390EP" @@ -162,6 +163,12 @@ static inline unsigned long kaslr_offset(void) return __kaslr_offset; } +static inline u32 gen_lpswe(unsigned long addr) +{ + BUILD_BUG_ON(addr > 0xfff); + return 0xb2b20000 | addr; +} + #else /* __ASSEMBLY__ */ #define IPL_DEVICE (IPL_DEVICE_OFFSET) diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index ce33406cfe83..e80f0e6f5972 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -124,6 +124,8 @@ int main(void) OFFSET(__LC_EXT_DAMAGE_CODE, lowcore, external_damage_code); OFFSET(__LC_MCCK_FAIL_STOR_ADDR, lowcore, failing_storage_address); OFFSET(__LC_LAST_BREAK, lowcore, breaking_event_addr); + OFFSET(__LC_RETURN_LPSWE, lowcore, return_lpswe); + OFFSET(__LC_RETURN_MCCK_LPSWE, lowcore, return_mcck_lpswe); OFFSET(__LC_RST_OLD_PSW, lowcore, restart_old_psw); OFFSET(__LC_EXT_OLD_PSW, lowcore, external_old_psw); OFFSET(__LC_SVC_OLD_PSW, lowcore, svc_old_psw); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 9205add8481d..3ae64914bd14 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -115,26 +115,29 @@ _LPP_OFFSET = __LC_LPP .macro SWITCH_ASYNC savearea,timer tmhh %r8,0x0001 # interrupting from user ? - jnz 1f + jnz 2f lgr %r14,%r9 + cghi %r14,__LC_RETURN_LPSWE + je 0f slg %r14,BASED(.Lcritical_start) clg %r14,BASED(.Lcritical_length) - jhe 0f + jhe 1f +0: lghi %r11,\savearea # inside critical section, do cleanup brasl %r14,cleanup_critical tmhh %r8,0x0001 # retest problem state after cleanup - jnz 1f -0: lg %r14,__LC_ASYNC_STACK # are we already on the target stack? + jnz 2f +1: lg %r14,__LC_ASYNC_STACK # are we already on the target stack? slgr %r14,%r15 srag %r14,%r14,STACK_SHIFT - jnz 2f + jnz 3f CHECK_STACK \savearea aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) - j 3f -1: UPDATE_VTIME %r14,%r15,\timer + j 4f +2: UPDATE_VTIME %r14,%r15,\timer BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP -2: lg %r15,__LC_ASYNC_STACK # load async stack -3: la %r11,STACK_FRAME_OVERHEAD(%r15) +3: lg %r15,__LC_ASYNC_STACK # load async stack +4: la %r11,STACK_FRAME_OVERHEAD(%r15) .endm .macro UPDATE_VTIME w1,w2,enter_timer @@ -401,7 +404,7 @@ ENTRY(system_call) stpt __LC_EXIT_TIMER mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER lmg %r11,%r15,__PT_R11(%r11) - lpswe __LC_RETURN_PSW + b __LC_RETURN_LPSWE(%r0) .Lsysc_done: # @@ -608,43 +611,50 @@ ENTRY(pgm_check_handler) BPOFF stmg %r8,%r15,__LC_SAVE_AREA_SYNC lg %r10,__LC_LAST_BREAK - lg %r12,__LC_CURRENT + srag %r11,%r10,12 + jnz 0f + /* if __LC_LAST_BREAK is < 4096, it contains one of + * the lpswe addresses in lowcore. Set it to 1 (initial state) + * to prevent leaking that address to userspace. + */ + lghi %r10,1 +0: lg %r12,__LC_CURRENT lghi %r11,0 larl %r13,cleanup_critical lmg %r8,%r9,__LC_PGM_OLD_PSW tmhh %r8,0x0001 # test problem state bit - jnz 2f # -> fault in user space + jnz 3f # -> fault in user space #if IS_ENABLED(CONFIG_KVM) # cleanup critical section for program checks in sie64a lgr %r14,%r9 slg %r14,BASED(.Lsie_critical_start) clg %r14,BASED(.Lsie_critical_length) - jhe 0f + jhe 1f lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE lctlg %c1,%c1,__LC_USER_ASCE # load primary asce larl %r9,sie_exit # skip forward to sie_exit lghi %r11,_PIF_GUEST_FAULT #endif -0: tmhh %r8,0x4000 # PER bit set in old PSW ? - jnz 1f # -> enabled, can't be a double fault +1: tmhh %r8,0x4000 # PER bit set in old PSW ? + jnz 2f # -> enabled, can't be a double fault tm __LC_PGM_ILC+3,0x80 # check for per exception jnz .Lpgm_svcper # -> single stepped svc -1: CHECK_STACK __LC_SAVE_AREA_SYNC +2: CHECK_STACK __LC_SAVE_AREA_SYNC aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) - # CHECK_VMAP_STACK branches to stack_overflow or 4f - CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,4f -2: UPDATE_VTIME %r14,%r15,__LC_SYNC_ENTER_TIMER + # CHECK_VMAP_STACK branches to stack_overflow or 5f + CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,5f +3: UPDATE_VTIME %r14,%r15,__LC_SYNC_ENTER_TIMER BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP lg %r15,__LC_KERNEL_STACK lgr %r14,%r12 aghi %r14,__TASK_thread # pointer to thread_struct lghi %r13,__LC_PGM_TDB tm __LC_PGM_ILC+2,0x02 # check for transaction abort - jz 3f + jz 4f mvc __THREAD_trap_tdb(256,%r14),0(%r13) -3: stg %r10,__THREAD_last_break(%r14) -4: lgr %r13,%r11 +4: stg %r10,__THREAD_last_break(%r14) +5: lgr %r13,%r11 la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) # clear user controlled registers to prevent speculative use @@ -663,14 +673,14 @@ ENTRY(pgm_check_handler) stg %r13,__PT_FLAGS(%r11) stg %r10,__PT_ARGS(%r11) tm __LC_PGM_ILC+3,0x80 # check for per exception - jz 5f + jz 6f tmhh %r8,0x0001 # kernel per event ? jz .Lpgm_kprobe oi __PT_FLAGS+7(%r11),_PIF_PER_TRAP mvc __THREAD_per_address(8,%r14),__LC_PER_ADDRESS mvc __THREAD_per_cause(2,%r14),__LC_PER_CODE mvc __THREAD_per_paid(1,%r14),__LC_PER_ACCESS_ID -5: REENABLE_IRQS +6: REENABLE_IRQS xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) larl %r1,pgm_check_table llgh %r10,__PT_INT_CODE+2(%r11) @@ -775,7 +785,7 @@ ENTRY(io_int_handler) mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER .Lio_exit_kernel: lmg %r11,%r15,__PT_R11(%r11) - lpswe __LC_RETURN_PSW + b __LC_RETURN_LPSWE(%r0) .Lio_done: # @@ -1214,7 +1224,7 @@ ENTRY(mcck_int_handler) stpt __LC_EXIT_TIMER mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER 0: lmg %r11,%r15,__PT_R11(%r11) - lpswe __LC_RETURN_MCCK_PSW + b __LC_RETURN_MCCK_LPSWE .Lmcck_panic: lg %r15,__LC_NODAT_STACK @@ -1271,6 +1281,8 @@ ENDPROC(stack_overflow) #endif ENTRY(cleanup_critical) + cghi %r9,__LC_RETURN_LPSWE + je .Lcleanup_lpswe #if IS_ENABLED(CONFIG_KVM) clg %r9,BASED(.Lcleanup_table_sie) # .Lsie_gmap jl 0f @@ -1424,6 +1436,7 @@ ENDPROC(cleanup_critical) mvc __LC_RETURN_PSW(16),__PT_PSW(%r9) mvc 0(64,%r11),__PT_R8(%r9) lmg %r0,%r7,__PT_R0(%r9) +.Lcleanup_lpswe: 1: lmg %r8,%r9,__LC_RETURN_PSW BR_EX %r14,%r11 .Lcleanup_sysc_restore_insn: diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 6ccef5f29761..eb6e23ad15a2 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -106,6 +106,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp, p->thread.system_timer = 0; p->thread.hardirq_timer = 0; p->thread.softirq_timer = 0; + p->thread.last_break = 1; frame->sf.back_chain = 0; /* new return point is ret_from_fork */ diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 1158a63a8e0e..26de59256466 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -73,6 +73,7 @@ #include #include #include +#include #include "entry.h" /* @@ -450,6 +451,8 @@ static void __init setup_lowcore_dat_off(void) lc->spinlock_index = 0; arch_spin_lock_setup(0); lc->br_r1_trampoline = 0x07f1; /* br %r1 */ + lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); + lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); set_prefix((u32)(unsigned long) lc); lowcore_ptr[0] = lc; diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index a08bd2522dd9..f87d4e14269c 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -212,6 +212,8 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) lc->spinlock_lockval = arch_spin_lockval(cpu); lc->spinlock_index = 0; lc->br_r1_trampoline = 0x07f1; /* br %r1 */ + lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); + lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); if (nmi_alloc_per_cpu(lc)) goto out_async; if (vdso_alloc_per_cpu(lc)) diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index b403fa14847d..f810930aff42 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -415,6 +415,10 @@ void __init vmem_map_init(void) SET_MEMORY_RO | SET_MEMORY_X); __set_memory(__stext_dma, (__etext_dma - __stext_dma) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); + + /* we need lowcore executable for our LPSWE instructions */ + set_memory_x(0, 1); + pr_info("Write protected kernel read-only data: %luk\n", (unsigned long)(__end_rodata - _stext) >> 10); } -- cgit v1.2.3 From fb83510295d7a6cdeb46242515c3180f9adafc85 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 29 Jan 2020 12:15:15 +0100 Subject: s390/cpuinfo: add system topology information This update adjusts /proc/cpuinfo format to meet some user level programs expectations. It also makes the layout consistent with x86 where CPU topology is presented as blocks of key-value pairs. Reviewed-by: Vasily Gorbik Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/kernel/processor.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'arch') diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index 6ebc2117c66c..2c13ca562b48 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -151,6 +151,26 @@ static void show_cpu_summary(struct seq_file *m, void *v) } } +static void show_cpu_topology(struct seq_file *m, unsigned long n) +{ +#ifdef CONFIG_SCHED_TOPOLOGY + seq_printf(m, "physical id : %d\n", topology_physical_package_id(n)); + seq_printf(m, "core id : %d\n", topology_core_id(n)); + seq_printf(m, "book id : %d\n", topology_book_id(n)); + seq_printf(m, "drawer id : %d\n", topology_drawer_id(n)); + seq_printf(m, "dedicated : %d\n", topology_cpu_dedicated(n)); +#endif /* CONFIG_SCHED_TOPOLOGY */ +} + +static void show_cpu_ids(struct seq_file *m, unsigned long n) +{ + struct cpuid *id = &per_cpu(cpu_info.cpu_id, n); + + seq_printf(m, "version : %02X\n", id->version); + seq_printf(m, "identification : %06X\n", id->ident); + seq_printf(m, "machine : %04X\n", id->machine); +} + static void show_cpu_mhz(struct seq_file *m, unsigned long n) { struct cpu_info *c = per_cpu_ptr(&cpu_info, n); @@ -171,6 +191,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (!machine_has_cpu_mhz) return 0; seq_printf(m, "\ncpu number : %ld\n", n); + show_cpu_topology(m, n); + show_cpu_ids(m, n); show_cpu_mhz(m, n); return 0; } -- cgit v1.2.3 From 8719b6d29d2851fa84c4074bb2e5adc022911ab8 Mon Sep 17 00:00:00 2001 From: afzal mohammed Date: Wed, 4 Mar 2020 06:20:48 +0530 Subject: s390/irq: replace setup_irq() by request_irq() request_irq() is preferred over setup_irq(). Invocations of setup_irq() occur after memory allocators are ready. Per tglx[1], setup_irq() existed in olden days when allocators were not ready by the time early interrupts were initialized. Hence replace setup_irq() by request_irq(). [1] https://lkml.kernel.org/r/alpine.DEB.2.20.1710191609480.1971@nanos Signed-off-by: afzal mohammed Message-Id: <20200304005049.5291-1-afzal.mohd.ma@gmail.com> [heiko.carstens@de.ibm.com: replace pr_err with panic] Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/irq.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 8371855042dc..da550cb8b31b 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -294,11 +294,6 @@ static irqreturn_t do_ext_interrupt(int irq, void *dummy) return IRQ_HANDLED; } -static struct irqaction external_interrupt = { - .name = "EXT", - .handler = do_ext_interrupt, -}; - void __init init_ext_interrupts(void) { int idx; @@ -308,7 +303,8 @@ void __init init_ext_interrupts(void) irq_set_chip_and_handler(EXT_INTERRUPT, &dummy_irq_chip, handle_percpu_irq); - setup_irq(EXT_INTERRUPT, &external_interrupt); + if (request_irq(EXT_INTERRUPT, do_ext_interrupt, 0, "EXT", NULL)) + panic("Failed to register EXT interrupt\n"); } static DEFINE_SPINLOCK(irq_subclass_lock); -- cgit v1.2.3 From 76fb118083eaf63f506fcbe695c1b12a38971b7a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 10 Mar 2020 10:25:51 +0100 Subject: s390/irq: make init_ext_interrupts static Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/hw_irq.h | 1 - arch/s390/kernel/irq.c | 18 +++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/s390/include/asm/hw_irq.h b/arch/s390/include/asm/hw_irq.h index adae176757ae..9078b5b6b837 100644 --- a/arch/s390/include/asm/hw_irq.h +++ b/arch/s390/include/asm/hw_irq.h @@ -7,6 +7,5 @@ void __init init_airq_interrupts(void); void __init init_cio_interrupts(void); -void __init init_ext_interrupts(void); #endif diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index da550cb8b31b..3514420f0259 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -95,14 +95,6 @@ static const struct irq_class irqclass_sub_desc[] = { {.irq = CPU_RST, .name = "RST", .desc = "[CPU] CPU Restart"}, }; -void __init init_IRQ(void) -{ - BUILD_BUG_ON(ARRAY_SIZE(irqclass_sub_desc) != NR_ARCH_IRQS); - init_cio_interrupts(); - init_airq_interrupts(); - init_ext_interrupts(); -} - void do_IRQ(struct pt_regs *regs, int irq) { struct pt_regs *old_regs; @@ -294,7 +286,7 @@ static irqreturn_t do_ext_interrupt(int irq, void *dummy) return IRQ_HANDLED; } -void __init init_ext_interrupts(void) +static void __init init_ext_interrupts(void) { int idx; @@ -307,6 +299,14 @@ void __init init_ext_interrupts(void) panic("Failed to register EXT interrupt\n"); } +void __init init_IRQ(void) +{ + BUILD_BUG_ON(ARRAY_SIZE(irqclass_sub_desc) != NR_ARCH_IRQS); + init_cio_interrupts(); + init_airq_interrupts(); + init_ext_interrupts(); +} + static DEFINE_SPINLOCK(irq_subclass_lock); static unsigned char irq_subclass_refcount[64]; -- cgit v1.2.3 From 1d49688d2bc6406d74566bca35b3d67201a906fc Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 10 Mar 2020 10:29:43 +0100 Subject: s390/traps: mark test_monitor_call __init Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/traps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index dc75588d7894..ff9cc4c3290e 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -271,7 +271,7 @@ void kernel_stack_overflow(struct pt_regs *regs) } NOKPROBE_SYMBOL(kernel_stack_overflow); -static void test_monitor_call(void) +static void __init test_monitor_call(void) { int val = 1; -- cgit v1.2.3 From bb533ec8bacd064ee273ca3305db97938c3331ae Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 10 Mar 2020 09:01:44 +0100 Subject: s390/config: do not select VIRTIO_CONSOLE via Kconfig select does not ensure that dependencies are also selected. Instead of selecting VIRTIO_CONSOLE from S390_GUEST we should rather add this to the defconfigs. So we update those as well. Reported-by: Michael S. Tsirkin Signed-off-by: Christian Borntraeger Signed-off-by: Vasily Gorbik --- arch/s390/Kconfig | 1 - arch/s390/configs/debug_defconfig | 1 + arch/s390/configs/defconfig | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index f4ff75ff62f2..334f3f2199e8 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -953,7 +953,6 @@ config S390_GUEST select TTY select VIRTUALIZATION select VIRTIO - select VIRTIO_CONSOLE help Enabling this option adds support for virtio based paravirtual device drivers on s390. diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 2e60c80395ab..4ca5c7499cce 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -532,6 +532,7 @@ CONFIG_INPUT_EVDEV=y # CONFIG_SERIO is not set CONFIG_LEGACY_PTY_COUNT=0 CONFIG_NULL_TTY=m +CONFIG_VIRTIO_CONSOLE=y CONFIG_HW_RANDOM_VIRTIO=m CONFIG_RAW_DRIVER=m CONFIG_HANGCHECK_TIMER=m diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 25f799849582..c0950750fb50 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -528,6 +528,7 @@ CONFIG_INPUT_EVDEV=y # CONFIG_SERIO is not set CONFIG_LEGACY_PTY_COUNT=0 CONFIG_NULL_TTY=m +CONFIG_VIRTIO_CONSOLE=y CONFIG_HW_RANDOM_VIRTIO=m CONFIG_RAW_DRIVER=m CONFIG_HANGCHECK_TIMER=m -- cgit v1.2.3 From 31932757c6121b394cd4f158b6b8f1cca8ffe871 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Sun, 8 Mar 2020 21:34:49 +0100 Subject: s390/mm: optimize page table upgrade routine There is a maximum of two new tables allocated on page table upgrade. Because we know that a loop the current implementation is based on could be unrolled with some improvements: * upgrade from 3 to 5 levels happens in one go - without an unnecessary re-take of page_table_lock in-between; * page tables initialization moved out of the atomic code; Signed-off-by: Alexander Gordeev Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/mm/pgalloc.c | 90 +++++++++++++++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 34 deletions(-) (limited to 'arch') diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 3dd253f81a77..d3be3fe2c55d 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -77,43 +77,65 @@ static void __crst_table_upgrade(void *arg) int crst_table_upgrade(struct mm_struct *mm, unsigned long end) { - unsigned long *table, *pgd; - int rc, notify; + unsigned long *pgd = NULL, *p4d = NULL, *__pgd; + unsigned long asce_limit = mm->context.asce_limit; /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */ - VM_BUG_ON(mm->context.asce_limit < _REGION2_SIZE); - rc = 0; - notify = 0; - while (mm->context.asce_limit < end) { - table = crst_table_alloc(mm); - if (!table) { - rc = -ENOMEM; - break; - } - spin_lock_bh(&mm->page_table_lock); - pgd = (unsigned long *) mm->pgd; - if (mm->context.asce_limit == _REGION2_SIZE) { - crst_table_init(table, _REGION2_ENTRY_EMPTY); - p4d_populate(mm, (p4d_t *) table, (pud_t *) pgd); - mm->pgd = (pgd_t *) table; - mm->context.asce_limit = _REGION1_SIZE; - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_REGION2; - mm_inc_nr_puds(mm); - } else { - crst_table_init(table, _REGION1_ENTRY_EMPTY); - pgd_populate(mm, (pgd_t *) table, (p4d_t *) pgd); - mm->pgd = (pgd_t *) table; - mm->context.asce_limit = -PAGE_SIZE; - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_REGION1; - } - notify = 1; - spin_unlock_bh(&mm->page_table_lock); + VM_BUG_ON(asce_limit < _REGION2_SIZE); + + if (end <= asce_limit) + return 0; + + if (asce_limit == _REGION2_SIZE) { + p4d = crst_table_alloc(mm); + if (unlikely(!p4d)) + goto err_p4d; + crst_table_init(p4d, _REGION2_ENTRY_EMPTY); } - if (notify) - on_each_cpu(__crst_table_upgrade, mm, 0); - return rc; + if (end > _REGION1_SIZE) { + pgd = crst_table_alloc(mm); + if (unlikely(!pgd)) + goto err_pgd; + crst_table_init(pgd, _REGION1_ENTRY_EMPTY); + } + + spin_lock_bh(&mm->page_table_lock); + + /* + * This routine gets called with mmap_sem lock held and there is + * no reason to optimize for the case of otherwise. However, if + * that would ever change, the below check will let us know. + */ + VM_BUG_ON(asce_limit != mm->context.asce_limit); + + if (p4d) { + __pgd = (unsigned long *) mm->pgd; + p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd); + mm->pgd = (pgd_t *) p4d; + mm->context.asce_limit = _REGION1_SIZE; + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | _ASCE_TYPE_REGION2; + mm_inc_nr_puds(mm); + } + if (pgd) { + __pgd = (unsigned long *) mm->pgd; + pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd); + mm->pgd = (pgd_t *) pgd; + mm->context.asce_limit = -PAGE_SIZE; + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | _ASCE_TYPE_REGION1; + } + + spin_unlock_bh(&mm->page_table_lock); + + on_each_cpu(__crst_table_upgrade, mm, 0); + + return 0; + +err_pgd: + crst_table_free(mm, p4d); +err_p4d: + return -ENOMEM; } void crst_table_downgrade(struct mm_struct *mm) -- cgit v1.2.3 From 42d211a1ae3b77008d4190b7dc79ad29b48bbcd2 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 11 Mar 2020 14:18:05 +0100 Subject: s390/cpuinfo: show processor physical address Show CPU physical address as reported by STAP instruction Signed-off-by: Alexander Gordeev Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/smp.h | 1 + arch/s390/kernel/processor.c | 1 + arch/s390/kernel/smp.c | 5 +++++ 3 files changed, 7 insertions(+) (limited to 'arch') diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index b157a81fb977..231a51e870fe 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -34,6 +34,7 @@ extern int smp_vcpu_scheduled(int cpu); extern void smp_yield_cpu(int cpu); extern void smp_cpu_set_polarization(int cpu, int val); extern int smp_cpu_get_polarization(int cpu); +extern int smp_cpu_get_cpu_address(int cpu); extern void smp_fill_possible_mask(void); extern void smp_detect_cpus(void); diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index 2c13ca562b48..b98654d0ce41 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -159,6 +159,7 @@ static void show_cpu_topology(struct seq_file *m, unsigned long n) seq_printf(m, "book id : %d\n", topology_book_id(n)); seq_printf(m, "drawer id : %d\n", topology_drawer_id(n)); seq_printf(m, "dedicated : %d\n", topology_cpu_dedicated(n)); + seq_printf(m, "address : %d\n", smp_cpu_get_cpu_address(n)); #endif /* CONFIG_SCHED_TOPOLOGY */ } diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index f87d4e14269c..edc1bf39c542 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -703,6 +703,11 @@ int smp_cpu_get_polarization(int cpu) return pcpu_devices[cpu].polarization; } +int smp_cpu_get_cpu_address(int cpu) +{ + return pcpu_devices[cpu].address; +} + static void __ref smp_get_core_info(struct sclp_core_info *info, int early) { static int use_sigp_detection; -- cgit v1.2.3 From cd8e702f0db75f28d0fbdc574a8fcda4aca0b09b Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 12 Mar 2020 11:35:05 +0100 Subject: s390/numa: remove redundant cpus_with_topology variable Variable cpus_with_topology is a leftover that became unneeded once the fake NUMA support has been removed. Signed-off-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/topology.h | 1 - arch/s390/kernel/topology.c | 6 ------ 2 files changed, 7 deletions(-) (limited to 'arch') diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index bd3417185e30..4648303e6958 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -24,7 +24,6 @@ struct cpu_topology_s390 { }; extern struct cpu_topology_s390 cpu_topology[NR_CPUS]; -extern cpumask_t cpus_with_topology; #define topology_physical_package_id(cpu) (cpu_topology[cpu].socket_id) #define topology_thread_id(cpu) (cpu_topology[cpu].thread_id) diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index c189f5d996ff..ec1bffe6ce75 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -62,8 +62,6 @@ static struct mask_info drawer_info; struct cpu_topology_s390 cpu_topology[NR_CPUS]; EXPORT_SYMBOL_GPL(cpu_topology); -cpumask_t cpus_with_topology; - static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu) { cpumask_t mask; @@ -137,7 +135,6 @@ static void add_cpus_to_mask(struct topology_core *tl_core, cpumask_set_cpu(lcpu + i, &drawer->mask); cpumask_set_cpu(lcpu + i, &book->mask); cpumask_set_cpu(lcpu + i, &socket->mask); - cpumask_set_cpu(lcpu + i, &cpus_with_topology); smp_cpu_set_polarization(lcpu + i, tl_core->pp); } } @@ -262,8 +259,6 @@ static void update_cpu_masks(void) topo->socket_id = id; topo->book_id = id; topo->drawer_id = id; - if (cpu_present(cpu)) - cpumask_set_cpu(cpu, &cpus_with_topology); } } } @@ -287,7 +282,6 @@ static int __arch_update_cpu_topology(void) int rc = 0; mutex_lock(&smp_cpu_state_mutex); - cpumask_clear(&cpus_with_topology); if (MACHINE_HAS_TOPOLOGY) { rc = 1; store_topology(info); -- cgit v1.2.3 From 52aeda7accb6d2e511a1b89142cbbf6fd2c12565 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 12 Mar 2020 11:32:23 +0100 Subject: s390/topology: remove offline CPUs from CPU topology masks The CPU topology masks on s390 contain also bits of CPUs which are offline. Currently this is already a problem, since common code scheduler expects e.g. cpu_smt_mask() to reflect reality. This update changes the described behaviour and s390 starts to behave like all other architectures. Signed-off-by: Alexander Gordeev Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/topology.h | 2 ++ arch/s390/kernel/smp.c | 6 ++++-- arch/s390/kernel/topology.c | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index 4648303e6958..56b14616fb6b 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -43,6 +43,7 @@ int topology_cpu_init(struct cpu *); int topology_set_cpu_management(int fc); void topology_schedule_update(void); void store_topology(struct sysinfo_15_1_x *info); +void update_cpu_masks(void); void topology_expect_change(void); const struct cpumask *cpu_coregroup_mask(int cpu); @@ -52,6 +53,7 @@ static inline void topology_init_early(void) { } static inline void topology_schedule_update(void) { } static inline int topology_cpu_init(struct cpu *cpu) { return 0; } static inline int topology_cpu_dedicated(int cpu_nr) { return 0; } +static inline void update_cpu_masks(void) { } static inline void topology_expect_change(void) { } #endif /* CONFIG_SCHED_TOPOLOGY */ diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index edc1bf39c542..7eaabbab2213 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -858,12 +858,13 @@ static void smp_init_secondary(void) init_cpu_timer(); vtime_init(); pfault_init(); - notify_cpu_starting(smp_processor_id()); + notify_cpu_starting(cpu); if (topology_cpu_dedicated(cpu)) set_cpu_flag(CIF_DEDICATED_CPU); else clear_cpu_flag(CIF_DEDICATED_CPU); - set_cpu_online(smp_processor_id(), true); + set_cpu_online(cpu, true); + update_cpu_masks(); inc_irq_stat(CPU_RST); local_irq_enable(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); @@ -935,6 +936,7 @@ int __cpu_disable(void) /* Handle possible pending IPIs */ smp_handle_ext_call(); set_cpu_online(smp_processor_id(), false); + update_cpu_masks(); /* Disable pseudo page faults on this cpu. */ pfault_fini(); /* Disable interrupt sources via control register. */ diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index ec1bffe6ce75..09711d55f123 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -88,6 +88,7 @@ static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu) cpumask_copy(&mask, cpumask_of(cpu)); break; } + cpumask_and(&mask, &mask, cpu_online_mask); return mask; } @@ -103,6 +104,7 @@ static cpumask_t cpu_thread_map(unsigned int cpu) for (i = 0; i <= smp_cpu_mtid; i++) if (cpu_present(cpu + i)) cpumask_set_cpu(cpu + i, &mask); + cpumask_and(&mask, &mask, cpu_online_mask); return mask; } @@ -241,7 +243,7 @@ int topology_set_cpu_management(int fc) return rc; } -static void update_cpu_masks(void) +void update_cpu_masks(void) { struct cpu_topology_s390 *topo; int cpu, id; -- cgit v1.2.3 From 1a2ae03b1938b050c3bbd79e79d5075e0307fe20 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 20 Feb 2020 16:22:30 +0100 Subject: s390/ipl: add support to control memory clearing for FCP and CCW re-IPL Re-IPL for both CCW and FCP is currently done by using diag 308 with the "Load Clear" subcode, which means that all memory will be cleared. This can increase re-IPL duration considerably on very large machines. For CCW devices, there is also a "Load Normal" subcode that was only used for dump kernels so far. For FCP devices, a similar "Load Normal" subcode was introduced with z14. The "Load Normal" diag 308 subcode allows to re-IPL without clearing memory. This patch adds a new "clear" sysfs attribute to /sys/firmware/reipl for both the ccw and fcp subdirectories, which can be set to either "0" or "1" to disable or enable re-IPL with memory clearing. The default value is "0", which disables memory clearing. Signed-off-by: Gerald Schaefer Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/ipl.h | 1 + arch/s390/kernel/ipl.c | 73 ++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 67 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index 084e71b7272a..b63bd66404b8 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -119,6 +119,7 @@ enum diag308_subcode { DIAG308_LOAD_NORMAL_DUMP = 4, DIAG308_SET = 5, DIAG308_STORE = 6, + DIAG308_LOAD_NORMAL = 7, }; enum diag308_rc { diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 6837affc19e8..4a71061974fd 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -144,6 +144,9 @@ static struct ipl_parameter_block *dump_block_ccw; static struct sclp_ipl_info sclp_ipl_info; +static bool reipl_fcp_clear; +static bool reipl_ccw_clear; + static inline int __diag308(unsigned long subcode, void *addr) { register unsigned long _addr asm("0") = (unsigned long) addr; @@ -691,6 +694,21 @@ static struct kobj_attribute sys_reipl_fcp_loadparm_attr = __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_fcp_loadparm_show, reipl_fcp_loadparm_store); +static ssize_t reipl_fcp_clear_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%u\n", reipl_fcp_clear); +} + +static ssize_t reipl_fcp_clear_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + if (strtobool(buf, &reipl_fcp_clear) < 0) + return -EINVAL; + return len; +} + static struct attribute *reipl_fcp_attrs[] = { &sys_reipl_fcp_device_attr.attr, &sys_reipl_fcp_wwpn_attr.attr, @@ -706,6 +724,9 @@ static struct attribute_group reipl_fcp_attr_group = { .bin_attrs = reipl_fcp_bin_attrs, }; +static struct kobj_attribute sys_reipl_fcp_clear_attr = + __ATTR(clear, 0644, reipl_fcp_clear_show, reipl_fcp_clear_store); + /* CCW reipl device attributes */ DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ccw); @@ -741,16 +762,36 @@ static struct kobj_attribute sys_reipl_ccw_loadparm_attr = __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_ccw_loadparm_show, reipl_ccw_loadparm_store); +static ssize_t reipl_ccw_clear_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%u\n", reipl_ccw_clear); +} + +static ssize_t reipl_ccw_clear_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + if (strtobool(buf, &reipl_ccw_clear) < 0) + return -EINVAL; + return len; +} + +static struct kobj_attribute sys_reipl_ccw_clear_attr = + __ATTR(clear, 0644, reipl_ccw_clear_show, reipl_ccw_clear_store); + static struct attribute *reipl_ccw_attrs_vm[] = { &sys_reipl_ccw_device_attr.attr, &sys_reipl_ccw_loadparm_attr.attr, &sys_reipl_ccw_vmparm_attr.attr, + &sys_reipl_ccw_clear_attr.attr, NULL, }; static struct attribute *reipl_ccw_attrs_lpar[] = { &sys_reipl_ccw_device_attr.attr, &sys_reipl_ccw_loadparm_attr.attr, + &sys_reipl_ccw_clear_attr.attr, NULL, }; @@ -892,11 +933,17 @@ static void __reipl_run(void *unused) switch (reipl_type) { case IPL_TYPE_CCW: diag308(DIAG308_SET, reipl_block_ccw); - diag308(DIAG308_LOAD_CLEAR, NULL); + if (reipl_ccw_clear) + diag308(DIAG308_LOAD_CLEAR, NULL); + else + diag308(DIAG308_LOAD_NORMAL_DUMP, NULL); break; case IPL_TYPE_FCP: diag308(DIAG308_SET, reipl_block_fcp); - diag308(DIAG308_LOAD_CLEAR, NULL); + if (reipl_fcp_clear) + diag308(DIAG308_LOAD_CLEAR, NULL); + else + diag308(DIAG308_LOAD_NORMAL, NULL); break; case IPL_TYPE_NSS: diag308(DIAG308_SET, reipl_block_nss); @@ -1008,11 +1055,16 @@ static int __init reipl_fcp_init(void) } rc = sysfs_create_group(&reipl_fcp_kset->kobj, &reipl_fcp_attr_group); - if (rc) { - kset_unregister(reipl_fcp_kset); - free_page((unsigned long) reipl_block_fcp); - return rc; - } + if (rc) + goto out1; + + if (test_facility(141)) { + rc = sysfs_create_file(&reipl_fcp_kset->kobj, + &sys_reipl_fcp_clear_attr.attr); + if (rc) + goto out2; + } else + reipl_fcp_clear = true; if (ipl_info.type == IPL_TYPE_FCP) { memcpy(reipl_block_fcp, &ipl_block, sizeof(ipl_block)); @@ -1032,6 +1084,13 @@ static int __init reipl_fcp_init(void) } reipl_capabilities |= IPL_TYPE_FCP; return 0; + +out2: + sysfs_remove_group(&reipl_fcp_kset->kobj, &reipl_fcp_attr_group); +out1: + kset_unregister(reipl_fcp_kset); + free_page((unsigned long) reipl_block_fcp); + return rc; } static int __init reipl_type_init(void) -- cgit v1.2.3 From 959684978d5a8443cfb0ed59a9d1fc59d2a80d09 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 13 Mar 2020 16:52:44 +0100 Subject: s390/cpuinfo: show number of online cores Show number of cores that run at least one SMT thread Signed-off-by: Alexander Gordeev Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/topology.h | 3 +++ arch/s390/kernel/processor.c | 1 + arch/s390/kernel/topology.c | 20 ++++++++++++++++++-- 3 files changed, 22 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index 56b14616fb6b..fbb507504a3b 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -17,6 +17,7 @@ struct cpu_topology_s390 { unsigned short book_id; unsigned short drawer_id; unsigned short dedicated : 1; + int booted_cores; cpumask_t thread_mask; cpumask_t core_mask; cpumask_t book_mask; @@ -35,6 +36,7 @@ extern struct cpu_topology_s390 cpu_topology[NR_CPUS]; #define topology_drawer_id(cpu) (cpu_topology[cpu].drawer_id) #define topology_drawer_cpumask(cpu) (&cpu_topology[cpu].drawer_mask) #define topology_cpu_dedicated(cpu) (cpu_topology[cpu].dedicated) +#define topology_booted_cores(cpu) (cpu_topology[cpu].booted_cores) #define mc_capable() 1 @@ -53,6 +55,7 @@ static inline void topology_init_early(void) { } static inline void topology_schedule_update(void) { } static inline int topology_cpu_init(struct cpu *cpu) { return 0; } static inline int topology_cpu_dedicated(int cpu_nr) { return 0; } +static inline int topology_booted_cores(int cpu_nr) { return 1; } static inline void update_cpu_masks(void) { } static inline void topology_expect_change(void) { } diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index b98654d0ce41..41172093160e 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -160,6 +160,7 @@ static void show_cpu_topology(struct seq_file *m, unsigned long n) seq_printf(m, "drawer id : %d\n", topology_drawer_id(n)); seq_printf(m, "dedicated : %d\n", topology_cpu_dedicated(n)); seq_printf(m, "address : %d\n", smp_cpu_get_cpu_address(n)); + seq_printf(m, "cpu cores : %d\n", topology_booted_cores(n)); #endif /* CONFIG_SCHED_TOPOLOGY */ } diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 09711d55f123..d03edebce754 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -245,8 +245,8 @@ int topology_set_cpu_management(int fc) void update_cpu_masks(void) { - struct cpu_topology_s390 *topo; - int cpu, id; + struct cpu_topology_s390 *topo, *topo_package, *topo_sibling; + int cpu, sibling, pkg_first, smt_first, id; for_each_possible_cpu(cpu) { topo = &cpu_topology[cpu]; @@ -254,6 +254,7 @@ void update_cpu_masks(void) topo->core_mask = cpu_group_map(&socket_info, cpu); topo->book_mask = cpu_group_map(&book_info, cpu); topo->drawer_mask = cpu_group_map(&drawer_info, cpu); + topo->booted_cores = 0; if (topology_mode != TOPOLOGY_MODE_HW) { id = topology_mode == TOPOLOGY_MODE_PACKAGE ? 0 : cpu; topo->thread_id = cpu; @@ -263,6 +264,21 @@ void update_cpu_masks(void) topo->drawer_id = id; } } + for_each_online_cpu(cpu) { + topo = &cpu_topology[cpu]; + pkg_first = cpumask_first(&topo->core_mask); + topo_package = &cpu_topology[pkg_first]; + if (cpu == pkg_first) { + for_each_cpu(sibling, &topo->core_mask) { + topo_sibling = &cpu_topology[sibling]; + smt_first = cpumask_first(&topo_sibling->thread_mask); + if (sibling == smt_first) + topo_package->booted_cores++; + } + } else { + topo->booted_cores = topo_package->booted_cores; + } + } } void store_topology(struct sysinfo_15_1_x *info) -- cgit v1.2.3 From 2db52dc353146e684d0b3ec4060f00ebefc5c4d2 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 13 Mar 2020 16:55:10 +0100 Subject: s390/cpuinfo: show number of online CPUs within a package Show number of online CPUs within a package (which is the socket in case of s390). For what it worth, present that value as "siblings" field - just like x86 does. Signed-off-by: Alexander Gordeev Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/processor.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index 41172093160e..39bf777e140e 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -160,6 +160,7 @@ static void show_cpu_topology(struct seq_file *m, unsigned long n) seq_printf(m, "drawer id : %d\n", topology_drawer_id(n)); seq_printf(m, "dedicated : %d\n", topology_cpu_dedicated(n)); seq_printf(m, "address : %d\n", smp_cpu_get_cpu_address(n)); + seq_printf(m, "siblings : %d\n", cpumask_weight(topology_core_cpumask(n))); seq_printf(m, "cpu cores : %d\n", topology_booted_cores(n)); #endif /* CONFIG_SCHED_TOPOLOGY */ } -- cgit v1.2.3 From 872f27103874a73783aeff2aac2b41a489f67d7c Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Mon, 16 Mar 2020 12:39:55 +0100 Subject: s390/cpuinfo: fix wrong output when CPU0 is offline /proc/cpuinfo should not print information about CPU 0 when it is offline. Fixes: 281eaa8cb67c ("s390/cpuinfo: simplify locking and skip offline cpus early") Signed-off-by: Alexander Gordeev Reviewed-by: Heiko Carstens [heiko.carstens@de.ibm.com: shortened commit message] Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/processor.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index 39bf777e140e..f36acc8d2631 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -188,8 +188,9 @@ static void show_cpu_mhz(struct seq_file *m, unsigned long n) static int show_cpuinfo(struct seq_file *m, void *v) { unsigned long n = (unsigned long) v - 1; + unsigned long first = cpumask_first(cpu_online_mask); - if (!n) + if (n == first) show_cpu_summary(m, v); if (!machine_has_cpu_mhz) return 0; @@ -204,6 +205,8 @@ static inline void *c_update(loff_t *pos) { if (*pos) *pos = cpumask_next(*pos - 1, cpu_online_mask); + else + *pos = cpumask_first(cpu_online_mask); return *pos < nr_cpu_ids ? (void *)*pos + 1 : NULL; } -- cgit v1.2.3 From 1b648dfd544bd9d486926e221743b9bd143d7eca Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Mon, 16 Mar 2020 10:25:44 +0100 Subject: s390/cpuinfo: do not skip info for CPUs without MHz feature In the past there were no per-CPU information in /proc/cpuinfo other than CPU frequency. Hence, for machines without CPU MHz feature there were nothing to show. Now CPU topology and IDs still could be shown, so do not skip this information from the output. Signed-off-by: Alexander Gordeev Reviewed-by: Heiko Carstens [heiko.carstens@de.ibm.com: moved comparison] Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/processor.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index f36acc8d2631..c92d04f876cb 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -178,6 +178,8 @@ static void show_cpu_mhz(struct seq_file *m, unsigned long n) { struct cpu_info *c = per_cpu_ptr(&cpu_info, n); + if (!machine_has_cpu_mhz) + return; seq_printf(m, "cpu MHz dynamic : %d\n", c->cpu_mhz_dynamic); seq_printf(m, "cpu MHz static : %d\n", c->cpu_mhz_static); } @@ -192,8 +194,6 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (n == first) show_cpu_summary(m, v); - if (!machine_has_cpu_mhz) - return 0; seq_printf(m, "\ncpu number : %ld\n", n); show_cpu_topology(m, n); show_cpu_ids(m, n); -- cgit v1.2.3 From 394216275c7d503d966317da9a01ad6626a6091d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 18 Mar 2020 20:55:24 +0100 Subject: s390: remove broken hibernate / power management support Hibernation is known to be broken for many years on s390. Given that there aren't any real use cases, remove the code instead of spending time to fix and maintain it. Without hibernate support it doesn't make too much sense to keep power management support; therefore remove it completely. Acked-by: Christian Borntraeger Acked-by: Peter Oberparleiter Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/Kconfig | 10 -- arch/s390/kernel/Makefile | 1 - arch/s390/kernel/machine_kexec.c | 31 ----- arch/s390/kernel/suspend.c | 240 ---------------------------------- arch/s390/kernel/swsusp.S | 276 --------------------------------------- arch/s390/mm/cmm.c | 46 +------ arch/s390/mm/pageattr.c | 16 --- arch/s390/pci/pci.c | 43 ------ 8 files changed, 3 insertions(+), 660 deletions(-) delete mode 100644 arch/s390/kernel/suspend.c delete mode 100644 arch/s390/kernel/swsusp.S (limited to 'arch') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 334f3f2199e8..0e2ad7b2e073 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -102,7 +102,6 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_IRQ select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE select ARCH_KEEP_MEMBLOCK - select ARCH_SAVE_PAGE_KEYS if HIBERNATION select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_NUMA_BALANCING @@ -810,15 +809,6 @@ config SECCOMP If unsure, say Y. -menu "Power Management" - -config ARCH_HIBERNATION_POSSIBLE - def_bool y - -source "kernel/power/Kconfig" - -endmenu - config CCW def_bool y diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 2b1203cf7be6..10f80071f945 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -54,7 +54,6 @@ CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE) obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o -obj-$(CONFIG_HIBERNATION) += suspend.o swsusp.o obj-$(CONFIG_AUDIT) += audit.o compat-obj-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_COMPAT) += compat_linux.o compat_signal.o diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index cb8b1cc285c9..3a854cb5a4c6 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -38,36 +37,6 @@ extern const unsigned long long relocate_kernel_len; #ifdef CONFIG_CRASH_DUMP -/* - * PM notifier callback for kdump - */ -static int machine_kdump_pm_cb(struct notifier_block *nb, unsigned long action, - void *ptr) -{ - switch (action) { - case PM_SUSPEND_PREPARE: - case PM_HIBERNATION_PREPARE: - if (kexec_crash_image) - arch_kexec_unprotect_crashkres(); - break; - case PM_POST_SUSPEND: - case PM_POST_HIBERNATION: - if (kexec_crash_image) - arch_kexec_protect_crashkres(); - break; - default: - return NOTIFY_DONE; - } - return NOTIFY_OK; -} - -static int __init machine_kdump_pm_init(void) -{ - pm_notifier(machine_kdump_pm_cb, 0); - return 0; -} -arch_initcall(machine_kdump_pm_init); - /* * Reset the system, copy boot CPU registers to absolute zero, * and jump to the kdump image diff --git a/arch/s390/kernel/suspend.c b/arch/s390/kernel/suspend.c deleted file mode 100644 index 75b7b307946e..000000000000 --- a/arch/s390/kernel/suspend.c +++ /dev/null @@ -1,240 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Suspend support specific for s390. - * - * Copyright IBM Corp. 2009 - * - * Author(s): Hans-Joachim Picht - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "entry.h" - -/* - * The restore of the saved pages in an hibernation image will set - * the change and referenced bits in the storage key for each page. - * Overindication of the referenced bits after an hibernation cycle - * does not cause any harm but the overindication of the change bits - * would cause trouble. - * Use the ARCH_SAVE_PAGE_KEYS hooks to save the storage key of each - * page to the most significant byte of the associated page frame - * number in the hibernation image. - */ - -/* - * Key storage is allocated as a linked list of pages. - * The size of the keys array is (PAGE_SIZE - sizeof(long)) - */ -struct page_key_data { - struct page_key_data *next; - unsigned char data[]; -}; - -#define PAGE_KEY_DATA_SIZE (PAGE_SIZE - sizeof(struct page_key_data *)) - -static struct page_key_data *page_key_data; -static struct page_key_data *page_key_rp, *page_key_wp; -static unsigned long page_key_rx, page_key_wx; -unsigned long suspend_zero_pages; - -/* - * For each page in the hibernation image one additional byte is - * stored in the most significant byte of the page frame number. - * On suspend no additional memory is required but on resume the - * keys need to be memorized until the page data has been restored. - * Only then can the storage keys be set to their old state. - */ -unsigned long page_key_additional_pages(unsigned long pages) -{ - return DIV_ROUND_UP(pages, PAGE_KEY_DATA_SIZE); -} - -/* - * Free page_key_data list of arrays. - */ -void page_key_free(void) -{ - struct page_key_data *pkd; - - while (page_key_data) { - pkd = page_key_data; - page_key_data = pkd->next; - free_page((unsigned long) pkd); - } -} - -/* - * Allocate page_key_data list of arrays with enough room to store - * one byte for each page in the hibernation image. - */ -int page_key_alloc(unsigned long pages) -{ - struct page_key_data *pk; - unsigned long size; - - size = DIV_ROUND_UP(pages, PAGE_KEY_DATA_SIZE); - while (size--) { - pk = (struct page_key_data *) get_zeroed_page(GFP_KERNEL); - if (!pk) { - page_key_free(); - return -ENOMEM; - } - pk->next = page_key_data; - page_key_data = pk; - } - page_key_rp = page_key_wp = page_key_data; - page_key_rx = page_key_wx = 0; - return 0; -} - -/* - * Save the storage key into the upper 8 bits of the page frame number. - */ -void page_key_read(unsigned long *pfn) -{ - struct page *page; - unsigned long addr; - unsigned char key; - - page = pfn_to_page(*pfn); - addr = (unsigned long) page_address(page); - key = (unsigned char) page_get_storage_key(addr) & 0x7f; - if (arch_test_page_nodat(page)) - key |= 0x80; - *(unsigned char *) pfn = key; -} - -/* - * Extract the storage key from the upper 8 bits of the page frame number - * and store it in the page_key_data list of arrays. - */ -void page_key_memorize(unsigned long *pfn) -{ - page_key_wp->data[page_key_wx] = *(unsigned char *) pfn; - *(unsigned char *) pfn = 0; - if (++page_key_wx < PAGE_KEY_DATA_SIZE) - return; - page_key_wp = page_key_wp->next; - page_key_wx = 0; -} - -/* - * Get the next key from the page_key_data list of arrays and set the - * storage key of the page referred by @address. If @address refers to - * a "safe" page the swsusp_arch_resume code will transfer the storage - * key from the buffer page to the original page. - */ -void page_key_write(void *address) -{ - struct page *page; - unsigned char key; - - key = page_key_rp->data[page_key_rx]; - page_set_storage_key((unsigned long) address, key & 0x7f, 0); - page = virt_to_page(address); - if (key & 0x80) - arch_set_page_nodat(page, 0); - else - arch_set_page_dat(page, 0); - if (++page_key_rx >= PAGE_KEY_DATA_SIZE) - return; - page_key_rp = page_key_rp->next; - page_key_rx = 0; -} - -int pfn_is_nosave(unsigned long pfn) -{ - unsigned long nosave_begin_pfn = PFN_DOWN(__pa(&__nosave_begin)); - unsigned long nosave_end_pfn = PFN_DOWN(__pa(&__nosave_end)); - unsigned long end_rodata_pfn = PFN_DOWN(__pa(__end_rodata)) - 1; - unsigned long stext_pfn = PFN_DOWN(__pa(_stext)); - - /* Always save lowcore pages (LC protection might be enabled). */ - if (pfn <= LC_PAGES) - return 0; - if (pfn >= nosave_begin_pfn && pfn < nosave_end_pfn) - return 1; - /* Skip memory holes and read-only pages (DCSS, ...). */ - if (pfn >= stext_pfn && pfn <= end_rodata_pfn) - return 0; - if (tprot(PFN_PHYS(pfn))) - return 1; - return 0; -} - -/* - * PM notifier callback for suspend - */ -static int suspend_pm_cb(struct notifier_block *nb, unsigned long action, - void *ptr) -{ - switch (action) { - case PM_SUSPEND_PREPARE: - case PM_HIBERNATION_PREPARE: - suspend_zero_pages = __get_free_pages(GFP_KERNEL, LC_ORDER); - if (!suspend_zero_pages) - return NOTIFY_BAD; - break; - case PM_POST_SUSPEND: - case PM_POST_HIBERNATION: - free_pages(suspend_zero_pages, LC_ORDER); - break; - default: - return NOTIFY_DONE; - } - return NOTIFY_OK; -} - -static int __init suspend_pm_init(void) -{ - pm_notifier(suspend_pm_cb, 0); - return 0; -} -arch_initcall(suspend_pm_init); - -void save_processor_state(void) -{ - /* swsusp_arch_suspend() actually saves all cpu register contents. - * Machine checks must be disabled since swsusp_arch_suspend() stores - * register contents to their lowcore save areas. That's the same - * place where register contents on machine checks would be saved. - * To avoid register corruption disable machine checks. - * We must also disable machine checks in the new psw mask for - * program checks, since swsusp_arch_suspend() may generate program - * checks. Disabling machine checks for all other new psw masks is - * just paranoia. - */ - local_mcck_disable(); - /* Disable lowcore protection */ - __ctl_clear_bit(0,28); - S390_lowcore.external_new_psw.mask &= ~PSW_MASK_MCHECK; - S390_lowcore.svc_new_psw.mask &= ~PSW_MASK_MCHECK; - S390_lowcore.io_new_psw.mask &= ~PSW_MASK_MCHECK; - S390_lowcore.program_new_psw.mask &= ~PSW_MASK_MCHECK; -} - -void restore_processor_state(void) -{ - S390_lowcore.external_new_psw.mask |= PSW_MASK_MCHECK; - S390_lowcore.svc_new_psw.mask |= PSW_MASK_MCHECK; - S390_lowcore.io_new_psw.mask |= PSW_MASK_MCHECK; - S390_lowcore.program_new_psw.mask |= PSW_MASK_MCHECK; - /* Enable lowcore protection */ - __ctl_set_bit(0,28); - local_mcck_enable(); -} - -/* Called at the end of swsusp_arch_resume */ -void s390_early_resume(void) -{ - lgr_info_log(); - channel_subsystem_reinit(); - zpci_rescan(); -} diff --git a/arch/s390/kernel/swsusp.S b/arch/s390/kernel/swsusp.S deleted file mode 100644 index a7baf0b5f818..000000000000 --- a/arch/s390/kernel/swsusp.S +++ /dev/null @@ -1,276 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * S390 64-bit swsusp implementation - * - * Copyright IBM Corp. 2009 - * - * Author(s): Hans-Joachim Picht - * Michael Holzheu - */ - -#include -#include -#include -#include -#include -#include -#include - -/* - * Save register context in absolute 0 lowcore and call swsusp_save() to - * create in-memory kernel image. The context is saved in the designated - * "store status" memory locations (see POP). - * We return from this function twice. The first time during the suspend to - * disk process. The second time via the swsusp_arch_resume() function - * (see below) in the resume process. - * This function runs with disabled interrupts. - */ - GEN_BR_THUNK %r14 - - .section .text -ENTRY(swsusp_arch_suspend) - lg %r1,__LC_NODAT_STACK - stmg %r6,%r15,__SF_GPRS(%r1) - aghi %r1,-STACK_FRAME_OVERHEAD - stg %r15,__SF_BACKCHAIN(%r1) - lgr %r15,%r1 - - /* Store FPU registers */ - brasl %r14,save_fpu_regs - - /* Deactivate DAT */ - stnsm __SF_EMPTY(%r15),0xfb - - /* Store prefix register on stack */ - stpx __SF_EMPTY(%r15) - - /* Save prefix register contents for lowcore copy */ - llgf %r10,__SF_EMPTY(%r15) - - /* Get pointer to save area */ - lghi %r1,0x1000 - - /* Save CPU address */ - stap __LC_EXT_CPU_ADDR(%r0) - - /* Store registers */ - mvc 0x318(4,%r1),__SF_EMPTY(%r15) /* move prefix to lowcore */ - stam %a0,%a15,0x340(%r1) /* store access registers */ - stctg %c0,%c15,0x380(%r1) /* store control registers */ - stmg %r0,%r15,0x280(%r1) /* store general registers */ - - stpt 0x328(%r1) /* store timer */ - stck __SF_EMPTY(%r15) /* store clock */ - stckc 0x330(%r1) /* store clock comparator */ - - /* Update cputime accounting before going to sleep */ - lg %r0,__LC_LAST_UPDATE_TIMER - slg %r0,0x328(%r1) - alg %r0,__LC_SYSTEM_TIMER - stg %r0,__LC_SYSTEM_TIMER - mvc __LC_LAST_UPDATE_TIMER(8),0x328(%r1) - lg %r0,__LC_LAST_UPDATE_CLOCK - slg %r0,__SF_EMPTY(%r15) - alg %r0,__LC_STEAL_TIMER - stg %r0,__LC_STEAL_TIMER - mvc __LC_LAST_UPDATE_CLOCK(8),__SF_EMPTY(%r15) - - /* Activate DAT */ - stosm __SF_EMPTY(%r15),0x04 - - /* Set prefix page to zero */ - xc __SF_EMPTY(4,%r15),__SF_EMPTY(%r15) - spx __SF_EMPTY(%r15) - - /* Save absolute zero pages */ - larl %r2,suspend_zero_pages - lg %r2,0(%r2) - lghi %r4,0 - lghi %r3,2*PAGE_SIZE - lghi %r5,2*PAGE_SIZE -1: mvcle %r2,%r4,0 - jo 1b - - /* Copy lowcore to absolute zero lowcore */ - lghi %r2,0 - lgr %r4,%r10 - lghi %r3,2*PAGE_SIZE - lghi %r5,2*PAGE_SIZE -1: mvcle %r2,%r4,0 - jo 1b - - /* Save image */ - brasl %r14,swsusp_save - - /* Restore prefix register and return */ - lghi %r1,0x1000 - spx 0x318(%r1) - lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15) - lghi %r2,0 - BR_EX %r14 -ENDPROC(swsusp_arch_suspend) - -/* - * Restore saved memory image to correct place and restore register context. - * Then we return to the function that called swsusp_arch_suspend(). - * swsusp_arch_resume() runs with disabled interrupts. - */ -ENTRY(swsusp_arch_resume) - stmg %r6,%r15,__SF_GPRS(%r15) - lgr %r1,%r15 - aghi %r15,-STACK_FRAME_OVERHEAD - stg %r1,__SF_BACKCHAIN(%r15) - - /* Make all free pages stable */ - lghi %r2,1 - brasl %r14,arch_set_page_states - - /* Set prefix page to zero */ - xc __SF_EMPTY(4,%r15),__SF_EMPTY(%r15) - spx __SF_EMPTY(%r15) - - /* Deactivate DAT */ - stnsm __SF_EMPTY(%r15),0xfb - - /* Restore saved image */ - larl %r1,restore_pblist - lg %r1,0(%r1) - ltgr %r1,%r1 - jz 2f -0: - lg %r2,8(%r1) - lg %r4,0(%r1) - iske %r0,%r4 - lghi %r3,PAGE_SIZE - lghi %r5,PAGE_SIZE -1: - mvcle %r2,%r4,0 - jo 1b - lg %r2,8(%r1) - sske %r0,%r2 - lg %r1,16(%r1) - ltgr %r1,%r1 - jnz 0b -2: - ptlb /* flush tlb */ - - /* Reset System */ - larl %r1,.Lnew_pgm_check_psw - epsw %r2,%r3 - stm %r2,%r3,0(%r1) - mvc __LC_PGM_NEW_PSW(16,%r0),0(%r1) - larl %r1,__swsusp_reset_dma - lg %r1,0(%r1) - BASR_EX %r14,%r1 - larl %r1,smp_cpu_mt_shift - icm %r1,15,0(%r1) - jz smt_done - llgfr %r1,%r1 -smt_loop: - sigp %r1,%r0,SIGP_SET_MULTI_THREADING - brc 8,smt_done /* accepted */ - brc 2,smt_loop /* busy, try again */ -smt_done: - larl %r1,.Lnew_pgm_check_psw - lpswe 0(%r1) -pgm_check_entry: - - /* Switch to original suspend CPU */ - larl %r1,.Lresume_cpu /* Resume CPU address: r2 */ - stap 0(%r1) - llgh %r2,0(%r1) - llgh %r1,__LC_EXT_CPU_ADDR(%r0) /* Suspend CPU address: r1 */ - cgr %r1,%r2 - je restore_registers /* r1 = r2 -> nothing to do */ - larl %r4,.Lrestart_suspend_psw /* Set new restart PSW */ - mvc __LC_RST_NEW_PSW(16,%r0),0(%r4) -3: - sigp %r9,%r1,SIGP_INITIAL_CPU_RESET /* sigp initial cpu reset */ - brc 8,4f /* accepted */ - brc 2,3b /* busy, try again */ - - /* Suspend CPU not available -> panic */ - larl %r15,init_thread_union+THREAD_SIZE-STACK_FRAME_OVERHEAD - larl %r2,.Lpanic_string - brasl %r14,sclp_early_printk_force - larl %r3,.Ldisabled_wait_31 - lpsw 0(%r3) -4: - /* Switch to suspend CPU */ - sigp %r9,%r1,SIGP_RESTART /* sigp restart to suspend CPU */ - brc 2,4b /* busy, try again */ -5: - sigp %r9,%r2,SIGP_STOP /* sigp stop to current resume CPU */ - brc 2,5b /* busy, try again */ -6: j 6b - -restart_suspend: - larl %r1,.Lresume_cpu - llgh %r2,0(%r1) -7: - sigp %r9,%r2,SIGP_SENSE /* sigp sense, wait for resume CPU */ - brc 8,7b /* accepted, status 0, still running */ - brc 2,7b /* busy, try again */ - tmll %r9,0x40 /* Test if resume CPU is stopped */ - jz 7b - -restore_registers: - /* Restore registers */ - lghi %r13,0x1000 /* %r1 = pointer to save area */ - - /* Ignore time spent in suspended state. */ - llgf %r1,0x318(%r13) - stck __LC_LAST_UPDATE_CLOCK(%r1) - spt 0x328(%r13) /* reprogram timer */ - //sckc 0x330(%r13) /* set clock comparator */ - - lctlg %c0,%c15,0x380(%r13) /* load control registers */ - lam %a0,%a15,0x340(%r13) /* load access registers */ - - /* Load old stack */ - lg %r15,0x2f8(%r13) - - /* Save prefix register */ - mvc __SF_EMPTY(4,%r15),0x318(%r13) - - /* Restore absolute zero pages */ - lghi %r2,0 - larl %r4,suspend_zero_pages - lg %r4,0(%r4) - lghi %r3,2*PAGE_SIZE - lghi %r5,2*PAGE_SIZE -1: mvcle %r2,%r4,0 - jo 1b - - /* Restore prefix register */ - spx __SF_EMPTY(%r15) - - /* Activate DAT */ - stosm __SF_EMPTY(%r15),0x04 - - /* Make all free pages unstable */ - lghi %r2,0 - brasl %r14,arch_set_page_states - - /* Call arch specific early resume code */ - brasl %r14,s390_early_resume - - /* Return 0 */ - lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15) - lghi %r2,0 - BR_EX %r14 -ENDPROC(swsusp_arch_resume) - - .section .data..nosave,"aw",@progbits - .align 8 -.Ldisabled_wait_31: - .long 0x000a0000,0x00000000 -.Lpanic_string: - .asciz "Resume not possible because suspend CPU is no longer available\n" - .align 8 -.Lrestart_suspend_psw: - .quad 0x0000000180000000,restart_suspend -.Lnew_pgm_check_psw: - .quad 0,pgm_check_entry -.Lresume_cpu: - .byte 0,0 diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index a51c892f14f3..ae989b740376 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include @@ -49,7 +48,6 @@ static volatile long cmm_pages_target; static volatile long cmm_timed_pages_target; static long cmm_timeout_pages; static long cmm_timeout_seconds; -static int cmm_suspended; static struct cmm_page_array *cmm_page_list; static struct cmm_page_array *cmm_timed_page_list; @@ -151,9 +149,9 @@ static int cmm_thread(void *dummy) while (1) { rc = wait_event_interruptible(cmm_thread_wait, - (!cmm_suspended && (cmm_pages != cmm_pages_target || - cmm_timed_pages != cmm_timed_pages_target)) || - kthread_should_stop()); + cmm_pages != cmm_pages_target || + cmm_timed_pages != cmm_timed_pages_target || + kthread_should_stop()); if (kthread_should_stop() || rc == -ERESTARTSYS) { cmm_pages_target = cmm_pages; cmm_timed_pages_target = cmm_timed_pages; @@ -390,38 +388,6 @@ static void cmm_smsg_target(const char *from, char *msg) static struct ctl_table_header *cmm_sysctl_header; -static int cmm_suspend(void) -{ - cmm_suspended = 1; - cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list); - cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list); - return 0; -} - -static int cmm_resume(void) -{ - cmm_suspended = 0; - cmm_kick_thread(); - return 0; -} - -static int cmm_power_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - switch (event) { - case PM_POST_HIBERNATION: - return cmm_resume(); - case PM_HIBERNATION_PREPARE: - return cmm_suspend(); - default: - return NOTIFY_DONE; - } -} - -static struct notifier_block cmm_power_notifier = { - .notifier_call = cmm_power_event, -}; - static int __init cmm_init(void) { int rc = -ENOMEM; @@ -446,16 +412,11 @@ static int __init cmm_init(void) rc = register_oom_notifier(&cmm_oom_nb); if (rc < 0) goto out_oom_notify; - rc = register_pm_notifier(&cmm_power_notifier); - if (rc) - goto out_pm; cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread"); if (!IS_ERR(cmm_thread_ptr)) return 0; rc = PTR_ERR(cmm_thread_ptr); - unregister_pm_notifier(&cmm_power_notifier); -out_pm: unregister_oom_notifier(&cmm_oom_nb); out_oom_notify: #ifdef CONFIG_CMM_IUCV @@ -475,7 +436,6 @@ static void __exit cmm_exit(void) #ifdef CONFIG_CMM_IUCV smsg_unregister_callback(SMSG_PREFIX, cmm_smsg_target); #endif - unregister_pm_notifier(&cmm_power_notifier); unregister_oom_notifier(&cmm_oom_nb); kthread_stop(cmm_thread_ptr); del_timer_sync(&cmm_timer); diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index f8c6faab41f4..e22c06d5f206 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -367,20 +367,4 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) } } -#ifdef CONFIG_HIBERNATION -bool kernel_page_present(struct page *page) -{ - unsigned long addr; - int cc; - - addr = page_to_phys(page); - asm volatile( - " lra %1,0(%1)\n" - " ipm %0\n" - " srl %0,28" - : "=d" (cc), "+a" (addr) : : "cc"); - return cc == 0; -} -#endif /* CONFIG_HIBERNATION */ - #endif /* CONFIG_DEBUG_PAGEALLOC */ diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index bc61ea18e88d..d4eaba24e300 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -607,49 +607,6 @@ void pcibios_disable_device(struct pci_dev *pdev) zpci_debug_exit_device(zdev); } -#ifdef CONFIG_HIBERNATE_CALLBACKS -static int zpci_restore(struct device *dev) -{ - struct pci_dev *pdev = to_pci_dev(dev); - struct zpci_dev *zdev = to_zpci(pdev); - int ret = 0; - - if (zdev->state != ZPCI_FN_STATE_ONLINE) - goto out; - - ret = clp_enable_fh(zdev, ZPCI_NR_DMA_SPACES); - if (ret) - goto out; - - zpci_map_resources(pdev); - zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - (u64) zdev->dma_table); - -out: - return ret; -} - -static int zpci_freeze(struct device *dev) -{ - struct pci_dev *pdev = to_pci_dev(dev); - struct zpci_dev *zdev = to_zpci(pdev); - - if (zdev->state != ZPCI_FN_STATE_ONLINE) - return 0; - - zpci_unregister_ioat(zdev, 0); - zpci_unmap_resources(pdev); - return clp_disable_fh(zdev); -} - -struct dev_pm_ops pcibios_pm_ops = { - .thaw_noirq = zpci_restore, - .freeze_noirq = zpci_freeze, - .restore_noirq = zpci_restore, - .poweroff_noirq = zpci_freeze, -}; -#endif /* CONFIG_HIBERNATE_CALLBACKS */ - static int zpci_alloc_domain(struct zpci_dev *zdev) { if (zpci_unique_uid) { -- cgit v1.2.3 From 969ae01bab2fe938b4c8324836038b5ac1c78fac Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Tue, 17 Mar 2020 12:59:37 +0100 Subject: s390/pci: Fix zpci_alloc_domain() over allocation Until now zpci_alloc_domain() only prevented more than CONFIG_PCI_NR_FUNCTIONS from being added when using automatic domain allocation. When explicit UIDs were defined UIDs above CONFIG_PCI_NR_FUNCTIONS were not counted at all. When more PCI functions are added this could lead to various errors including under sized IRQ vectors and similar issues. Fix this by explicitly tracking the number of allocated domains. Signed-off-by: Niklas Schnelle Reviewed-by: Pierre Morel Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/pci.h | 1 + arch/s390/pci/pci.c | 34 ++++++++++++++++++---------------- 2 files changed, 19 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 73b69a777152..93527655e752 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -26,6 +26,7 @@ int pci_proc_domain(struct pci_bus *); #define ZPCI_NR_DMA_SPACES 1 #define ZPCI_NR_DEVICES CONFIG_PCI_NR_FUNCTIONS +#define ZPCI_DOMAIN_BITMAP_SIZE (1 << 16) /* PCI Function Controls */ #define ZPCI_FC_FN_ENABLED 0x80 diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index d4eaba24e300..2b90a90aa81d 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -40,8 +40,9 @@ static LIST_HEAD(zpci_list); static DEFINE_SPINLOCK(zpci_list_lock); -static DECLARE_BITMAP(zpci_domain, ZPCI_NR_DEVICES); +static DECLARE_BITMAP(zpci_domain, ZPCI_DOMAIN_BITMAP_SIZE); static DEFINE_SPINLOCK(zpci_domain_lock); +static unsigned int zpci_num_domains_allocated; #define ZPCI_IOMAP_ENTRIES \ min(((unsigned long) ZPCI_NR_DEVICES * PCI_STD_NUM_BARS / 2), \ @@ -609,12 +610,16 @@ void pcibios_disable_device(struct pci_dev *pdev) static int zpci_alloc_domain(struct zpci_dev *zdev) { + spin_lock(&zpci_domain_lock); + if (zpci_num_domains_allocated > (ZPCI_NR_DEVICES - 1)) { + spin_unlock(&zpci_domain_lock); + pr_err("Adding PCI function %08x failed because the configured limit of %d is reached\n", + zdev->fid, ZPCI_NR_DEVICES); + return -ENOSPC; + } + if (zpci_unique_uid) { zdev->domain = (u16) zdev->uid; - if (zdev->domain >= ZPCI_NR_DEVICES) - return 0; - - spin_lock(&zpci_domain_lock); if (test_bit(zdev->domain, zpci_domain)) { spin_unlock(&zpci_domain_lock); pr_err("Adding PCI function %08x failed because domain %04x is already assigned\n", @@ -622,30 +627,27 @@ static int zpci_alloc_domain(struct zpci_dev *zdev) return -EEXIST; } set_bit(zdev->domain, zpci_domain); + zpci_num_domains_allocated++; spin_unlock(&zpci_domain_lock); return 0; } - - spin_lock(&zpci_domain_lock); + /* + * We can always auto allocate domains below ZPCI_NR_DEVICES. + * There is either a free domain or we have reached the maximum in + * which case we would have bailed earlier. + */ zdev->domain = find_first_zero_bit(zpci_domain, ZPCI_NR_DEVICES); - if (zdev->domain == ZPCI_NR_DEVICES) { - spin_unlock(&zpci_domain_lock); - pr_err("Adding PCI function %08x failed because the configured limit of %d is reached\n", - zdev->fid, ZPCI_NR_DEVICES); - return -ENOSPC; - } set_bit(zdev->domain, zpci_domain); + zpci_num_domains_allocated++; spin_unlock(&zpci_domain_lock); return 0; } static void zpci_free_domain(struct zpci_dev *zdev) { - if (zdev->domain >= ZPCI_NR_DEVICES) - return; - spin_lock(&zpci_domain_lock); clear_bit(zdev->domain, zpci_domain); + zpci_num_domains_allocated--; spin_unlock(&zpci_domain_lock); } -- cgit v1.2.3 From 7a11c67a1ff9b0231eaaaa6a28294776d55b569a Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Wed, 18 Mar 2020 13:53:16 +0100 Subject: s390/pci: Improve handling of unset UID When UID checking is enabled a UID value of 0 is invalid and can not be set by the user. On z/VM it is however used to indicate an unset UID. Until now, this lead to the behavior that one PCI function could be attached with UID 0 after which z/VM would prohibit further attachment. Now if the user then turns off UID checking in z/VM the user could seemingly attach additional PCI functions that would however not show up in Linux as that would not be informed of the change in UID checking mode. This is unexpected and confusing and lead to bug reports against Linux. Instead now, if we encounter an unset UID value of 0 treat it as indicating that UID checking was turned off, switch to automatic domain allocation, and warn the user of the possible misconfiguration. Signed-off-by: Niklas Schnelle Reviewed-by: Peter Oberparleiter Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/pci.h | 3 +++ arch/s390/pci/pci.c | 8 ++++++++ arch/s390/pci/pci_clp.c | 2 +- 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 93527655e752..7485ee561fec 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -189,6 +189,9 @@ int clp_enable_fh(struct zpci_dev *, u8); int clp_disable_fh(struct zpci_dev *); int clp_get_state(u32 fid, enum zpci_state *state); +/* UID */ +void update_uid_checking(bool new); + /* IOMMU Interface */ int zpci_init_iommu(struct zpci_dev *zdev); void zpci_destroy_iommu(struct zpci_dev *zdev); diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 2b90a90aa81d..cf7485bdd7cf 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -620,6 +620,13 @@ static int zpci_alloc_domain(struct zpci_dev *zdev) if (zpci_unique_uid) { zdev->domain = (u16) zdev->uid; + if (zdev->domain == 0) { + pr_warn("UID checking is active but no UID is set for PCI function %08x, so automatic domain allocation is used instead\n", + zdev->fid); + update_uid_checking(false); + goto auto_allocate; + } + if (test_bit(zdev->domain, zpci_domain)) { spin_unlock(&zpci_domain_lock); pr_err("Adding PCI function %08x failed because domain %04x is already assigned\n", @@ -631,6 +638,7 @@ static int zpci_alloc_domain(struct zpci_dev *zdev) spin_unlock(&zpci_domain_lock); return 0; } +auto_allocate: /* * We can always auto allocate domains below ZPCI_NR_DEVICES. * There is either a free domain or we have reached the maximum in diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 0d3d8f170ea4..ea794ae755ae 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -24,7 +24,7 @@ bool zpci_unique_uid; -static void update_uid_checking(bool new) +void update_uid_checking(bool new) { if (zpci_unique_uid != new) zpci_dbg(1, "uid checking:%d\n", new); -- cgit v1.2.3 From 6c7c851f1b666a8a455678a0b480b9162de86052 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Tue, 3 Mar 2020 16:42:01 +0100 Subject: s390/diag: fix display of diagnose call statistics Show the full diag statistic table and not just parts of it. The issue surfaced in a KVM guest with a number of vcpus defined smaller than NR_DIAG_STAT. Fixes: 1ec2772e0c3c ("s390/diag: add a statistic for diagnose calls") Cc: stable@vger.kernel.org Signed-off-by: Michael Mueller Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/diag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c index e9dac9a24d3f..61f2b0412345 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag.c @@ -84,7 +84,7 @@ static int show_diag_stat(struct seq_file *m, void *v) static void *show_diag_stat_start(struct seq_file *m, loff_t *pos) { - return *pos <= nr_cpu_ids ? (void *)((unsigned long) *pos + 1) : NULL; + return *pos <= NR_DIAG_STAT ? (void *)((unsigned long) *pos + 1) : NULL; } static void *show_diag_stat_next(struct seq_file *m, void *v, loff_t *pos) -- cgit v1.2.3 From 4141b6a5e9f171325effc36a22eb92bf961e7a5c Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 23 Mar 2020 11:09:07 +0100 Subject: s390/cpum_sf: Fix wrong page count in error message When perf record -e SF_CYCLES_BASIC_DIAG runs with very high frequency, the samples arrive faster than the perf process can save them to file. Eventually, for longer running processes, this leads to the siutation where the trace buffers allocated by perf slowly fills up. At one point the auxiliary trace buffer is full and the CPU Measurement sampling facility is turned off. Furthermore a warning is printed to the kernel log buffer: cpum_sf: The AUX buffer with 0 pages for the diagnostic-sampling mode is full The number of allocated pages for the auxiliary trace buffer is shown as zero pages. That is wrong. Fix this by saving the number of allocated pages before entering the work loop in the interrupt handler. When the interrupt handler processes the samples, it may detect the buffer full condition and stop sampling, reducing the buffer size to zero. Print the correct value in the error message: cpum_sf: The AUX buffer with 256 pages for the diagnostic-sampling mode is full Signed-off-by: Thomas Richter Signed-off-by: Vasily Gorbik --- arch/s390/kernel/perf_cpum_sf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index cf2020b8db44..85a711d783eb 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -1578,6 +1578,7 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw) unsigned long range = 0, size; unsigned long long overflow = 0; struct perf_output_handle *handle = &cpuhw->handle; + unsigned long num_sdb; aux = perf_get_aux(handle); if (WARN_ON_ONCE(!aux)) @@ -1589,13 +1590,14 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw) size >> PAGE_SHIFT); perf_aux_output_end(handle, size); + num_sdb = aux->sfb.num_sdb; while (!done) { /* Get an output handle */ aux = perf_aux_output_begin(handle, cpuhw->event); if (handle->size == 0) { pr_err("The AUX buffer with %lu pages for the " "diagnostic-sampling mode is full\n", - aux->sfb.num_sdb); + num_sdb); debug_sprintf_event(sfdbg, 1, "%s: AUX buffer used up\n", __func__); -- cgit v1.2.3 From 2c7749b90536b76795eab4cada028c2ddad25fc3 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Mar 2020 13:47:30 -0700 Subject: s390: use fallthrough; Convert the various uses of fallthrough comments to fallthrough; Done via script Link: https://lore.kernel.org/lkml/b56602fcf79f849e733e7b521bb0e17895d390fa.1582230379.git.joe.com/ Signed-off-by: Joe Perches Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/signal.c | 4 ++-- arch/s390/kernel/topology.c | 2 +- arch/s390/mm/fault.c | 11 +++++------ arch/s390/mm/pgalloc.c | 2 +- 4 files changed, 9 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index e6fca5498e1f..b295090e2ce6 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -487,7 +487,7 @@ void do_signal(struct pt_regs *regs) regs->gprs[2] = -EINTR; break; } - /* fallthrough */ + fallthrough; case -ERESTARTNOINTR: regs->gprs[2] = regs->orig_gpr2; regs->psw.addr = @@ -514,7 +514,7 @@ void do_signal(struct pt_regs *regs) case -ERESTART_RESTARTBLOCK: /* Restart with sys_restart_syscall */ regs->int_code = __NR_restart_syscall; - /* fallthrough */ + fallthrough; case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index d03edebce754..5f70cefc13e4 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -83,7 +83,7 @@ static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu) cpumask_copy(&mask, cpu_present_mask); break; default: - /* fallthrough */ + fallthrough; case TOPOLOGY_MODE_SINGLE: cpumask_copy(&mask, cpumask_of(cpu)); break; diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 151adef0d5dd..15c6c811d254 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -122,7 +122,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) if (*table & _REGION_ENTRY_INVALID) goto out; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* fallthrough */ + fallthrough; case _ASCE_TYPE_REGION2: table += (address & _REGION2_INDEX) >> _REGION2_SHIFT; if (bad_address(table)) @@ -131,7 +131,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) if (*table & _REGION_ENTRY_INVALID) goto out; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* fallthrough */ + fallthrough; case _ASCE_TYPE_REGION3: table += (address & _REGION3_INDEX) >> _REGION3_SHIFT; if (bad_address(table)) @@ -140,7 +140,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE)) goto out; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* fallthrough */ + fallthrough; case _ASCE_TYPE_SEGMENT: table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; if (bad_address(table)) @@ -327,7 +327,7 @@ static noinline void do_fault_error(struct pt_regs *regs, int access, case VM_FAULT_BADACCESS: if (access == VM_EXEC && signal_return(regs) == 0) break; - /* fallthrough */ + fallthrough; case VM_FAULT_BADMAP: /* Bad memory access. Check if it is kernel or user space. */ if (user_mode(regs)) { @@ -337,9 +337,8 @@ static noinline void do_fault_error(struct pt_regs *regs, int access, do_sigsegv(regs, si_code); break; } - /* fallthrough */ + fallthrough; case VM_FAULT_BADCONTEXT: - /* fallthrough */ case VM_FAULT_PFAULT: do_no_context(regs); break; diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index d3be3fe2c55d..af3bddd5e568 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -326,7 +326,7 @@ void __tlb_remove_table(void *_table) mask >>= 24; if (mask != 0) break; - /* fallthrough */ + fallthrough; case 3: /* 4K page table with pgstes */ if (mask & 3) atomic_xor_bits(&page->_refcount, 3 << 24); -- cgit v1.2.3 From 712fa5f294f377ee3103c36c178e7d62c65dd108 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Mon, 23 Mar 2020 09:38:37 +0100 Subject: s390/mm: cleanup arch_get_unmapped_area() and friends Factor out check_asce_limit() function and fix few style defects in arch_get_unmapped_area() family of functions. Signed-off-by: Alexander Gordeev Reviewed-by: Heiko Carstens [heiko.carstens@de.ibm.com: small coding style changes] Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/pgalloc.h | 14 ++++++++++++++ arch/s390/mm/hugetlbpage.c | 11 ++--------- arch/s390/mm/mmap.c | 40 +++++++++++----------------------------- 3 files changed, 27 insertions(+), 38 deletions(-) (limited to 'arch') diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 77606c4acd58..f0d7457fa1da 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -48,6 +48,20 @@ static inline unsigned long pgd_entry_type(struct mm_struct *mm) int crst_table_upgrade(struct mm_struct *mm, unsigned long limit); void crst_table_downgrade(struct mm_struct *); +static inline unsigned long check_asce_limit(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + int rc; + + if (addr + len > mm->context.asce_limit && + addr + len <= TASK_SIZE) { + rc = crst_table_upgrade(mm, addr + len); + if (rc) + return (unsigned long) rc; + } + return addr; +} + static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long address) { unsigned long *table = crst_table_alloc(mm); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 5674710a4841..f01daddcbc5e 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -326,7 +326,6 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct hstate *h = hstate_file(file); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - int rc; if (len & ~huge_page_mask(h)) return -EINVAL; @@ -353,15 +352,9 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, else addr = hugetlb_get_unmapped_area_topdown(file, addr, len, pgoff, flags); - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return addr; check_asce_limit: - if (addr + len > current->mm->context.asce_limit && - addr + len <= TASK_SIZE) { - rc = crst_table_upgrade(mm, addr + len); - if (rc) - return (unsigned long) rc; - } - return addr; + return check_asce_limit(mm, addr, len); } diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index cbc718ba6d78..1b78f630a9ca 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -72,14 +72,13 @@ static inline unsigned long mmap_base(unsigned long rnd, return PAGE_ALIGN(STACK_TOP - gap - rnd); } -unsigned long -arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) +unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct vm_unmapped_area_info info; - int rc; if (len > TASK_SIZE - mmap_min_addr) return -ENOMEM; @@ -105,30 +104,20 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; addr = vm_unmapped_area(&info); - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return addr; check_asce_limit: - if (addr + len > current->mm->context.asce_limit && - addr + len <= TASK_SIZE) { - rc = crst_table_upgrade(mm, addr + len); - if (rc) - return (unsigned long) rc; - } - - return addr; + return check_asce_limit(mm, addr, len); } -unsigned long -arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, - const unsigned long len, const unsigned long pgoff, - const unsigned long flags) +unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; - unsigned long addr = addr0; struct vm_unmapped_area_info info; - int rc; /* requested length too big for entire address space */ if (len > TASK_SIZE - mmap_min_addr) @@ -163,25 +152,18 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, * can happen with large stack limits and large mmap() * allocations. */ - if (addr & ~PAGE_MASK) { + if (offset_in_page(addr)) { VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; info.high_limit = TASK_SIZE; addr = vm_unmapped_area(&info); - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return addr; } check_asce_limit: - if (addr + len > current->mm->context.asce_limit && - addr + len <= TASK_SIZE) { - rc = crst_table_upgrade(mm, addr + len); - if (rc) - return (unsigned long) rc; - } - - return addr; + return check_asce_limit(mm, addr, len); } /* -- cgit v1.2.3 From 6a3eb35e56b3308966945b76ec1dfbc18537feef Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 28 Feb 2020 11:32:01 +0100 Subject: s390/mm: remove page table downgrade support This update consolidates page table handling code. Because there are hardly any 31-bit binaries left we do not need to optimize for that. No extra efforts are needed to ensure that a compat task does not map anything above 2GB. The TASK_SIZE limit for 31-bit tasks is 2GB already and the generic code does check that a resulting map address would not surpass that limit. Signed-off-by: Alexander Gordeev Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/mmu.h | 2 -- arch/s390/include/asm/mmu_context.h | 5 ----- arch/s390/include/asm/pgalloc.h | 16 +--------------- arch/s390/include/asm/processor.h | 1 - arch/s390/mm/pgalloc.c | 24 ------------------------ 5 files changed, 1 insertion(+), 47 deletions(-) (limited to 'arch') diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index bcfb6371086f..a8418e1379eb 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -32,8 +32,6 @@ typedef struct { unsigned int uses_cmm:1; /* The gmaps associated with this context are allowed to use huge pages. */ unsigned int allow_gmap_hpage_1m:1; - /* The mmu context is for compat task */ - unsigned int compat_mm:1; } mm_context_t; #define INIT_MM_CONTEXT(name) \ diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 8d04e6f3f796..3763734965e4 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -25,7 +25,6 @@ static inline int init_new_context(struct task_struct *tsk, atomic_set(&mm->context.flush_count, 0); mm->context.gmap_asce = 0; mm->context.flush_mm = 0; - mm->context.compat_mm = test_thread_flag(TIF_31BIT); #ifdef CONFIG_PGSTE mm->context.alloc_pgste = page_table_allocate_pgste || test_thread_flag(TIF_PGSTE) || @@ -57,10 +56,6 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION2; break; - case _REGION3_SIZE: - /* forked 2-level compat task, set new asce with new mm->pgd */ - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; } crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); return 0; diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index f0d7457fa1da..5e3ff9f7a586 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -46,7 +46,6 @@ static inline unsigned long pgd_entry_type(struct mm_struct *mm) } int crst_table_upgrade(struct mm_struct *mm, unsigned long limit); -void crst_table_downgrade(struct mm_struct *); static inline unsigned long check_asce_limit(struct mm_struct *mm, unsigned long addr, unsigned long len) @@ -130,24 +129,11 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - unsigned long *table = crst_table_alloc(mm); - - if (!table) - return NULL; - if (mm->context.asce_limit == _REGION3_SIZE) { - /* Forking a compat process with 2 page table levels */ - if (!pgtable_pmd_page_ctor(virt_to_page(table))) { - crst_table_free(mm, table); - return NULL; - } - } - return (pgd_t *) table; + return (pgd_t *) crst_table_alloc(mm); } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - if (mm->context.asce_limit == _REGION3_SIZE) - pgtable_pmd_page_dtor(virt_to_page(pgd)); crst_table_free(mm, (unsigned long *) pgd); } diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index c9522346799f..dee5e57da518 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -179,7 +179,6 @@ typedef struct thread_struct thread_struct; regs->psw.mask = PSW_USER_BITS | PSW_MASK_BA; \ regs->psw.addr = new_psw; \ regs->gprs[15] = new_stackp; \ - crst_table_downgrade(current->mm); \ execve_tail(); \ } while (0) diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index af3bddd5e568..4630fb7705ca 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -138,30 +138,6 @@ err_p4d: return -ENOMEM; } -void crst_table_downgrade(struct mm_struct *mm) -{ - pgd_t *pgd; - - /* downgrade should only happen from 3 to 2 levels (compat only) */ - VM_BUG_ON(mm->context.asce_limit != _REGION2_SIZE); - - if (current->active_mm == mm) { - clear_user_asce(); - __tlb_flush_mm(mm); - } - - pgd = mm->pgd; - mm_dec_nr_pmds(mm); - mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); - mm->context.asce_limit = _REGION3_SIZE; - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; - crst_table_free(mm, (unsigned long *) pgd); - - if (current->active_mm == mm) - set_user_asce(mm); -} - static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) { unsigned int old, new; -- cgit v1.2.3 From f75556081afe5a565c2ce200837406303a59ae2b Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 19 Mar 2020 13:44:49 +0100 Subject: s390/mm: cleanup virtual memory constants usage Remove duplicate definitions and consolidate usage of virutal and address translation constants. Signed-off-by: Alexander Gordeev Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/mmu_context.h | 4 ++-- arch/s390/include/asm/processor.h | 8 ++++---- arch/s390/mm/pgalloc.c | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 3763734965e4..248d51cdcad9 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -42,11 +42,11 @@ static inline int init_new_context(struct task_struct *tsk, */ case 0: /* context created by exec, set asce limit to 4TB */ - mm->context.asce_limit = STACK_TOP_MAX; + mm->context.asce_limit = _REGION2_SIZE; mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION3; break; - case -PAGE_SIZE: + case TASK_SIZE_MAX: /* forked 5-level task, set new asce with new_mm->pgd */ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION1; diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index dee5e57da518..d052adfd53f3 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -93,15 +93,15 @@ extern void __bpon(void); */ #define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_31BIT) ? \ - (1UL << 31) : -PAGE_SIZE) + _REGION3_SIZE : TASK_SIZE_MAX) #define TASK_UNMAPPED_BASE (test_thread_flag(TIF_31BIT) ? \ - (1UL << 30) : (1UL << 41)) + (_REGION3_SIZE >> 1) : (_REGION2_SIZE >> 1)) #define TASK_SIZE TASK_SIZE_OF(current) #define TASK_SIZE_MAX (-PAGE_SIZE) #define STACK_TOP (test_thread_flag(TIF_31BIT) ? \ - (1UL << 31) : (1UL << 42)) -#define STACK_TOP_MAX (1UL << 42) + _REGION3_SIZE : _REGION2_SIZE) +#define STACK_TOP_MAX _REGION2_SIZE #define HAVE_ARCH_PICK_MMAP_LAYOUT diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 4630fb7705ca..498c98a312f4 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -121,7 +121,7 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) __pgd = (unsigned long *) mm->pgd; pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd); mm->pgd = (pgd_t *) pgd; - mm->context.asce_limit = -PAGE_SIZE; + mm->context.asce_limit = TASK_SIZE_MAX; mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION1; } @@ -527,7 +527,7 @@ void base_asce_free(unsigned long asce) base_region2_walk(table, 0, _REGION1_SIZE, 0); break; case _ASCE_TYPE_REGION1: - base_region1_walk(table, 0, -_PAGE_SIZE, 0); + base_region1_walk(table, 0, TASK_SIZE_MAX, 0); break; } base_crst_free(table); -- cgit v1.2.3 From 1058c163dc31b3335c9cf7c4fa42ccf87be73017 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 19 Mar 2020 13:44:50 +0100 Subject: s390/mm: cleanup init_new_context() callback The set of values asce_limit may be assigned with is TASK_SIZE_MAX, _REGION1_SIZE, _REGION2_SIZE and 0 as a special case if the callback was called from execve(). Do VM_BUG_ON() if asce_limit is something else. Save few CPU cycles by removing unnecessary asce_limit re-assignment in case of 3-level task and redundant PGD entry type reconstruction. Signed-off-by: Alexander Gordeev Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/mmu_context.h | 35 +++++++++++++++++++++-------------- arch/s390/include/asm/pgalloc.h | 11 ----------- 2 files changed, 21 insertions(+), 25 deletions(-) (limited to 'arch') diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 248d51cdcad9..844396b3735e 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -18,6 +18,8 @@ static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { + unsigned long asce_type, init_entry; + spin_lock_init(&mm->context.lock); INIT_LIST_HEAD(&mm->context.pgtable_list); INIT_LIST_HEAD(&mm->context.gmap_list); @@ -35,29 +37,34 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.allow_gmap_hpage_1m = 0; #endif switch (mm->context.asce_limit) { - case _REGION2_SIZE: + default: /* - * forked 3-level task, fall through to set new asce with new - * mm->pgd + * context created by exec, the value of asce_limit can + * only be zero in this case */ - case 0: - /* context created by exec, set asce limit to 4TB */ + VM_BUG_ON(mm->context.asce_limit); + /* continue as 3-level task */ mm->context.asce_limit = _REGION2_SIZE; - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_REGION3; + fallthrough; + case _REGION2_SIZE: + /* forked 3-level task */ + init_entry = _REGION3_ENTRY_EMPTY; + asce_type = _ASCE_TYPE_REGION3; break; case TASK_SIZE_MAX: - /* forked 5-level task, set new asce with new_mm->pgd */ - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_REGION1; + /* forked 5-level task */ + init_entry = _REGION1_ENTRY_EMPTY; + asce_type = _ASCE_TYPE_REGION1; break; case _REGION1_SIZE: - /* forked 4-level task, set new asce with new mm->pgd */ - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_REGION2; + /* forked 4-level task */ + init_entry = _REGION2_ENTRY_EMPTY; + asce_type = _ASCE_TYPE_REGION2; break; } - crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | asce_type; + crst_table_init((unsigned long *) mm->pgd, init_entry); return 0; } diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 5e3ff9f7a586..74a352f8c0d1 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -34,17 +34,6 @@ static inline void crst_table_init(unsigned long *crst, unsigned long entry) memset64((u64 *)crst, entry, _CRST_ENTRIES); } -static inline unsigned long pgd_entry_type(struct mm_struct *mm) -{ - if (mm_pmd_folded(mm)) - return _SEGMENT_ENTRY_EMPTY; - if (mm_pud_folded(mm)) - return _REGION3_ENTRY_EMPTY; - if (mm_p4d_folded(mm)) - return _REGION2_ENTRY_EMPTY; - return _REGION1_ENTRY_EMPTY; -} - int crst_table_upgrade(struct mm_struct *mm, unsigned long limit); static inline unsigned long check_asce_limit(struct mm_struct *mm, unsigned long addr, -- cgit v1.2.3