From 47f7c6cf0082e1d963d1761b6bc2a94480fc8671 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Sat, 26 Jun 2021 15:34:37 +0800 Subject: s390/kprobes: use is_kernel() helper Use is_kernel() helper instead of is_kernel_addr(). [hca@linux.ibm.com: add missing unsigned long cast] Cc: Vasily Gorbik Cc: Christian Borntraeger Signed-off-by: Kefeng Wang Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/kprobes.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 528bb31815c3..52d056a5f89f 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -92,11 +92,6 @@ static void copy_instruction(struct kprobe *p) } NOKPROBE_SYMBOL(copy_instruction); -static inline int is_kernel_addr(void *addr) -{ - return addr < (void *)_end; -} - static int s390_get_insn_slot(struct kprobe *p) { /* @@ -105,7 +100,7 @@ static int s390_get_insn_slot(struct kprobe *p) * field can be patched and executed within the insn slot. */ p->ainsn.insn = NULL; - if (is_kernel_addr(p->addr)) + if (is_kernel((unsigned long)p->addr)) p->ainsn.insn = get_s390_insn_slot(); else if (is_module_addr(p->addr)) p->ainsn.insn = get_insn_slot(); @@ -117,7 +112,7 @@ static void s390_free_insn_slot(struct kprobe *p) { if (!p->ainsn.insn) return; - if (is_kernel_addr(p->addr)) + if (is_kernel((unsigned long)p->addr)) free_s390_insn_slot(p->ainsn.insn, 0); else free_insn_slot(p->ainsn.insn, 0); -- cgit v1.2.3 From 85b18d7b5e7ffefb2f076186511d39c4990aa005 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Tue, 12 Jan 2021 05:40:53 -0500 Subject: s390: mm: Fix secure storage access exception handling Turns out that the bit 61 in the TEID is not always 1 and if that's the case the address space ID and the address are unpredictable. Without an address and its address space ID we can't export memory and hence we can only send a SIGSEGV to the process or panic the kernel depending on who caused the exception. Unfortunately bit 61 is only reliable if we have the "misc" UV feature bit. Signed-off-by: Janosch Frank Reviewed-by: Christian Borntraeger Fixes: 084ea4d611a3d ("s390/mm: add (non)secure page access exceptions handlers") Cc: stable@vger.kernel.org Signed-off-by: Vasily Gorbik --- arch/s390/kernel/uv.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 6be2167943bb..aeb0a15bcbb7 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -358,6 +358,15 @@ static ssize_t uv_query_facilities(struct kobject *kobj, static struct kobj_attribute uv_query_facilities_attr = __ATTR(facilities, 0444, uv_query_facilities, NULL); +static ssize_t uv_query_feature_indications(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.uv_feature_indications); +} + +static struct kobj_attribute uv_query_feature_indications_attr = + __ATTR(feature_indications, 0444, uv_query_feature_indications, NULL); + static ssize_t uv_query_max_guest_cpus(struct kobject *kobj, struct kobj_attribute *attr, char *page) { @@ -390,6 +399,7 @@ static struct kobj_attribute uv_query_max_guest_addr_attr = static struct attribute *uv_query_attrs[] = { &uv_query_facilities_attr.attr, + &uv_query_feature_indications_attr.attr, &uv_query_max_guest_cpus_attr.attr, &uv_query_max_guest_vms_attr.attr, &uv_query_max_guest_addr_attr.attr, -- cgit v1.2.3 From fbbdfca5c5535f52ba47e46eacac899dfad7f384 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 18 Jun 2021 08:17:13 +0200 Subject: s390/entry.S: factor out SIEEXIT macro Factor out SIEEXIT macro and use it instead of cleanup_sie routine. As a side effect %r13 and %r14 are spared. Signed-off-by: Alexander Gordeev Reviewed-by: Christia Borntraeger Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/entry.S | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 3e8c6669373a..3287cb0d89ad 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -148,6 +148,13 @@ _LPP_OFFSET = __LC_LPP clgr %r14,%r13 jhe \outside_label .endm + + .macro SIEEXIT + lg %r9,__SF_SIE_CONTROL(%r15) # get control block pointer + ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE + lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce + larl %r9,sie_exit # skip forward to sie_exit + .endm #endif GEN_BR_THUNK %r14 @@ -235,7 +242,6 @@ ENTRY(sie64a) # are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable. # Other instructions between sie64a and .Lsie_done should not cause program # interrupts. So lets use 3 nops as a landing pad for all possible rewinds. -# See also .Lcleanup_sie .Lrewind_pad6: nopr 7 .Lrewind_pad4: @@ -341,10 +347,7 @@ ENTRY(pgm_check_handler) #if IS_ENABLED(CONFIG_KVM) # cleanup critical section for program checks in sie64a OUTSIDE %r9,.Lsie_gmap,.Lsie_done,1f - lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer - ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE - lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce - larl %r9,sie_exit # skip forward to sie_exit + SIEEXIT lghi %r10,_PIF_GUEST_FAULT #endif 1: tmhh %r8,0x4000 # PER bit set in old PSW ? @@ -410,7 +413,8 @@ ENTRY(\name) jnz 1f #if IS_ENABLED(CONFIG_KVM) OUTSIDE %r9,.Lsie_gmap,.Lsie_done,0f - brasl %r14,.Lcleanup_sie + BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) + SIEEXIT #endif 0: CHECK_STACK __LC_SAVE_AREA_ASYNC aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) @@ -556,7 +560,8 @@ ENTRY(mcck_int_handler) OUTSIDE %r9,.Lsie_gmap,.Lsie_done,.Lmcck_stack OUTSIDE %r9,.Lsie_entry,.Lsie_skip,5f oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST -5: brasl %r14,.Lcleanup_sie +5: BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) + SIEEXIT #endif j .Lmcck_stack .Lmcck_user: @@ -657,15 +662,6 @@ ENTRY(stack_overflow) ENDPROC(stack_overflow) #endif -#if IS_ENABLED(CONFIG_KVM) -.Lcleanup_sie: - BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) - lg %r9,__SF_SIE_CONTROL(%r15) # get control block pointer - ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE - lctlg %c1,%c1,__LC_KERNEL_ASCE - larl %r9,sie_exit # skip forward to sie_exit - BR_EX %r14,%r13 -#endif .section .rodata, "a" #define SYSCALL(esame,emu) .quad __s390x_ ## esame .globl sys_call_table -- cgit v1.2.3 From e2c13d64200bff0aa3964017cfabb0bc47691022 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 22 Jun 2021 19:06:18 +0200 Subject: s390/mcck: optimize user mode check in case of !CONFIG_KVM In case of the !CONFIG_KVM use "jz" instead of "jnz" when detecting user mode and get rid of unnecessary jump as result. Signed-off-by: Alexander Gordeev Reviewed-by: Christia Borntraeger Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/entry.S | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 3287cb0d89ad..ff715cc2b77b 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -555,15 +555,17 @@ ENTRY(mcck_int_handler) jno .Lmcck_panic 4: ssm __LC_PGM_NEW_PSW # turn dat on, keep irqs off tmhh %r8,0x0001 # interrupting from user ? - jnz .Lmcck_user #if IS_ENABLED(CONFIG_KVM) + jnz .Lmcck_user OUTSIDE %r9,.Lsie_gmap,.Lsie_done,.Lmcck_stack OUTSIDE %r9,.Lsie_entry,.Lsie_skip,5f oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST 5: BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) SIEEXIT -#endif j .Lmcck_stack +#else + jz .Lmcck_stack +#endif .Lmcck_user: BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP .Lmcck_stack: -- cgit v1.2.3 From 7f6dc8d4c880f64b9d450d780d88985b264d8793 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 18 Jun 2021 08:17:15 +0200 Subject: s390/mcck: always enter C handler with DAT enabled The machine check handler must be entered with DAT disabled in case control registers are corrupted or a storage error happened and we can not tell if such error corresponds to a page table. Both of described conditions end up in stopping all CPUs and entering the disabled wait in C half of the handler. However, the storage errors are still checked after the DAT is enabled and C code is entered. In case a page table is damaged such flow is not expected to work. This update paves the way for moving the storage error checks from C to assembler half. All fatal errors that can only be handled with DAT disabled are handled in assembler half also. As result, the C half is only entered if the DAT is secured. Signed-off-by: Alexander Gordeev Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/entry.S | 35 ++++++++++++++++++++++++++++++++--- arch/s390/kernel/nmi.c | 29 ----------------------------- 2 files changed, 32 insertions(+), 32 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index ff715cc2b77b..6bc8ed800458 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -570,7 +570,6 @@ ENTRY(mcck_int_handler) BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP .Lmcck_stack: lg %r15,__LC_MCCK_STACK -.Lmcck_skip: la %r11,STACK_FRAME_OVERHEAD(%r15) stctg %c1,%c1,__PT_CR1(%r11) lctlg %c1,%c1,__LC_KERNEL_ASCE @@ -612,8 +611,33 @@ ENTRY(mcck_int_handler) b __LC_RETURN_MCCK_LPSWE .Lmcck_panic: - lg %r15,__LC_NODAT_STACK - j .Lmcck_skip + /* + * Iterate over all possible CPU addresses in the range 0..0xffff + * and stop each CPU using signal processor. Use compare and swap + * to allow just one CPU-stopper and prevent concurrent CPUs from + * stopping each other while leaving the others running. + */ + lhi %r5,0 + lhi %r6,1 + larl %r7,.Lstop_lock + cs %r5,%r6,0(%r7) # single CPU-stopper only + jnz 4f + larl %r7,.Lthis_cpu + stap 0(%r7) # this CPU address + lh %r4,0(%r7) + nilh %r4,0 + lhi %r0,1 + sll %r0,16 # CPU counter + lhi %r3,0 # next CPU address +0: cr %r3,%r4 + je 2f +1: sigp %r1,%r3,SIGP_STOP # stop next CPU + brc SIGP_CC_BUSY,1b +2: ahi %r3,1 + brct %r0,0b +3: sigp %r1,%r4,SIGP_STOP # stop this CPU + brc SIGP_CC_BUSY,3b +4: j 4b ENDPROC(mcck_int_handler) # @@ -664,6 +688,11 @@ ENTRY(stack_overflow) ENDPROC(stack_overflow) #endif + .section .data, "aw" + .align 4 +.Lstop_lock: .long 0 +.Lthis_cpu: .short 0 + .section .rodata, "a" #define SYSCALL(esame,emu) .quad __s390x_ ## esame .globl sys_call_table diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 11f8c296f60d..a424f6e69b95 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -205,14 +205,6 @@ static int notrace s390_check_registers(union mci mci, int umode) s390_handle_damage(); kill_task = 1; } - /* Check control registers */ - if (!mci.cr) { - /* - * Control registers have unknown contents. - * Can't recover and therefore stopping machine. - */ - s390_handle_damage(); - } if (!mci.fp) { /* * Floating point registers can't be restored. If the @@ -273,22 +265,6 @@ static int notrace s390_check_registers(union mci mci, int umode) kill_task = 1; } } - /* Check if old PSW is valid */ - if (!mci.wp) { - /* - * Can't tell if we come from user or kernel mode - * -> stopping machine. - */ - s390_handle_damage(); - } - /* Check for invalid kernel instruction address */ - if (!mci.ia && !umode) { - /* - * The instruction address got lost while running - * in the kernel -> stopping machine. - */ - s390_handle_damage(); - } if (!mci.ms || !mci.pm || !mci.ia) kill_task = 1; @@ -353,11 +329,6 @@ int notrace s390_do_machine_check(struct pt_regs *regs) mci.val = S390_lowcore.mcck_interruption_code; mcck = this_cpu_ptr(&cpu_mcck); - if (mci.sd) { - /* System damage -> stopping machine */ - s390_handle_damage(); - } - /* * Reinject the instruction processing damages' machine checks * including Delayed Access Exception into the guest -- cgit v1.2.3 From d35925b34996196d22a4357dc5212ab03af75151 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 18 Jun 2021 08:17:16 +0200 Subject: s390/mcck: move storage error checks to assembler The current storage errors tackling is wrong - the DAT is enabled in assembler code before the actual storage checks in C half are executed. In case the page tables themselves are damaged such approach is not going to work. With this update unrecoverable storage errors are not passed to C code for handling, but rather the machine is stopped right away. The only exception to this flow is when a machine check occurred in KVM guest - in this case the errors are reinjected by the handler. Signed-off-by: Alexander Gordeev Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/entry.S | 43 ++++++++++++++++++++++++++++++++----------- arch/s390/kernel/nmi.c | 15 --------------- 2 files changed, 32 insertions(+), 26 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 6bc8ed800458..8f72a8f9bc33 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -129,6 +129,24 @@ _LPP_OFFSET = __LC_LPP "jnz .+8; .long 0xb2e8d000", 82 .endm + /* + * The CHKSTG macro jumps to the provided label in case the + * machine check interruption code reports one of unrecoverable + * storage errors: + * - Storage error uncorrected + * - Storage key error uncorrected + * - Storage degradation with Failing-storage-address validity + */ + .macro CHKSTG errlabel + TSTMSK __LC_MCCK_CODE,(MCCK_CODE_STG_ERROR|MCCK_CODE_STG_KEY_ERROR) + jnz \errlabel + TSTMSK __LC_MCCK_CODE,MCCK_CODE_STG_DEGRAD + jz oklabel\@ + TSTMSK __LC_MCCK_CODE,MCCK_CODE_STG_FAIL_ADDR + jnz \errlabel +oklabel\@: + .endm + #if IS_ENABLED(CONFIG_KVM) /* * The OUTSIDE macro jumps to the provided label in case the value @@ -550,23 +568,26 @@ ENTRY(mcck_int_handler) 3: TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_MWP_VALID jno .Lmcck_panic tmhh %r8,0x0001 # interrupting from user ? - jnz 4f + jnz 6f TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID jno .Lmcck_panic -4: ssm __LC_PGM_NEW_PSW # turn dat on, keep irqs off - tmhh %r8,0x0001 # interrupting from user ? #if IS_ENABLED(CONFIG_KVM) - jnz .Lmcck_user - OUTSIDE %r9,.Lsie_gmap,.Lsie_done,.Lmcck_stack - OUTSIDE %r9,.Lsie_entry,.Lsie_skip,5f + OUTSIDE %r9,.Lsie_gmap,.Lsie_done,6f + OUTSIDE %r9,.Lsie_entry,.Lsie_skip,4f oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST -5: BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) + j 5f +4: CHKSTG .Lmcck_panic +5: larl %r14,.Lstosm_tmp + stosm 0(%r14),0x04 # turn dat on, keep irqs off + BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) SIEEXIT j .Lmcck_stack -#else - jz .Lmcck_stack #endif -.Lmcck_user: +6: CHKSTG .Lmcck_panic + larl %r14,.Lstosm_tmp + stosm 0(%r14),0x04 # turn dat on, keep irqs off + tmhh %r8,0x0001 # interrupting from user ? + jz .Lmcck_stack BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP .Lmcck_stack: lg %r15,__LC_MCCK_STACK @@ -692,7 +713,7 @@ ENDPROC(stack_overflow) .align 4 .Lstop_lock: .long 0 .Lthis_cpu: .short 0 - +.Lstosm_tmp: .byte 0 .section .rodata, "a" #define SYSCALL(esame,emu) .quad __s390x_ ## esame .globl sys_call_table diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index a424f6e69b95..fdb5d23ac995 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -399,21 +399,6 @@ int notrace s390_do_machine_check(struct pt_regs *regs) mcck_pending = 1; } - /* - * Reinject storage related machine checks into the guest if they - * happen when the guest is running. - */ - if (!test_cpu_flag(CIF_MCCK_GUEST)) { - if (mci.se) - /* Storage error uncorrected */ - s390_handle_damage(); - if (mci.ke) - /* Storage key-error uncorrected */ - s390_handle_damage(); - if (mci.ds && mci.fa) - /* Storage degradation */ - s390_handle_damage(); - } if (mci.cp) { /* Channel report word pending */ mcck->channel_report = 1; -- cgit v1.2.3 From 9f744abb4639e793689570fc9dcdf5f2f028bc9a Mon Sep 17 00:00:00 2001 From: Alexander Egorenkov Date: Thu, 25 Mar 2021 12:10:56 +0100 Subject: s390/boot: replace magic string check with a bootdata flag The magic string "S390EP" at offset 0x10008 indicated to the decompressed kernel that it was booted by the decompressor. Introduce a new bootdata flag instead which conveys the same information in an explicit and a cleaner way. But keep the magic string because it is a kernel ABI. Signed-off-by: Alexander Egorenkov Reviewed-by: Vasily Gorbik Signed-off-by: Vasily Gorbik --- arch/s390/kernel/early.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index c2cf79d353cf..fb84e3fc1686 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -33,6 +33,8 @@ #include #include "entry.h" +int __bootdata(is_full_image); + static void __init reset_tod_clock(void) { union tod_clock clk; @@ -279,7 +281,7 @@ static void __init setup_boot_command_line(void) static void __init check_image_bootable(void) { - if (!memcmp(EP_STRING, (void *)EP_OFFSET, strlen(EP_STRING))) + if (is_full_image) return; sclp_early_printk("Linux kernel boot failure: An attempt to boot a vmlinux ELF image failed.\n"); -- cgit v1.2.3 From 5fa2ea0714d75bf631c111ca51e9bd2bf6dbfb87 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 18 Jun 2021 08:17:18 +0200 Subject: s390/mcck: move register validation to C code This update partially reverts commit 3037a52f9846 ("s390/nmi: do register validation as early as possible"). Storage error checks and control registers validation are left in the assembler code, since correct ASCEs and page tables are required to enable DAT - which is done before the C handler is entered. System damage, kernel instruction address and PSW MWP checks are left in the assembler code as well, since there is no way to proceed if one of these checks is failed. The getcpu vdso syscall reads CPU number from the programmable field of the TOD clock. Disregard the TOD programmable register validity bit and load the CPU number into the TOD programmable field unconditionally. Signed-off-by: Alexander Gordeev Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/asm-offsets.c | 6 --- arch/s390/kernel/entry.S | 39 +------------------ arch/s390/kernel/nmi.c | 85 ++++++++++++++++++++++++++++++++++++++---- 3 files changed, 78 insertions(+), 52 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index f53605a3dfcd..77ff2130cb04 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -14,8 +14,6 @@ #include #include #include -#include -#include #include int main(void) @@ -108,7 +106,6 @@ int main(void) OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock); OFFSET(__LC_INT_CLOCK, lowcore, int_clock); OFFSET(__LC_MCCK_CLOCK, lowcore, mcck_clock); - OFFSET(__LC_CLOCK_COMPARATOR, lowcore, clock_comparator); OFFSET(__LC_BOOT_CLOCK, lowcore, boot_clock); OFFSET(__LC_CURRENT, lowcore, current_task); OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack); @@ -145,9 +142,6 @@ int main(void) OFFSET(__LC_CREGS_SAVE_AREA, lowcore, cregs_save_area); OFFSET(__LC_PGM_TDB, lowcore, pgm_tdb); BLANK(); - /* extended machine check save area */ - OFFSET(__MCESA_GS_SAVE_AREA, mcesa, guarded_storage_save_area); - BLANK(); /* gmap/sie offsets */ OFFSET(__GMAP_ASCE, gmap, asce); OFFSET(__SIE_PROG0C, kvm_s390_sie_block, prog0c); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 8f72a8f9bc33..5a2f70cbd3a9 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -506,8 +505,6 @@ ENTRY(mcck_int_handler) BPOFF la %r1,4095 # validate r1 spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # validate cpu timer - sckc __LC_CLOCK_COMPARATOR # validate comparator - lam %a0,%a15,__LC_AREGS_SAVE_AREA-4095(%r1) # validate acrs lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# validate gprs lg %r12,__LC_CURRENT lmg %r8,%r9,__LC_MCK_OLD_PSW @@ -518,41 +515,7 @@ ENTRY(mcck_int_handler) la %r14,4095 lctlg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r14) # validate ctl regs ptlb - lg %r11,__LC_MCESAD-4095(%r14) # extended machine check save area - nill %r11,0xfc00 # MCESA_ORIGIN_MASK - TSTMSK __LC_CREGS_SAVE_AREA+16-4095(%r14),CR2_GUARDED_STORAGE - jno 0f - TSTMSK __LC_MCCK_CODE,MCCK_CODE_GS_VALID - jno 0f - .insn rxy,0xe3000000004d,0,__MCESA_GS_SAVE_AREA(%r11) # LGSC -0: l %r14,__LC_FP_CREG_SAVE_AREA-4095(%r14) - TSTMSK __LC_MCCK_CODE,MCCK_CODE_FC_VALID - jo 0f - sr %r14,%r14 -0: sfpc %r14 - TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX - jo 0f - lghi %r14,__LC_FPREGS_SAVE_AREA - ld %f0,0(%r14) - ld %f1,8(%r14) - ld %f2,16(%r14) - ld %f3,24(%r14) - ld %f4,32(%r14) - ld %f5,40(%r14) - ld %f6,48(%r14) - ld %f7,56(%r14) - ld %f8,64(%r14) - ld %f9,72(%r14) - ld %f10,80(%r14) - ld %f11,88(%r14) - ld %f12,96(%r14) - ld %f13,104(%r14) - ld %f14,112(%r14) - ld %f15,120(%r14) - j 1f -0: VLM %v0,%v15,0,%r11 - VLM %v16,%v31,256,%r11 -1: lghi %r14,__LC_CPU_TIMER_SAVE_AREA + lghi %r14,__LC_CPU_TIMER_SAVE_AREA mvc __LC_MCCK_ENTER_TIMER(8),0(%r14) TSTMSK __LC_MCCK_CODE,MCCK_CODE_CPU_TIMER_VALID jo 3f diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index fdb5d23ac995..20f8e1868853 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -189,12 +189,16 @@ void noinstr s390_handle_mcck(void) * returns 0 if all required registers are available * returns 1 otherwise */ -static int notrace s390_check_registers(union mci mci, int umode) +static int notrace s390_validate_registers(union mci mci, int umode) { + struct mcesa *mcesa; + void *fpt_save_area; union ctlreg2 cr2; int kill_task; + u64 zero; kill_task = 0; + zero = 0; if (!mci.gr) { /* @@ -217,35 +221,89 @@ static int notrace s390_check_registers(union mci mci, int umode) if (!test_cpu_flag(CIF_FPU)) kill_task = 1; } + fpt_save_area = &S390_lowcore.floating_pt_save_area; if (!mci.fc) { /* * Floating point control register can't be restored. * If the kernel currently uses the floating pointer * registers and needs the FPC register the system is * stopped. If the process has its floating pointer - * registers loaded it is terminated. + * registers loaded it is terminated. Otherwise the + * FPC is just validated. */ if (S390_lowcore.fpu_flags & KERNEL_FPC) s390_handle_damage(); + asm volatile( + " lfpc %0\n" + : + : "Q" (zero)); if (!test_cpu_flag(CIF_FPU)) kill_task = 1; + } else { + asm volatile( + " lfpc %0\n" + : + : "Q" (S390_lowcore.fpt_creg_save_area)); } - if (MACHINE_HAS_VX) { + mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK); + if (!MACHINE_HAS_VX) { + /* Validate floating point registers */ + asm volatile( + " ld 0,0(%0)\n" + " ld 1,8(%0)\n" + " ld 2,16(%0)\n" + " ld 3,24(%0)\n" + " ld 4,32(%0)\n" + " ld 5,40(%0)\n" + " ld 6,48(%0)\n" + " ld 7,56(%0)\n" + " ld 8,64(%0)\n" + " ld 9,72(%0)\n" + " ld 10,80(%0)\n" + " ld 11,88(%0)\n" + " ld 12,96(%0)\n" + " ld 13,104(%0)\n" + " ld 14,112(%0)\n" + " ld 15,120(%0)\n" + : + : "a" (fpt_save_area) + : "memory"); + } else { + /* Validate vector registers */ + union ctlreg0 cr0; + if (!mci.vr) { /* * Vector registers can't be restored. If the kernel * currently uses vector registers the system is * stopped. If the process has its vector registers - * loaded it is terminated. + * loaded it is terminated. Otherwise just validate + * the registers. */ if (S390_lowcore.fpu_flags & KERNEL_VXR) s390_handle_damage(); if (!test_cpu_flag(CIF_FPU)) kill_task = 1; } + cr0.val = S390_lowcore.cregs_save_area[0]; + cr0.afp = cr0.vx = 1; + __ctl_load(cr0.val, 0, 0); + asm volatile( + " la 1,%0\n" + " .word 0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */ + " .word 0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */ + : + : "Q" (*(struct vx_array *)mcesa->vector_save_area) + : "1"); + __ctl_load(S390_lowcore.cregs_save_area[0], 0, 0); } - /* Check if access registers are valid */ + /* Validate access registers */ + asm volatile( + " lam 0,15,0(%0)\n" + : + : "a" (&S390_lowcore.access_regs_save_area) + : "memory"); if (!mci.ar) { /* * Access registers have unknown contents. @@ -253,7 +311,7 @@ static int notrace s390_check_registers(union mci mci, int umode) */ kill_task = 1; } - /* Check guarded storage registers */ + /* Validate guarded storage registers */ cr2.val = S390_lowcore.cregs_save_area[2]; if (cr2.gse) { if (!mci.gs) { @@ -263,15 +321,26 @@ static int notrace s390_check_registers(union mci mci, int umode) * It has to be terminated. */ kill_task = 1; + } else { + load_gs_cb((struct gs_cb *)mcesa->guarded_storage_save_area); } } + /* + * The getcpu vdso syscall reads CPU number from the programmable + * field of the TOD clock. Disregard the TOD programmable register + * validity bit and load the CPU number into the TOD programmable + * field unconditionally. + */ + set_tod_programmable_field(raw_smp_processor_id()); + /* Validate clock comparator register */ + set_clock_comparator(S390_lowcore.clock_comparator); if (!mci.ms || !mci.pm || !mci.ia) kill_task = 1; return kill_task; } -NOKPROBE_SYMBOL(s390_check_registers); +NOKPROBE_SYMBOL(s390_validate_registers); /* * Backup the guest's machine check info to its description block @@ -369,7 +438,7 @@ int notrace s390_do_machine_check(struct pt_regs *regs) s390_handle_damage(); } } - if (s390_check_registers(mci, user_mode(regs))) { + if (s390_validate_registers(mci, user_mode(regs))) { /* * Couldn't restore all register contents for the * user space process -> mark task for termination. -- cgit v1.2.3 From a029a4eab39e4bf542907a3263773fce3d48c983 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Fri, 25 Jun 2021 15:17:01 +0200 Subject: s390/cpumf: Allow concurrent access for CPU Measurement Counter Facility Commit cf6acb8bdb1d ("s390/cpumf: Add support for complete counter set extraction") allows access to the CPU Measurement Counter Facility via character device /dev/hwctr. The access was exclusive via this device or via perf_event_open() system call. Only one path at a time was permitted. The CPU Measurement Counter Facility device driver blocked access to other processes. This patch removes this restriction and allows concurrent access to the CPU Measurement Counter Facility from multiple processes at the same time via perf_event_open() SVC and via /dev/hwctr device. The access via /dev/hwctr device is still exclusive, only one process is allowed to access this device. This patch - moves the /dev/hwctr device access from file perf_cpum_cf_diag.c. to file perf_cpum_cf.c. - use only one trace buffer .../s390dbf/cpum_cf. - remove cfset_csd structure and includes its members it into the structure cpu_cf_events. This results in one data structure and simplifies the access. - rework function familiy ctr_set_enable, ctr_set_disable, ctr_set_start and ctr_set_stop which operate on a counter set number. Now they operate on a counter set bit mask. - move CF_DIAG event functionality to file perf_cpum_cf.c. It now contains the complete functionality of the CPU Measurement Counter Facility: - Performance measurement support for counters using perf stat. - Support for complete counter set extraction with device /dev/hwctr. - Support for counter set extraction event CF_DIAG attached to samples using perf record. - removes file perf_cpum_cf_diag.c Signed-off-by: Thomas Richter Reviewed-by: Sumanth Korikkar Signed-off-by: Vasily Gorbik --- arch/s390/kernel/Makefile | 1 - arch/s390/kernel/perf_cpum_cf.c | 1026 +++++++++++++++++++++++++++- arch/s390/kernel/perf_cpum_cf_common.c | 27 +- arch/s390/kernel/perf_cpum_cf_diag.c | 1148 -------------------------------- 4 files changed, 1002 insertions(+), 1200 deletions(-) delete mode 100644 arch/s390/kernel/perf_cpum_cf_diag.c (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 68ca1834316f..7a77f7f6f9d8 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -71,7 +71,6 @@ obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf_common.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf.o perf_cpum_sf.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o -obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_diag.o obj-$(CONFIG_TRACEPOINTS) += trace.o obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 1b7a0525fbed..975a00c8c564 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -2,8 +2,9 @@ /* * Performance event support for s390x - CPU-measurement Counter Facility * - * Copyright IBM Corp. 2012, 2019 + * Copyright IBM Corp. 2012, 2021 * Author(s): Hendrik Brueckner + * Thomas Richter */ #define KMSG_COMPONENT "cpum_cf" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt @@ -14,7 +15,223 @@ #include #include #include +#include + #include +#include +#include + +static unsigned int cfdiag_cpu_speed; /* CPU speed for CF_DIAG trailer */ +static debug_info_t *cf_dbg; + +#define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */ + /* interval in seconds */ + +/* Counter sets are stored as data stream in a page sized memory buffer and + * exported to user space via raw data attached to the event sample data. + * Each counter set starts with an eight byte header consisting of: + * - a two byte eye catcher (0xfeef) + * - a one byte counter set number + * - a two byte counter set size (indicates the number of counters in this set) + * - a three byte reserved value (must be zero) to make the header the same + * size as a counter value. + * All counter values are eight byte in size. + * + * All counter sets are followed by a 64 byte trailer. + * The trailer consists of a: + * - flag field indicating valid fields when corresponding bit set + * - the counter facility first and second version number + * - the CPU speed if nonzero + * - the time stamp the counter sets have been collected + * - the time of day (TOD) base value + * - the machine type. + * + * The counter sets are saved when the process is prepared to be executed on a + * CPU and saved again when the process is going to be removed from a CPU. + * The difference of both counter sets are calculated and stored in the event + * sample data area. + */ +struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */ + unsigned int def:16; /* 0-15 Data Entry Format */ + unsigned int set:16; /* 16-31 Counter set identifier */ + unsigned int ctr:16; /* 32-47 Number of stored counters */ + unsigned int res1:16; /* 48-63 Reserved */ +}; + +struct cf_trailer_entry { /* CPU-M CF_DIAG trailer (64 byte) */ + /* 0 - 7 */ + union { + struct { + unsigned int clock_base:1; /* TOD clock base set */ + unsigned int speed:1; /* CPU speed set */ + /* Measurement alerts */ + unsigned int mtda:1; /* Loss of MT ctr. data alert */ + unsigned int caca:1; /* Counter auth. change alert */ + unsigned int lcda:1; /* Loss of counter data alert */ + }; + unsigned long flags; /* 0-63 All indicators */ + }; + /* 8 - 15 */ + unsigned int cfvn:16; /* 64-79 Ctr First Version */ + unsigned int csvn:16; /* 80-95 Ctr Second Version */ + unsigned int cpu_speed:32; /* 96-127 CPU speed */ + /* 16 - 23 */ + unsigned long timestamp; /* 128-191 Timestamp (TOD) */ + /* 24 - 55 */ + union { + struct { + unsigned long progusage1; + unsigned long progusage2; + unsigned long progusage3; + unsigned long tod_base; + }; + unsigned long progusage[4]; + }; + /* 56 - 63 */ + unsigned int mach_type:16; /* Machine type */ + unsigned int res1:16; /* Reserved */ + unsigned int res2:32; /* Reserved */ +}; + +/* Create the trailer data at the end of a page. */ +static void cfdiag_trailer(struct cf_trailer_entry *te) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpuid cpuid; + + te->cfvn = cpuhw->info.cfvn; /* Counter version numbers */ + te->csvn = cpuhw->info.csvn; + + get_cpu_id(&cpuid); /* Machine type */ + te->mach_type = cpuid.machine; + te->cpu_speed = cfdiag_cpu_speed; + if (te->cpu_speed) + te->speed = 1; + te->clock_base = 1; /* Save clock base */ + te->tod_base = tod_clock_base.tod; + te->timestamp = get_tod_clock_fast(); +} + +/* Read a counter set. The counter set number determines the counter set and + * the CPUM-CF first and second version number determine the number of + * available counters in each counter set. + * Each counter set starts with header containing the counter set number and + * the number of eight byte counters. + * + * The functions returns the number of bytes occupied by this counter set + * including the header. + * If there is no counter in the counter set, this counter set is useless and + * zero is returned on this case. + * + * Note that the counter sets may not be enabled or active and the stcctm + * instruction might return error 3. Depending on error_ok value this is ok, + * for example when called from cpumf_pmu_start() call back function. + */ +static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset, + size_t room, bool error_ok) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + size_t ctrset_size, need = 0; + int rc = 3; /* Assume write failure */ + + ctrdata->def = CF_DIAG_CTRSET_DEF; + ctrdata->set = ctrset; + ctrdata->res1 = 0; + ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info); + + if (ctrset_size) { /* Save data */ + need = ctrset_size * sizeof(u64) + sizeof(*ctrdata); + if (need <= room) { + rc = ctr_stcctm(ctrset, ctrset_size, + (u64 *)(ctrdata + 1)); + } + if (rc != 3 || error_ok) + ctrdata->ctr = ctrset_size; + else + need = 0; + } + + debug_sprintf_event(cf_dbg, 3, + "%s ctrset %d ctrset_size %zu cfvn %d csvn %d" + " need %zd rc %d\n", __func__, ctrset, ctrset_size, + cpuhw->info.cfvn, cpuhw->info.csvn, need, rc); + return need; +} + +/* Read out all counter sets and save them in the provided data buffer. + * The last 64 byte host an artificial trailer entry. + */ +static size_t cfdiag_getctr(void *data, size_t sz, unsigned long auth, + bool error_ok) +{ + struct cf_trailer_entry *trailer; + size_t offset = 0, done; + int i; + + memset(data, 0, sz); + sz -= sizeof(*trailer); /* Always room for trailer */ + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + struct cf_ctrset_entry *ctrdata = data + offset; + + if (!(auth & cpumf_ctr_ctl[i])) + continue; /* Counter set not authorized */ + + done = cfdiag_getctrset(ctrdata, i, sz - offset, error_ok); + offset += done; + } + trailer = data + offset; + cfdiag_trailer(trailer); + return offset + sizeof(*trailer); +} + +/* Calculate the difference for each counter in a counter set. */ +static void cfdiag_diffctrset(u64 *pstart, u64 *pstop, int counters) +{ + for (; --counters >= 0; ++pstart, ++pstop) + if (*pstop >= *pstart) + *pstop -= *pstart; + else + *pstop = *pstart - *pstop + 1; +} + +/* Scan the counter sets and calculate the difference of each counter + * in each set. The result is the increment of each counter during the + * period the counter set has been activated. + * + * Return true on success. + */ +static int cfdiag_diffctr(struct cpu_cf_events *cpuhw, unsigned long auth) +{ + struct cf_trailer_entry *trailer_start, *trailer_stop; + struct cf_ctrset_entry *ctrstart, *ctrstop; + size_t offset = 0; + + auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1; + do { + ctrstart = (struct cf_ctrset_entry *)(cpuhw->start + offset); + ctrstop = (struct cf_ctrset_entry *)(cpuhw->stop + offset); + + if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) { + pr_err_once("cpum_cf_diag counter set compare error " + "in set %i\n", ctrstart->set); + return 0; + } + auth &= ~cpumf_ctr_ctl[ctrstart->set]; + if (ctrstart->def == CF_DIAG_CTRSET_DEF) { + cfdiag_diffctrset((u64 *)(ctrstart + 1), + (u64 *)(ctrstop + 1), ctrstart->ctr); + offset += ctrstart->ctr * sizeof(u64) + + sizeof(*ctrstart); + } + } while (ctrstart->def && auth); + + /* Save time_stamp from start of event in stop's trailer */ + trailer_start = (struct cf_trailer_entry *)(cpuhw->start + offset); + trailer_stop = (struct cf_trailer_entry *)(cpuhw->stop + offset); + trailer_stop->progusage[0] = trailer_start->timestamp; + + return 1; +} static enum cpumf_ctr_set get_counter_set(u64 event) { @@ -34,7 +251,8 @@ static enum cpumf_ctr_set get_counter_set(u64 event) return set; } -static int validate_ctr_version(const struct hw_perf_event *hwc) +static int validate_ctr_version(const struct hw_perf_event *hwc, + enum cpumf_ctr_set set) { struct cpu_cf_events *cpuhw; int err = 0; @@ -43,7 +261,7 @@ static int validate_ctr_version(const struct hw_perf_event *hwc) cpuhw = &get_cpu_var(cpu_cf_events); /* check required version for counter sets */ - switch (hwc->config_base) { + switch (set) { case CPUMF_CTR_SET_BASIC: case CPUMF_CTR_SET_USER: if (cpuhw->info.cfvn < 1) @@ -86,6 +304,8 @@ static int validate_ctr_version(const struct hw_perf_event *hwc) (cpuhw->info.act_ctl & mtdiag_ctl))) err = -EOPNOTSUPP; break; + case CPUMF_CTR_SET_MAX: + err = -EOPNOTSUPP; } put_cpu_var(cpu_cf_events); @@ -95,7 +315,6 @@ static int validate_ctr_version(const struct hw_perf_event *hwc) static int validate_ctr_auth(const struct hw_perf_event *hwc) { struct cpu_cf_events *cpuhw; - u64 ctrs_state; int err = 0; cpuhw = &get_cpu_var(cpu_cf_events); @@ -105,8 +324,7 @@ static int validate_ctr_auth(const struct hw_perf_event *hwc) * return with -ENOENT in order to fall back to other * PMUs that might suffice the event request. */ - ctrs_state = cpumf_ctr_ctl[hwc->config_base]; - if (!(ctrs_state & cpuhw->info.auth_ctl)) + if (!(hwc->config_base & cpuhw->info.auth_ctl)) err = -ENOENT; put_cpu_var(cpu_cf_events); @@ -126,7 +344,7 @@ static void cpumf_pmu_enable(struct pmu *pmu) if (cpuhw->flags & PMU_F_ENABLED) return; - err = lcctl(cpuhw->state); + err = lcctl(cpuhw->state | cpuhw->dev_state); if (err) { pr_err("Enabling the performance measuring unit " "failed with rc=%x\n", err); @@ -151,6 +369,7 @@ static void cpumf_pmu_disable(struct pmu *pmu) return; inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1); + inactive |= cpuhw->dev_state; err = lcctl(inactive); if (err) { pr_err("Disabling the performance measuring unit " @@ -199,6 +418,14 @@ static const int cpumf_generic_events_user[] = { [PERF_COUNT_HW_BUS_CYCLES] = -1, }; +static void cpumf_hw_inuse(void) +{ + mutex_lock(&pmc_reserve_mutex); + if (atomic_inc_return(&num_events) == 1) + __kernel_cpumcf_begin(); + mutex_unlock(&pmc_reserve_mutex); +} + static int __hw_perf_event_init(struct perf_event *event, unsigned int type) { struct perf_event_attr *attr = &event->attr; @@ -258,11 +485,11 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) /* * Use the hardware perf event structure to store the * counter number in the 'config' member and the counter - * set number in the 'config_base'. The counter set number - * is then later used to enable/disable the counter(s). + * set number in the 'config_base' as bit mask. + * It is later used to enable/disable the counter(s). */ hwc->config = ev; - hwc->config_base = set; + hwc->config_base = cpumf_ctr_ctl[set]; break; case CPUMF_CTR_SET_MAX: /* The counter could not be associated to a counter set */ @@ -270,22 +497,13 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) } /* Initialize for using the CPU-measurement counter facility */ - if (!atomic_inc_not_zero(&num_events)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&num_events) == 0 && __kernel_cpumcf_begin()) - err = -EBUSY; - else - atomic_inc(&num_events); - mutex_unlock(&pmc_reserve_mutex); - } - if (err) - return err; + cpumf_hw_inuse(); event->destroy = hw_perf_event_destroy; /* Finally, validate version and authorization of the counter set */ err = validate_ctr_auth(hwc); if (!err) - err = validate_ctr_version(hwc); + err = validate_ctr_version(hwc, set); return err; } @@ -361,6 +579,7 @@ static void cpumf_pmu_start(struct perf_event *event, int flags) { struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); struct hw_perf_event *hwc = &event->hw; + int i; if (!(hwc->state & PERF_HES_STOPPED)) return; @@ -376,29 +595,92 @@ static void cpumf_pmu_start(struct perf_event *event, int flags) * needs to be synchronized. At this point, the counter set can be in * the inactive or disabled state. */ - hw_perf_event_reset(event); + if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) { + cpuhw->usedss = cfdiag_getctr(cpuhw->start, + sizeof(cpuhw->start), + hwc->config_base, true); + } else { + hw_perf_event_reset(event); + } - /* increment refcount for this counter set */ - atomic_inc(&cpuhw->ctr_set[hwc->config_base]); + /* Increment refcount for counter sets */ + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) + if ((hwc->config_base & cpumf_ctr_ctl[i])) + atomic_inc(&cpuhw->ctr_set[i]); +} + +/* Create perf event sample with the counter sets as raw data. The sample + * is then pushed to the event subsystem and the function checks for + * possible event overflows. If an event overflow occurs, the PMU is + * stopped. + * + * Return non-zero if an event overflow occurred. + */ +static int cfdiag_push_sample(struct perf_event *event, + struct cpu_cf_events *cpuhw) +{ + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + int overflow; + + /* Setup perf sample */ + perf_sample_data_init(&data, 0, event->hw.last_period); + memset(®s, 0, sizeof(regs)); + memset(&raw, 0, sizeof(raw)); + + if (event->attr.sample_type & PERF_SAMPLE_CPU) + data.cpu_entry.cpu = event->cpu; + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + raw.frag.size = cpuhw->usedss; + raw.frag.data = cpuhw->stop; + raw.size = raw.frag.size; + data.raw = &raw; + } + + overflow = perf_event_overflow(event, &data, ®s); + debug_sprintf_event(cf_dbg, 3, + "%s event %#llx sample_type %#llx raw %d ov %d\n", + __func__, event->hw.config, + event->attr.sample_type, raw.size, overflow); + if (overflow) + event->pmu->stop(event, 0); + + perf_event_update_userpage(event); + return overflow; } static void cpumf_pmu_stop(struct perf_event *event, int flags) { struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); struct hw_perf_event *hwc = &event->hw; + int i; if (!(hwc->state & PERF_HES_STOPPED)) { /* Decrement reference count for this counter set and if this * is the last used counter in the set, clear activation * control and set the counter set state to inactive. */ - if (!atomic_dec_return(&cpuhw->ctr_set[hwc->config_base])) - ctr_set_stop(&cpuhw->state, hwc->config_base); + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + if (!(hwc->config_base & cpumf_ctr_ctl[i])) + continue; + if (!atomic_dec_return(&cpuhw->ctr_set[i])) + ctr_set_stop(&cpuhw->state, cpumf_ctr_ctl[i]); + } hwc->state |= PERF_HES_STOPPED; } if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { - hw_perf_event_update(event); + if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) { + local64_inc(&event->count); + cpuhw->usedss = cfdiag_getctr(cpuhw->stop, + sizeof(cpuhw->stop), + event->hw.config_base, + false); + if (cfdiag_diffctr(cpuhw, event->hw.config_base)) + cfdiag_push_sample(event, cpuhw); + } else + hw_perf_event_update(event); hwc->state |= PERF_HES_UPTODATE; } } @@ -419,6 +701,7 @@ static int cpumf_pmu_add(struct perf_event *event, int flags) static void cpumf_pmu_del(struct perf_event *event, int flags) { struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + int i; cpumf_pmu_stop(event, PERF_EF_UPDATE); @@ -430,8 +713,9 @@ static void cpumf_pmu_del(struct perf_event *event, int flags) * clear enable control and resets all counters in a set. Therefore, * cpumf_pmu_start() always has to reenable a counter set. */ - if (!atomic_read(&cpuhw->ctr_set[event->hw.config_base])) - ctr_set_disable(&cpuhw->state, event->hw.config_base); + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) + if (!atomic_read(&cpuhw->ctr_set[i])) + ctr_set_disable(&cpuhw->state, cpumf_ctr_ctl[i]); } /* Performance monitoring unit for s390x */ @@ -448,6 +732,7 @@ static struct pmu cpumf_pmu = { .read = cpumf_pmu_read, }; +static int cfset_init(void); static int __init cpumf_pmu_init(void) { int rc; @@ -455,10 +740,689 @@ static int __init cpumf_pmu_init(void) if (!kernel_cpumcf_avail()) return -ENODEV; + /* Setup s390dbf facility */ + cf_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128); + if (!cf_dbg) { + pr_err("Registration of s390dbf(cpum_cf) failed\n"); + return -ENOMEM; + }; + debug_register_view(cf_dbg, &debug_sprintf_view); + cpumf_pmu.attr_groups = cpumf_cf_event_group(); rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1); - if (rc) + if (rc) { + debug_unregister_view(cf_dbg, &debug_sprintf_view); + debug_unregister(cf_dbg); pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc); + } else if (stccm_avail()) { /* Setup counter set device */ + cfset_init(); + } + return rc; +} + +/* Support for the CPU Measurement Facility counter set extraction using + * device /dev/hwctr. This allows user space programs to extract complete + * counter set via normal file operations. + */ + +static atomic_t cfset_opencnt = ATOMIC_INIT(0); /* Excl. access */ +static DEFINE_MUTEX(cfset_ctrset_mutex);/* Synchronize access to hardware */ +struct cfset_call_on_cpu_parm { /* Parm struct for smp_call_on_cpu */ + unsigned int sets; /* Counter set bit mask */ + atomic_t cpus_ack; /* # CPUs successfully executed func */ +}; + +static struct cfset_request { /* CPUs and counter set bit mask */ + unsigned long ctrset; /* Bit mask of counter set to read */ + cpumask_t mask; /* CPU mask to read from */ +} cfset_request; + +static void cfset_ctrset_clear(void) +{ + cpumask_clear(&cfset_request.mask); + cfset_request.ctrset = 0; +} + +/* The /dev/hwctr device access uses PMU_F_IN_USE to mark the device access + * path is currently used. + * The cpu_cf_events::dev_state is used to denote counter sets in use by this + * interface. It is always or'ed in. If this interface is not active, its + * value is zero and no additional counter sets will be included. + * + * The cpu_cf_events::state is used by the perf_event_open SVC and remains + * unchanged. + * + * perf_pmu_enable() and perf_pmu_enable() and its call backs + * cpumf_pmu_enable() and cpumf_pmu_disable() are called by the + * performance measurement subsystem to enable per process + * CPU Measurement counter facility. + * The XXX_enable() and XXX_disable functions are used to turn off + * x86 performance monitoring interrupt (PMI) during scheduling. + * s390 uses these calls to temporarily stop and resume the active CPU + * counters sets during scheduling. + * + * We do allow concurrent access of perf_event_open() SVC and /dev/hwctr + * device access. The perf_event_open() SVC interface makes a lot of effort + * to only run the counters while the calling process is actively scheduled + * to run. + * When /dev/hwctr interface is also used at the same time, the counter sets + * will keep running, even when the process is scheduled off a CPU. + * However this is not a problem and does not lead to wrong counter values + * for the perf_event_open() SVC. The current counter value will be recorded + * during schedule-in. At schedule-out time the current counter value is + * extracted again and the delta is calculated and added to the event. + */ +/* Stop all counter sets via ioctl interface */ +static void cfset_ioctl_off(void *parm) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cfset_call_on_cpu_parm *p = parm; + int rc; + + cpuhw->dev_state = 0; + for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) + if ((p->sets & cpumf_ctr_ctl[rc])) + atomic_dec(&cpuhw->ctr_set[rc]); + rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */ + if (rc) + pr_err("Counter set stop %#llx of /dev/%s failed rc=%i\n", + cpuhw->state, S390_HWCTR_DEVICE, rc); + cpuhw->flags &= ~PMU_F_IN_USE; + debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n", + __func__, rc, cpuhw->state, cpuhw->dev_state); +} + +/* Start counter sets on particular CPU */ +static void cfset_ioctl_on(void *parm) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cfset_call_on_cpu_parm *p = parm; + int rc; + + cpuhw->flags |= PMU_F_IN_USE; + ctr_set_enable(&cpuhw->dev_state, p->sets); + ctr_set_start(&cpuhw->dev_state, p->sets); + for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) + if ((p->sets & cpumf_ctr_ctl[rc])) + atomic_inc(&cpuhw->ctr_set[rc]); + rc = lcctl(cpuhw->dev_state | cpuhw->state); /* Start counter sets */ + if (!rc) + atomic_inc(&p->cpus_ack); + else + pr_err("Counter set start %#llx of /dev/%s failed rc=%i\n", + cpuhw->dev_state | cpuhw->state, S390_HWCTR_DEVICE, rc); + debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n", + __func__, rc, cpuhw->state, cpuhw->dev_state); +} + +static void cfset_release_cpu(void *p) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + int rc; + + debug_sprintf_event(cf_dbg, 4, "%s state %#llx dev_state %#llx\n", + __func__, cpuhw->state, cpuhw->dev_state); + rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */ + if (rc) + pr_err("Counter set release %#llx of /dev/%s failed rc=%i\n", + cpuhw->state, S390_HWCTR_DEVICE, rc); + cpuhw->dev_state = 0; +} + +/* Release function is also called when application gets terminated without + * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command. + */ +static int cfset_release(struct inode *inode, struct file *file) +{ + on_each_cpu(cfset_release_cpu, NULL, 1); + hw_perf_event_destroy(NULL); + cfset_ctrset_clear(); + atomic_set(&cfset_opencnt, 0); + return 0; +} + +static int cfset_open(struct inode *inode, struct file *file) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* Only one user space program can open /dev/hwctr */ + if (atomic_xchg(&cfset_opencnt, 1)) + return -EBUSY; + + cpumf_hw_inuse(); + file->private_data = NULL; + /* nonseekable_open() never fails */ + return nonseekable_open(inode, file); +} + +static int cfset_all_stop(void) +{ + struct cfset_call_on_cpu_parm p = { + .sets = cfset_request.ctrset, + }; + cpumask_var_t mask; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_and(mask, &cfset_request.mask, cpu_online_mask); + on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1); + free_cpumask_var(mask); + return 0; +} + +static int cfset_all_start(void) +{ + struct cfset_call_on_cpu_parm p = { + .sets = cfset_request.ctrset, + .cpus_ack = ATOMIC_INIT(0), + }; + cpumask_var_t mask; + int rc = 0; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_and(mask, &cfset_request.mask, cpu_online_mask); + on_each_cpu_mask(mask, cfset_ioctl_on, &p, 1); + if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) { + on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1); + rc = -EIO; + debug_sprintf_event(cf_dbg, 4, "%s CPUs missing", __func__); + } + free_cpumask_var(mask); + return rc; +} + + +/* Return the maximum required space for all possible CPUs in case one + * CPU will be onlined during the START, READ, STOP cycles. + * To find out the size of the counter sets, any one CPU will do. They + * all have the same counter sets. + */ +static size_t cfset_needspace(unsigned int sets) +{ + struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events); + size_t bytes = 0; + int i; + + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + if (!(sets & cpumf_ctr_ctl[i])) + continue; + bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) + + sizeof(((struct s390_ctrset_setdata *)0)->set) + + sizeof(((struct s390_ctrset_setdata *)0)->no_cnts); + } + bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids * + (bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) + + sizeof(((struct s390_ctrset_cpudata *)0)->no_sets)); + put_cpu_ptr(&cpu_cf_events); + return bytes; +} + +static int cfset_all_copy(unsigned long arg, cpumask_t *mask) +{ + struct s390_ctrset_read __user *ctrset_read; + unsigned int cpu, cpus, rc; + void __user *uptr; + + ctrset_read = (struct s390_ctrset_read __user *)arg; + uptr = ctrset_read->data; + for_each_cpu(cpu, mask) { + struct cpu_cf_events *cpuhw = per_cpu_ptr(&cpu_cf_events, cpu); + struct s390_ctrset_cpudata __user *ctrset_cpudata; + + ctrset_cpudata = uptr; + rc = put_user(cpu, &ctrset_cpudata->cpu_nr); + rc |= put_user(cpuhw->sets, &ctrset_cpudata->no_sets); + rc |= copy_to_user(ctrset_cpudata->data, cpuhw->data, + cpuhw->used); + if (rc) + return -EFAULT; + uptr += sizeof(struct s390_ctrset_cpudata) + cpuhw->used; + cond_resched(); + } + cpus = cpumask_weight(mask); + if (put_user(cpus, &ctrset_read->no_cpus)) + return -EFAULT; + debug_sprintf_event(cf_dbg, 4, "%s copied %ld\n", __func__, + uptr - (void __user *)ctrset_read->data); + return 0; +} + +static size_t cfset_cpuset_read(struct s390_ctrset_setdata *p, int ctrset, + int ctrset_size, size_t room) +{ + size_t need = 0; + int rc = -1; + + need = sizeof(*p) + sizeof(u64) * ctrset_size; + if (need <= room) { + p->set = cpumf_ctr_ctl[ctrset]; + p->no_cnts = ctrset_size; + rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv); + if (rc == 3) /* Nothing stored */ + need = 0; + } + return need; +} + +/* Read all counter sets. */ +static void cfset_cpu_read(void *parm) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cfset_call_on_cpu_parm *p = parm; + int set, set_size; + size_t space; + + /* No data saved yet */ + cpuhw->used = 0; + cpuhw->sets = 0; + memset(cpuhw->data, 0, sizeof(cpuhw->data)); + + /* Scan the counter sets */ + for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) { + struct s390_ctrset_setdata *sp = (void *)cpuhw->data + + cpuhw->used; + + if (!(p->sets & cpumf_ctr_ctl[set])) + continue; /* Counter set not in list */ + set_size = cpum_cf_ctrset_size(set, &cpuhw->info); + space = sizeof(cpuhw->data) - cpuhw->used; + space = cfset_cpuset_read(sp, set, set_size, space); + if (space) { + cpuhw->used += space; + cpuhw->sets += 1; + } + } + debug_sprintf_event(cf_dbg, 4, "%s sets %d used %zd\n", __func__, + cpuhw->sets, cpuhw->used); +} + +static int cfset_all_read(unsigned long arg) +{ + struct cfset_call_on_cpu_parm p; + cpumask_var_t mask; + int rc; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + p.sets = cfset_request.ctrset; + cpumask_and(mask, &cfset_request.mask, cpu_online_mask); + on_each_cpu_mask(mask, cfset_cpu_read, &p, 1); + rc = cfset_all_copy(arg, mask); + free_cpumask_var(mask); return rc; } -subsys_initcall(cpumf_pmu_init); + +static long cfset_ioctl_read(unsigned long arg) +{ + struct s390_ctrset_read read; + int ret = 0; + + if (copy_from_user(&read, (char __user *)arg, sizeof(read))) + return -EFAULT; + ret = cfset_all_read(arg); + return ret; +} + +static long cfset_ioctl_stop(void) +{ + int ret = ENXIO; + + if (cfset_request.ctrset) { + ret = cfset_all_stop(); + cfset_ctrset_clear(); + } + return ret; +} + +static long cfset_ioctl_start(unsigned long arg) +{ + struct s390_ctrset_start __user *ustart; + struct s390_ctrset_start start; + void __user *umask; + unsigned int len; + int ret = 0; + size_t need; + + if (cfset_request.ctrset) + return -EBUSY; + ustart = (struct s390_ctrset_start __user *)arg; + if (copy_from_user(&start, ustart, sizeof(start))) + return -EFAULT; + if (start.version != S390_HWCTR_START_VERSION) + return -EINVAL; + if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] | + cpumf_ctr_ctl[CPUMF_CTR_SET_USER] | + cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] | + cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] | + cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG])) + return -EINVAL; /* Invalid counter set */ + if (!start.counter_sets) + return -EINVAL; /* No counter set at all? */ + cpumask_clear(&cfset_request.mask); + len = min_t(u64, start.cpumask_len, cpumask_size()); + umask = (void __user *)start.cpumask; + if (copy_from_user(&cfset_request.mask, umask, len)) + return -EFAULT; + if (cpumask_empty(&cfset_request.mask)) + return -EINVAL; + need = cfset_needspace(start.counter_sets); + if (put_user(need, &ustart->data_bytes)) + ret = -EFAULT; + if (ret) + goto out; + cfset_request.ctrset = start.counter_sets; + ret = cfset_all_start(); +out: + if (ret) + cfset_ctrset_clear(); + debug_sprintf_event(cf_dbg, 4, "%s sets %#lx need %ld ret %d\n", + __func__, cfset_request.ctrset, need, ret); + return ret; +} + +/* Entry point to the /dev/hwctr device interface. + * The ioctl system call supports three subcommands: + * S390_HWCTR_START: Start the specified counter sets on a CPU list. The + * counter set keeps running until explicitly stopped. Returns the number + * of bytes needed to store the counter values. If another S390_HWCTR_START + * ioctl subcommand is called without a previous S390_HWCTR_STOP stop + * command, -EBUSY is returned. + * S390_HWCTR_READ: Read the counter set values from specified CPU list given + * with the S390_HWCTR_START command. + * S390_HWCTR_STOP: Stops the counter sets on the CPU list given with the + * previous S390_HWCTR_START subcommand. + */ +static long cfset_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int ret; + + get_online_cpus(); + mutex_lock(&cfset_ctrset_mutex); + switch (cmd) { + case S390_HWCTR_START: + ret = cfset_ioctl_start(arg); + break; + case S390_HWCTR_STOP: + ret = cfset_ioctl_stop(); + break; + case S390_HWCTR_READ: + ret = cfset_ioctl_read(arg); + break; + default: + ret = -ENOTTY; + break; + } + mutex_unlock(&cfset_ctrset_mutex); + put_online_cpus(); + return ret; +} + +static const struct file_operations cfset_fops = { + .owner = THIS_MODULE, + .open = cfset_open, + .release = cfset_release, + .unlocked_ioctl = cfset_ioctl, + .compat_ioctl = cfset_ioctl, + .llseek = no_llseek +}; + +static struct miscdevice cfset_dev = { + .name = S390_HWCTR_DEVICE, + .minor = MISC_DYNAMIC_MINOR, + .fops = &cfset_fops, +}; + +int cfset_online_cpu(unsigned int cpu) +{ + struct cfset_call_on_cpu_parm p; + + mutex_lock(&cfset_ctrset_mutex); + if (cfset_request.ctrset) { + p.sets = cfset_request.ctrset; + cfset_ioctl_on(&p); + cpumask_set_cpu(cpu, &cfset_request.mask); + } + mutex_unlock(&cfset_ctrset_mutex); + return 0; +} + +int cfset_offline_cpu(unsigned int cpu) +{ + struct cfset_call_on_cpu_parm p; + + mutex_lock(&cfset_ctrset_mutex); + if (cfset_request.ctrset) { + p.sets = cfset_request.ctrset; + cfset_ioctl_off(&p); + cpumask_clear_cpu(cpu, &cfset_request.mask); + } + mutex_unlock(&cfset_ctrset_mutex); + return 0; +} + +static void cfdiag_read(struct perf_event *event) +{ + debug_sprintf_event(cf_dbg, 3, "%s event %#llx count %ld\n", __func__, + event->attr.config, local64_read(&event->count)); +} + +static int get_authctrsets(void) +{ + struct cpu_cf_events *cpuhw; + unsigned long auth = 0; + enum cpumf_ctr_set i; + + cpuhw = &get_cpu_var(cpu_cf_events); + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i]) + auth |= cpumf_ctr_ctl[i]; + } + put_cpu_var(cpu_cf_events); + return auth; +} + +/* Setup the event. Test for authorized counter sets and only include counter + * sets which are authorized at the time of the setup. Including unauthorized + * counter sets result in specification exception (and panic). + */ +static int cfdiag_event_init2(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + int err = 0; + + /* Set sample_period to indicate sampling */ + event->hw.config = attr->config; + event->hw.sample_period = attr->sample_period; + local64_set(&event->hw.period_left, event->hw.sample_period); + local64_set(&event->count, 0); + event->hw.last_period = event->hw.sample_period; + + /* Add all authorized counter sets to config_base. The + * the hardware init function is either called per-cpu or just once + * for all CPUS (event->cpu == -1). This depends on the whether + * counting is started for all CPUs or on a per workload base where + * the perf event moves from one CPU to another CPU. + * Checking the authorization on any CPU is fine as the hardware + * applies the same authorization settings to all CPUs. + */ + event->hw.config_base = get_authctrsets(); + + /* No authorized counter sets, nothing to count/sample */ + if (!event->hw.config_base) + err = -EINVAL; + + debug_sprintf_event(cf_dbg, 5, "%s err %d config_base %#lx\n", + __func__, err, event->hw.config_base); + return err; +} + +static int cfdiag_event_init(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + int err = -ENOENT; + + if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG || + event->attr.type != event->pmu->type) + goto out; + + /* Raw events are used to access counters directly, + * hence do not permit excludes. + * This event is useless without PERF_SAMPLE_RAW to return counter set + * values as raw data. + */ + if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv || + !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) { + err = -EOPNOTSUPP; + goto out; + } + + /* Initialize for using the CPU-measurement counter facility */ + cpumf_hw_inuse(); + event->destroy = hw_perf_event_destroy; + + err = cfdiag_event_init2(event); + if (unlikely(err)) + event->destroy(event); +out: + return err; +} + +/* Create cf_diag/events/CF_DIAG event sysfs file. This counter is used + * to collect the complete counter sets for a scheduled process. Target + * are complete counter sets attached as raw data to the artificial event. + * This results in complete counter sets available when a process is + * scheduled. Contains the delta of every counter while the process was + * running. + */ +CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG); + +static struct attribute *cfdiag_events_attr[] = { + CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG), + NULL, +}; + +PMU_FORMAT_ATTR(event, "config:0-63"); + +static struct attribute *cfdiag_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group cfdiag_events_group = { + .name = "events", + .attrs = cfdiag_events_attr, +}; +static struct attribute_group cfdiag_format_group = { + .name = "format", + .attrs = cfdiag_format_attr, +}; +static const struct attribute_group *cfdiag_attr_groups[] = { + &cfdiag_events_group, + &cfdiag_format_group, + NULL, +}; + +/* Performance monitoring unit for event CF_DIAG. Since this event + * is also started and stopped via the perf_event_open() system call, use + * the same event enable/disable call back functions. They do not + * have a pointer to the perf_event strcture as first parameter. + * + * The functions XXX_add, XXX_del, XXX_start and XXX_stop are also common. + * Reuse them and distinguish the event (always first parameter) via + * 'config' member. + */ +static struct pmu cf_diag = { + .task_ctx_nr = perf_sw_context, + .event_init = cfdiag_event_init, + .pmu_enable = cpumf_pmu_enable, + .pmu_disable = cpumf_pmu_disable, + .add = cpumf_pmu_add, + .del = cpumf_pmu_del, + .start = cpumf_pmu_start, + .stop = cpumf_pmu_stop, + .read = cfdiag_read, + + .attr_groups = cfdiag_attr_groups +}; + +/* Calculate memory needed to store all counter sets together with header and + * trailer data. This is independent of the counter set authorization which + * can vary depending on the configuration. + */ +static size_t cfdiag_maxsize(struct cpumf_ctr_info *info) +{ + size_t max_size = sizeof(struct cf_trailer_entry); + enum cpumf_ctr_set i; + + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + size_t size = cpum_cf_ctrset_size(i, info); + + if (size) + max_size += size * sizeof(u64) + + sizeof(struct cf_ctrset_entry); + } + return max_size; +} + +/* Get the CPU speed, try sampling facility first and CPU attributes second. */ +static void cfdiag_get_cpu_speed(void) +{ + if (cpum_sf_avail()) { /* Sampling facility first */ + struct hws_qsi_info_block si; + + memset(&si, 0, sizeof(si)); + if (!qsi(&si)) { + cfdiag_cpu_speed = si.cpu_speed; + return; + } + } + + /* Fallback: CPU speed extract static part. Used in case + * CPU Measurement Sampling Facility is turned off. + */ + if (test_facility(34)) { + unsigned long mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0); + + if (mhz != -1UL) + cfdiag_cpu_speed = mhz & 0xffffffff; + } +} + +static int cfset_init(void) +{ + struct cpumf_ctr_info info; + size_t need; + int rc; + + if (qctri(&info)) + return -ENODEV; + + cfdiag_get_cpu_speed(); + /* Make sure the counter set data fits into predefined buffer. */ + need = cfdiag_maxsize(&info); + if (need > sizeof(((struct cpu_cf_events *)0)->start)) { + pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n", + need); + return -ENOMEM; + } + + rc = misc_register(&cfset_dev); + if (rc) { + pr_err("Registration of /dev/%s failed rc=%i\n", + cfset_dev.name, rc); + goto out; + } + + rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1); + if (rc) { + misc_deregister(&cfset_dev); + pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n", + rc); + } +out: + return rc; +} + +device_initcall(cpumf_pmu_init); diff --git a/arch/s390/kernel/perf_cpum_cf_common.c b/arch/s390/kernel/perf_cpum_cf_common.c index 2300fbaac556..30f0242de4a5 100644 --- a/arch/s390/kernel/perf_cpum_cf_common.c +++ b/arch/s390/kernel/perf_cpum_cf_common.c @@ -29,7 +29,11 @@ DEFINE_PER_CPU(struct cpu_cf_events, cpu_cf_events) = { }, .alert = ATOMIC64_INIT(0), .state = 0, + .dev_state = 0, .flags = 0, + .used = 0, + .usedss = 0, + .sets = 0 }; /* Indicator whether the CPU-Measurement Counter Facility Support is ready */ static bool cpum_cf_initalized; @@ -96,25 +100,10 @@ bool kernel_cpumcf_avail(void) } EXPORT_SYMBOL(kernel_cpumcf_avail); - -/* Reserve/release functions for sharing perf hardware */ -static DEFINE_SPINLOCK(cpumcf_owner_lock); -static void *cpumcf_owner; - /* Initialize the CPU-measurement counter facility */ int __kernel_cpumcf_begin(void) { int flags = PMC_INIT; - int err = 0; - - spin_lock(&cpumcf_owner_lock); - if (cpumcf_owner) - err = -EBUSY; - else - cpumcf_owner = __builtin_return_address(0); - spin_unlock(&cpumcf_owner_lock); - if (err) - return err; on_each_cpu(cpum_cf_setup_cpu, &flags, 1); irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); @@ -144,10 +133,6 @@ void __kernel_cpumcf_end(void) on_each_cpu(cpum_cf_setup_cpu, &flags, 1); irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT); - - spin_lock(&cpumcf_owner_lock); - cpumcf_owner = NULL; - spin_unlock(&cpumcf_owner_lock); } EXPORT_SYMBOL(__kernel_cpumcf_end); @@ -161,11 +146,13 @@ static int cpum_cf_setup(unsigned int cpu, int flags) static int cpum_cf_online_cpu(unsigned int cpu) { - return cpum_cf_setup(cpu, PMC_INIT); + cpum_cf_setup(cpu, PMC_INIT); + return cfset_online_cpu(cpu); } static int cpum_cf_offline_cpu(unsigned int cpu) { + cfset_offline_cpu(cpu); return cpum_cf_setup(cpu, PMC_RELEASE); } diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c deleted file mode 100644 index 08c985c1097c..000000000000 --- a/arch/s390/kernel/perf_cpum_cf_diag.c +++ /dev/null @@ -1,1148 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Performance event support for s390x - CPU-measurement Counter Sets - * - * Copyright IBM Corp. 2019, 2021 - * Author(s): Hendrik Brueckner - * Thomas Richer - */ -#define KMSG_COMPONENT "cpum_cf_diag" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include - -#define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */ - /* interval in seconds */ -static unsigned int cf_diag_cpu_speed; -static debug_info_t *cf_diag_dbg; - -struct cf_diag_csd { /* Counter set data per CPU */ - size_t used; /* Bytes used in data/start */ - unsigned char start[PAGE_SIZE]; /* Counter set at event start */ - unsigned char data[PAGE_SIZE]; /* Counter set at event delete */ - unsigned int sets; /* # Counter set saved in data */ -}; -static DEFINE_PER_CPU(struct cf_diag_csd, cf_diag_csd); - -/* Counter sets are stored as data stream in a page sized memory buffer and - * exported to user space via raw data attached to the event sample data. - * Each counter set starts with an eight byte header consisting of: - * - a two byte eye catcher (0xfeef) - * - a one byte counter set number - * - a two byte counter set size (indicates the number of counters in this set) - * - a three byte reserved value (must be zero) to make the header the same - * size as a counter value. - * All counter values are eight byte in size. - * - * All counter sets are followed by a 64 byte trailer. - * The trailer consists of a: - * - flag field indicating valid fields when corresponding bit set - * - the counter facility first and second version number - * - the CPU speed if nonzero - * - the time stamp the counter sets have been collected - * - the time of day (TOD) base value - * - the machine type. - * - * The counter sets are saved when the process is prepared to be executed on a - * CPU and saved again when the process is going to be removed from a CPU. - * The difference of both counter sets are calculated and stored in the event - * sample data area. - */ - -struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */ - unsigned int def:16; /* 0-15 Data Entry Format */ - unsigned int set:16; /* 16-31 Counter set identifier */ - unsigned int ctr:16; /* 32-47 Number of stored counters */ - unsigned int res1:16; /* 48-63 Reserved */ -}; - -struct cf_trailer_entry { /* CPU-M CF_DIAG trailer (64 byte) */ - /* 0 - 7 */ - union { - struct { - unsigned int clock_base:1; /* TOD clock base set */ - unsigned int speed:1; /* CPU speed set */ - /* Measurement alerts */ - unsigned int mtda:1; /* Loss of MT ctr. data alert */ - unsigned int caca:1; /* Counter auth. change alert */ - unsigned int lcda:1; /* Loss of counter data alert */ - }; - unsigned long flags; /* 0-63 All indicators */ - }; - /* 8 - 15 */ - unsigned int cfvn:16; /* 64-79 Ctr First Version */ - unsigned int csvn:16; /* 80-95 Ctr Second Version */ - unsigned int cpu_speed:32; /* 96-127 CPU speed */ - /* 16 - 23 */ - unsigned long timestamp; /* 128-191 Timestamp (TOD) */ - /* 24 - 55 */ - union { - struct { - unsigned long progusage1; - unsigned long progusage2; - unsigned long progusage3; - unsigned long tod_base; - }; - unsigned long progusage[4]; - }; - /* 56 - 63 */ - unsigned int mach_type:16; /* Machine type */ - unsigned int res1:16; /* Reserved */ - unsigned int res2:32; /* Reserved */ -}; - -/* Create the trailer data at the end of a page. */ -static void cf_diag_trailer(struct cf_trailer_entry *te) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - struct cpuid cpuid; - - te->cfvn = cpuhw->info.cfvn; /* Counter version numbers */ - te->csvn = cpuhw->info.csvn; - - get_cpu_id(&cpuid); /* Machine type */ - te->mach_type = cpuid.machine; - te->cpu_speed = cf_diag_cpu_speed; - if (te->cpu_speed) - te->speed = 1; - te->clock_base = 1; /* Save clock base */ - te->tod_base = tod_clock_base.tod; - te->timestamp = get_tod_clock_fast(); -} - -/* - * Change the CPUMF state to active. - * Enable and activate the CPU-counter sets according - * to the per-cpu control state. - */ -static void cf_diag_enable(struct pmu *pmu) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - int err; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s pmu %p cpu %d flags %#x state %#llx\n", - __func__, pmu, smp_processor_id(), cpuhw->flags, - cpuhw->state); - if (cpuhw->flags & PMU_F_ENABLED) - return; - - err = lcctl(cpuhw->state); - if (err) { - pr_err("Enabling the performance measuring unit " - "failed with rc=%x\n", err); - return; - } - cpuhw->flags |= PMU_F_ENABLED; -} - -/* - * Change the CPUMF state to inactive. - * Disable and enable (inactive) the CPU-counter sets according - * to the per-cpu control state. - */ -static void cf_diag_disable(struct pmu *pmu) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - u64 inactive; - int err; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s pmu %p cpu %d flags %#x state %#llx\n", - __func__, pmu, smp_processor_id(), cpuhw->flags, - cpuhw->state); - if (!(cpuhw->flags & PMU_F_ENABLED)) - return; - - inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1); - err = lcctl(inactive); - if (err) { - pr_err("Disabling the performance measuring unit " - "failed with rc=%x\n", err); - return; - } - cpuhw->flags &= ~PMU_F_ENABLED; -} - -/* Number of perf events counting hardware events */ -static atomic_t cf_diag_events = ATOMIC_INIT(0); -/* Used to avoid races in calling reserve/release_cpumf_hardware */ -static DEFINE_MUTEX(cf_diag_reserve_mutex); - -/* Release the PMU if event is the last perf event */ -static void cf_diag_perf_event_destroy(struct perf_event *event) -{ - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d cf_diag_events %d\n", - __func__, event, smp_processor_id(), - atomic_read(&cf_diag_events)); - if (atomic_dec_return(&cf_diag_events) == 0) - __kernel_cpumcf_end(); -} - -static int get_authctrsets(void) -{ - struct cpu_cf_events *cpuhw; - unsigned long auth = 0; - enum cpumf_ctr_set i; - - cpuhw = &get_cpu_var(cpu_cf_events); - for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { - if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i]) - auth |= cpumf_ctr_ctl[i]; - } - put_cpu_var(cpu_cf_events); - return auth; -} - -/* Setup the event. Test for authorized counter sets and only include counter - * sets which are authorized at the time of the setup. Including unauthorized - * counter sets result in specification exception (and panic). - */ -static int __hw_perf_event_init(struct perf_event *event) -{ - struct perf_event_attr *attr = &event->attr; - int err = 0; - - debug_sprintf_event(cf_diag_dbg, 5, "%s event %p cpu %d\n", __func__, - event, event->cpu); - - event->hw.config = attr->config; - - /* Add all authorized counter sets to config_base. The - * the hardware init function is either called per-cpu or just once - * for all CPUS (event->cpu == -1). This depends on the whether - * counting is started for all CPUs or on a per workload base where - * the perf event moves from one CPU to another CPU. - * Checking the authorization on any CPU is fine as the hardware - * applies the same authorization settings to all CPUs. - */ - event->hw.config_base = get_authctrsets(); - - /* No authorized counter sets, nothing to count/sample */ - if (!event->hw.config_base) { - err = -EINVAL; - goto out; - } - - /* Set sample_period to indicate sampling */ - event->hw.sample_period = attr->sample_period; - local64_set(&event->hw.period_left, event->hw.sample_period); - event->hw.last_period = event->hw.sample_period; -out: - debug_sprintf_event(cf_diag_dbg, 5, "%s err %d config_base %#lx\n", - __func__, err, event->hw.config_base); - return err; -} - -/* Return 0 if the CPU-measurement counter facility is currently free - * and an error otherwise. - */ -static int cf_diag_perf_event_inuse(void) -{ - int err = 0; - - if (!atomic_inc_not_zero(&cf_diag_events)) { - mutex_lock(&cf_diag_reserve_mutex); - if (atomic_read(&cf_diag_events) == 0 && - __kernel_cpumcf_begin()) - err = -EBUSY; - else - err = atomic_inc_return(&cf_diag_events); - mutex_unlock(&cf_diag_reserve_mutex); - } - return err; -} - -static int cf_diag_event_init(struct perf_event *event) -{ - struct perf_event_attr *attr = &event->attr; - int err = -ENOENT; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d config %#llx type:%u " - "sample_type %#llx cf_diag_events %d\n", __func__, - event, event->cpu, attr->config, event->pmu->type, - attr->sample_type, atomic_read(&cf_diag_events)); - - if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG || - event->attr.type != event->pmu->type) - goto out; - - /* Raw events are used to access counters directly, - * hence do not permit excludes. - * This event is usesless without PERF_SAMPLE_RAW to return counter set - * values as raw data. - */ - if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv || - !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) { - err = -EOPNOTSUPP; - goto out; - } - - /* Initialize for using the CPU-measurement counter facility */ - err = cf_diag_perf_event_inuse(); - if (err < 0) - goto out; - event->destroy = cf_diag_perf_event_destroy; - - err = __hw_perf_event_init(event); - if (unlikely(err)) - event->destroy(event); -out: - debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err); - return err; -} - -static void cf_diag_read(struct perf_event *event) -{ - debug_sprintf_event(cf_diag_dbg, 5, "%s event %p\n", __func__, event); -} - -/* Calculate memory needed to store all counter sets together with header and - * trailer data. This is independend of the counter set authorization which - * can vary depending on the configuration. - */ -static size_t cf_diag_ctrset_maxsize(struct cpumf_ctr_info *info) -{ - size_t max_size = sizeof(struct cf_trailer_entry); - enum cpumf_ctr_set i; - - for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { - size_t size = cpum_cf_ctrset_size(i, info); - - if (size) - max_size += size * sizeof(u64) + - sizeof(struct cf_ctrset_entry); - } - debug_sprintf_event(cf_diag_dbg, 5, "%s max_size %zu\n", __func__, - max_size); - - return max_size; -} - -/* Read a counter set. The counter set number determines which counter set and - * the CPUM-CF first and second version number determine the number of - * available counters in this counter set. - * Each counter set starts with header containing the counter set number and - * the number of 8 byte counters. - * - * The functions returns the number of bytes occupied by this counter set - * including the header. - * If there is no counter in the counter set, this counter set is useless and - * zero is returned on this case. - */ -static size_t cf_diag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset, - size_t room) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - size_t ctrset_size, need = 0; - int rc = 3; /* Assume write failure */ - - ctrdata->def = CF_DIAG_CTRSET_DEF; - ctrdata->set = ctrset; - ctrdata->res1 = 0; - ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info); - - if (ctrset_size) { /* Save data */ - need = ctrset_size * sizeof(u64) + sizeof(*ctrdata); - if (need <= room) - rc = ctr_stcctm(ctrset, ctrset_size, - (u64 *)(ctrdata + 1)); - if (rc != 3) - ctrdata->ctr = ctrset_size; - else - need = 0; - } - - debug_sprintf_event(cf_diag_dbg, 6, - "%s ctrset %d ctrset_size %zu cfvn %d csvn %d" - " need %zd rc %d\n", - __func__, ctrset, ctrset_size, cpuhw->info.cfvn, - cpuhw->info.csvn, need, rc); - return need; -} - -/* Read out all counter sets and save them in the provided data buffer. - * The last 64 byte host an artificial trailer entry. - */ -static size_t cf_diag_getctr(void *data, size_t sz, unsigned long auth) -{ - struct cf_trailer_entry *trailer; - size_t offset = 0, done; - int i; - - memset(data, 0, sz); - sz -= sizeof(*trailer); /* Always room for trailer */ - for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { - struct cf_ctrset_entry *ctrdata = data + offset; - - if (!(auth & cpumf_ctr_ctl[i])) - continue; /* Counter set not authorized */ - - done = cf_diag_getctrset(ctrdata, i, sz - offset); - offset += done; - debug_sprintf_event(cf_diag_dbg, 6, - "%s ctrset %d offset %zu done %zu\n", - __func__, i, offset, done); - } - trailer = data + offset; - cf_diag_trailer(trailer); - return offset + sizeof(*trailer); -} - -/* Calculate the difference for each counter in a counter set. */ -static void cf_diag_diffctrset(u64 *pstart, u64 *pstop, int counters) -{ - for (; --counters >= 0; ++pstart, ++pstop) - if (*pstop >= *pstart) - *pstop -= *pstart; - else - *pstop = *pstart - *pstop; -} - -/* Scan the counter sets and calculate the difference of each counter - * in each set. The result is the increment of each counter during the - * period the counter set has been activated. - * - * Return true on success. - */ -static int cf_diag_diffctr(struct cf_diag_csd *csd, unsigned long auth) -{ - struct cf_trailer_entry *trailer_start, *trailer_stop; - struct cf_ctrset_entry *ctrstart, *ctrstop; - size_t offset = 0; - - auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1; - do { - ctrstart = (struct cf_ctrset_entry *)(csd->start + offset); - ctrstop = (struct cf_ctrset_entry *)(csd->data + offset); - - if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) { - pr_err("cpum_cf_diag counter set compare error " - "in set %i\n", ctrstart->set); - return 0; - } - auth &= ~cpumf_ctr_ctl[ctrstart->set]; - if (ctrstart->def == CF_DIAG_CTRSET_DEF) { - cf_diag_diffctrset((u64 *)(ctrstart + 1), - (u64 *)(ctrstop + 1), ctrstart->ctr); - offset += ctrstart->ctr * sizeof(u64) + - sizeof(*ctrstart); - } - debug_sprintf_event(cf_diag_dbg, 6, - "%s set %d ctr %d offset %zu auth %lx\n", - __func__, ctrstart->set, ctrstart->ctr, - offset, auth); - } while (ctrstart->def && auth); - - /* Save time_stamp from start of event in stop's trailer */ - trailer_start = (struct cf_trailer_entry *)(csd->start + offset); - trailer_stop = (struct cf_trailer_entry *)(csd->data + offset); - trailer_stop->progusage[0] = trailer_start->timestamp; - - return 1; -} - -/* Create perf event sample with the counter sets as raw data. The sample - * is then pushed to the event subsystem and the function checks for - * possible event overflows. If an event overflow occurs, the PMU is - * stopped. - * - * Return non-zero if an event overflow occurred. - */ -static int cf_diag_push_sample(struct perf_event *event, - struct cf_diag_csd *csd) -{ - struct perf_sample_data data; - struct perf_raw_record raw; - struct pt_regs regs; - int overflow; - - /* Setup perf sample */ - perf_sample_data_init(&data, 0, event->hw.last_period); - memset(®s, 0, sizeof(regs)); - memset(&raw, 0, sizeof(raw)); - - if (event->attr.sample_type & PERF_SAMPLE_CPU) - data.cpu_entry.cpu = event->cpu; - if (event->attr.sample_type & PERF_SAMPLE_RAW) { - raw.frag.size = csd->used; - raw.frag.data = csd->data; - raw.size = csd->used; - data.raw = &raw; - } - - overflow = perf_event_overflow(event, &data, ®s); - debug_sprintf_event(cf_diag_dbg, 6, - "%s event %p cpu %d sample_type %#llx raw %d " - "ov %d\n", __func__, event, event->cpu, - event->attr.sample_type, raw.size, overflow); - if (overflow) - event->pmu->stop(event, 0); - - perf_event_update_userpage(event); - return overflow; -} - -static void cf_diag_start(struct perf_event *event, int flags) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd); - struct hw_perf_event *hwc = &event->hw; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d flags %#x hwc-state %#x\n", - __func__, event, event->cpu, flags, hwc->state); - if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) - return; - - /* (Re-)enable and activate all counter sets */ - lcctl(0); /* Reset counter sets */ - hwc->state = 0; - ctr_set_multiple_enable(&cpuhw->state, hwc->config_base); - lcctl(cpuhw->state); /* Enable counter sets */ - csd->used = cf_diag_getctr(csd->start, sizeof(csd->start), - event->hw.config_base); - ctr_set_multiple_start(&cpuhw->state, hwc->config_base); - /* Function cf_diag_enable() starts the counter sets. */ -} - -static void cf_diag_stop(struct perf_event *event, int flags) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd); - struct hw_perf_event *hwc = &event->hw; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d flags %#x hwc-state %#x\n", - __func__, event, event->cpu, flags, hwc->state); - - /* Deactivate all counter sets */ - ctr_set_multiple_stop(&cpuhw->state, hwc->config_base); - local64_inc(&event->count); - csd->used = cf_diag_getctr(csd->data, sizeof(csd->data), - event->hw.config_base); - if (cf_diag_diffctr(csd, event->hw.config_base)) - cf_diag_push_sample(event, csd); - hwc->state |= PERF_HES_STOPPED; -} - -static int cf_diag_add(struct perf_event *event, int flags) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - int err = 0; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d flags %#x cpuhw %p\n", - __func__, event, event->cpu, flags, cpuhw); - - if (cpuhw->flags & PMU_F_IN_USE) { - err = -EAGAIN; - goto out; - } - - event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; - - cpuhw->flags |= PMU_F_IN_USE; - if (flags & PERF_EF_START) - cf_diag_start(event, PERF_EF_RELOAD); -out: - debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err); - return err; -} - -static void cf_diag_del(struct perf_event *event, int flags) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d flags %#x\n", - __func__, event, event->cpu, flags); - - cf_diag_stop(event, PERF_EF_UPDATE); - ctr_set_multiple_stop(&cpuhw->state, event->hw.config_base); - ctr_set_multiple_disable(&cpuhw->state, event->hw.config_base); - cpuhw->flags &= ~PMU_F_IN_USE; -} - -/* Default counter set events and format attribute groups */ - -CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG); - -static struct attribute *cf_diag_events_attr[] = { - CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG), - NULL, -}; - -PMU_FORMAT_ATTR(event, "config:0-63"); - -static struct attribute *cf_diag_format_attr[] = { - &format_attr_event.attr, - NULL, -}; - -static struct attribute_group cf_diag_events_group = { - .name = "events", - .attrs = cf_diag_events_attr, -}; -static struct attribute_group cf_diag_format_group = { - .name = "format", - .attrs = cf_diag_format_attr, -}; -static const struct attribute_group *cf_diag_attr_groups[] = { - &cf_diag_events_group, - &cf_diag_format_group, - NULL, -}; - -/* Performance monitoring unit for s390x */ -static struct pmu cf_diag = { - .task_ctx_nr = perf_sw_context, - .pmu_enable = cf_diag_enable, - .pmu_disable = cf_diag_disable, - .event_init = cf_diag_event_init, - .add = cf_diag_add, - .del = cf_diag_del, - .start = cf_diag_start, - .stop = cf_diag_stop, - .read = cf_diag_read, - - .attr_groups = cf_diag_attr_groups -}; - -/* Get the CPU speed, try sampling facility first and CPU attributes second. */ -static void cf_diag_get_cpu_speed(void) -{ - if (cpum_sf_avail()) { /* Sampling facility first */ - struct hws_qsi_info_block si; - - memset(&si, 0, sizeof(si)); - if (!qsi(&si)) { - cf_diag_cpu_speed = si.cpu_speed; - return; - } - } - - if (test_facility(34)) { /* CPU speed extract static part */ - unsigned long mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0); - - if (mhz != -1UL) - cf_diag_cpu_speed = mhz & 0xffffffff; - } -} - -/* Code to create device and file I/O operations */ -static atomic_t ctrset_opencnt = ATOMIC_INIT(0); /* Excl. access */ - -static int cf_diag_open(struct inode *inode, struct file *file) -{ - int err = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (atomic_xchg(&ctrset_opencnt, 1)) - return -EBUSY; - - /* Avoid concurrent access with perf_event_open() system call */ - mutex_lock(&cf_diag_reserve_mutex); - if (atomic_read(&cf_diag_events) || __kernel_cpumcf_begin()) - err = -EBUSY; - mutex_unlock(&cf_diag_reserve_mutex); - if (err) { - atomic_set(&ctrset_opencnt, 0); - return err; - } - file->private_data = NULL; - debug_sprintf_event(cf_diag_dbg, 2, "%s\n", __func__); - /* nonseekable_open() never fails */ - return nonseekable_open(inode, file); -} - -/* Variables for ioctl() interface support */ -static DEFINE_MUTEX(cf_diag_ctrset_mutex); -static struct cf_diag_ctrset { - unsigned long ctrset; /* Bit mask of counter set to read */ - cpumask_t mask; /* CPU mask to read from */ -} cf_diag_ctrset; - -static void cf_diag_ctrset_clear(void) -{ - cpumask_clear(&cf_diag_ctrset.mask); - cf_diag_ctrset.ctrset = 0; -} - -static void cf_diag_release_cpu(void *p) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - - debug_sprintf_event(cf_diag_dbg, 3, "%s cpu %d\n", __func__, - smp_processor_id()); - lcctl(0); /* Reset counter sets */ - cpuhw->state = 0; /* Save state in CPU hardware state */ -} - -/* Release function is also called when application gets terminated without - * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command. - * Since only one application is allowed to open the device, simple stop all - * CPU counter sets. - */ -static int cf_diag_release(struct inode *inode, struct file *file) -{ - on_each_cpu(cf_diag_release_cpu, NULL, 1); - cf_diag_ctrset_clear(); - atomic_set(&ctrset_opencnt, 0); - __kernel_cpumcf_end(); - debug_sprintf_event(cf_diag_dbg, 2, "%s\n", __func__); - return 0; -} - -struct cf_diag_call_on_cpu_parm { /* Parm struct for smp_call_on_cpu */ - unsigned int sets; /* Counter set bit mask */ - atomic_t cpus_ack; /* # CPUs successfully executed func */ -}; - -static int cf_diag_all_copy(unsigned long arg, cpumask_t *mask) -{ - struct s390_ctrset_read __user *ctrset_read; - unsigned int cpu, cpus, rc; - void __user *uptr; - - ctrset_read = (struct s390_ctrset_read __user *)arg; - uptr = ctrset_read->data; - for_each_cpu(cpu, mask) { - struct cf_diag_csd *csd = per_cpu_ptr(&cf_diag_csd, cpu); - struct s390_ctrset_cpudata __user *ctrset_cpudata; - - ctrset_cpudata = uptr; - debug_sprintf_event(cf_diag_dbg, 5, "%s cpu %d used %zd\n", - __func__, cpu, csd->used); - rc = put_user(cpu, &ctrset_cpudata->cpu_nr); - rc |= put_user(csd->sets, &ctrset_cpudata->no_sets); - rc |= copy_to_user(ctrset_cpudata->data, csd->data, csd->used); - if (rc) - return -EFAULT; - uptr += sizeof(struct s390_ctrset_cpudata) + csd->used; - cond_resched(); - } - cpus = cpumask_weight(mask); - if (put_user(cpus, &ctrset_read->no_cpus)) - return -EFAULT; - debug_sprintf_event(cf_diag_dbg, 5, "%s copied %ld\n", - __func__, uptr - (void __user *)ctrset_read->data); - return 0; -} - -static size_t cf_diag_cpuset_read(struct s390_ctrset_setdata *p, int ctrset, - int ctrset_size, size_t room) -{ - size_t need = 0; - int rc = -1; - - need = sizeof(*p) + sizeof(u64) * ctrset_size; - debug_sprintf_event(cf_diag_dbg, 5, - "%s room %zd need %zd set %#x set_size %d\n", - __func__, room, need, ctrset, ctrset_size); - if (need <= room) { - p->set = cpumf_ctr_ctl[ctrset]; - p->no_cnts = ctrset_size; - rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv); - if (rc == 3) /* Nothing stored */ - need = 0; - } - debug_sprintf_event(cf_diag_dbg, 5, "%s need %zd rc %d\n", __func__, - need, rc); - return need; -} - -/* Read all counter sets. Since the perf_event_open() system call with - * event cpum_cf_diag/.../ is blocked when this interface is active, reuse - * the perf_event_open() data buffer to store the counter sets. - */ -static void cf_diag_cpu_read(void *parm) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd); - struct cf_diag_call_on_cpu_parm *p = parm; - int set, set_size; - size_t space; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s new %#x flags %#x state %#llx\n", - __func__, p->sets, cpuhw->flags, - cpuhw->state); - /* No data saved yet */ - csd->used = 0; - csd->sets = 0; - memset(csd->data, 0, sizeof(csd->data)); - - /* Scan the counter sets */ - for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) { - struct s390_ctrset_setdata *sp = (void *)csd->data + csd->used; - - if (!(p->sets & cpumf_ctr_ctl[set])) - continue; /* Counter set not in list */ - set_size = cpum_cf_ctrset_size(set, &cpuhw->info); - space = sizeof(csd->data) - csd->used; - space = cf_diag_cpuset_read(sp, set, set_size, space); - if (space) { - csd->used += space; - csd->sets += 1; - } - debug_sprintf_event(cf_diag_dbg, 5, "%s sp %px space %zd\n", - __func__, sp, space); - } - debug_sprintf_event(cf_diag_dbg, 5, "%s sets %d used %zd\n", __func__, - csd->sets, csd->used); -} - -static int cf_diag_all_read(unsigned long arg) -{ - struct cf_diag_call_on_cpu_parm p; - cpumask_var_t mask; - int rc; - - debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__); - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - - p.sets = cf_diag_ctrset.ctrset; - cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask); - on_each_cpu_mask(mask, cf_diag_cpu_read, &p, 1); - rc = cf_diag_all_copy(arg, mask); - free_cpumask_var(mask); - debug_sprintf_event(cf_diag_dbg, 5, "%s rc %d\n", __func__, rc); - return rc; -} - -/* Stop all counter sets via ioctl interface */ -static void cf_diag_ioctl_off(void *parm) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - struct cf_diag_call_on_cpu_parm *p = parm; - int rc; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s new %#x flags %#x state %#llx\n", - __func__, p->sets, cpuhw->flags, - cpuhw->state); - - ctr_set_multiple_disable(&cpuhw->state, p->sets); - ctr_set_multiple_stop(&cpuhw->state, p->sets); - rc = lcctl(cpuhw->state); /* Stop counter sets */ - if (!cpuhw->state) - cpuhw->flags &= ~PMU_F_IN_USE; - debug_sprintf_event(cf_diag_dbg, 5, - "%s rc %d flags %#x state %#llx\n", __func__, - rc, cpuhw->flags, cpuhw->state); -} - -/* Start counter sets on particular CPU */ -static void cf_diag_ioctl_on(void *parm) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - struct cf_diag_call_on_cpu_parm *p = parm; - int rc; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s new %#x flags %#x state %#llx\n", - __func__, p->sets, cpuhw->flags, - cpuhw->state); - - if (!(cpuhw->flags & PMU_F_IN_USE)) - cpuhw->state = 0; - cpuhw->flags |= PMU_F_IN_USE; - rc = lcctl(cpuhw->state); /* Reset unused counter sets */ - ctr_set_multiple_enable(&cpuhw->state, p->sets); - ctr_set_multiple_start(&cpuhw->state, p->sets); - rc |= lcctl(cpuhw->state); /* Start counter sets */ - if (!rc) - atomic_inc(&p->cpus_ack); - debug_sprintf_event(cf_diag_dbg, 5, "%s rc %d state %#llx\n", - __func__, rc, cpuhw->state); -} - -static int cf_diag_all_stop(void) -{ - struct cf_diag_call_on_cpu_parm p = { - .sets = cf_diag_ctrset.ctrset, - }; - cpumask_var_t mask; - - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask); - on_each_cpu_mask(mask, cf_diag_ioctl_off, &p, 1); - free_cpumask_var(mask); - return 0; -} - -static int cf_diag_all_start(void) -{ - struct cf_diag_call_on_cpu_parm p = { - .sets = cf_diag_ctrset.ctrset, - .cpus_ack = ATOMIC_INIT(0), - }; - cpumask_var_t mask; - int rc = 0; - - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask); - on_each_cpu_mask(mask, cf_diag_ioctl_on, &p, 1); - if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) { - on_each_cpu_mask(mask, cf_diag_ioctl_off, &p, 1); - rc = -EIO; - } - free_cpumask_var(mask); - return rc; -} - -/* Return the maximum required space for all possible CPUs in case one - * CPU will be onlined during the START, READ, STOP cycles. - * To find out the size of the counter sets, any one CPU will do. They - * all have the same counter sets. - */ -static size_t cf_diag_needspace(unsigned int sets) -{ - struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events); - size_t bytes = 0; - int i; - - for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { - if (!(sets & cpumf_ctr_ctl[i])) - continue; - bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) + - sizeof(((struct s390_ctrset_setdata *)0)->set) + - sizeof(((struct s390_ctrset_setdata *)0)->no_cnts); - } - bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids * - (bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) + - sizeof(((struct s390_ctrset_cpudata *)0)->no_sets)); - debug_sprintf_event(cf_diag_dbg, 5, "%s bytes %ld\n", __func__, - bytes); - put_cpu_ptr(&cpu_cf_events); - return bytes; -} - -static long cf_diag_ioctl_read(unsigned long arg) -{ - struct s390_ctrset_read read; - int ret = 0; - - debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__); - if (copy_from_user(&read, (char __user *)arg, sizeof(read))) - return -EFAULT; - ret = cf_diag_all_read(arg); - debug_sprintf_event(cf_diag_dbg, 5, "%s ret %d\n", __func__, ret); - return ret; -} - -static long cf_diag_ioctl_stop(void) -{ - int ret; - - debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__); - ret = cf_diag_all_stop(); - cf_diag_ctrset_clear(); - debug_sprintf_event(cf_diag_dbg, 5, "%s ret %d\n", __func__, ret); - return ret; -} - -static long cf_diag_ioctl_start(unsigned long arg) -{ - struct s390_ctrset_start __user *ustart; - struct s390_ctrset_start start; - void __user *umask; - unsigned int len; - int ret = 0; - size_t need; - - if (cf_diag_ctrset.ctrset) - return -EBUSY; - ustart = (struct s390_ctrset_start __user *)arg; - if (copy_from_user(&start, ustart, sizeof(start))) - return -EFAULT; - if (start.version != S390_HWCTR_START_VERSION) - return -EINVAL; - if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] | - cpumf_ctr_ctl[CPUMF_CTR_SET_USER] | - cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] | - cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] | - cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG])) - return -EINVAL; /* Invalid counter set */ - if (!start.counter_sets) - return -EINVAL; /* No counter set at all? */ - cpumask_clear(&cf_diag_ctrset.mask); - len = min_t(u64, start.cpumask_len, cpumask_size()); - umask = (void __user *)start.cpumask; - if (copy_from_user(&cf_diag_ctrset.mask, umask, len)) - return -EFAULT; - if (cpumask_empty(&cf_diag_ctrset.mask)) - return -EINVAL; - need = cf_diag_needspace(start.counter_sets); - if (put_user(need, &ustart->data_bytes)) - ret = -EFAULT; - if (ret) - goto out; - cf_diag_ctrset.ctrset = start.counter_sets; - ret = cf_diag_all_start(); -out: - if (ret) - cf_diag_ctrset_clear(); - debug_sprintf_event(cf_diag_dbg, 2, "%s sets %#lx need %ld ret %d\n", - __func__, cf_diag_ctrset.ctrset, need, ret); - return ret; -} - -static long cf_diag_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - int ret; - - debug_sprintf_event(cf_diag_dbg, 2, "%s cmd %#x arg %lx\n", __func__, - cmd, arg); - get_online_cpus(); - mutex_lock(&cf_diag_ctrset_mutex); - switch (cmd) { - case S390_HWCTR_START: - ret = cf_diag_ioctl_start(arg); - break; - case S390_HWCTR_STOP: - ret = cf_diag_ioctl_stop(); - break; - case S390_HWCTR_READ: - ret = cf_diag_ioctl_read(arg); - break; - default: - ret = -ENOTTY; - break; - } - mutex_unlock(&cf_diag_ctrset_mutex); - put_online_cpus(); - debug_sprintf_event(cf_diag_dbg, 2, "%s ret %d\n", __func__, ret); - return ret; -} - -static const struct file_operations cf_diag_fops = { - .owner = THIS_MODULE, - .open = cf_diag_open, - .release = cf_diag_release, - .unlocked_ioctl = cf_diag_ioctl, - .compat_ioctl = cf_diag_ioctl, - .llseek = no_llseek -}; - -static struct miscdevice cf_diag_dev = { - .name = S390_HWCTR_DEVICE, - .minor = MISC_DYNAMIC_MINOR, - .fops = &cf_diag_fops, -}; - -static int cf_diag_online_cpu(unsigned int cpu) -{ - struct cf_diag_call_on_cpu_parm p; - - mutex_lock(&cf_diag_ctrset_mutex); - if (!cf_diag_ctrset.ctrset) - goto out; - p.sets = cf_diag_ctrset.ctrset; - cf_diag_ioctl_on(&p); -out: - mutex_unlock(&cf_diag_ctrset_mutex); - return 0; -} - -static int cf_diag_offline_cpu(unsigned int cpu) -{ - struct cf_diag_call_on_cpu_parm p; - - mutex_lock(&cf_diag_ctrset_mutex); - if (!cf_diag_ctrset.ctrset) - goto out; - p.sets = cf_diag_ctrset.ctrset; - cf_diag_ioctl_off(&p); -out: - mutex_unlock(&cf_diag_ctrset_mutex); - return 0; -} - -/* Initialize the counter set PMU to generate complete counter set data as - * event raw data. This relies on the CPU Measurement Counter Facility device - * already being loaded and initialized. - */ -static int __init cf_diag_init(void) -{ - struct cpumf_ctr_info info; - size_t need; - int rc; - - if (!kernel_cpumcf_avail() || !stccm_avail() || qctri(&info)) - return -ENODEV; - cf_diag_get_cpu_speed(); - - /* Make sure the counter set data fits into predefined buffer. */ - need = cf_diag_ctrset_maxsize(&info); - if (need > sizeof(((struct cf_diag_csd *)0)->start)) { - pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n", - need); - return -ENOMEM; - } - - rc = misc_register(&cf_diag_dev); - if (rc) { - pr_err("Registration of /dev/" S390_HWCTR_DEVICE - "failed rc=%d\n", rc); - goto out; - } - - /* Setup s390dbf facility */ - cf_diag_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128); - if (!cf_diag_dbg) { - pr_err("Registration of s390dbf(cpum_cf_diag) failed\n"); - rc = -ENOMEM; - goto out_dbf; - } - debug_register_view(cf_diag_dbg, &debug_sprintf_view); - - rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1); - if (rc) { - pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n", - rc); - goto out_perf; - } - rc = cpuhp_setup_state_nocalls(CPUHP_AP_PERF_S390_CFD_ONLINE, - "perf/s390/cfd:online", - cf_diag_online_cpu, cf_diag_offline_cpu); - if (!rc) - goto out; - - pr_err("Registration of CPUHP_AP_PERF_S390_CFD_ONLINE failed rc=%i\n", - rc); - perf_pmu_unregister(&cf_diag); -out_perf: - debug_unregister_view(cf_diag_dbg, &debug_sprintf_view); - debug_unregister(cf_diag_dbg); -out_dbf: - misc_deregister(&cf_diag_dev); -out: - return rc; -} -device_initcall(cf_diag_init); -- cgit v1.2.3 From b8e9cc20b808e26329090c19ff80b7f5098e98ff Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 2 Jul 2021 15:54:55 +0200 Subject: s390/traps: do not test MONITOR CALL without CONFIG_BUG tinyconfig fails to boot, because without CONFIG_BUG report_bug() always returns BUG_TRAP_TYPE_BUG, which causes mc 0,0 in test_monitor_call() to panic. Fix by skipping the test without CONFIG_BUG. Signed-off-by: Ilya Leoshkevich Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/traps.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 019c5748b607..3a6d08d6df6f 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -277,6 +277,8 @@ static void __init test_monitor_call(void) { int val = 1; + if (!IS_ENABLED(CONFIG_BUG)) + return; asm volatile( " mc 0,0\n" "0: xgr %0,%0\n" -- cgit v1.2.3 From d57778feb9878aa6b79c615fd029c2112d40a747 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Wed, 23 Jun 2021 14:10:00 +0200 Subject: s390/vdso: always enable vdso With the upcoming move of the svc sigreturn instruction from the signal frame to vdso we need to have vdso always enabled. Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vdso.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 8c4e07d533c8..f786246e621a 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -37,18 +37,6 @@ enum vvar_pages { VVAR_NR_PAGES, }; -unsigned int __read_mostly vdso_enabled = 1; - -static int __init vdso_setup(char *str) -{ - bool enabled; - - if (!kstrtobool(str, &enabled)) - vdso_enabled = enabled; - return 1; -} -__setup("vdso=", vdso_setup); - #ifdef CONFIG_TIME_NS struct vdso_data *arch_get_vdso_data(void *vvar_page) { @@ -176,7 +164,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) int rc; BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); - if (!vdso_enabled || is_compat_task()) + if (is_compat_task()) return 0; if (mmap_write_lock_killable(mm)) return -EINTR; @@ -218,10 +206,9 @@ static int __init vdso_init(void) vdso_pages = (vdso64_end - vdso64_start) >> PAGE_SHIFT; pages = kcalloc(vdso_pages + 1, sizeof(struct page *), GFP_KERNEL); - if (!pages) { - vdso_enabled = 0; - return -ENOMEM; - } + if (!pages) + panic("failed to allocate VDSO pages"); + for (i = 0; i < vdso_pages; i++) pages[i] = virt_to_page(vdso64_start + i * PAGE_SIZE); pages[vdso_pages] = NULL; -- cgit v1.2.3 From 686341f2548b5a4c4ab1ee22427e046027ae1c9c Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 25 Jun 2021 14:48:28 +0200 Subject: s390/vdso64: add sigreturn,rt_sigreturn and restart_syscall Add minimalistic trampolines to vdso64 so we can return from signal without using the stack which requires pgm check handler hacks when NX is enabled. restart_syscall will be called from vdso to work around the architectural limitation that the syscall number might be encoded in the svc instruction, and therefore can not be changed. Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vdso64/vdso64.lds.S | 3 +++ arch/s390/kernel/vdso64/vdso_user_wrapper.S | 17 +++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S index 518f1ea405f4..489a72a69594 100644 --- a/arch/s390/kernel/vdso64/vdso64.lds.S +++ b/arch/s390/kernel/vdso64/vdso64.lds.S @@ -137,6 +137,9 @@ VERSION __kernel_clock_gettime; __kernel_clock_getres; __kernel_getcpu; + __kernel_restart_syscall; + __kernel_rt_sigreturn; + __kernel_sigreturn; local: *; }; } diff --git a/arch/s390/kernel/vdso64/vdso_user_wrapper.S b/arch/s390/kernel/vdso64/vdso_user_wrapper.S index f773505c7e63..97f0c0a669a5 100644 --- a/arch/s390/kernel/vdso64/vdso_user_wrapper.S +++ b/arch/s390/kernel/vdso64/vdso_user_wrapper.S @@ -37,3 +37,20 @@ vdso_func gettimeofday vdso_func clock_getres vdso_func clock_gettime vdso_func getcpu + +.macro vdso_syscall func,syscall + .globl __kernel_\func + .type __kernel_\func,@function + .align 8 +__kernel_\func: + CFI_STARTPROC + svc \syscall + /* Make sure we notice when a syscall returns, which shouldn't happen */ + .word 0 + CFI_ENDPROC + .size __kernel_\func,.-__kernel_\func +.endm + +vdso_syscall restart_syscall,__NR_restart_syscall +vdso_syscall sigreturn,__NR_sigreturn +vdso_syscall rt_sigreturn,__NR_rt_sigreturn -- cgit v1.2.3 From 43e1f76b0b69b86b2175ef755243e61fe40c75db Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 25 Jun 2021 15:10:35 +0200 Subject: s390/vdso: rename VDSO64_LBASE to VDSO_LBASE Will be used by both vdso32 and vdso64, so change the name. Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vdso64/vdso64.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S index 489a72a69594..d4fb336d747b 100644 --- a/arch/s390/kernel/vdso64/vdso64.lds.S +++ b/arch/s390/kernel/vdso64/vdso64.lds.S @@ -17,7 +17,7 @@ SECTIONS #ifdef CONFIG_TIME_NS PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); #endif - . = VDSO64_LBASE + SIZEOF_HEADERS; + . = VDSO_LBASE + SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } -- cgit v1.2.3 From 779df2248739b6308c03b354c99e4c352141e3bc Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 25 Jun 2021 14:50:08 +0200 Subject: s390/vdso: add minimal compat vdso Add a small vdso for 31 bit compat application that provides trampolines for calls to sigreturn,rt_sigreturn,syscall_restart. This is requird for moving these syscalls away from the signal frame to the vdso. Note that this patch effectively disables CONFIG_COMPAT when using clang to compile the kernel. clang doesn't support 31 bit mode. We want to redirect sigreturn and restart_syscall to the vdso. However, the kernel cannot parse the ELF vdso file, so we need to generate header files which contain the offsets of the syscall instructions in the vdso page. Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/Makefile | 1 + arch/s390/kernel/vdso.c | 48 ++++++---- arch/s390/kernel/vdso32/.gitignore | 2 + arch/s390/kernel/vdso32/Makefile | 75 +++++++++++++++ arch/s390/kernel/vdso32/gen_vdso_offsets.sh | 15 +++ arch/s390/kernel/vdso32/note.S | 13 +++ arch/s390/kernel/vdso32/vdso32.lds.S | 141 ++++++++++++++++++++++++++++ arch/s390/kernel/vdso32/vdso32_wrapper.S | 15 +++ arch/s390/kernel/vdso32/vdso_user_wrapper.S | 21 +++++ arch/s390/kernel/vdso64/Makefile | 8 ++ arch/s390/kernel/vdso64/gen_vdso_offsets.sh | 15 +++ 11 files changed, 338 insertions(+), 16 deletions(-) create mode 100644 arch/s390/kernel/vdso32/.gitignore create mode 100644 arch/s390/kernel/vdso32/Makefile create mode 100755 arch/s390/kernel/vdso32/gen_vdso_offsets.sh create mode 100644 arch/s390/kernel/vdso32/note.S create mode 100644 arch/s390/kernel/vdso32/vdso32.lds.S create mode 100644 arch/s390/kernel/vdso32/vdso32_wrapper.S create mode 100644 arch/s390/kernel/vdso32/vdso_user_wrapper.S create mode 100755 arch/s390/kernel/vdso64/gen_vdso_offsets.sh (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 7a77f7f6f9d8..4a44ba5a2d73 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -77,3 +77,4 @@ obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += # vdso obj-y += vdso64/ +obj-$(CONFIG_COMPAT) += vdso32/ diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index f786246e621a..99694260cac9 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -20,7 +20,7 @@ #include extern char vdso64_start[], vdso64_end[]; -static unsigned int vdso_pages; +extern char vdso32_start[], vdso32_end[]; static struct vm_special_mapping vvar_mapping; @@ -143,7 +143,12 @@ static struct vm_special_mapping vvar_mapping = { .fault = vvar_fault, }; -static struct vm_special_mapping vdso_mapping = { +static struct vm_special_mapping vdso64_mapping = { + .name = "[vdso]", + .mremap = vdso_mremap, +}; + +static struct vm_special_mapping vdso32_mapping = { .name = "[vdso]", .mremap = vdso_mremap, }; @@ -159,16 +164,22 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { unsigned long vdso_text_len, vdso_mapping_len; unsigned long vvar_start, vdso_text_start; + struct vm_special_mapping *vdso_mapping; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int rc; BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); - if (is_compat_task()) - return 0; if (mmap_write_lock_killable(mm)) return -EINTR; - vdso_text_len = vdso_pages << PAGE_SHIFT; + + if (is_compat_task()) { + vdso_text_len = vdso32_end - vdso32_start; + vdso_mapping = &vdso32_mapping; + } else { + vdso_text_len = vdso64_end - vdso64_start; + vdso_mapping = &vdso64_mapping; + } vdso_mapping_len = vdso_text_len + VVAR_NR_PAGES * PAGE_SIZE; vvar_start = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); rc = vvar_start; @@ -186,7 +197,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - &vdso_mapping); + vdso_mapping); if (IS_ERR(vma)) { do_munmap(mm, vvar_start, PAGE_SIZE, NULL); rc = PTR_ERR(vma); @@ -199,20 +210,25 @@ out: return rc; } -static int __init vdso_init(void) +static struct page ** __init vdso_setup_pages(void *start, void *end) { - struct page **pages; + int pages = (end - start) >> PAGE_SHIFT; + struct page **pagelist; int i; - vdso_pages = (vdso64_end - vdso64_start) >> PAGE_SHIFT; - pages = kcalloc(vdso_pages + 1, sizeof(struct page *), GFP_KERNEL); - if (!pages) - panic("failed to allocate VDSO pages"); + pagelist = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL); + if (!pagelist) + panic("%s: Cannot allocate page list for VDSO", __func__); + for (i = 0; i < pages; i++) + pagelist[i] = virt_to_page(start + i * PAGE_SIZE); + return pagelist; +} - for (i = 0; i < vdso_pages; i++) - pages[i] = virt_to_page(vdso64_start + i * PAGE_SIZE); - pages[vdso_pages] = NULL; - vdso_mapping.pages = pages; +static int __init vdso_init(void) +{ + vdso64_mapping.pages = vdso_setup_pages(vdso64_start, vdso64_end); + if (IS_ENABLED(CONFIG_COMPAT)) + vdso32_mapping.pages = vdso_setup_pages(vdso32_start, vdso32_end); return 0; } arch_initcall(vdso_init); diff --git a/arch/s390/kernel/vdso32/.gitignore b/arch/s390/kernel/vdso32/.gitignore new file mode 100644 index 000000000000..5167384843b9 --- /dev/null +++ b/arch/s390/kernel/vdso32/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +vdso32.lds diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile new file mode 100644 index 000000000000..b2349a3f4fa3 --- /dev/null +++ b/arch/s390/kernel/vdso32/Makefile @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: GPL-2.0 +# List of files in the vdso + +KCOV_INSTRUMENT := n +ARCH_REL_TYPE_ABS := R_390_COPY|R_390_GLOB_DAT|R_390_JMP_SLOT|R_390_RELATIVE +ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT + +include $(srctree)/lib/vdso/Makefile +obj-vdso32 = vdso_user_wrapper-32.o note-32.o + +# Build rules + +targets := $(obj-vdso32) vdso32.so vdso32.so.dbg +obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32)) + +KBUILD_AFLAGS += -DBUILD_VDSO +KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING + +KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) +KBUILD_AFLAGS_32 += -m31 -s + +KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS_32 += -m31 -fPIC -shared -fno-common -fno-builtin + +LDFLAGS_vdso32.so.dbg += -fPIC -shared -nostdlib -soname=linux-vdso32.so.1 \ + --hash-style=both --build-id=sha1 -melf_s390 -T + +$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) +$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) + +obj-y += vdso32_wrapper.o +CPPFLAGS_vdso32.lds += -P -C -U$(ARCH) + +# Disable gcov profiling, ubsan and kasan for VDSO code +GCOV_PROFILE := n +UBSAN_SANITIZE := n +KASAN_SANITIZE := n + +# Force dependency (incbin is bad) +$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so + +$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) FORCE + $(call if_changed,ld) + +# strip rule for the .so file +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg FORCE + $(call if_changed,objcopy) + +$(obj-vdso32): %-32.o: %.S FORCE + $(call if_changed_dep,vdso32as) + +# actual build commands +quiet_cmd_vdso32as = VDSO32A $@ + cmd_vdso32as = $(CC) $(a_flags) -c -o $@ $< +quiet_cmd_vdso32cc = VDSO32C $@ + cmd_vdso32cc = $(CC) $(c_flags) -c -o $@ $< + +# install commands for the unstripped file +quiet_cmd_vdso_install = INSTALL $@ + cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ + +vdso32.so: $(obj)/vdso32.so.dbg + @mkdir -p $(MODLIB)/vdso + $(call cmd,vdso_install) + +vdso_install: vdso32.so + +# Generate VDSO offsets using helper script +gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh +quiet_cmd_vdsosym = VDSOSYM $@ + cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ + +include/generated/vdso32-offsets.h: $(obj)/vdso32.so.dbg FORCE + $(call if_changed,vdsosym) diff --git a/arch/s390/kernel/vdso32/gen_vdso_offsets.sh b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh new file mode 100755 index 000000000000..9c4f951e227d --- /dev/null +++ b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh @@ -0,0 +1,15 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# +# Match symbols in the DSO that look like VDSO_*; produce a header file +# of constant offsets into the shared object. +# +# Doing this inside the Makefile will break the $(filter-out) function, +# causing Kbuild to rebuild the vdso-offsets header file every time. +# +# Inspired by arm64 version. +# + +LC_ALL=C +sed -n 's/\([0-9a-f]*\) . __kernel_compat_\(.*\)/\#define vdso32_offset_\2\t0x\1/p' diff --git a/arch/s390/kernel/vdso32/note.S b/arch/s390/kernel/vdso32/note.S new file mode 100644 index 000000000000..db19d0680a0a --- /dev/null +++ b/arch/s390/kernel/vdso32/note.S @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include +#include +#include + +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S new file mode 100644 index 000000000000..bff50b6acd6d --- /dev/null +++ b/arch/s390/kernel/vdso32/vdso32.lds.S @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This is the infamous ld script for the 64 bits vdso + * library + */ + +#include +#include + +OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390") +OUTPUT_ARCH(s390:31-bit) +ENTRY(_start) + +SECTIONS +{ + PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); +#ifdef CONFIG_TIME_NS + PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); +#endif + . = VDSO_LBASE + SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .note : { *(.note.*) } :text :note + + . = ALIGN(16); + .text : { + *(.text .stub .text.* .gnu.linkonce.t.*) + } :text + PROVIDE(__etext = .); + PROVIDE(_etext = .); + PROVIDE(etext = .); + + /* + * Other stuff is appended to the text segment: + */ + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } + .rodata1 : { *(.rodata1) } + + .dynamic : { *(.dynamic) } :text :dynamic + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + .gcc_except_table : { *(.gcc_except_table .gcc_except_table.*) } + + .rela.dyn ALIGN(8) : { *(.rela.dyn) } + .got ALIGN(8) : { *(.got .toc) } + + _end = .; + PROVIDE(end = .); + + /* + * Stabs debugging sections are here too. + */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + + /* + * DWARF debug sections. + * Symbols in the DWARF debugging sections are relative to the + * beginning of the section so we begin them at 0. + */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /* DWARF 3 */ + .debug_pubtypes 0 : { *(.debug_pubtypes) } + .debug_ranges 0 : { *(.debug_ranges) } + .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } + + /DISCARD/ : { + *(.note.GNU-stack) + *(.branch_lt) + *(.data .data.* .gnu.linkonce.d.* .sdata*) + *(.bss .sbss .dynbss .dynsbss) + } +} + +/* + * Very old versions of ld do not recognize this name token; use the constant. + */ +#define PT_GNU_EH_FRAME 0x6474e550 + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr PT_GNU_EH_FRAME; +} + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + VDSO_VERSION_STRING { + global: + /* + * Has to be there for the kernel to find + */ + __kernel_compat_restart_syscall; + __kernel_compat_rt_sigreturn; + __kernel_compat_sigreturn; + local: *; + }; +} diff --git a/arch/s390/kernel/vdso32/vdso32_wrapper.S b/arch/s390/kernel/vdso32/vdso32_wrapper.S new file mode 100644 index 000000000000..de2fb930471a --- /dev/null +++ b/arch/s390/kernel/vdso32/vdso32_wrapper.S @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include + + __PAGE_ALIGNED_DATA + + .globl vdso32_start, vdso32_end + .balign PAGE_SIZE +vdso32_start: + .incbin "arch/s390/kernel/vdso32/vdso32.so" + .balign PAGE_SIZE +vdso32_end: + + .previous diff --git a/arch/s390/kernel/vdso32/vdso_user_wrapper.S b/arch/s390/kernel/vdso32/vdso_user_wrapper.S new file mode 100644 index 000000000000..3f42f27f978c --- /dev/null +++ b/arch/s390/kernel/vdso32/vdso_user_wrapper.S @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include + +.macro vdso_syscall func,syscall + .globl __kernel_compat_\func + .type __kernel_compat_\func,@function + .align 8 +__kernel_compat_\func: + CFI_STARTPROC + svc \syscall + /* Make sure we notice when a syscall returns, which shouldn't happen */ + .word 0 + CFI_ENDPROC + .size __kernel_compat_\func,.-__kernel_compat_\func +.endm + +vdso_syscall restart_syscall,__NR_restart_syscall +vdso_syscall sigreturn,__NR_sigreturn +vdso_syscall rt_sigreturn,__NR_rt_sigreturn diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile index a6e0fb6b91d6..2a2092ce19f1 100644 --- a/arch/s390/kernel/vdso64/Makefile +++ b/arch/s390/kernel/vdso64/Makefile @@ -74,3 +74,11 @@ vdso64.so: $(obj)/vdso64.so.dbg $(call cmd,vdso_install) vdso_install: vdso64.so + +# Generate VDSO offsets using helper script +gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh +quiet_cmd_vdsosym = VDSOSYM $@ + cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ + +include/generated/vdso64-offsets.h: $(obj)/vdso64.so.dbg FORCE + $(call if_changed,vdsosym) diff --git a/arch/s390/kernel/vdso64/gen_vdso_offsets.sh b/arch/s390/kernel/vdso64/gen_vdso_offsets.sh new file mode 100755 index 000000000000..37f05cb38dad --- /dev/null +++ b/arch/s390/kernel/vdso64/gen_vdso_offsets.sh @@ -0,0 +1,15 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# +# Match symbols in the DSO that look like VDSO_*; produce a header file +# of constant offsets into the shared object. +# +# Doing this inside the Makefile will break the $(filter-out) function, +# causing Kbuild to rebuild the vdso-offsets header file every time. +# +# Inspired by arm64 version. +# + +LC_ALL=C +sed -n 's/\([0-9a-f]*\) . __kernel_\(.*\)/\#define vdso64_offset_\2\t0x\1/p' -- cgit v1.2.3 From df29a7440c4b5c65765c8f60396b3b13063e24e9 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 25 Jun 2021 15:02:08 +0200 Subject: s390/signal: switch to using vdso for sigreturn and syscall restart with generic entry, there's a bug when it comes to restarting of signals. The failing sequence is: a) a signal is coming in, and no handler is registered, so the lower part of arch_do_signal_or_restart() in arch/s390/kernel/signal.c sets PIF_SYSCALL_RESTART. b) a second signal gets pending while the kernel is still in the exit loop, and for that one, a handler exists. c) The first part of arch_do_signal_or_restart() is called. That part calls handle_signal(), which sets up stack + registers for handling the signal. d) __do_syscall() in arch/s390/kernel/syscall.c checks for PIF_SYSCALL_RESTART right before leaving to userspace. If it is set, it restart's the syscall. However, the registers are already setup for handling a signal from c). The syscall is now restarted with the wrong arguments. Change the code to: - use vdso for syscall_restart() instead of PIF_SYSCALL_RESTART because we cannot rewind and go back to userspace on s390 because the system call number might be encoded in the svc instruction. - for all other syscalls we rewind the PSW and return to userspace. Cc: # v5.12+ d57778feb987: s390/vdso: always enable vdso Cc: # v5.12+ 686341f2548b: s390/vdso64: add sigreturn,rt_sigreturn and restart_syscall Cc: # v5.12+ 43e1f76b0b69: s390/vdso: rename VDSO64_LBASE to VDSO_LBASE Cc: # v5.12+ 779df2248739: s390/vdso: add minimal compat vdso Cc: # v5.12+ Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/compat_signal.c | 13 +++---------- arch/s390/kernel/process.c | 6 ++++++ arch/s390/kernel/signal.c | 39 ++++++++++++++++++--------------------- arch/s390/kernel/syscall.c | 4 ++++ 4 files changed, 31 insertions(+), 31 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index 1d0e17ec93eb..cca142fbb516 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "compat_linux.h" #include "compat_ptrace.h" #include "entry.h" @@ -118,7 +119,6 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs) fpregs_load((_s390_fp_regs *) &user_sregs.fpregs, ¤t->thread.fpu); clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */ - clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART); return 0; } @@ -304,11 +304,7 @@ static int setup_frame32(struct ksignal *ksig, sigset_t *set, restorer = (unsigned long __force) ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE; } else { - /* Signal frames without vectors registers are short ! */ - __u16 __user *svc = (void __user *) frame + frame_size - 2; - if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc)) - return -EFAULT; - restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE; + restorer = VDSO32_SYMBOL(current, sigreturn); } /* Set up registers for signal handler */ @@ -371,10 +367,7 @@ static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set, restorer = (unsigned long __force) ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE; } else { - __u16 __user *svc = &frame->svc_insn; - if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc)) - return -EFAULT; - restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE; + restorer = VDSO32_SYMBOL(current, rt_sigreturn); } /* Create siginfo on the signal stack */ diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 7ae5dde9c54d..350e94d0cac2 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -166,6 +166,12 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp, p->thread.acrs[1] = (unsigned int)tls; } } + /* + * s390 stores the svc return address in arch_data when calling + * sigreturn()/restart_syscall() via vdso. 1 means no valid address + * stored. + */ + p->restart_block.arch_data = 1; return 0; } diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 080e7aed181f..78ef53b29958 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "entry.h" /* @@ -171,7 +172,6 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs) fpregs_load(&user_sregs.fpregs, ¤t->thread.fpu); clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */ - clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART); return 0; } @@ -334,15 +334,10 @@ static int setup_frame(int sig, struct k_sigaction *ka, /* Set up to return from userspace. If provided, use a stub already in userspace. */ - if (ka->sa.sa_flags & SA_RESTORER) { + if (ka->sa.sa_flags & SA_RESTORER) restorer = (unsigned long) ka->sa.sa_restorer; - } else { - /* Signal frame without vector registers are short ! */ - __u16 __user *svc = (void __user *) frame + frame_size - 2; - if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc)) - return -EFAULT; - restorer = (unsigned long) svc; - } + else + restorer = VDSO64_SYMBOL(current, sigreturn); /* Set up registers for signal handler */ regs->gprs[14] = restorer; @@ -397,14 +392,10 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, /* Set up to return from userspace. If provided, use a stub already in userspace. */ - if (ksig->ka.sa.sa_flags & SA_RESTORER) { + if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = (unsigned long) ksig->ka.sa.sa_restorer; - } else { - __u16 __user *svc = &frame->svc_insn; - if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc)) - return -EFAULT; - restorer = (unsigned long) svc; - } + else + restorer = VDSO64_SYMBOL(current, rt_sigreturn); /* Create siginfo on the signal stack */ if (copy_siginfo_to_user(&frame->info, &ksig->info)) @@ -501,7 +492,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) } /* No longer in a system call */ clear_pt_regs_flag(regs, PIF_SYSCALL); - clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART); + rseq_signal_deliver(&ksig, regs); if (is_compat_task()) handle_signal32(&ksig, oldset, regs); @@ -517,14 +508,20 @@ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) switch (regs->gprs[2]) { case -ERESTART_RESTARTBLOCK: /* Restart with sys_restart_syscall */ - regs->int_code = __NR_restart_syscall; - fallthrough; + regs->gprs[2] = regs->orig_gpr2; + current->restart_block.arch_data = regs->psw.addr; + if (is_compat_task()) + regs->psw.addr = VDSO32_SYMBOL(current, restart_syscall); + else + regs->psw.addr = VDSO64_SYMBOL(current, restart_syscall); + if (test_thread_flag(TIF_SINGLE_STEP)) + clear_thread_flag(TIF_PER_TRAP); + break; case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: - /* Restart system call with magic TIF bit. */ regs->gprs[2] = regs->orig_gpr2; - set_pt_regs_flag(regs, PIF_SYSCALL_RESTART); + regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16); if (test_thread_flag(TIF_SINGLE_STEP)) clear_thread_flag(TIF_PER_TRAP); break; diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c index 76f7916cc30f..c6b99da0738b 100644 --- a/arch/s390/kernel/syscall.c +++ b/arch/s390/kernel/syscall.c @@ -121,6 +121,10 @@ void do_syscall(struct pt_regs *regs) regs->gprs[2] = nr; + if (nr == __NR_restart_syscall && !(current->restart_block.arch_data & 1)) { + regs->psw.addr = current->restart_block.arch_data; + current->restart_block.arch_data = 1; + } nr = syscall_enter_from_user_mode_work(regs, nr); /* -- cgit v1.2.3 From fbf50f47ea99d07aec59859027352d4837e84ce1 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 25 Jun 2021 15:06:06 +0200 Subject: s390/signal: remove sigreturn on stack {rt_}sigreturn is now called from the vdso, so we no longer need the svc on the stack, and therefore no hack to support that mechanism on machines with non-executable stack. Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/syscall.c | 2 +- arch/s390/kernel/traps.c | 12 +----------- 2 files changed, 2 insertions(+), 12 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c index c6b99da0738b..ec73d2c61e58 100644 --- a/arch/s390/kernel/syscall.c +++ b/arch/s390/kernel/syscall.c @@ -108,7 +108,7 @@ SYSCALL_DEFINE0(ni_syscall) return -ENOSYS; } -void do_syscall(struct pt_regs *regs) +static void do_syscall(struct pt_regs *regs) { unsigned long nr; diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 3a6d08d6df6f..76947275fe8b 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -301,10 +301,9 @@ static void (*pgm_check_table[128])(struct pt_regs *regs); void noinstr __do_pgm_check(struct pt_regs *regs) { unsigned long last_break = S390_lowcore.breaking_event_addr; - unsigned int trapnr, syscall_redirect = 0; + unsigned int trapnr; irqentry_state_t state; - add_random_kstack_offset(); regs->int_code = *(u32 *)&S390_lowcore.pgm_ilc; regs->int_parm_long = S390_lowcore.trans_exc_code; @@ -346,18 +345,9 @@ void noinstr __do_pgm_check(struct pt_regs *regs) trapnr = regs->int_code & PGM_INT_CODE_MASK; if (trapnr) pgm_check_table[trapnr](regs); - syscall_redirect = user_mode(regs) && test_pt_regs_flag(regs, PIF_SYSCALL); out: local_irq_disable(); irqentry_exit(regs, state); - - if (syscall_redirect) { - enter_from_user_mode(regs); - local_irq_enable(); - regs->orig_gpr2 = regs->gprs[2]; - do_syscall(regs); - exit_to_user_mode(); - } } /* -- cgit v1.2.3 From e3c7a8d7f44f4b36eb299563526ef8c5cb8011b0 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Wed, 30 Jun 2021 13:50:55 +0200 Subject: s390: move restart of execve() syscall On s390, execve might have to be restarted for PGSTE binaries like kvm. In the past this was done via the PIF_SYSCALL_RESTART bit. However, with the recent changes, syscalls are now restarted differently. Now that execve() is the only call that might get restarted via PIF_SYSCALL_RESTART, move the loop to do_syscall(). This also has the advantage that the restart is no longer visible to userspace. Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/syscall.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c index ec73d2c61e58..0322f00f84ac 100644 --- a/arch/s390/kernel/syscall.c +++ b/arch/s390/kernel/syscall.c @@ -134,13 +134,15 @@ static void do_syscall(struct pt_regs *regs) * work, the ptrace code sets PIF_SYSCALL_RET_SET, which is checked here * and if set, the syscall will be skipped. */ - if (!test_pt_regs_flag(regs, PIF_SYSCALL_RET_SET)) { - regs->gprs[2] = -ENOSYS; - if (likely(nr < NR_syscalls)) - regs->gprs[2] = current->thread.sys_call_table[nr](regs); - } else { - clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET); - } + if (unlikely(test_and_clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET))) + goto out; + regs->gprs[2] = -ENOSYS; + if (likely(nr >= NR_syscalls)) + goto out; + do { + regs->gprs[2] = current->thread.sys_call_table[nr](regs); + } while (test_and_clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART)); +out: syscall_exit_to_user_mode_work(regs); } @@ -158,13 +160,8 @@ void noinstr __do_syscall(struct pt_regs *regs, int per_trap) if (per_trap) set_thread_flag(TIF_PER_TRAP); - for (;;) { - regs->flags = 0; - set_pt_regs_flag(regs, PIF_SYSCALL); - do_syscall(regs); - if (!test_pt_regs_flag(regs, PIF_SYSCALL_RESTART)) - break; - local_irq_enable(); - } + regs->flags = 0; + set_pt_regs_flag(regs, PIF_SYSCALL); + do_syscall(regs); exit_to_user_mode(); } -- cgit v1.2.3 From d26a357fe88e3875bcdf4a167d4182228c7e8964 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Wed, 30 Jun 2021 14:02:41 +0200 Subject: s390: rename PIF_SYSCALL_RESTART to PIF_EXECVE_PGSTE_RESTART PIF_SYSCALL_RESTART is now only used to restart execve when loading PGSTE binaries. Rename the flag to reflect that, and avoid people thinking that this bit has anything to do with generic syscall restarting. Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/syscall.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c index 0322f00f84ac..8fe2d23b64f4 100644 --- a/arch/s390/kernel/syscall.c +++ b/arch/s390/kernel/syscall.c @@ -134,6 +134,7 @@ static void do_syscall(struct pt_regs *regs) * work, the ptrace code sets PIF_SYSCALL_RET_SET, which is checked here * and if set, the syscall will be skipped. */ + if (unlikely(test_and_clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET))) goto out; regs->gprs[2] = -ENOSYS; @@ -141,7 +142,7 @@ static void do_syscall(struct pt_regs *regs) goto out; do { regs->gprs[2] = current->thread.sys_call_table[nr](regs); - } while (test_and_clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART)); + } while (test_and_clear_pt_regs_flag(regs, PIF_EXECVE_PGSTE_RESTART)); out: syscall_exit_to_user_mode_work(regs); } -- cgit v1.2.3 From 938e02beb3a0181ed1c7828e8939ffa32c350bea Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 5 Jul 2021 22:26:26 +0200 Subject: s390/irq: simplify do_softirq_own_stack() do_softirq_own_stack() is always called from task context and therefore it is not necessary to check if the async stack is currently used. Remove the check and directly switch to async stack. Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/irq.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index c0df4060d28d..a03d9f54d36f 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -270,17 +270,7 @@ unsigned int arch_dynirq_lower_bound(unsigned int from) */ void do_softirq_own_stack(void) { - unsigned long old, new; - - old = current_stack_pointer(); - /* Check against async. stack address range. */ - new = S390_lowcore.async_stack; - if (((new - old) >> (PAGE_SHIFT + THREAD_SIZE_ORDER)) != 0) { - CALL_ON_STACK(__do_softirq, new, 0); - } else { - /* We are already on the async stack. */ - __do_softirq(); - } + CALL_ON_STACK(__do_softirq, S390_lowcore.async_stack, 0); } /* -- cgit v1.2.3 From 2ae6521504941650fd48bcefd288730c3e44211a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 5 Jul 2021 22:27:06 +0200 Subject: s390/irq: inline do_softirq_own_stack() Move do_softirq_own_stack() to proper header file so it can be inlined; saving a few cycles. Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/irq.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index a03d9f54d36f..b55af916fbd5 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -265,14 +265,6 @@ unsigned int arch_dynirq_lower_bound(unsigned int from) return from < NR_IRQS_BASE ? NR_IRQS_BASE : from; } -/* - * Switch to the asynchronous interrupt stack for softirq execution. - */ -void do_softirq_own_stack(void) -{ - CALL_ON_STACK(__do_softirq, S390_lowcore.async_stack, 0); -} - /* * ext_int_hash[index] is the list head for all external interrupts that hash * to this index. -- cgit v1.2.3 From bb250e64e4702774ddee052b57136ab222f59ce1 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 5 Jul 2021 20:57:49 +0200 Subject: s390/irq: simplify on_async_stack() Make on_async_stack() a bit more readable, even though as usual it depends if one considers "!!!" readable or not. At least the new construct to check if the async stack is in use or not is a bit shorter and generates slightly better code. Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index b55af916fbd5..a1af5ecdabdf 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -110,7 +110,7 @@ static int on_async_stack(void) { unsigned long frame = current_frame_address(); - return !!!((S390_lowcore.async_stack - frame) >> (PAGE_SHIFT + THREAD_SIZE_ORDER)); + return ((S390_lowcore.async_stack ^ frame) & ~(THREAD_SIZE - 1)) == 0; } static void do_irq_async(struct pt_regs *regs, int irq) -- cgit v1.2.3 From de556892dc96e33bba7cda6a398f4d367cead50e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 5 Jul 2021 20:38:14 +0200 Subject: s390/irq: use call_on_stack() macro Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/irq.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index a1af5ecdabdf..234d085257eb 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -115,10 +115,12 @@ static int on_async_stack(void) static void do_irq_async(struct pt_regs *regs, int irq) { - if (on_async_stack()) + if (on_async_stack()) { do_IRQ(regs, irq); - else - CALL_ON_STACK(do_IRQ, S390_lowcore.async_stack, 2, regs, irq); + } else { + call_on_stack(2, S390_lowcore.async_stack, void, do_IRQ, + struct pt_regs *, regs, int, irq); + } } static int irq_pending(struct pt_regs *regs) -- cgit v1.2.3 From 845370f47fa0833d1b39be189c8bfea29a78ecc0 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 5 Jul 2021 20:38:26 +0200 Subject: s390/kexec: use call_on_stack() macro Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/machine_kexec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index d91989c7bd6a..1005a6935fbe 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -132,7 +132,8 @@ static bool kdump_csum_valid(struct kimage *image) int rc; preempt_disable(); - rc = CALL_ON_STACK(do_start_kdump, S390_lowcore.nodat_stack, 1, image); + rc = call_on_stack(1, S390_lowcore.nodat_stack, unsigned long, do_start_kdump, + unsigned long, (unsigned long)image); preempt_enable(); return rc == 0; #else -- cgit v1.2.3 From 0f541cc20129b8529c33f8aa42734f8bdd006582 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 5 Jul 2021 20:38:36 +0200 Subject: s390/smp: use call_on_stack() macro Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/smp.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index ff42d3aa0f00..8db0f9fcbbdd 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -300,24 +300,28 @@ static void pcpu_start_fn(struct pcpu *pcpu, void (*func)(void *), void *data) pcpu_sigp_retry(pcpu, SIGP_RESTART, 0); } +typedef void (pcpu_delegate_fn)(void *); + /* * Call function via PSW restart on pcpu and stop the current cpu. */ -static void __pcpu_delegate(void (*func)(void*), void *data) +static void __pcpu_delegate(pcpu_delegate_fn *func, void *data) { func(data); /* should not return */ } static void __no_sanitize_address pcpu_delegate(struct pcpu *pcpu, - void (*func)(void *), + pcpu_delegate_fn *func, void *data, unsigned long stack) { struct lowcore *lc = lowcore_ptr[pcpu - pcpu_devices]; unsigned long source_cpu = stap(); __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); - if (pcpu->address == source_cpu) - CALL_ON_STACK(__pcpu_delegate, stack, 2, func, data); + if (pcpu->address == source_cpu) { + call_on_stack(2, stack, void, __pcpu_delegate, + pcpu_delegate_fn *, func, void *, data); + } /* Stop target cpu (if func returns this stops the current cpu). */ pcpu_sigp_retry(pcpu, SIGP_STOP, 0); /* Restart func on the target cpu and stop the current cpu. */ -- cgit v1.2.3 From b55e692e6bcbec36b4e0ba683608e7e1e7aab8c7 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 5 Jul 2021 17:55:32 +0200 Subject: s390: rename CALL_ON_STACK_NORETURN() to call_on_stack_noreturn() Lower case matches the call_on_stack() macro and is easier to read. Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/setup.c | 2 +- arch/s390/kernel/smp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index b58ee83f30e3..d55c636e4889 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -354,7 +354,7 @@ void __init arch_call_rest_init(void) set_task_stack_end_magic(current); stack += STACK_INIT_OFFSET; S390_lowcore.kernel_stack = stack; - CALL_ON_STACK_NORETURN(rest_init, stack); + call_on_stack_noreturn(rest_init, stack); } static void __init setup_lowcore_dat_off(void) diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 8db0f9fcbbdd..a0a8c0abf7de 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -902,7 +902,7 @@ static void __no_sanitize_address smp_start_secondary(void *cpuvoid) S390_lowcore.restart_source = -1UL; __ctl_load(S390_lowcore.cregs_save_area, 0, 15); __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); - CALL_ON_STACK_NORETURN(smp_init_secondary, S390_lowcore.kernel_stack); + call_on_stack_noreturn(smp_init_secondary, S390_lowcore.kernel_stack); } /* Upping and downing of CPUs */ -- cgit v1.2.3 From 6a942f5780545ebd11aca8b3ac4b163397962322 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 7 Jul 2021 17:33:38 +0100 Subject: s390: preempt: Fix preempt_count initialization S390's init_idle_preempt_count(p, cpu) doesn't actually let us initialize the preempt_count of the requested CPU's idle task: it unconditionally writes to the current CPU's. This clearly conflicts with idle_threads_init(), which intends to initialize *all* the idle tasks, including their preempt_count (or their CPU's, if the arch uses a per-CPU preempt_count). Unfortunately, it seems the way s390 does things doesn't let us initialize every possible CPU's preempt_count early on, as the pages where this resides are only allocated when a CPU is brought up and are freed when it is brought down. Let the arch-specific code set a CPU's preempt_count when its lowcore is allocated, and turn init_idle_preempt_count() into an empty stub. Fixes: f1a0a376ca0c ("sched/core: Initialize the idle task with preemption disabled") Reported-by: Guenter Roeck Signed-off-by: Valentin Schneider Tested-by: Guenter Roeck Reviewed-by: Heiko Carstens Link: https://lore.kernel.org/r/20210707163338.1623014-1-valentin.schneider@arm.com Signed-off-by: Vasily Gorbik --- arch/s390/kernel/setup.c | 1 + arch/s390/kernel/smp.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index d55c636e4889..9e1f538e58e6 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -442,6 +442,7 @@ static void __init setup_lowcore_dat_off(void) lc->br_r1_trampoline = 0x07f1; /* br %r1 */ lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); + lc->preempt_count = PREEMPT_DISABLED; set_prefix((u32)(unsigned long) lc); lowcore_ptr[0] = lc; diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index a0a8c0abf7de..8984711f72ed 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -210,6 +210,7 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) lc->br_r1_trampoline = 0x07f1; /* br %r1 */ lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); + lc->preempt_count = PREEMPT_DISABLED; if (nmi_alloc_per_cpu(lc)) goto out; lowcore_ptr[cpu] = lc; -- cgit v1.2.3