diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-04-20 15:13:50 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-04-20 18:53:49 -0700 |
commit | 034ff37d34071ff3f48755f728cd229e42a4f15d (patch) | |
tree | 8050dff8bb1da92ccf36b36c5e9bb84027d3bfa3 | |
parent | e1f2750edc4afebb966a229b797fc89b98ee6098 (diff) | |
download | linux-x86-rep-insns.tar.gz linux-x86-rep-insns.tar.bz2 linux-x86-rep-insns.zip |
x86: rewrite '__copy_user_nocache' functionx86-rep-insns
I didn't really want to do this, but as part of all the other changes to
the user copy loops, I've been looking at this horror.
I tried to clean it up multiple times, but every time I just found more
problems, and the way it's written, it's just too hard to fix them.
For example, the code is written to do quad-word alignment, and will use
regular byte accesses to get to that point. That's fairly simple, but
it means that any initial 8-byte alignment will be done with cached
copies.
However, the code then is very careful to do any 4-byte _tail_ accesses
using an uncached 4-byte write, and that was claimed to be relevant in
commit a82eee742452 ("x86/uaccess/64: Handle the caching of 4-byte
nocache copies properly in __copy_user_nocache()").
So if you do a 4-byte copy using that function, it carefully uses a
4-byte 'movnti' for the destination. But if you were to do a 12-byte
copy that is 4-byte aligned, it would _not_ do a 4-byte 'movnti'
followed by a 8-byte 'movnti' to keep it all uncached.
Instead, it would align the destination to 8 bytes using a
byte-at-a-time loop, and then do a 8-byte 'movnti' for the final 8
bytes.
The main caller that cares is __copy_user_flushcache(), which knows
about this insanity, and has odd cases for it all. But I just can't
deal with looking at this kind of "it does one case right, and another
related case entirely wrong".
And the code really wasn't fixable without hard drugs, which I try to
avoid.
So instead, rewrite it in a form that hopefully not only gets this
right, but is a bit more maintainable. Knock wood.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | arch/x86/lib/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/lib/copy_user_64.S | 213 | ||||
-rw-r--r-- | arch/x86/lib/copy_user_uncached_64.S | 242 |
3 files changed, 243 insertions, 214 deletions
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 4f1a40a86534..01932af64193 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -71,6 +71,6 @@ ifneq ($(CONFIG_GENERIC_CSUM),y) endif lib-y += clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o - lib-y += copy_user_64.o + lib-y += copy_user_64.o copy_user_uncached_64.o lib-y += cmpxchg16b_emu.o endif diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index d424fb75e0f0..4fc5c2de2de4 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -7,15 +7,8 @@ */ #include <linux/linkage.h> -#include <asm/current.h> -#include <asm/asm-offsets.h> -#include <asm/thread_info.h> -#include <asm/cpufeatures.h> -#include <asm/alternative.h> #include <asm/asm.h> -#include <asm/smap.h> #include <asm/export.h> -#include <asm/trapnr.h> /* * rep_movs_alternative - memory copy with exception handling. @@ -119,209 +112,3 @@ SYM_FUNC_START(rep_movs_alternative) _ASM_EXTABLE_UA(27b, .Lcopy_user_tail) SYM_FUNC_END(rep_movs_alternative) EXPORT_SYMBOL(rep_movs_alternative) - -/* - * The uncached copy needs to align the destination for - * movnti and friends. - */ -.macro ALIGN_DESTINATION - /* check for bad alignment of destination */ - movl %edi,%ecx - andl $7,%ecx - jz 102f /* already aligned */ - subl $8,%ecx - negl %ecx - subl %ecx,%edx -100: movb (%rsi),%al -101: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz 100b -102: - - _ASM_EXTABLE_CPY(100b, .Lcopy_user_handle_align) - _ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align) -.endm - - -/* - * copy_user_nocache - Uncached memory copy with exception handling - * This will force destination out of cache for more performance. - * - * Note: Cached memory copy is used when destination or size is not - * naturally aligned. That is: - * - Require 8-byte alignment when size is 8 bytes or larger. - * - Require 4-byte alignment when size is 4 bytes. - */ -SYM_FUNC_START(__copy_user_nocache) - /* If size is less than 8 bytes, go to 4-byte copy */ - cmpl $8,%edx - jb .L_4b_nocache_copy_entry - - /* If destination is not 8-byte aligned, "cache" copy to align it */ - ALIGN_DESTINATION - - /* Set 4x8-byte copy count and remainder */ - movl %edx,%ecx - andl $63,%edx - shrl $6,%ecx - jz .L_8b_nocache_copy_entry /* jump if count is 0 */ - - /* Perform 4x8-byte nocache loop-copy */ -.L_4x8b_nocache_copy_loop: -1: movq (%rsi),%r8 -2: movq 1*8(%rsi),%r9 -3: movq 2*8(%rsi),%r10 -4: movq 3*8(%rsi),%r11 -5: movnti %r8,(%rdi) -6: movnti %r9,1*8(%rdi) -7: movnti %r10,2*8(%rdi) -8: movnti %r11,3*8(%rdi) -9: movq 4*8(%rsi),%r8 -10: movq 5*8(%rsi),%r9 -11: movq 6*8(%rsi),%r10 -12: movq 7*8(%rsi),%r11 -13: movnti %r8,4*8(%rdi) -14: movnti %r9,5*8(%rdi) -15: movnti %r10,6*8(%rdi) -16: movnti %r11,7*8(%rdi) - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - decl %ecx - jnz .L_4x8b_nocache_copy_loop - - /* Set 8-byte copy count and remainder */ -.L_8b_nocache_copy_entry: - movl %edx,%ecx - andl $7,%edx - shrl $3,%ecx - jz .L_4b_nocache_copy_entry /* jump if count is 0 */ - - /* Perform 8-byte nocache loop-copy */ -.L_8b_nocache_copy_loop: -20: movq (%rsi),%r8 -21: movnti %r8,(%rdi) - leaq 8(%rsi),%rsi - leaq 8(%rdi),%rdi - decl %ecx - jnz .L_8b_nocache_copy_loop - - /* If no byte left, we're done */ -.L_4b_nocache_copy_entry: - andl %edx,%edx - jz .L_finish_copy - - /* If destination is not 4-byte aligned, go to byte copy: */ - movl %edi,%ecx - andl $3,%ecx - jnz .L_1b_cache_copy_entry - - /* Set 4-byte copy count (1 or 0) and remainder */ - movl %edx,%ecx - andl $3,%edx - shrl $2,%ecx - jz .L_1b_cache_copy_entry /* jump if count is 0 */ - - /* Perform 4-byte nocache copy: */ -30: movl (%rsi),%r8d -31: movnti %r8d,(%rdi) - leaq 4(%rsi),%rsi - leaq 4(%rdi),%rdi - - /* If no bytes left, we're done: */ - andl %edx,%edx - jz .L_finish_copy - - /* Perform byte "cache" loop-copy for the remainder */ -.L_1b_cache_copy_entry: - movl %edx,%ecx -.L_1b_cache_copy_loop: -40: movb (%rsi),%al -41: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz .L_1b_cache_copy_loop - - /* Finished copying; fence the prior stores */ -.L_finish_copy: - xorl %eax,%eax - sfence - RET - -.L_fixup_4x8b_copy: - shll $6,%ecx - addl %ecx,%edx - jmp .L_fixup_handle_tail -.L_fixup_8b_copy: - lea (%rdx,%rcx,8),%rdx - jmp .L_fixup_handle_tail -.L_fixup_4b_copy: - lea (%rdx,%rcx,4),%rdx - jmp .L_fixup_handle_tail -.L_fixup_1b_copy: - movl %ecx,%edx -.L_fixup_handle_tail: - sfence - jmp .Lcopy_user_handle_tail - - _ASM_EXTABLE_CPY(1b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(2b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(3b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(4b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(5b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(6b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(7b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(8b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(9b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(10b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(11b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(12b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(13b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(14b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(15b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(16b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(20b, .L_fixup_8b_copy) - _ASM_EXTABLE_CPY(21b, .L_fixup_8b_copy) - _ASM_EXTABLE_CPY(30b, .L_fixup_4b_copy) - _ASM_EXTABLE_CPY(31b, .L_fixup_4b_copy) - _ASM_EXTABLE_CPY(40b, .L_fixup_1b_copy) - _ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy) - -/* - * Try to copy last bytes. - * Since protection fault in copy_from/to_user is not a normal situation, - * it is not necessary to optimize tail handling. - * Don't try to copy the tail if machine check happened - * - * Input: - * eax trap number written by ex_handler_copy() - * rdi destination - * rsi source - * rdx count - * - * Output: - * eax uncopied bytes or 0 if successful. - */ -.Lcopy_user_handle_tail: - cmp $X86_TRAP_MC,%eax - je 3f - - movl %edx,%ecx -1: rep movsb -2: mov %ecx,%eax - RET - -3: - movl %edx,%eax - RET - - _ASM_EXTABLE_CPY(1b, 2b) - -.Lcopy_user_handle_align: - addl %ecx,%edx - jmp .Lcopy_user_handle_tail - -SYM_FUNC_END(__copy_user_nocache) -EXPORT_SYMBOL(__copy_user_nocache) diff --git a/arch/x86/lib/copy_user_uncached_64.S b/arch/x86/lib/copy_user_uncached_64.S new file mode 100644 index 000000000000..5c5f38d32672 --- /dev/null +++ b/arch/x86/lib/copy_user_uncached_64.S @@ -0,0 +1,242 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org> + */ + +#include <linux/linkage.h> +#include <asm/asm.h> +#include <asm/export.h> + +/* + * copy_user_nocache - Uncached memory copy with exception handling + * + * This copies from user space into kernel space, but the kernel + * space accesses can take a machine check exception, so they too + * need exception handling. + * + * Note: only 32-bit and 64-bit stores have non-temporal versions, + * and we only use aligned versions. Any unaligned parts at the + * start or end of the copy will be done using normal cached stores. + * + * Input: + * rdi destination + * rsi source + * edx count + * + * Output: + * rax uncopied bytes or 0 if successful. + */ +SYM_FUNC_START(__copy_user_nocache) + /* If destination is not 7-byte aligned, we'll have to align it */ + testb $7,%dil + jne .Lalign + +.Lis_aligned: + cmp $64,%edx + jb .Lquadwords + + .p2align 4,0x90 +.Lunrolled: +10: movq (%rsi),%r8 +11: movq 8(%rsi),%r9 +12: movq 16(%rsi),%r10 +13: movq 24(%rsi),%r11 +20: movnti %r8,(%rdi) +21: movnti %r9,8(%rdi) +22: movnti %r10,16(%rdi) +23: movnti %r11,24(%rdi) +30: movq 32(%rsi),%r8 +31: movq 40(%rsi),%r9 +32: movq 48(%rsi),%r10 +33: movq 56(%rsi),%r11 +40: movnti %r8,32(%rdi) +41: movnti %r9,40(%rdi) +42: movnti %r10,48(%rdi) +43: movnti %r11,56(%rdi) + + addq $64,%rsi + addq $64,%rdi + sub $64,%edx + cmp $64,%edx + jae .Lunrolled + +/* + * First set of user mode loads have been done + * without any stores, so if they fail, we can + * just try the non-unrolled loop. + */ +_ASM_EXTABLE_UA(10b, .Lquadwords) +_ASM_EXTABLE_UA(11b, .Lquadwords) +_ASM_EXTABLE_UA(12b, .Lquadwords) +_ASM_EXTABLE_UA(13b, .Lquadwords) + +/* + * The second set of user mode loads have been + * done with 32 bytes stored to the destination, + * so we need to take that into account before + * falling back to the unrolled loop. + */ +_ASM_EXTABLE_UA(30b, .Lfixup32) +_ASM_EXTABLE_UA(31b, .Lfixup32) +_ASM_EXTABLE_UA(32b, .Lfixup32) +_ASM_EXTABLE_UA(33b, .Lfixup32) + +/* + * An exception on a write means that we're + * done, but we need to update the count + * depending on where in the unrolled loop + * we were. + */ +_ASM_EXTABLE_UA(20b, .Ldone0) +_ASM_EXTABLE_UA(21b, .Ldone8) +_ASM_EXTABLE_UA(22b, .Ldone16) +_ASM_EXTABLE_UA(23b, .Ldone24) +_ASM_EXTABLE_UA(40b, .Ldone32) +_ASM_EXTABLE_UA(41b, .Ldone40) +_ASM_EXTABLE_UA(42b, .Ldone48) +_ASM_EXTABLE_UA(43b, .Ldone56) + +.Lquadwords: + cmp $8,%edx + jb .Llong +50: movq (%rsi),%rax +51: movnti %rax,(%rdi) + addq $8,%rsi + addq $8,%rdi + sub $8,%edx + jmp .Lquadwords + +/* + * If we fail on the last full quadword, we will + * not try to do any byte-wise cached accesses. + * We will try to do one more 4-byte uncached + * one, though. + */ +_ASM_EXTABLE_UA(50b, .Llast4) +_ASM_EXTABLE_UA(51b, .Ldone0) + +.Llong: + test $4,%dl + je .Lword +60: movl (%rsi),%eax +61: movnti %eax,(%rdi) + addq $4,%rsi + addq $4,%rdi + sub $4,%edx +.Lword: + sfence + test $2,%dl + je .Lbyte +70: movw (%rsi),%ax +71: movw %ax,(%rdi) + addq $2,%rsi + addq $2,%rdi + sub $2,%edx +.Lbyte: + test $1,%dl + je .Ldone +80: movb (%rsi),%al +81: movb %al,(%rdi) + dec %edx +.Ldone: + mov %edx,%eax + RET + +/* + * If we fail on the last four bytes, we won't + * bother with any fixups. It's dead, Jim. Note + * that there's no need for 'sfence' for any + * of this, since the exception will have been + * serializing. + */ +_ASM_EXTABLE_UA(60b, .Ldone) +_ASM_EXTABLE_UA(61b, .Ldone) +_ASM_EXTABLE_UA(70b, .Ldone) +_ASM_EXTABLE_UA(71b, .Ldone) +_ASM_EXTABLE_UA(80b, .Ldone) +_ASM_EXTABLE_UA(81b, .Ldone) + +/* + * This is the "head needs aliging" case when + * the destination isn't 8-byte aligned. The + * 4-byte case can be done uncached, but any + * smaller alignment is done with regular stores. + */ +.Lalign: + test $1,%dil + je .Lalign_word + test %edx,%edx + je .Ldone +90: movb (%rsi),%al +91: movb %al,(%rdi) + inc %rsi + inc %rdi + dec %edx +.Lalign_word: + test $2,%dil + je .Lalign_long + cmp $2,%edx + jb .Lbyte +92: movw (%rsi),%ax +93: movw %ax,(%rdi) + addq $2,%rsi + addq $2,%rdi + sub $2,%edx +.Lalign_long: + test $4,%dil + je .Lis_aligned + cmp $4,%edx + jb .Lword +94: movl (%rsi),%eax +95: movnti %eax,(%rdi) + addq $4,%rsi + addq $4,%rdi + sub $4,%edx + jmp .Lis_aligned + +/* + * If we fail on the initial alignment accesses, + * we're all done. Again, no point in trying to + * do byte-by-byte probing if the 4-byte load + * fails - we're not doing any uncached accesses + * any more. + */ +_ASM_EXTABLE_UA(90b, .Ldone) +_ASM_EXTABLE_UA(91b, .Ldone) +_ASM_EXTABLE_UA(92b, .Ldone) +_ASM_EXTABLE_UA(93b, .Ldone) +_ASM_EXTABLE_UA(94b, .Ldone) +_ASM_EXTABLE_UA(95b, .Ldone) + +/* + * Exception table fixups for faults in the middle + */ +.Ldone56: sub $8,%edx +.Ldone48: sub $8,%edx +.Ldone40: sub $8,%edx +.Ldone32: sub $8,%edx +.Ldone24: sub $8,%edx +.Ldone16: sub $8,%edx +.Ldone8: sub $8,%edx +.Ldone0: + mov %edx,%eax + RET + +.Lfixup32: + addq $32,%rsi + addq $32,%rdi + sub $32,%edx + jmp .Lquadwords + +.Llast4: +52: movl (%rsi),%eax +53: movnti %eax,(%rdi) + sfence + sub $4,%edx + mov %edx,%eax + RET +_ASM_EXTABLE_UA(52b, .Ldone0) +_ASM_EXTABLE_UA(53b, .Ldone0) + +SYM_FUNC_END(__copy_user_nocache) +EXPORT_SYMBOL(__copy_user_nocache) |