summaryrefslogtreecommitdiffstats
path: root/arch/x86/lib/copy_user_uncached_64.S
blob: 5c5f38d326724134ab8691e90689ce3a1f9adce8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
 */

#include <linux/linkage.h>
#include <asm/asm.h>
#include <asm/export.h>

/*
 * copy_user_nocache - Uncached memory copy with exception handling
 *
 * This copies from user space into kernel space, but the kernel
 * space accesses can take a machine check exception, so they too
 * need exception handling.
 *
 * Note: only 32-bit and 64-bit stores have non-temporal versions,
 * and we only use aligned versions. Any unaligned parts at the
 * start or end of the copy will be done using normal cached stores.
 *
 * Input:
 * rdi destination
 * rsi source
 * edx count
 *
 * Output:
 * rax uncopied bytes or 0 if successful.
 */
SYM_FUNC_START(__copy_user_nocache)
	/* If destination is not 7-byte aligned, we'll have to align it */
	testb $7,%dil
	jne .Lalign

.Lis_aligned:
	cmp $64,%edx
	jb .Lquadwords

	.p2align 4,0x90
.Lunrolled:
10:	movq (%rsi),%r8
11:	movq 8(%rsi),%r9
12:	movq 16(%rsi),%r10
13:	movq 24(%rsi),%r11
20:	movnti %r8,(%rdi)
21:	movnti %r9,8(%rdi)
22:	movnti %r10,16(%rdi)
23:	movnti %r11,24(%rdi)
30:	movq 32(%rsi),%r8
31:	movq 40(%rsi),%r9
32:	movq 48(%rsi),%r10
33:	movq 56(%rsi),%r11
40:	movnti %r8,32(%rdi)
41:	movnti %r9,40(%rdi)
42:	movnti %r10,48(%rdi)
43:	movnti %r11,56(%rdi)

	addq $64,%rsi
	addq $64,%rdi
	sub $64,%edx
	cmp $64,%edx
	jae .Lunrolled

/*
 * First set of user mode loads have been done
 * without any stores, so if they fail, we can
 * just try the non-unrolled loop.
 */
_ASM_EXTABLE_UA(10b, .Lquadwords)
_ASM_EXTABLE_UA(11b, .Lquadwords)
_ASM_EXTABLE_UA(12b, .Lquadwords)
_ASM_EXTABLE_UA(13b, .Lquadwords)

/*
 * The second set of user mode loads have been
 * done with 32 bytes stored to the destination,
 * so we need to take that into account before
 * falling back to the unrolled loop.
 */
_ASM_EXTABLE_UA(30b, .Lfixup32)
_ASM_EXTABLE_UA(31b, .Lfixup32)
_ASM_EXTABLE_UA(32b, .Lfixup32)
_ASM_EXTABLE_UA(33b, .Lfixup32)

/*
 * An exception on a write means that we're
 * done, but we need to update the count
 * depending on where in the unrolled loop
 * we were.
 */
_ASM_EXTABLE_UA(20b, .Ldone0)
_ASM_EXTABLE_UA(21b, .Ldone8)
_ASM_EXTABLE_UA(22b, .Ldone16)
_ASM_EXTABLE_UA(23b, .Ldone24)
_ASM_EXTABLE_UA(40b, .Ldone32)
_ASM_EXTABLE_UA(41b, .Ldone40)
_ASM_EXTABLE_UA(42b, .Ldone48)
_ASM_EXTABLE_UA(43b, .Ldone56)

.Lquadwords:
	cmp $8,%edx
	jb .Llong
50:	movq (%rsi),%rax
51:	movnti %rax,(%rdi)
	addq $8,%rsi
	addq $8,%rdi
	sub $8,%edx
	jmp .Lquadwords

/*
 * If we fail on the last full quadword, we will
 * not try to do any byte-wise cached accesses.
 * We will try to do one more 4-byte uncached
 * one, though.
 */
_ASM_EXTABLE_UA(50b, .Llast4)
_ASM_EXTABLE_UA(51b, .Ldone0)

.Llong:
	test $4,%dl
	je .Lword
60:	movl (%rsi),%eax
61:	movnti %eax,(%rdi)
	addq $4,%rsi
	addq $4,%rdi
	sub $4,%edx
.Lword:
	sfence
	test $2,%dl
	je .Lbyte
70:	movw (%rsi),%ax
71:	movw %ax,(%rdi)
	addq $2,%rsi
	addq $2,%rdi
	sub $2,%edx
.Lbyte:
	test $1,%dl
	je .Ldone
80:	movb (%rsi),%al
81:	movb %al,(%rdi)
	dec %edx
.Ldone:
	mov %edx,%eax
	RET

/*
 * If we fail on the last four bytes, we won't
 * bother with any fixups. It's dead, Jim. Note
 * that there's no need for 'sfence' for any
 * of this, since the exception will have been
 * serializing.
 */
_ASM_EXTABLE_UA(60b, .Ldone)
_ASM_EXTABLE_UA(61b, .Ldone)
_ASM_EXTABLE_UA(70b, .Ldone)
_ASM_EXTABLE_UA(71b, .Ldone)
_ASM_EXTABLE_UA(80b, .Ldone)
_ASM_EXTABLE_UA(81b, .Ldone)

/*
 * This is the "head needs aliging" case when
 * the destination isn't 8-byte aligned. The
 * 4-byte case can be done uncached, but any
 * smaller alignment is done with regular stores.
 */
.Lalign:
	test $1,%dil
	je .Lalign_word
	test %edx,%edx
	je .Ldone
90:	movb (%rsi),%al
91:	movb %al,(%rdi)
	inc %rsi
	inc %rdi
	dec %edx
.Lalign_word:
	test $2,%dil
	je .Lalign_long
	cmp $2,%edx
	jb .Lbyte
92:	movw (%rsi),%ax
93:	movw %ax,(%rdi)
	addq $2,%rsi
	addq $2,%rdi
	sub $2,%edx
.Lalign_long:
	test $4,%dil
	je .Lis_aligned
	cmp $4,%edx
	jb .Lword
94:	movl (%rsi),%eax
95:	movnti %eax,(%rdi)
	addq $4,%rsi
	addq $4,%rdi
	sub $4,%edx
	jmp .Lis_aligned

/*
 * If we fail on the initial alignment accesses,
 * we're all done. Again, no point in trying to
 * do byte-by-byte probing if the 4-byte load
 * fails - we're not doing any uncached accesses
 * any more.
 */
_ASM_EXTABLE_UA(90b, .Ldone)
_ASM_EXTABLE_UA(91b, .Ldone)
_ASM_EXTABLE_UA(92b, .Ldone)
_ASM_EXTABLE_UA(93b, .Ldone)
_ASM_EXTABLE_UA(94b, .Ldone)
_ASM_EXTABLE_UA(95b, .Ldone)

/*
 * Exception table fixups for faults in the middle
 */
.Ldone56: sub $8,%edx
.Ldone48: sub $8,%edx
.Ldone40: sub $8,%edx
.Ldone32: sub $8,%edx
.Ldone24: sub $8,%edx
.Ldone16: sub $8,%edx
.Ldone8: sub $8,%edx
.Ldone0:
	mov %edx,%eax
	RET

.Lfixup32:
	addq $32,%rsi
	addq $32,%rdi
	sub $32,%edx
	jmp .Lquadwords

.Llast4:
52:	movl (%rsi),%eax
53:	movnti %eax,(%rdi)
	sfence
	sub $4,%edx
	mov %edx,%eax
	RET
_ASM_EXTABLE_UA(52b, .Ldone0)
_ASM_EXTABLE_UA(53b, .Ldone0)

SYM_FUNC_END(__copy_user_nocache)
EXPORT_SYMBOL(__copy_user_nocache)