arch/x86/lib/crc-pclmul-template.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582

/* SPDX-License-Identifier: GPL-2.0-or-later */
//
// Template to generate [V]PCLMULQDQ-based CRC functions for x86
//
// Copyright 2025 Google LLC
//
// Author: Eric Biggers <ebiggers@google.com>

#include <linux/linkage.h>
#include <linux/objtool.h>

// Offsets within the generated constants table
.set OFFSETOF_BSWAP_MASK,			-5*16	// msb-first CRCs only
.set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS,	-4*16	// must precede next
.set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS,	-3*16	// must precede next
.set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS,	-2*16	// must precede next
.set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS,	-1*16	// must precede next
.set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS,	0*16	// must be 0
.set OFFSETOF_SHUF_TABLE,			1*16
.set OFFSETOF_BARRETT_REDUCTION_CONSTS,		4*16

// Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the
// corresponding non-VEX instruction plus any needed moves.  The supported
// instruction formats are:
//
//     - Two-arg [src, dst], where the non-VEX format is the same.
//     - Three-arg [src1, src2, dst] where the non-VEX format is
//	 [src1, src2_and_dst].  If src2 != dst, then src1 must != dst too.
//
// \insn gives the instruction without a "v" prefix and including any immediate
// argument if needed to make the instruction follow one of the above formats.
// If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to
// it first; this is needed when \arg1 is an unaligned mem operand.
.macro	_cond_vex	insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp
.if AVX_LEVEL == 0
  // VEX not allowed.  Emulate it.
  .ifnb \arg3 // Three-arg [src1, src2, dst]
    .ifc "\arg2", "\arg3" // src2 == dst?
      .ifnb \unaligned_mem_tmp
	movdqu		\arg1, \unaligned_mem_tmp
	\insn		\unaligned_mem_tmp, \arg3
      .else
	\insn		\arg1, \arg3
      .endif
    .else // src2 != dst
      .ifc "\arg1", "\arg3"
	.error "Can't have src1 == dst when src2 != dst"
      .endif
      .ifnb \unaligned_mem_tmp
	movdqu		\arg1, \unaligned_mem_tmp
	movdqa		\arg2, \arg3
	\insn		\unaligned_mem_tmp, \arg3
      .else
	movdqa		\arg2, \arg3
	\insn		\arg1, \arg3
      .endif
    .endif
  .else // Two-arg [src, dst]
    .ifnb \unaligned_mem_tmp
	movdqu		\arg1, \unaligned_mem_tmp
	\insn		\unaligned_mem_tmp, \arg2
    .else
	\insn		\arg1, \arg2
    .endif
  .endif
.else
  // VEX is allowed.  Emit the desired instruction directly.
  .ifnb \arg3
	v\insn		\arg1, \arg2, \arg3
  .else
	v\insn		\arg1, \arg2
  .endif
.endif
.endm

// Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector
// register of length VL.
.macro	_vbroadcast	src, dst
.if VL == 16
	_cond_vex movdqa,	\src, \dst
.elseif VL == 32
	vbroadcasti128		\src, \dst
.else
	vbroadcasti32x4		\src, \dst
.endif
.endm

// Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC
// is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane.
.macro	_load_data	vl, src, bswap_mask, dst
.if \vl < 64
	_cond_vex movdqu,	"\src", \dst
.else
	vmovdqu8		\src, \dst
.endif
.if !LSB_CRC
	_cond_vex pshufb,	\bswap_mask, \dst, \dst
.endif
.endm

.macro	_prepare_v0	vl, v0, v1, bswap_mask
.if LSB_CRC
  .if \vl < 64
	_cond_vex pxor,		(BUF), \v0, \v0, unaligned_mem_tmp=\v1
  .else
	vpxorq			(BUF), \v0, \v0
  .endif
.else
	_load_data		\vl, (BUF), \bswap_mask, \v1
  .if \vl < 64
	_cond_vex pxor,		\v1, \v0, \v0
  .else
	vpxorq			\v1, \v0, \v0
  .endif
.endif
.endm

// The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for
// msb-first order or the physically high qword for lsb-first order
#define LO64_TERMS 0

// The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high
// qword for msb-first order or the physically low qword for lsb-first order
#define HI64_TERMS 1

// Multiply the given \src1_terms of each 128-bit lane of \src1 by the given
// \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst.
.macro	_pclmulqdq	src1, src1_terms, src2, src2_terms, dst
	_cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \
		  \src1, \src2, \dst
.endm

// Fold \acc into \data and store the result back into \acc.  \data can be an
// unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no
// byte-reflection is needed; otherwise it must be a vector register.  \consts
// is a vector register containing the needed fold constants, and \tmp is a
// temporary vector register.  All arguments must be the same length.
.macro	_fold_vec	acc, data, consts, tmp
	_pclmulqdq	\consts, HI64_TERMS, \acc, HI64_TERMS, \tmp
	_pclmulqdq	\consts, LO64_TERMS, \acc, LO64_TERMS, \acc
.if AVX_LEVEL <= 2
	_cond_vex pxor,	\data, \tmp, \tmp
	_cond_vex pxor,	\tmp, \acc, \acc
.else
	vpternlogq	$0x96, \data, \tmp, \acc
.endif
.endm

// Fold \acc into \data and store the result back into \acc.  \data is an
// unaligned mem operand, \consts is a vector register containing the needed
// fold constants, \bswap_mask is a vector register containing the
// byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are
// temporary vector registers.  All arguments must have length \vl.
.macro	_fold_vec_mem	vl, acc, data, consts, bswap_mask, tmp1, tmp2
.if AVX_LEVEL == 0 || !LSB_CRC
	_load_data	\vl, \data, \bswap_mask, \tmp1
	_fold_vec	\acc, \tmp1, \consts, \tmp2
.else
	_fold_vec	\acc, \data, \consts, \tmp1
.endif
.endm

// Load the constants for folding across 2**i vectors of length VL at a time
// into all 128-bit lanes of the vector register CONSTS.
.macro	_load_vec_folding_consts	i
	_vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \
		    CONSTS
.endm

// Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store
// the result back into \v0.  If the remaining length mod \vl is nonzero, also
// fold \vl data bytes from BUF.  For both operations the fold distance is \vl.
// \consts must be a register of length \vl containing the fold constants.
.macro	_fold_vec_final	vl, v0, v1, consts, bswap_mask, tmp1, tmp2
	_fold_vec	\v0, \v1, \consts, \tmp1
	test		$\vl, LEN8
	jz		.Lfold_vec_final_done\@
	_fold_vec_mem	\vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2
	add		$\vl, BUF
.Lfold_vec_final_done\@:
.endm

// This macro generates the body of a CRC function with the following prototype:
//
// crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts);
//
// |crc| is the initial CRC, and crc_t is a data type wide enough to hold it.
// |buf| is the data to checksum.  |len| is the data length in bytes, which must
// be at least 16.  |consts| is a pointer to the fold_across_128_bits_consts
// field of the constants struct that was generated for the chosen CRC variant.
//
// Moving onto the macro parameters, \n is the number of bits in the CRC, e.g.
// 32 for a CRC-32.  Currently the supported values are 8, 16, 32, and 64.  If
// the file is compiled in i386 mode, then the maximum supported value is 32.
//
// \lsb_crc is 1 if the CRC processes the least significant bit of each byte
// first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0.  \lsb_crc is 0
// if the CRC processes the most significant bit of each byte first, i.e. maps
// bit0 to x^0, bit1 to x^1, bit7 to x^7.
//
// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64.
//
// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or
// 512 for AVX512.
//
// If \vl == 16 && \avx_level == 0, the generated code requires:
// PCLMULQDQ && SSE4.1.  (Note: all known CPUs with PCLMULQDQ also have SSE4.1.)
//
// If \vl == 32 && \avx_level == 2, the generated code requires:
// VPCLMULQDQ && AVX2.
//
// If \vl == 64 && \avx_level == 512, the generated code requires:
// VPCLMULQDQ && AVX512BW && AVX512VL.
//
// Other \vl and \avx_level combinations are either not supported or not useful.
.macro	_crc_pclmul	n, lsb_crc, vl, avx_level
	.set	LSB_CRC,	\lsb_crc
	.set	VL,		\vl
	.set	AVX_LEVEL,	\avx_level

	// Define aliases for the xmm, ymm, or zmm registers according to VL.
.irp i, 0,1,2,3,4,5,6,7
  .if VL == 16
	.set	V\i,		%xmm\i
	.set	LOG2_VL,	4
  .elseif VL == 32
	.set	V\i,		%ymm\i
	.set	LOG2_VL,	5
  .elseif VL == 64
	.set	V\i,		%zmm\i
	.set	LOG2_VL,	6
  .else
	.error "Unsupported vector length"
  .endif
.endr
	// Define aliases for the function parameters.
	// Note: when crc_t is shorter than u32, zero-extension to 32 bits is
	// guaranteed by the ABI.  Zero-extension to 64 bits is *not* guaranteed
	// when crc_t is shorter than u64.
#ifdef __x86_64__
.if \n <= 32
	.set	CRC,		%edi
.else
	.set	CRC,		%rdi
.endif
	.set	BUF,		%rsi
	.set	LEN,		%rdx
	.set	LEN32,		%edx
	.set	LEN8,		%dl
	.set	CONSTS_PTR,	%rcx
#else
	// 32-bit support, assuming -mregparm=3 and not including support for
	// CRC-64 (which would use both eax and edx to pass the crc parameter).
	.set	CRC,		%eax
	.set	BUF,		%edx
	.set	LEN,		%ecx
	.set	LEN32,		%ecx
	.set	LEN8,		%cl
	.set	CONSTS_PTR,	%ebx	// Passed on stack
#endif

	// Define aliases for some local variables.  V0-V5 are used without
	// aliases (for accumulators, data, temporary values, etc).  Staying
	// within the first 8 vector registers keeps the code 32-bit SSE
	// compatible and reduces the size of 64-bit SSE code slightly.
	.set	BSWAP_MASK,	V6
	.set	BSWAP_MASK_YMM,	%ymm6
	.set	BSWAP_MASK_XMM,	%xmm6
	.set	CONSTS,		V7
	.set	CONSTS_YMM,	%ymm7
	.set	CONSTS_XMM,	%xmm7

	// Use ANNOTATE_NOENDBR to suppress an objtool warning, since the
	// functions generated by this macro are called only by static_call.
	ANNOTATE_NOENDBR

#ifdef __i386__
	push		CONSTS_PTR
	mov		8(%esp), CONSTS_PTR
#endif

	// Create a 128-bit vector that contains the initial CRC in the end
	// representing the high-order polynomial coefficients, and the rest 0.
	// If the CRC is msb-first, also load the byte-reflection table.
.if \n <= 32
	_cond_vex movd,	CRC, %xmm0
.else
	_cond_vex movq,	CRC, %xmm0
.endif
.if !LSB_CRC
	_cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0
	_vbroadcast	OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK
.endif

	// Load the first vector of data and XOR the initial CRC into the
	// appropriate end of the first 128-bit lane of data.  If LEN < VL, then
	// use a short vector and jump ahead to the final reduction.  (LEN >= 16
	// is guaranteed here but not necessarily LEN >= VL.)
.if VL >= 32
	cmp		$VL, LEN
	jae		.Lat_least_1vec\@
  .if VL == 64
	cmp		$32, LEN32
	jb		.Lless_than_32bytes\@
	_prepare_v0	32, %ymm0, %ymm1, BSWAP_MASK_YMM
	add		$32, BUF
	jmp		.Lreduce_256bits_to_128bits\@
.Lless_than_32bytes\@:
  .endif
	_prepare_v0	16, %xmm0, %xmm1, BSWAP_MASK_XMM
	add		$16, BUF
	vmovdqa		OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
	jmp		.Lcheck_for_partial_block\@
.Lat_least_1vec\@:
.endif
	_prepare_v0	VL, V0, V1, BSWAP_MASK

	// Handle VL <= LEN < 4*VL.
	cmp		$4*VL-1, LEN
	ja		.Lat_least_4vecs\@
	add		$VL, BUF
	// If VL <= LEN < 2*VL, then jump ahead to the reduction from 1 vector.
	// If VL==16 then load fold_across_128_bits_consts first, as the final
	// reduction depends on it and it won't be loaded anywhere else.
	cmp		$2*VL-1, LEN32
.if VL == 16
	_cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
.endif
	jbe		.Lreduce_1vec_to_128bits\@
	// Otherwise 2*VL <= LEN < 4*VL.  Load one more vector and jump ahead to
	// the reduction from 2 vectors.
	_load_data	VL, (BUF), BSWAP_MASK, V1
	add		$VL, BUF
	jmp		.Lreduce_2vecs_to_1\@

.Lat_least_4vecs\@:
	// Load 3 more vectors of data.
	_load_data	VL, 1*VL(BUF), BSWAP_MASK, V1
	_load_data	VL, 2*VL(BUF), BSWAP_MASK, V2
	_load_data	VL, 3*VL(BUF), BSWAP_MASK, V3
	sub		$-4*VL, BUF	// Shorter than 'add 4*VL' when VL=32
	add		$-4*VL, LEN	// Shorter than 'sub 4*VL' when VL=32

	// Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next
	// 4 vectors of data and write the result back to V0-V3.
	cmp		$4*VL-1, LEN	// Shorter than 'cmp 4*VL' when VL=32
	jbe		.Lreduce_4vecs_to_2\@
	_load_vec_folding_consts	2
.Lfold_4vecs_loop\@:
	_fold_vec_mem	VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
	_fold_vec_mem	VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
	_fold_vec_mem	VL, V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
	_fold_vec_mem	VL, V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
	sub		$-4*VL, BUF
	add		$-4*VL, LEN
	cmp		$4*VL-1, LEN
	ja		.Lfold_4vecs_loop\@

	// Fold V0,V1 into V2,V3 and write the result back to V0,V1.  Then fold
	// two more vectors of data from BUF, if at least that much remains.
.Lreduce_4vecs_to_2\@:
	_load_vec_folding_consts	1
	_fold_vec	V0, V2, CONSTS, V4
	_fold_vec	V1, V3, CONSTS, V4
	test		$2*VL, LEN8
	jz		.Lreduce_2vecs_to_1\@
	_fold_vec_mem	VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
	_fold_vec_mem	VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
	sub		$-2*VL, BUF

	// Fold V0 into V1 and write the result back to V0.  Then fold one more
	// vector of data from BUF, if at least that much remains.
.Lreduce_2vecs_to_1\@:
	_load_vec_folding_consts	0
	_fold_vec_final	VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5

.Lreduce_1vec_to_128bits\@:
.if VL == 64
	// Reduce 512-bit %zmm0 to 256-bit %ymm0.  Then fold 256 more bits of
	// data from BUF, if at least that much remains.
	vbroadcasti128	OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM
	vextracti64x4	$1, %zmm0, %ymm1
	_fold_vec_final	32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5
.Lreduce_256bits_to_128bits\@:
.endif
.if VL >= 32
	// Reduce 256-bit %ymm0 to 128-bit %xmm0.  Then fold 128 more bits of
	// data from BUF, if at least that much remains.
	vmovdqa		OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
	vextracti128	$1, %ymm0, %xmm1
	_fold_vec_final	16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5
.Lcheck_for_partial_block\@:
.endif
	and		$15, LEN32
	jz		.Lreduce_128bits_to_crc\@

	// 1 <= LEN <= 15 data bytes remain in BUF.  The polynomial is now
	// A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0
	// and B is the polynomial of the remaining LEN data bytes.  To reduce
	// this to 128 bits without needing fold constants for each possible
	// LEN, rearrange this expression into C1*(x^128) + C2, where
	// C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128.
	// Then fold C1 into C2, which is just another fold across 128 bits.

.if !LSB_CRC || AVX_LEVEL == 0
	// Load the last 16 data bytes.  Note that originally LEN was >= 16.
	_load_data	16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2
.endif // Else will use vpblendvb mem operand later.
.if !LSB_CRC
	neg		LEN	// Needed for indexing shuf_table
.endif

	// tmp = A*x^(8*LEN) mod x^128
	// lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1]
	//	i.e. right-shift by LEN bytes.
	// msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN]
	//	i.e. left-shift by LEN bytes.
	_cond_vex movdqu,	"OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3
	_cond_vex pshufb,	%xmm3, %xmm0, %xmm1

	// C1 = floor(A / x^(128 - 8*LEN))
	// lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1]
	//	i.e. left-shift by 16-LEN bytes.
	// msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1]
	//	i.e. right-shift by 16-LEN bytes.
	_cond_vex pshufb,	"OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \
				%xmm0, %xmm0, unaligned_mem_tmp=%xmm4

	// C2 = tmp + B.  This is just a blend of tmp with the last 16 data
	// bytes (reflected if msb-first).  The blend mask is the shuffle table
	// that was used to create tmp.  0 selects tmp, and 1 last16databytes.
.if AVX_LEVEL == 0
	movdqa		%xmm0, %xmm4
	movdqa		%xmm3, %xmm0
	pblendvb	%xmm2, %xmm1	// uses %xmm0 as implicit operand
	movdqa		%xmm4, %xmm0
.elseif LSB_CRC
	vpblendvb	%xmm3, -16(BUF,LEN), %xmm1, %xmm1
.else
	vpblendvb	%xmm3, %xmm2, %xmm1, %xmm1
.endif

	// Fold C1 into C2 and store the 128-bit result in %xmm0.
	_fold_vec	%xmm0, %xmm1, CONSTS_XMM, %xmm4

.Lreduce_128bits_to_crc\@:
	// Compute the CRC as %xmm0 * x^n mod G.  Here %xmm0 means the 128-bit
	// polynomial stored in %xmm0 (using either lsb-first or msb-first bit
	// order according to LSB_CRC), and G is the CRC's generator polynomial.

	// First, multiply %xmm0 by x^n and reduce the result to 64+n bits:
	//
	//	t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) +
	//	      x^n * (%xmm0 mod x^64)
	//
	// Store t0 * x^(64-n) in %xmm0.  I.e., actually do:
	//
	//	%xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) +
	//		 x^64 * (%xmm0 mod x^64)
	//
	// The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned
	// to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily
	// select it.  The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the
	// msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case
	// (considering the extra factor of x that gets implicitly introduced by
	// each pclmulqdq when using lsb-first order), is identical to the
	// constant that was used earlier for folding the LO64_TERMS across 128
	// bits.  Thus it's already available in LO64_TERMS of CONSTS_XMM.
	_pclmulqdq		CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1
.if LSB_CRC
	_cond_vex psrldq,	$8, %xmm0, %xmm0  // x^64 * (%xmm0 mod x^64)
.else
	_cond_vex pslldq,	$8, %xmm0, %xmm0  // x^64 * (%xmm0 mod x^64)
.endif
	_cond_vex pxor,		%xmm1, %xmm0, %xmm0
	// The HI64_TERMS of %xmm0 now contain floor(t0 / x^n).
	// The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n).

	// First step of Barrett reduction: Compute floor(t0 / G).  This is the
	// polynomial by which G needs to be multiplied to cancel out the x^n
	// and higher terms of t0, i.e. to reduce t0 mod G.  First do:
	//
	//	t1 := floor(x^(63+n) / G) * x * floor(t0 / x^n)
	//
	// Then the desired value floor(t0 / G) is floor(t1 / x^64).  The 63 in
	// x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest
	// value that makes enough precision be carried through the calculation.
	//
	// The '* x' makes it so the result is floor(t1 / x^64) rather than
	// floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it
	// can be extracted much more easily in the next step.  In the lsb-first
	// case the '* x' happens implicitly.  In the msb-first case it must be
	// done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the
	// constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and
	// the multiplication by the x^64 term is handled using a pxor.  The
	// pxor causes the low 64 terms of t1 to be wrong, but they are unused.
	_cond_vex movdqa,	OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM
	_pclmulqdq		CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1
.if !LSB_CRC
	_cond_vex pxor,		%xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n)
.endif
	// The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G).

	// Second step of Barrett reduction: Cancel out the x^n and higher terms
	// of t0 by subtracting the needed multiple of G.  This gives the CRC:
	//
	//	crc := t0 - (G * floor(t0 / G))
	//
	// But %xmm0 contains t0 * x^(64-n), so it's more convenient to do:
	//
	//	crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n)
	//
	// Furthermore, since the resulting CRC is n-bit, if mod x^n is
	// explicitly applied to it then the x^n term of G makes no difference
	// in the result and can be omitted.  This helps keep the constant
	// multiplier in 64 bits in most cases.  This gives the following:
	//
	//	%xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G))
	//	crc := (%xmm0 / x^(64-n)) mod x^n
	//
	// In the lsb-first case, each pclmulqdq implicitly introduces
	// an extra factor of x, so in that case the constant that needs to be
	// passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63.
	// For lsb-first CRCs where n=64, the extra factor of x cannot be as
	// easily avoided.  In that case, instead pass '(G - x^n - x^0) / x' to
	// pclmulqdq and handle the x^0 term (i.e. 1) separately.  (All CRC
	// polynomials have nonzero x^n and x^0 terms.)  It works out as: the
	// CRC has be XORed with the physically low qword of %xmm1, representing
	// floor(t0 / G).  The most efficient way to do that is to move it to
	// the physically high qword and use a ternlog to combine the two XORs.
.if LSB_CRC && \n == 64
	_cond_vex punpcklqdq,	%xmm1, %xmm2, %xmm2
	_pclmulqdq		CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
    .if AVX_LEVEL <= 2
	_cond_vex pxor,		%xmm2, %xmm0, %xmm0
	_cond_vex pxor,		%xmm1, %xmm0, %xmm0
    .else
	vpternlogq		$0x96, %xmm2, %xmm1, %xmm0
    .endif
	_cond_vex "pextrq $1,",	%xmm0, %rax  // (%xmm0 / x^0) mod x^64
.else
	_pclmulqdq		CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
	_cond_vex pxor,		%xmm1, %xmm0, %xmm0
  .if \n == 8
	_cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8
  .elseif \n == 16
	_cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16
  .elseif \n == 32
	_cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32
  .else // \n == 64 && !LSB_CRC
	_cond_vex movq,		%xmm0, %rax  // (%xmm0 / x^0) mod x^64
  .endif
.endif

.if VL > 16
	vzeroupper	// Needed when ymm or zmm registers may have been used.
.endif
#ifdef __i386__
	pop		CONSTS_PTR
#endif
	RET
.endm

#ifdef CONFIG_AS_VPCLMULQDQ
#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb)			\
SYM_FUNC_START(prefix##_pclmul_sse);					\
	_crc_pclmul	n=bits, lsb_crc=lsb, vl=16, avx_level=0;	\
SYM_FUNC_END(prefix##_pclmul_sse);					\
									\
SYM_FUNC_START(prefix##_vpclmul_avx2);					\
	_crc_pclmul	n=bits, lsb_crc=lsb, vl=32, avx_level=2;	\
SYM_FUNC_END(prefix##_vpclmul_avx2);					\
									\
SYM_FUNC_START(prefix##_vpclmul_avx512);				\
	_crc_pclmul	n=bits, lsb_crc=lsb, vl=64, avx_level=512;	\
SYM_FUNC_END(prefix##_vpclmul_avx512);
#else
#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb)			\
SYM_FUNC_START(prefix##_pclmul_sse);					\
	_crc_pclmul	n=bits, lsb_crc=lsb, vl=16, avx_level=0;	\
SYM_FUNC_END(prefix##_pclmul_sse);
#endif // !CONFIG_AS_VPCLMULQDQ