1 files changed, 845 insertions, 0 deletions
diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
new file mode 100644
index 000000000000..48f97b79f7a9
--- /dev/null
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -0,0 +1,845 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * AES-XTS for modern x86_64 CPUs
+ *
+ * Copyright 2024 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+/*
+ * This file implements AES-XTS for modern x86_64 CPUs.  To handle the
+ * complexities of coding for x86 SIMD, e.g. where every vector length needs
+ * different code, it uses a macro to generate several implementations that
+ * share similar source code but are targeted at different CPUs, listed below:
+ *
+ * AES-NI + AVX
+ *    - 128-bit vectors (1 AES block per vector)
+ *    - VEX-coded instructions
+ *    - xmm0-xmm15
+ *    - This is for older CPUs that lack VAES but do have AVX.
+ *
+ * VAES + VPCLMULQDQ + AVX2
+ *    - 256-bit vectors (2 AES blocks per vector)
+ *    - VEX-coded instructions
+ *    - ymm0-ymm15
+ *    - This is for CPUs that have VAES but lack AVX512 or AVX10,
+ *      e.g. Intel's Alder Lake and AMD's Zen 3.
+ *
+ * VAES + VPCLMULQDQ + AVX10/256 + BMI2
+ *    - 256-bit vectors (2 AES blocks per vector)
+ *    - EVEX-coded instructions
+ *    - ymm0-ymm31
+ *    - This is for CPUs that have AVX512 but where using zmm registers causes
+ *      downclocking, and for CPUs that have AVX10/256 but not AVX10/512.
+ *    - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256.
+ *      To avoid confusion with 512-bit, we just write AVX10/256.
+ *
+ * VAES + VPCLMULQDQ + AVX10/512 + BMI2
+ *    - Same as the previous one, but upgrades to 512-bit vectors
+ *      (4 AES blocks per vector) in zmm0-zmm31.
+ *    - This is for CPUs that have good AVX512 or AVX10/512 support.
+ *
+ * This file doesn't have an implementation for AES-NI alone (without AVX), as
+ * the lack of VEX would make all the assembly code different.
+ *
+ * When we use VAES, we also use VPCLMULQDQ to parallelize the computation of
+ * the XTS tweaks.  This avoids a bottleneck.  Currently there don't seem to be
+ * any CPUs that support VAES but not VPCLMULQDQ.  If that changes, we might
+ * need to start also providing an implementation using VAES alone.
+ *
+ * The AES-XTS implementations in this file support everything required by the
+ * crypto API, including support for arbitrary input lengths and multi-part
+ * processing.  However, they are most heavily optimized for the common case of
+ * power-of-2 length inputs that are processed in a single part (disk sectors).
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+
+.section .rodata
+.p2align 4
+.Lgf_poly:
+	// The low 64 bits of this value represent the polynomial x^7 + x^2 + x
+	// + 1.  It is the value that must be XOR'd into the low 64 bits of the
+	// tweak each time a 1 is carried out of the high 64 bits.
+	//
+	// The high 64 bits of this value is just the internal carry bit that
+	// exists when there's a carry out of the low 64 bits of the tweak.
+	.quad	0x87, 1
+
+	// This table contains constants for vpshufb and vpblendvb, used to
+	// handle variable byte shifts and blending during ciphertext stealing
+	// on CPUs that don't support AVX10-style masking.
+.Lcts_permute_table:
+	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+	.byte	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+.text
+
+// Function parameters
+.set	KEY,		%rdi	// Initially points to crypto_aes_ctx, then is
+				// advanced to point to 7th-from-last round key
+.set	SRC,		%rsi	// Pointer to next source data
+.set	DST,		%rdx	// Pointer to next destination data
+.set	LEN,		%ecx	// Remaining length in bytes
+.set	LEN8,		%cl
+.set	LEN64,		%rcx
+.set	TWEAK,		%r8	// Pointer to next tweak
+
+// %rax holds the AES key length in bytes.
+.set	KEYLEN,		%eax
+.set	KEYLEN64,	%rax
+
+// %r9-r11 are available as temporaries.
+
+.macro	_define_Vi	i
+.if VL == 16
+	.set	V\i,		%xmm\i
+.elseif VL == 32
+	.set	V\i,		%ymm\i
+.elseif VL == 64
+	.set	V\i,		%zmm\i
+.else
+	.error "Unsupported Vector Length (VL)"
+.endif
+.endm
+
+.macro _define_aliases
+	// Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers
+	// are available, that map to the xmm, ymm, or zmm registers according
+	// to the selected Vector Length (VL).
+	_define_Vi	0
+	_define_Vi	1
+	_define_Vi	2
+	_define_Vi	3
+	_define_Vi	4
+	_define_Vi	5
+	_define_Vi	6
+	_define_Vi	7
+	_define_Vi	8
+	_define_Vi	9
+	_define_Vi	10
+	_define_Vi	11
+	_define_Vi	12
+	_define_Vi	13
+	_define_Vi	14
+	_define_Vi	15
+.if USE_AVX10
+	_define_Vi	16
+	_define_Vi	17
+	_define_Vi	18
+	_define_Vi	19
+	_define_Vi	20
+	_define_Vi	21
+	_define_Vi	22
+	_define_Vi	23
+	_define_Vi	24
+	_define_Vi	25
+	_define_Vi	26
+	_define_Vi	27
+	_define_Vi	28
+	_define_Vi	29
+	_define_Vi	30
+	_define_Vi	31
+.endif
+
+	// V0-V3 hold the data blocks during the main loop, or temporary values
+	// otherwise.  V4-V5 hold temporary values.
+
+	// V6-V9 hold XTS tweaks.  Each 128-bit lane holds one tweak.
+	.set	TWEAK0_XMM,	%xmm6
+	.set	TWEAK0,		V6
+	.set	TWEAK1_XMM,	%xmm7
+	.set	TWEAK1,		V7
+	.set	TWEAK2,		V8
+	.set	TWEAK3,		V9
+
+	// V10-V13 are used for computing the next values of TWEAK[0-3].
+	.set	NEXT_TWEAK0,	V10
+	.set	NEXT_TWEAK1,	V11
+	.set	NEXT_TWEAK2,	V12
+	.set	NEXT_TWEAK3,	V13
+
+	// V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes.
+	.set	GF_POLY_XMM,	%xmm14
+	.set	GF_POLY,	V14
+
+	// V15 holds the key for AES "round 0", copied to all 128-bit lanes.
+	.set	KEY0_XMM,	%xmm15
+	.set	KEY0,		V15
+
+	// If 32 SIMD registers are available, then V16-V29 hold the remaining
+	// AES round keys, copied to all 128-bit lanes.
+	//
+	// AES-128, AES-192, and AES-256 use different numbers of round keys.
+	// To allow handling all three variants efficiently, we align the round
+	// keys to the *end* of this register range.  I.e., AES-128 uses
+	// KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
+	// (All also use KEY0 for the XOR-only "round" at the beginning.)
+.if USE_AVX10
+	.set	KEY1_XMM,	%xmm16
+	.set	KEY1,		V16
+	.set	KEY2_XMM,	%xmm17
+	.set	KEY2,		V17
+	.set	KEY3_XMM,	%xmm18
+	.set	KEY3,		V18
+	.set	KEY4_XMM,	%xmm19
+	.set	KEY4,		V19
+	.set	KEY5_XMM,	%xmm20
+	.set	KEY5,		V20
+	.set	KEY6_XMM,	%xmm21
+	.set	KEY6,		V21
+	.set	KEY7_XMM,	%xmm22
+	.set	KEY7,		V22
+	.set	KEY8_XMM,	%xmm23
+	.set	KEY8,		V23
+	.set	KEY9_XMM,	%xmm24
+	.set	KEY9,		V24
+	.set	KEY10_XMM,	%xmm25
+	.set	KEY10,		V25
+	.set	KEY11_XMM,	%xmm26
+	.set	KEY11,		V26
+	.set	KEY12_XMM,	%xmm27
+	.set	KEY12,		V27
+	.set	KEY13_XMM,	%xmm28
+	.set	KEY13,		V28
+	.set	KEY14_XMM,	%xmm29
+	.set	KEY14,		V29
+.endif
+	// V30-V31 are currently unused.
+.endm
+
+// Move a vector between memory and a register.
+.macro	_vmovdqu	src, dst
+.if VL < 64
+	vmovdqu		\src, \dst
+.else
+	vmovdqu8	\src, \dst
+.endif
+.endm
+
+// Broadcast a 128-bit value into a vector.
+.macro	_vbroadcast128	src, dst
+.if VL == 16 && !USE_AVX10
+	vmovdqu		\src, \dst
+.elseif VL == 32 && !USE_AVX10
+	vbroadcasti128	\src, \dst
+.else
+	vbroadcasti32x4	\src, \dst
+.endif
+.endm
+
+// XOR two vectors together.
+.macro	_vpxor	src1, src2, dst
+.if USE_AVX10
+	vpxord		\src1, \src2, \dst
+.else
+	vpxor		\src1, \src2, \dst
+.endif
+.endm
+
+// XOR three vectors together.
+.macro	_xor3	src1, src2, src3_and_dst
+.if USE_AVX10
+	// vpternlogd with immediate 0x96 is a three-argument XOR.
+	vpternlogd	$0x96, \src1, \src2, \src3_and_dst
+.else
+	vpxor		\src1, \src3_and_dst, \src3_and_dst
+	vpxor		\src2, \src3_and_dst, \src3_and_dst
+.endif
+.endm
+
+// Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak
+// (by multiplying by the polynomial 'x') and write it to \dst.
+.macro	_next_tweak	src, tmp, dst
+	vpshufd		$0x13, \src, \tmp
+	vpaddq		\src, \src, \dst
+	vpsrad		$31, \tmp, \tmp
+	vpand		GF_POLY_XMM, \tmp, \tmp
+	vpxor		\tmp, \dst, \dst
+.endm
+
+// Given the XTS tweak(s) in the vector \src, compute the next vector of
+// tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst.
+//
+// If VL > 16, then there are multiple tweaks, and we use vpclmulqdq to compute
+// all tweaks in the vector in parallel.  If VL=16, we just do the regular
+// computation without vpclmulqdq, as it's the faster method for a single tweak.
+.macro	_next_tweakvec	src, tmp1, tmp2, dst
+.if VL == 16
+	_next_tweak	\src, \tmp1, \dst
+.else
+	vpsrlq		$64 - VL/16, \src, \tmp1
+	vpclmulqdq	$0x01, GF_POLY, \tmp1, \tmp2
+	vpslldq		$8, \tmp1, \tmp1
+	vpsllq		$VL/16, \src, \dst
+	_xor3		\tmp1, \tmp2, \dst
+.endif
+.endm
+
+// Given the first XTS tweak at (TWEAK), compute the first set of tweaks and
+// store them in the vector registers TWEAK0-TWEAK3.  Clobbers V0-V5.
+.macro	_compute_first_set_of_tweaks
+	vmovdqu		(TWEAK), TWEAK0_XMM
+	_vbroadcast128	.Lgf_poly(%rip), GF_POLY
+.if VL == 16
+	// With VL=16, multiplying by x serially is fastest.
+	_next_tweak	TWEAK0, %xmm0, TWEAK1
+	_next_tweak	TWEAK1, %xmm0, TWEAK2
+	_next_tweak	TWEAK2, %xmm0, TWEAK3
+.else
+.if VL == 32
+	// Compute the second block of TWEAK0.
+	_next_tweak	TWEAK0_XMM, %xmm0, %xmm1
+	vinserti128	$1, %xmm1, TWEAK0, TWEAK0
+.elseif VL == 64
+	// Compute the remaining blocks of TWEAK0.
+	_next_tweak	TWEAK0_XMM, %xmm0, %xmm1
+	_next_tweak	%xmm1, %xmm0, %xmm2
+	_next_tweak	%xmm2, %xmm0, %xmm3
+	vinserti32x4	$1, %xmm1, TWEAK0, TWEAK0
+	vinserti32x4	$2, %xmm2, TWEAK0, TWEAK0
+	vinserti32x4	$3, %xmm3, TWEAK0, TWEAK0
+.endif
+	// Compute TWEAK[1-3] from TWEAK0.
+	vpsrlq		$64 - 1*VL/16, TWEAK0, V0
+	vpsrlq		$64 - 2*VL/16, TWEAK0, V2
+	vpsrlq		$64 - 3*VL/16, TWEAK0, V4
+	vpclmulqdq	$0x01, GF_POLY, V0, V1
+	vpclmulqdq	$0x01, GF_POLY, V2, V3
+	vpclmulqdq	$0x01, GF_POLY, V4, V5
+	vpslldq		$8, V0, V0
+	vpslldq		$8, V2, V2
+	vpslldq		$8, V4, V4
+	vpsllq		$1*VL/16, TWEAK0, TWEAK1
+	vpsllq		$2*VL/16, TWEAK0, TWEAK2
+	vpsllq		$3*VL/16, TWEAK0, TWEAK3
+.if USE_AVX10
+	vpternlogd	$0x96, V0, V1, TWEAK1
+	vpternlogd	$0x96, V2, V3, TWEAK2
+	vpternlogd	$0x96, V4, V5, TWEAK3
+.else
+	vpxor		V0, TWEAK1, TWEAK1
+	vpxor		V2, TWEAK2, TWEAK2
+	vpxor		V4, TWEAK3, TWEAK3
+	vpxor		V1, TWEAK1, TWEAK1
+	vpxor		V3, TWEAK2, TWEAK2
+	vpxor		V5, TWEAK3, TWEAK3
+.endif
+.endif
+.endm
+
+// Do one step in computing the next set of tweaks using the method of just
+// multiplying by x repeatedly (the same method _next_tweak uses).
+.macro	_tweak_step_mulx	i
+.if \i == 0
+	.set PREV_TWEAK, TWEAK3
+	.set NEXT_TWEAK, NEXT_TWEAK0
+.elseif \i == 5
+	.set PREV_TWEAK, NEXT_TWEAK0
+	.set NEXT_TWEAK, NEXT_TWEAK1
+.elseif \i == 10
+	.set PREV_TWEAK, NEXT_TWEAK1
+	.set NEXT_TWEAK, NEXT_TWEAK2
+.elseif \i == 15
+	.set PREV_TWEAK, NEXT_TWEAK2
+	.set NEXT_TWEAK, NEXT_TWEAK3
+.endif
+.if \i >= 0 && \i < 20 && \i % 5 == 0
+	vpshufd		$0x13, PREV_TWEAK, V5
+.elseif \i >= 0 && \i < 20 && \i % 5 == 1
+	vpaddq		PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK
+.elseif \i >= 0 && \i < 20 && \i % 5 == 2
+	vpsrad		$31, V5, V5
+.elseif \i >= 0 && \i < 20 && \i % 5 == 3
+	vpand		GF_POLY, V5, V5
+.elseif \i >= 0 && \i < 20 && \i % 5 == 4
+	vpxor		V5, NEXT_TWEAK, NEXT_TWEAK
+.elseif \i == 1000
+	vmovdqa		NEXT_TWEAK0, TWEAK0
+	vmovdqa		NEXT_TWEAK1, TWEAK1
+	vmovdqa		NEXT_TWEAK2, TWEAK2
+	vmovdqa		NEXT_TWEAK3, TWEAK3
+.endif
+.endm
+
+// Do one step in computing the next set of tweaks using the VPCLMULQDQ method
+// (the same method _next_tweakvec uses for VL > 16).  This means multiplying
+// each tweak by x^(4*VL/16) independently.  Since 4*VL/16 is a multiple of 8
+// when VL > 16 (which it is here), the needed shift amounts are byte-aligned,
+// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts.
+.macro	_tweak_step_pclmul	i
+.if \i == 0
+	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
+.elseif \i == 2
+	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
+.elseif \i == 4
+	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2
+.elseif \i == 6
+	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3
+.elseif \i == 8
+	vpclmulqdq	$0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0
+.elseif \i == 10
+	vpclmulqdq	$0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1
+.elseif \i == 12
+	vpclmulqdq	$0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2
+.elseif \i == 14
+	vpclmulqdq	$0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3
+.elseif \i == 1000
+	vpslldq		$(4*VL/16) / 8, TWEAK0, TWEAK0
+	vpslldq		$(4*VL/16) / 8, TWEAK1, TWEAK1
+	vpslldq		$(4*VL/16) / 8, TWEAK2, TWEAK2
+	vpslldq		$(4*VL/16) / 8, TWEAK3, TWEAK3
+	_vpxor		NEXT_TWEAK0, TWEAK0, TWEAK0
+	_vpxor		NEXT_TWEAK1, TWEAK1, TWEAK1
+	_vpxor		NEXT_TWEAK2, TWEAK2, TWEAK2
+	_vpxor		NEXT_TWEAK3, TWEAK3, TWEAK3
+.endif
+.endm
+
+// _tweak_step does one step of the computation of the next set of tweaks from
+// TWEAK[0-3].  To complete all steps, this is invoked with increasing values of
+// \i that include at least 0 through 19, then 1000 which signals the last step.
+//
+// This is used to interleave the computation of the next set of tweaks with the
+// AES en/decryptions, which increases performance in some cases.
+.macro	_tweak_step	i
+.if VL == 16
+	_tweak_step_mulx	\i
+.else
+	_tweak_step_pclmul	\i
+.endif
+.endm
+
+.macro	_setup_round_keys	enc
+
+	// Select either the encryption round keys or the decryption round keys.
+.if \enc
+	.set	OFFS, 0
+.else
+	.set	OFFS, 240
+.endif
+
+	// Load the round key for "round 0".
+	_vbroadcast128	OFFS(KEY), KEY0
+
+	// Increment KEY to make it so that 7*16(KEY) is the last round key.
+	// For AES-128, increment by 3*16, resulting in the 10 round keys (not
+	// counting the zero-th round key which was just loaded into KEY0) being
+	// -2*16(KEY) through 7*16(KEY).  For AES-192, increment by 5*16 and use
+	// 12 round keys -4*16(KEY) through 7*16(KEY).  For AES-256, increment
+	// by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY).
+	//
+	// This rebasing provides two benefits.  First, it makes the offset to
+	// any round key be in the range [-96, 112], fitting in a signed byte.
+	// This shortens VEX-encoded instructions that access the later round
+	// keys which otherwise would need 4-byte offsets.  Second, it makes it
+	// easy to do AES-128 and AES-192 by skipping irrelevant rounds at the
+	// beginning.  Skipping rounds at the end doesn't work as well because
+	// the last round needs different instructions.
+	//
+	// An alternative approach would be to roll up all the round loops.  We
+	// don't do that because it isn't compatible with caching the round keys
+	// in registers which we do when possible (see below), and also because
+	// it seems unwise to rely *too* heavily on the CPU's branch predictor.
+	lea		OFFS-16(KEY, KEYLEN64, 4), KEY
+
+	// If all 32 SIMD registers are available, cache all the round keys.
+.if USE_AVX10
+	cmp		$24, KEYLEN
+	jl		.Laes128\@
+	je		.Laes192\@
+	_vbroadcast128	-6*16(KEY), KEY1
+	_vbroadcast128	-5*16(KEY), KEY2
+.Laes192\@:
+	_vbroadcast128	-4*16(KEY), KEY3
+	_vbroadcast128	-3*16(KEY), KEY4
+.Laes128\@:
+	_vbroadcast128	-2*16(KEY), KEY5
+	_vbroadcast128	-1*16(KEY), KEY6
+	_vbroadcast128	0*16(KEY), KEY7
+	_vbroadcast128	1*16(KEY), KEY8
+	_vbroadcast128	2*16(KEY), KEY9
+	_vbroadcast128	3*16(KEY), KEY10
+	_vbroadcast128	4*16(KEY), KEY11
+	_vbroadcast128	5*16(KEY), KEY12
+	_vbroadcast128	6*16(KEY), KEY13
+	_vbroadcast128	7*16(KEY), KEY14
+.endif
+.endm
+
+// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0)
+// on the block(s) in \data using the round key(s) in \key.  The register length
+// determines the number of AES blocks en/decrypted.
+.macro	_vaes	enc, last, key, data
+.if \enc
+.if \last
+	vaesenclast	\key, \data, \data
+.else
+	vaesenc		\key, \data, \data
+.endif
+.else
+.if \last
+	vaesdeclast	\key, \data, \data
+.else
+	vaesdec		\key, \data, \data
+.endif
+.endif
+.endm
+
+// Do a single round of AES en/decryption on the block(s) in \data, using the
+// same key for all block(s).  The round key is loaded from the appropriate
+// register or memory location for round \i.  May clobber V4.
+.macro _vaes_1x		enc, last, i, xmm_suffix, data
+.if USE_AVX10
+	_vaes		\enc, \last, KEY\i\xmm_suffix, \data
+.else
+.ifnb \xmm_suffix
+	_vaes		\enc, \last, (\i-7)*16(KEY), \data
+.else
+	_vbroadcast128	(\i-7)*16(KEY), V4
+	_vaes		\enc, \last, V4, \data
+.endif
+.endif
+.endm
+
+// Do a single round of AES en/decryption on the blocks in registers V0-V3,
+// using the same key for all blocks.  The round key is loaded from the
+// appropriate register or memory location for round \i.  In addition, does two
+// steps of the computation of the next set of tweaks.  May clobber V4.
+.macro	_vaes_4x	enc, last, i
+.if USE_AVX10
+	_tweak_step	(2*(\i-5))
+	_vaes		\enc, \last, KEY\i, V0
+	_vaes		\enc, \last, KEY\i, V1
+	_tweak_step	(2*(\i-5) + 1)
+	_vaes		\enc, \last, KEY\i, V2
+	_vaes		\enc, \last, KEY\i, V3
+.else
+	_vbroadcast128	(\i-7)*16(KEY), V4
+	_tweak_step	(2*(\i-5))
+	_vaes		\enc, \last, V4, V0
+	_vaes		\enc, \last, V4, V1
+	_tweak_step	(2*(\i-5) + 1)
+	_vaes		\enc, \last, V4, V2
+	_vaes		\enc, \last, V4, V3
+.endif
+.endm
+
+// Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt,
+// then XOR with \tweak again) of the block(s) in \data.  To process a single
+// block, use xmm registers and set \xmm_suffix=_XMM.  To process a vector of
+// length VL, use V* registers and leave \xmm_suffix empty.  May clobber V4.
+.macro	_aes_crypt	enc, xmm_suffix, tweak, data
+	_xor3		KEY0\xmm_suffix, \tweak, \data
+	cmp		$24, KEYLEN
+	jl		.Laes128\@
+	je		.Laes192\@
+	_vaes_1x	\enc, 0, 1, \xmm_suffix, \data
+	_vaes_1x	\enc, 0, 2, \xmm_suffix, \data
+.Laes192\@:
+	_vaes_1x	\enc, 0, 3, \xmm_suffix, \data
+	_vaes_1x	\enc, 0, 4, \xmm_suffix, \data
+.Laes128\@:
+	_vaes_1x	\enc, 0, 5, \xmm_suffix, \data
+	_vaes_1x	\enc, 0, 6, \xmm_suffix, \data
+	_vaes_1x	\enc, 0, 7, \xmm_suffix, \data
+	_vaes_1x	\enc, 0, 8, \xmm_suffix, \data
+	_vaes_1x	\enc, 0, 9, \xmm_suffix, \data
+	_vaes_1x	\enc, 0, 10, \xmm_suffix, \data
+	_vaes_1x	\enc, 0, 11, \xmm_suffix, \data
+	_vaes_1x	\enc, 0, 12, \xmm_suffix, \data
+	_vaes_1x	\enc, 0, 13, \xmm_suffix, \data
+	_vaes_1x	\enc, 1, 14, \xmm_suffix, \data
+	_vpxor		\tweak, \data, \data
+.endm
+
+.macro	_aes_xts_crypt	enc
+	_define_aliases
+
+.if !\enc
+	// When decrypting a message whose length isn't a multiple of the AES
+	// block length, exclude the last full block from the main loop by
+	// subtracting 16 from LEN.  This is needed because ciphertext stealing
+	// decryption uses the last two tweaks in reverse order.  We'll handle
+	// the last full block and the partial block specially at the end.
+	lea		-16(LEN), %eax
+	test		$15, LEN8
+	cmovnz		%eax, LEN
+.endif
+
+	// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
+	movl		480(KEY), KEYLEN
+
+	// Setup the pointer to the round keys and cache as many as possible.
+	_setup_round_keys	\enc
+
+	// Compute the first set of tweaks TWEAK[0-3].
+	_compute_first_set_of_tweaks
+
+	sub		$4*VL, LEN
+	jl		.Lhandle_remainder\@
+
+.Lmain_loop\@:
+	// This is the main loop, en/decrypting 4*VL bytes per iteration.
+
+	// XOR each source block with its tweak and the zero-th round key.
+.if USE_AVX10
+	vmovdqu8	0*VL(SRC), V0
+	vmovdqu8	1*VL(SRC), V1
+	vmovdqu8	2*VL(SRC), V2
+	vmovdqu8	3*VL(SRC), V3
+	vpternlogd	$0x96, TWEAK0, KEY0, V0
+	vpternlogd	$0x96, TWEAK1, KEY0, V1
+	vpternlogd	$0x96, TWEAK2, KEY0, V2
+	vpternlogd	$0x96, TWEAK3, KEY0, V3
+.else
+	vpxor		0*VL(SRC), KEY0, V0
+	vpxor		1*VL(SRC), KEY0, V1
+	vpxor		2*VL(SRC), KEY0, V2
+	vpxor		3*VL(SRC), KEY0, V3
+	vpxor		TWEAK0, V0, V0
+	vpxor		TWEAK1, V1, V1
+	vpxor		TWEAK2, V2, V2
+	vpxor		TWEAK3, V3, V3
+.endif
+	cmp		$24, KEYLEN
+	jl		.Laes128\@
+	je		.Laes192\@
+	// Do all the AES rounds on the data blocks, interleaved with
+	// the computation of the next set of tweaks.
+	_vaes_4x	\enc, 0, 1
+	_vaes_4x	\enc, 0, 2
+.Laes192\@:
+	_vaes_4x	\enc, 0, 3
+	_vaes_4x	\enc, 0, 4
+.Laes128\@:
+	_vaes_4x	\enc, 0, 5
+	_vaes_4x	\enc, 0, 6
+	_vaes_4x	\enc, 0, 7
+	_vaes_4x	\enc, 0, 8
+	_vaes_4x	\enc, 0, 9
+	_vaes_4x	\enc, 0, 10
+	_vaes_4x	\enc, 0, 11
+	_vaes_4x	\enc, 0, 12
+	_vaes_4x	\enc, 0, 13
+	_vaes_4x	\enc, 1, 14
+
+	// XOR in the tweaks again.
+	_vpxor		TWEAK0, V0, V0
+	_vpxor		TWEAK1, V1, V1
+	_vpxor		TWEAK2, V2, V2
+	_vpxor		TWEAK3, V3, V3
+
+	// Store the destination blocks.
+	_vmovdqu	V0, 0*VL(DST)
+	_vmovdqu	V1, 1*VL(DST)
+	_vmovdqu	V2, 2*VL(DST)
+	_vmovdqu	V3, 3*VL(DST)
+
+	// Finish computing the next set of tweaks.
+	_tweak_step	1000
+
+	add		$4*VL, SRC
+	add		$4*VL, DST
+	sub		$4*VL, LEN
+	jge		.Lmain_loop\@
+
+	// Check for the uncommon case where the data length isn't a multiple of
+	// 4*VL.  Handle it out-of-line in order to optimize for the common
+	// case.  In the common case, just fall through to the ret.
+	test		$4*VL-1, LEN8
+	jnz		.Lhandle_remainder\@
+.Ldone\@:
+	// Store the next tweak back to *TWEAK to support continuation calls.
+	vmovdqu		TWEAK0_XMM, (TWEAK)
+.if VL > 16
+	vzeroupper
+.endif
+	RET
+
+.Lhandle_remainder\@:
+
+	// En/decrypt any remaining full blocks, one vector at a time.
+.if VL > 16
+	add		$3*VL, LEN	// Undo extra sub of 4*VL, then sub VL.
+	jl		.Lvec_at_a_time_done\@
+.Lvec_at_a_time\@:
+	_vmovdqu	(SRC), V0
+	_aes_crypt	\enc, , TWEAK0, V0
+	_vmovdqu	V0, (DST)
+	_next_tweakvec	TWEAK0, V0, V1, TWEAK0
+	add		$VL, SRC
+	add		$VL, DST
+	sub		$VL, LEN
+	jge		.Lvec_at_a_time\@
+.Lvec_at_a_time_done\@:
+	add		$VL-16, LEN	// Undo extra sub of VL, then sub 16.
+.else
+	add		$4*VL-16, LEN	// Undo extra sub of 4*VL, then sub 16.
+.endif
+
+	// En/decrypt any remaining full blocks, one at a time.
+	jl		.Lblock_at_a_time_done\@
+.Lblock_at_a_time\@:
+	vmovdqu		(SRC), %xmm0
+	_aes_crypt	\enc, _XMM, TWEAK0_XMM, %xmm0
+	vmovdqu		%xmm0, (DST)
+	_next_tweak	TWEAK0_XMM, %xmm0, TWEAK0_XMM
+	add		$16, SRC
+	add		$16, DST
+	sub		$16, LEN
+	jge		.Lblock_at_a_time\@
+.Lblock_at_a_time_done\@:
+	add		$16, LEN	// Undo the extra sub of 16.
+	// Now 0 <= LEN <= 15.  If LEN is zero, we're done.
+	jz		.Ldone\@
+
+	// Otherwise 1 <= LEN <= 15, but the real remaining length is 16 + LEN.
+	// Do ciphertext stealing to process the last 16 + LEN bytes.
+
+.if \enc
+	// If encrypting, the main loop already encrypted the last full block to
+	// create the CTS intermediate ciphertext.  Prepare for the rest of CTS
+	// by rewinding the pointers and loading the intermediate ciphertext.
+	sub		$16, SRC
+	sub		$16, DST
+	vmovdqu		(DST), %xmm0
+.else
+	// If decrypting, the main loop didn't decrypt the last full block
+	// because CTS decryption uses the last two tweaks in reverse order.
+	// Do it now by advancing the tweak and decrypting the last full block.
+	_next_tweak	TWEAK0_XMM, %xmm0, TWEAK1_XMM
+	vmovdqu		(SRC), %xmm0
+	_aes_crypt	\enc, _XMM, TWEAK1_XMM, %xmm0
+.endif
+
+.if USE_AVX10
+	// Create a mask that has the first LEN bits set.
+	mov		$-1, %r9d
+	bzhi		LEN, %r9d, %r9d
+	kmovd		%r9d, %k1
+
+	// Swap the first LEN bytes of the en/decryption of the last full block
+	// with the partial block.  Note that to support in-place en/decryption,
+	// the load from the src partial block must happen before the store to
+	// the dst partial block.
+	vmovdqa		%xmm0, %xmm1
+	vmovdqu8	16(SRC), %xmm0{%k1}
+	vmovdqu8	%xmm1, 16(DST){%k1}
+.else
+	lea		.Lcts_permute_table(%rip), %r9
+
+	// Load the src partial block, left-aligned.  Note that to support
+	// in-place en/decryption, this must happen before the store to the dst
+	// partial block.
+	vmovdqu		(SRC, LEN64, 1), %xmm1
+
+	// Shift the first LEN bytes of the en/decryption of the last full block
+	// to the end of a register, then store it to DST+LEN.  This stores the
+	// dst partial block.  It also writes to the second part of the dst last
+	// full block, but that part is overwritten later.
+	vpshufb		(%r9, LEN64, 1), %xmm0, %xmm2
+	vmovdqu		%xmm2, (DST, LEN64, 1)
+
+	// Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
+	sub		LEN64, %r9
+	vmovdqu		32(%r9), %xmm3
+
+	// Shift the src partial block to the beginning of its register.
+	vpshufb		%xmm3, %xmm1, %xmm1
+
+	// Do a blend to generate the src partial block followed by the second
+	// part of the en/decryption of the last full block.
+	vpblendvb	%xmm3, %xmm0, %xmm1, %xmm0
+.endif
+	// En/decrypt again and store the last full block.
+	_aes_crypt	\enc, _XMM, TWEAK0_XMM, %xmm0
+	vmovdqu		%xmm0, (DST)
+	jmp		.Ldone\@
+.endm
+
+// void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
+//			   u8 iv[AES_BLOCK_SIZE]);
+SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
+	vmovdqu		(%rsi), %xmm0
+	vpxor		(%rdi), %xmm0, %xmm0
+	movl		480(%rdi), %eax		// AES key length
+	lea		-16(%rdi, %rax, 4), %rdi
+	cmp		$24, %eax
+	jl		.Lencrypt_iv_aes128
+	je		.Lencrypt_iv_aes192
+	vaesenc		-6*16(%rdi), %xmm0, %xmm0
+	vaesenc		-5*16(%rdi), %xmm0, %xmm0
+.Lencrypt_iv_aes192:
+	vaesenc		-4*16(%rdi), %xmm0, %xmm0
+	vaesenc		-3*16(%rdi), %xmm0, %xmm0
+.Lencrypt_iv_aes128:
+	vaesenc		-2*16(%rdi), %xmm0, %xmm0
+	vaesenc		-1*16(%rdi), %xmm0, %xmm0
+	vaesenc		0*16(%rdi), %xmm0, %xmm0
+	vaesenc		1*16(%rdi), %xmm0, %xmm0
+	vaesenc		2*16(%rdi), %xmm0, %xmm0
+	vaesenc		3*16(%rdi), %xmm0, %xmm0
+	vaesenc		4*16(%rdi), %xmm0, %xmm0
+	vaesenc		5*16(%rdi), %xmm0, %xmm0
+	vaesenc		6*16(%rdi), %xmm0, %xmm0
+	vaesenclast	7*16(%rdi), %xmm0, %xmm0
+	vmovdqu		%xmm0, (%rsi)
+	RET
+SYM_FUNC_END(aes_xts_encrypt_iv)
+
+// Below are the actual AES-XTS encryption and decryption functions,
+// instantiated from the above macro.  They all have the following prototype:
+//
+// void (*xts_asm_func)(const struct crypto_aes_ctx *key,
+//			const u8 *src, u8 *dst, unsigned int len,
+//			u8 tweak[AES_BLOCK_SIZE]);
+//
+// |key| is the data key.  |tweak| contains the next tweak; the encryption of
+// the original IV with the tweak key was already done.  This function supports
+// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
+// |len| must be a multiple of 16 except on the last call.  If |len| is a
+// multiple of 16, then this function updates |tweak| to contain the next tweak.
+
+.set	VL, 16
+.set	USE_AVX10, 0
+SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx)
+	_aes_xts_crypt	1
+SYM_FUNC_END(aes_xts_encrypt_aesni_avx)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_aesni_avx)
+	_aes_xts_crypt	0
+SYM_FUNC_END(aes_xts_decrypt_aesni_avx)
+
+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
+.set	VL, 32
+.set	USE_AVX10, 0
+SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2)
+	_aes_xts_crypt	1
+SYM_FUNC_END(aes_xts_encrypt_vaes_avx2)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2)
+	_aes_xts_crypt	0
+SYM_FUNC_END(aes_xts_decrypt_vaes_avx2)
+
+.set	VL, 32
+.set	USE_AVX10, 1
+SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256)
+	_aes_xts_crypt	1
+SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256)
+	_aes_xts_crypt	0
+SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256)
+
+.set	VL, 64
+.set	USE_AVX10, 1
+SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512)
+	_aes_xts_crypt	1
+SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512)
+	_aes_xts_crypt	0
+SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512)
+#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */