17 files changed, 927 insertions, 893 deletions
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 4757bf922075..3d948f10c94c 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -3,10 +3,12 @@
 menu "Accelerated Cryptographic Algorithms for CPU (x86)"
 
 config CRYPTO_CURVE25519_X86
-	tristate "Public key crypto: Curve25519 (ADX)"
+	tristate
 	depends on X86 && 64BIT
+	select CRYPTO_KPP
 	select CRYPTO_LIB_CURVE25519_GENERIC
 	select CRYPTO_ARCH_HAVE_LIB_CURVE25519
+	default CRYPTO_LIB_CURVE25519_INTERNAL
 	help
 	  Curve25519 algorithm
 
@@ -348,11 +350,12 @@ config CRYPTO_ARIA_GFNI_AVX512_X86_64
 	  Processes 64 blocks in parallel.
 
 config CRYPTO_CHACHA20_X86_64
-	tristate "Ciphers: ChaCha20, XChaCha20, XChaCha12 (SSSE3/AVX2/AVX-512VL)"
+	tristate
 	depends on X86 && 64BIT
 	select CRYPTO_SKCIPHER
 	select CRYPTO_LIB_CHACHA_GENERIC
 	select CRYPTO_ARCH_HAVE_LIB_CHACHA
+	default CRYPTO_LIB_CHACHA_INTERNAL
 	help
 	  Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12
 	  stream cipher algorithms
@@ -417,10 +420,12 @@ config CRYPTO_POLYVAL_CLMUL_NI
 	  - CLMUL-NI (carry-less multiplication new instructions)
 
 config CRYPTO_POLY1305_X86_64
-	tristate "Hash functions: Poly1305 (SSE2/AVX2)"
+	tristate
 	depends on X86 && 64BIT
+	select CRYPTO_HASH
 	select CRYPTO_LIB_POLY1305_GENERIC
 	select CRYPTO_ARCH_HAVE_LIB_POLY1305
+	default CRYPTO_LIB_POLY1305_INTERNAL
 	help
 	  Poly1305 authenticator algorithm (RFC7539)
 
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 07b00bfca64b..5d19f41bde58 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -48,7 +48,7 @@ chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o
 
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
-aesni-intel-$(CONFIG_64BIT) += aes_ctrby8_avx-x86_64.o \
+aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \
 			       aes-gcm-aesni-x86_64.o \
 			       aes-xts-avx-x86_64.o
 ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy)
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index 01fa568dc5fc..26786e15abac 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -71,10 +71,9 @@ static void crypto_aegis128_aesni_process_ad(
 
 	scatterwalk_start(&walk, sg_src);
 	while (assoclen != 0) {
-		unsigned int size = scatterwalk_clamp(&walk, assoclen);
+		unsigned int size = scatterwalk_next(&walk, assoclen);
+		const u8 *src = walk.addr;
 		unsigned int left = size;
-		void *mapped = scatterwalk_map(&walk);
-		const u8 *src = (const u8 *)mapped;
 
 		if (pos + size >= AEGIS128_BLOCK_SIZE) {
 			if (pos > 0) {
@@ -97,9 +96,7 @@ static void crypto_aegis128_aesni_process_ad(
 		pos += left;
 		assoclen -= size;
 
-		scatterwalk_unmap(mapped);
-		scatterwalk_advance(&walk, size);
-		scatterwalk_done(&walk, 0, assoclen);
+		scatterwalk_done_src(&walk, size);
 	}
 
 	if (pos > 0) {
diff --git a/arch/x86/crypto/aes-ctr-avx-x86_64.S b/arch/x86/crypto/aes-ctr-avx-x86_64.S
new file mode 100644
index 000000000000..1685d8b24b2c
--- /dev/null
+++ b/arch/x86/crypto/aes-ctr-avx-x86_64.S
@@ -0,0 +1,592 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// Copyright 2025 Google LLC
+//
+// Author: Eric Biggers <ebiggers@google.com>
+//
+// This file is dual-licensed, meaning that you can use it under your choice of
+// either of the following two licenses:
+//
+// Licensed under the Apache License 2.0 (the "License").  You may obtain a copy
+// of the License at
+//
+//	http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// or
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+//
+// This file contains x86_64 assembly implementations of AES-CTR and AES-XCTR
+// using the following sets of CPU features:
+//	- AES-NI && AVX
+//	- VAES && AVX2
+//	- VAES && (AVX10/256 || (AVX512BW && AVX512VL)) && BMI2
+//	- VAES && (AVX10/512 || (AVX512BW && AVX512VL)) && BMI2
+//
+// See the function definitions at the bottom of the file for more information.
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+
+.section .rodata
+.p2align 4
+
+.Lbswap_mask:
+	.octa	0x000102030405060708090a0b0c0d0e0f
+
+.Lctr_pattern:
+	.quad	0, 0
+.Lone:
+	.quad	1, 0
+.Ltwo:
+	.quad	2, 0
+	.quad	3, 0
+
+.Lfour:
+	.quad	4, 0
+
+.text
+
+// Move a vector between memory and a register.
+// The register operand must be in the first 16 vector registers.
+.macro	_vmovdqu	src, dst
+.if VL < 64
+	vmovdqu		\src, \dst
+.else
+	vmovdqu8	\src, \dst
+.endif
+.endm
+
+// Move a vector between registers.
+// The registers must be in the first 16 vector registers.
+.macro	_vmovdqa	src, dst
+.if VL < 64
+	vmovdqa		\src, \dst
+.else
+	vmovdqa64	\src, \dst
+.endif
+.endm
+
+// Broadcast a 128-bit value from memory to all 128-bit lanes of a vector
+// register.  The register operand must be in the first 16 vector registers.
+.macro	_vbroadcast128	src, dst
+.if VL == 16
+	vmovdqu		\src, \dst
+.elseif VL == 32
+	vbroadcasti128	\src, \dst
+.else
+	vbroadcasti32x4	\src, \dst
+.endif
+.endm
+
+// XOR two vectors together.
+// Any register operands must be in the first 16 vector registers.
+.macro	_vpxor	src1, src2, dst
+.if VL < 64
+	vpxor		\src1, \src2, \dst
+.else
+	vpxord		\src1, \src2, \dst
+.endif
+.endm
+
+// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst
+// and zeroize any remaining bytes.  Clobbers %rax, %rcx, and \tmp{64,32}.
+.macro	_load_partial_block	src, dst, tmp64, tmp32
+	sub		$8, %ecx		// LEN - 8
+	jle		.Lle8\@
+
+	// Load 9 <= LEN <= 15 bytes.
+	vmovq		(\src), \dst		// Load first 8 bytes
+	mov		(\src, %rcx), %rax	// Load last 8 bytes
+	neg		%ecx
+	shl		$3, %ecx
+	shr		%cl, %rax		// Discard overlapping bytes
+	vpinsrq		$1, %rax, \dst, \dst
+	jmp		.Ldone\@
+
+.Lle8\@:
+	add		$4, %ecx		// LEN - 4
+	jl		.Llt4\@
+
+	// Load 4 <= LEN <= 8 bytes.
+	mov		(\src), %eax		// Load first 4 bytes
+	mov		(\src, %rcx), \tmp32	// Load last 4 bytes
+	jmp		.Lcombine\@
+
+.Llt4\@:
+	// Load 1 <= LEN <= 3 bytes.
+	add		$2, %ecx		// LEN - 2
+	movzbl		(\src), %eax		// Load first byte
+	jl		.Lmovq\@
+	movzwl		(\src, %rcx), \tmp32	// Load last 2 bytes
+.Lcombine\@:
+	shl		$3, %ecx
+	shl		%cl, \tmp64
+	or		\tmp64, %rax		// Combine the two parts
+.Lmovq\@:
+	vmovq		%rax, \dst
+.Ldone\@:
+.endm
+
+// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst.
+// Clobbers %rax, %rcx, and \tmp{64,32}.
+.macro	_store_partial_block	src, dst, tmp64, tmp32
+	sub		$8, %ecx		// LEN - 8
+	jl		.Llt8\@
+
+	// Store 8 <= LEN <= 15 bytes.
+	vpextrq		$1, \src, %rax
+	mov		%ecx, \tmp32
+	shl		$3, %ecx
+	ror		%cl, %rax
+	mov		%rax, (\dst, \tmp64)	// Store last LEN - 8 bytes
+	vmovq		\src, (\dst)		// Store first 8 bytes
+	jmp		.Ldone\@
+
+.Llt8\@:
+	add		$4, %ecx		// LEN - 4
+	jl		.Llt4\@
+
+	// Store 4 <= LEN <= 7 bytes.
+	vpextrd		$1, \src, %eax
+	mov		%ecx, \tmp32
+	shl		$3, %ecx
+	ror		%cl, %eax
+	mov		%eax, (\dst, \tmp64)	// Store last LEN - 4 bytes
+	vmovd		\src, (\dst)		// Store first 4 bytes
+	jmp		.Ldone\@
+
+.Llt4\@:
+	// Store 1 <= LEN <= 3 bytes.
+	vpextrb		$0, \src, 0(\dst)
+	cmp		$-2, %ecx		// LEN - 4 == -2, i.e. LEN == 2?
+	jl		.Ldone\@
+	vpextrb		$1, \src, 1(\dst)
+	je		.Ldone\@
+	vpextrb		$2, \src, 2(\dst)
+.Ldone\@:
+.endm
+
+// Prepare the next two vectors of AES inputs in AESDATA\i0 and AESDATA\i1, and
+// XOR each with the zero-th round key.  Also update LE_CTR if !\final.
+.macro	_prepare_2_ctr_vecs	is_xctr, i0, i1, final=0
+.if \is_xctr
+  .if USE_AVX10
+	_vmovdqa	LE_CTR, AESDATA\i0
+	vpternlogd	$0x96, XCTR_IV, RNDKEY0, AESDATA\i0
+  .else
+	vpxor		XCTR_IV, LE_CTR, AESDATA\i0
+	vpxor		RNDKEY0, AESDATA\i0, AESDATA\i0
+  .endif
+	vpaddq		LE_CTR_INC1, LE_CTR, AESDATA\i1
+
+  .if USE_AVX10
+	vpternlogd	$0x96, XCTR_IV, RNDKEY0, AESDATA\i1
+  .else
+	vpxor		XCTR_IV, AESDATA\i1, AESDATA\i1
+	vpxor		RNDKEY0, AESDATA\i1, AESDATA\i1
+  .endif
+.else
+	vpshufb		BSWAP_MASK, LE_CTR, AESDATA\i0
+	_vpxor		RNDKEY0, AESDATA\i0, AESDATA\i0
+	vpaddq		LE_CTR_INC1, LE_CTR, AESDATA\i1
+	vpshufb		BSWAP_MASK, AESDATA\i1, AESDATA\i1
+	_vpxor		RNDKEY0, AESDATA\i1, AESDATA\i1
+.endif
+.if !\final
+	vpaddq		LE_CTR_INC2, LE_CTR, LE_CTR
+.endif
+.endm
+
+// Do all AES rounds on the data in the given AESDATA vectors, excluding the
+// zero-th and last rounds.
+.macro	_aesenc_loop	vecs:vararg
+	mov		KEY, %rax
+1:
+	_vbroadcast128	(%rax), RNDKEY
+.irp i, \vecs
+	vaesenc		RNDKEY, AESDATA\i, AESDATA\i
+.endr
+	add		$16, %rax
+	cmp		%rax, RNDKEYLAST_PTR
+	jne		1b
+.endm
+
+// Finalize the keystream blocks in the given AESDATA vectors by doing the last
+// AES round, then XOR those keystream blocks with the corresponding data.
+// Reduce latency by doing the XOR before the vaesenclast, utilizing the
+// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
+.macro	_aesenclast_and_xor	vecs:vararg
+.irp i, \vecs
+	_vpxor		\i*VL(SRC), RNDKEYLAST, RNDKEY
+	vaesenclast	RNDKEY, AESDATA\i, AESDATA\i
+.endr
+.irp i, \vecs
+	_vmovdqu	AESDATA\i, \i*VL(DST)
+.endr
+.endm
+
+// XOR the keystream blocks in the specified AESDATA vectors with the
+// corresponding data.
+.macro	_xor_data	vecs:vararg
+.irp i, \vecs
+	_vpxor		\i*VL(SRC), AESDATA\i, AESDATA\i
+.endr
+.irp i, \vecs
+	_vmovdqu	AESDATA\i, \i*VL(DST)
+.endr
+.endm
+
+.macro	_aes_ctr_crypt		is_xctr
+
+	// Define register aliases V0-V15 that map to the xmm, ymm, or zmm
+	// registers according to the selected Vector Length (VL).
+.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+  .if VL == 16
+	.set	V\i,		%xmm\i
+  .elseif VL == 32
+	.set	V\i,		%ymm\i
+  .elseif VL == 64
+	.set	V\i,		%zmm\i
+  .else
+	.error "Unsupported Vector Length (VL)"
+  .endif
+.endr
+
+	// Function arguments
+	.set	KEY,		%rdi	// Initially points to the start of the
+					// crypto_aes_ctx, then is advanced to
+					// point to the index 1 round key
+	.set	KEY32,		%edi	// Available as temp register after all
+					// keystream blocks have been generated
+	.set	SRC,		%rsi	// Pointer to next source data
+	.set	DST,		%rdx	// Pointer to next destination data
+	.set	LEN,		%ecx	// Remaining length in bytes.
+					// Note: _load_partial_block relies on
+					// this being in %ecx.
+	.set	LEN64,		%rcx	// Zero-extend LEN before using!
+	.set	LEN8,		%cl
+.if \is_xctr
+	.set	XCTR_IV_PTR,	%r8	// const u8 iv[AES_BLOCK_SIZE];
+	.set	XCTR_CTR,	%r9	// u64 ctr;
+.else
+	.set	LE_CTR_PTR,	%r8	// const u64 le_ctr[2];
+.endif
+
+	// Additional local variables
+	.set	RNDKEYLAST_PTR,	%r10
+	.set	AESDATA0,	V0
+	.set	AESDATA0_XMM,	%xmm0
+	.set	AESDATA1,	V1
+	.set	AESDATA1_XMM,	%xmm1
+	.set	AESDATA2,	V2
+	.set	AESDATA3,	V3
+	.set	AESDATA4,	V4
+	.set	AESDATA5,	V5
+	.set	AESDATA6,	V6
+	.set	AESDATA7,	V7
+.if \is_xctr
+	.set	XCTR_IV,	V8
+.else
+	.set	BSWAP_MASK,	V8
+.endif
+	.set	LE_CTR,		V9
+	.set	LE_CTR_XMM,	%xmm9
+	.set	LE_CTR_INC1,	V10
+	.set	LE_CTR_INC2,	V11
+	.set	RNDKEY0,	V12
+	.set	RNDKEYLAST,	V13
+	.set	RNDKEY,		V14
+
+	// Create the first vector of counters.
+.if \is_xctr
+  .if VL == 16
+	vmovq		XCTR_CTR, LE_CTR
+  .elseif VL == 32
+	vmovq		XCTR_CTR, LE_CTR_XMM
+	inc		XCTR_CTR
+	vmovq		XCTR_CTR, AESDATA0_XMM
+	vinserti128	$1, AESDATA0_XMM, LE_CTR, LE_CTR
+  .else
+	vpbroadcastq	XCTR_CTR, LE_CTR
+	vpsrldq		$8, LE_CTR, LE_CTR
+	vpaddq		.Lctr_pattern(%rip), LE_CTR, LE_CTR
+  .endif
+	_vbroadcast128	(XCTR_IV_PTR), XCTR_IV
+.else
+	_vbroadcast128	(LE_CTR_PTR), LE_CTR
+  .if VL > 16
+	vpaddq		.Lctr_pattern(%rip), LE_CTR, LE_CTR
+  .endif
+	_vbroadcast128	.Lbswap_mask(%rip), BSWAP_MASK
+.endif
+
+.if VL == 16
+	_vbroadcast128	.Lone(%rip), LE_CTR_INC1
+.elseif VL == 32
+	_vbroadcast128	.Ltwo(%rip), LE_CTR_INC1
+.else
+	_vbroadcast128	.Lfour(%rip), LE_CTR_INC1
+.endif
+	vpsllq		$1, LE_CTR_INC1, LE_CTR_INC2
+
+	// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
+	movl		480(KEY), %eax
+
+	// Compute the pointer to the last round key.
+	lea		6*16(KEY, %rax, 4), RNDKEYLAST_PTR
+
+	// Load the zero-th and last round keys.
+	_vbroadcast128	(KEY), RNDKEY0
+	_vbroadcast128	(RNDKEYLAST_PTR), RNDKEYLAST
+
+	// Make KEY point to the first round key.
+	add		$16, KEY
+
+	// This is the main loop, which encrypts 8 vectors of data at a time.
+	add		$-8*VL, LEN
+	jl		.Lloop_8x_done\@
+.Lloop_8x\@:
+	_prepare_2_ctr_vecs	\is_xctr, 0, 1
+	_prepare_2_ctr_vecs	\is_xctr, 2, 3
+	_prepare_2_ctr_vecs	\is_xctr, 4, 5
+	_prepare_2_ctr_vecs	\is_xctr, 6, 7
+	_aesenc_loop	0,1,2,3,4,5,6,7
+	_aesenclast_and_xor 0,1,2,3,4,5,6,7
+	sub		$-8*VL, SRC
+	sub		$-8*VL, DST
+	add		$-8*VL, LEN
+	jge		.Lloop_8x\@
+.Lloop_8x_done\@:
+	sub		$-8*VL, LEN
+	jz		.Ldone\@
+
+	// 1 <= LEN < 8*VL.  Generate 2, 4, or 8 more vectors of keystream
+	// blocks, depending on the remaining LEN.
+
+	_prepare_2_ctr_vecs	\is_xctr, 0, 1
+	_prepare_2_ctr_vecs	\is_xctr, 2, 3
+	cmp		$4*VL, LEN
+	jle		.Lenc_tail_atmost4vecs\@
+
+	// 4*VL < LEN < 8*VL.  Generate 8 vectors of keystream blocks.  Use the
+	// first 4 to XOR 4 full vectors of data.  Then XOR the remaining data.
+	_prepare_2_ctr_vecs	\is_xctr, 4, 5
+	_prepare_2_ctr_vecs	\is_xctr, 6, 7, final=1
+	_aesenc_loop	0,1,2,3,4,5,6,7
+	_aesenclast_and_xor 0,1,2,3
+	vaesenclast	RNDKEYLAST, AESDATA4, AESDATA0
+	vaesenclast	RNDKEYLAST, AESDATA5, AESDATA1
+	vaesenclast	RNDKEYLAST, AESDATA6, AESDATA2
+	vaesenclast	RNDKEYLAST, AESDATA7, AESDATA3
+	sub		$-4*VL, SRC
+	sub		$-4*VL, DST
+	add		$-4*VL, LEN
+	cmp		$1*VL-1, LEN
+	jle		.Lxor_tail_partial_vec_0\@
+	_xor_data	0
+	cmp		$2*VL-1, LEN
+	jle		.Lxor_tail_partial_vec_1\@
+	_xor_data	1
+	cmp		$3*VL-1, LEN
+	jle		.Lxor_tail_partial_vec_2\@
+	_xor_data	2
+	cmp		$4*VL-1, LEN
+	jle		.Lxor_tail_partial_vec_3\@
+	_xor_data	3
+	jmp		.Ldone\@
+
+.Lenc_tail_atmost4vecs\@:
+	cmp		$2*VL, LEN
+	jle		.Lenc_tail_atmost2vecs\@
+
+	// 2*VL < LEN <= 4*VL.  Generate 4 vectors of keystream blocks.  Use the
+	// first 2 to XOR 2 full vectors of data.  Then XOR the remaining data.
+	_aesenc_loop	0,1,2,3
+	_aesenclast_and_xor 0,1
+	vaesenclast	RNDKEYLAST, AESDATA2, AESDATA0
+	vaesenclast	RNDKEYLAST, AESDATA3, AESDATA1
+	sub		$-2*VL, SRC
+	sub		$-2*VL, DST
+	add		$-2*VL, LEN
+	jmp		.Lxor_tail_upto2vecs\@
+
+.Lenc_tail_atmost2vecs\@:
+	// 1 <= LEN <= 2*VL.  Generate 2 vectors of keystream blocks.  Then XOR
+	// the remaining data.
+	_aesenc_loop	0,1
+	vaesenclast	RNDKEYLAST, AESDATA0, AESDATA0
+	vaesenclast	RNDKEYLAST, AESDATA1, AESDATA1
+
+.Lxor_tail_upto2vecs\@:
+	cmp		$1*VL-1, LEN
+	jle		.Lxor_tail_partial_vec_0\@
+	_xor_data	0
+	cmp		$2*VL-1, LEN
+	jle		.Lxor_tail_partial_vec_1\@
+	_xor_data	1
+	jmp		.Ldone\@
+
+.Lxor_tail_partial_vec_1\@:
+	add		$-1*VL, LEN
+	jz		.Ldone\@
+	sub		$-1*VL, SRC
+	sub		$-1*VL, DST
+	_vmovdqa	AESDATA1, AESDATA0
+	jmp		.Lxor_tail_partial_vec_0\@
+
+.Lxor_tail_partial_vec_2\@:
+	add		$-2*VL, LEN
+	jz		.Ldone\@
+	sub		$-2*VL, SRC
+	sub		$-2*VL, DST
+	_vmovdqa	AESDATA2, AESDATA0
+	jmp		.Lxor_tail_partial_vec_0\@
+
+.Lxor_tail_partial_vec_3\@:
+	add		$-3*VL, LEN
+	jz		.Ldone\@
+	sub		$-3*VL, SRC
+	sub		$-3*VL, DST
+	_vmovdqa	AESDATA3, AESDATA0
+
+.Lxor_tail_partial_vec_0\@:
+	// XOR the remaining 1 <= LEN < VL bytes.  It's easy if masked
+	// loads/stores are available; otherwise it's a bit harder...
+.if USE_AVX10
+  .if VL <= 32
+	mov		$-1, %eax
+	bzhi		LEN, %eax, %eax
+	kmovd		%eax, %k1
+  .else
+	mov		$-1, %rax
+	bzhi		LEN64, %rax, %rax
+	kmovq		%rax, %k1
+  .endif
+	vmovdqu8	(SRC), AESDATA1{%k1}{z}
+	_vpxor		AESDATA1, AESDATA0, AESDATA0
+	vmovdqu8	AESDATA0, (DST){%k1}
+.else
+  .if VL == 32
+	cmp		$16, LEN
+	jl		1f
+	vpxor		(SRC), AESDATA0_XMM, AESDATA1_XMM
+	vmovdqu		AESDATA1_XMM, (DST)
+	add		$16, SRC
+	add		$16, DST
+	sub		$16, LEN
+	jz		.Ldone\@
+	vextracti128	$1, AESDATA0, AESDATA0_XMM
+1:
+  .endif
+	mov		LEN, %r10d
+	_load_partial_block	SRC, AESDATA1_XMM, KEY, KEY32
+	vpxor		AESDATA1_XMM, AESDATA0_XMM, AESDATA0_XMM
+	mov		%r10d, %ecx
+	_store_partial_block	AESDATA0_XMM, DST, KEY, KEY32
+.endif
+
+.Ldone\@:
+.if VL > 16
+	vzeroupper
+.endif
+	RET
+.endm
+
+// Below are the definitions of the functions generated by the above macro.
+// They have the following prototypes:
+//
+//
+// void aes_ctr64_crypt_##suffix(const struct crypto_aes_ctx *key,
+//				 const u8 *src, u8 *dst, int len,
+//				 const u64 le_ctr[2]);
+//
+// void aes_xctr_crypt_##suffix(const struct crypto_aes_ctx *key,
+//				const u8 *src, u8 *dst, int len,
+//				const u8 iv[AES_BLOCK_SIZE], u64 ctr);
+//
+// Both functions generate |len| bytes of keystream, XOR it with the data from
+// |src|, and write the result to |dst|.  On non-final calls, |len| must be a
+// multiple of 16.  On the final call, |len| can be any value.
+//
+// aes_ctr64_crypt_* implement "regular" CTR, where the keystream is generated
+// from a 128-bit big endian counter that increments by 1 for each AES block.
+// HOWEVER, to keep the assembly code simple, some of the counter management is
+// left to the caller.  aes_ctr64_crypt_* take the counter in little endian
+// form, only increment the low 64 bits internally, do the conversion to big
+// endian internally, and don't write the updated counter back to memory.  The
+// caller is responsible for converting the starting IV to the little endian
+// le_ctr, detecting the (very rare) case of a carry out of the low 64 bits
+// being needed and splitting at that point with a carry done in between, and
+// updating le_ctr after each part if the message is multi-part.
+//
+// aes_xctr_crypt_* implement XCTR as specified in "Length-preserving encryption
+// with HCTR2" (https://eprint.iacr.org/2021/1441.pdf).  XCTR is an
+// easier-to-implement variant of CTR that uses little endian byte order and
+// eliminates carries.  |ctr| is the per-message block counter starting at 1.
+
+.set	VL, 16
+.set	USE_AVX10, 0
+SYM_TYPED_FUNC_START(aes_ctr64_crypt_aesni_avx)
+	_aes_ctr_crypt	0
+SYM_FUNC_END(aes_ctr64_crypt_aesni_avx)
+SYM_TYPED_FUNC_START(aes_xctr_crypt_aesni_avx)
+	_aes_ctr_crypt	1
+SYM_FUNC_END(aes_xctr_crypt_aesni_avx)
+
+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
+.set	VL, 32
+.set	USE_AVX10, 0
+SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx2)
+	_aes_ctr_crypt	0
+SYM_FUNC_END(aes_ctr64_crypt_vaes_avx2)
+SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx2)
+	_aes_ctr_crypt	1
+SYM_FUNC_END(aes_xctr_crypt_vaes_avx2)
+
+.set	VL, 32
+.set	USE_AVX10, 1
+SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx10_256)
+	_aes_ctr_crypt	0
+SYM_FUNC_END(aes_ctr64_crypt_vaes_avx10_256)
+SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx10_256)
+	_aes_ctr_crypt	1
+SYM_FUNC_END(aes_xctr_crypt_vaes_avx10_256)
+
+.set	VL, 64
+.set	USE_AVX10, 1
+SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx10_512)
+	_aes_ctr_crypt	0
+SYM_FUNC_END(aes_ctr64_crypt_vaes_avx10_512)
+SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx10_512)
+	_aes_ctr_crypt	1
+SYM_FUNC_END(aes_xctr_crypt_vaes_avx10_512)
+#endif // CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ
diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
index 8a3e23fbcf85..93ba0ddbe009 100644
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -1,11 +1,50 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * AES-XTS for modern x86_64 CPUs
- *
- * Copyright 2024 Google LLC
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// AES-XTS for modern x86_64 CPUs
+//
+// Copyright 2024 Google LLC
+//
+// Author: Eric Biggers <ebiggers@google.com>
+//
+//------------------------------------------------------------------------------
+//
+// This file is dual-licensed, meaning that you can use it under your choice of
+// either of the following two licenses:
+//
+// Licensed under the Apache License 2.0 (the "License").  You may obtain a copy
+// of the License at
+//
+//	http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// or
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
 
 /*
  * This file implements AES-XTS for modern x86_64 CPUs.  To handle the
diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
deleted file mode 100644
index 2402b9418cd7..000000000000
--- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
+++ /dev/null
@@ -1,597 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */
-/*
- * AES CTR mode by8 optimization with AVX instructions. (x86_64)
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * Contact Information:
- * James Guilford <james.guilford@intel.com>
- * Sean Gulley <sean.m.gulley@intel.com>
- * Chandramouli Narayanan <mouli@linux.intel.com>
- */
-/*
- * This is AES128/192/256 CTR mode optimization implementation. It requires
- * the support of Intel(R) AESNI and AVX instructions.
- *
- * This work was inspired by the AES CTR mode optimization published
- * in Intel Optimized IPSEC Cryptographic library.
- * Additional information on it can be found at:
- *    https://github.com/intel/intel-ipsec-mb
- */
-
-#include <linux/linkage.h>
-
-#define VMOVDQ		vmovdqu
-
-/*
- * Note: the "x" prefix in these aliases means "this is an xmm register".  The
- * alias prefixes have no relation to XCTR where the "X" prefix means "XOR
- * counter".
- */
-#define xdata0		%xmm0
-#define xdata1		%xmm1
-#define xdata2		%xmm2
-#define xdata3		%xmm3
-#define xdata4		%xmm4
-#define xdata5		%xmm5
-#define xdata6		%xmm6
-#define xdata7		%xmm7
-#define xcounter	%xmm8	// CTR mode only
-#define xiv		%xmm8	// XCTR mode only
-#define xbyteswap	%xmm9	// CTR mode only
-#define xtmp		%xmm9	// XCTR mode only
-#define xkey0		%xmm10
-#define xkey4		%xmm11
-#define xkey8		%xmm12
-#define xkey12		%xmm13
-#define xkeyA		%xmm14
-#define xkeyB		%xmm15
-
-#define p_in		%rdi
-#define p_iv		%rsi
-#define p_keys		%rdx
-#define p_out		%rcx
-#define num_bytes	%r8
-#define counter		%r9	// XCTR mode only
-#define tmp		%r10
-#define	DDQ_DATA	0
-#define	XDATA		1
-#define KEY_128		1
-#define KEY_192		2
-#define KEY_256		3
-
-.section .rodata
-.align 16
-
-byteswap_const:
-	.octa 0x000102030405060708090A0B0C0D0E0F
-ddq_low_msk:
-	.octa 0x0000000000000000FFFFFFFFFFFFFFFF
-ddq_high_add_1:
-	.octa 0x00000000000000010000000000000000
-ddq_add_1:
-	.octa 0x00000000000000000000000000000001
-ddq_add_2:
-	.octa 0x00000000000000000000000000000002
-ddq_add_3:
-	.octa 0x00000000000000000000000000000003
-ddq_add_4:
-	.octa 0x00000000000000000000000000000004
-ddq_add_5:
-	.octa 0x00000000000000000000000000000005
-ddq_add_6:
-	.octa 0x00000000000000000000000000000006
-ddq_add_7:
-	.octa 0x00000000000000000000000000000007
-ddq_add_8:
-	.octa 0x00000000000000000000000000000008
-
-.text
-
-/* generate a unique variable for ddq_add_x */
-
-/* generate a unique variable for xmm register */
-.macro setxdata n
-	var_xdata = %xmm\n
-.endm
-
-/* club the numeric 'id' to the symbol 'name' */
-
-.macro club name, id
-.altmacro
-	.if \name == XDATA
-		setxdata %\id
-	.endif
-.noaltmacro
-.endm
-
-/*
- * do_aes num_in_par load_keys key_len
- * This increments p_in, but not p_out
- */
-.macro do_aes b, k, key_len, xctr
-	.set by, \b
-	.set load_keys, \k
-	.set klen, \key_len
-
-	.if (load_keys)
-		vmovdqa	0*16(p_keys), xkey0
-	.endif
-
-	.if \xctr
-		movq counter, xtmp
-		.set i, 0
-		.rept (by)
-			club XDATA, i
-			vpaddq	(ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata
-			.set i, (i +1)
-		.endr
-		.set i, 0
-		.rept (by)
-			club	XDATA, i
-			vpxor	xiv, var_xdata, var_xdata
-			.set i, (i +1)
-		.endr
-	.else
-		vpshufb	xbyteswap, xcounter, xdata0
-		.set i, 1
-		.rept (by - 1)
-			club XDATA, i
-			vpaddq	(ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
-			vptest	ddq_low_msk(%rip), var_xdata
-			jnz 1f
-			vpaddq	ddq_high_add_1(%rip), var_xdata, var_xdata
-			vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
-			1:
-			vpshufb	xbyteswap, var_xdata, var_xdata
-			.set i, (i +1)
-		.endr
-	.endif
-
-	vmovdqa	1*16(p_keys), xkeyA
-
-	vpxor	xkey0, xdata0, xdata0
-	.if \xctr
-		add $by, counter
-	.else
-		vpaddq	(ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
-		vptest	ddq_low_msk(%rip), xcounter
-		jnz	1f
-		vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
-		1:
-	.endif
-
-	.set i, 1
-	.rept (by - 1)
-		club XDATA, i
-		vpxor	xkey0, var_xdata, var_xdata
-		.set i, (i +1)
-	.endr
-
-	vmovdqa	2*16(p_keys), xkeyB
-
-	.set i, 0
-	.rept by
-		club XDATA, i
-		vaesenc	xkeyA, var_xdata, var_xdata		/* key 1 */
-		.set i, (i +1)
-	.endr
-
-	.if (klen == KEY_128)
-		.if (load_keys)
-			vmovdqa	3*16(p_keys), xkey4
-		.endif
-	.else
-		vmovdqa	3*16(p_keys), xkeyA
-	.endif
-
-	.set i, 0
-	.rept by
-		club XDATA, i
-		vaesenc	xkeyB, var_xdata, var_xdata		/* key 2 */
-		.set i, (i +1)
-	.endr
-
-	add	$(16*by), p_in
-
-	.if (klen == KEY_128)
-		vmovdqa	4*16(p_keys), xkeyB
-	.else
-		.if (load_keys)
-			vmovdqa	4*16(p_keys), xkey4
-		.endif
-	.endif
-
-	.set i, 0
-	.rept by
-		club XDATA, i
-		/* key 3 */
-		.if (klen == KEY_128)
-			vaesenc	xkey4, var_xdata, var_xdata
-		.else
-			vaesenc	xkeyA, var_xdata, var_xdata
-		.endif
-		.set i, (i +1)
-	.endr
-
-	vmovdqa	5*16(p_keys), xkeyA
-
-	.set i, 0
-	.rept by
-		club XDATA, i
-		/* key 4 */
-		.if (klen == KEY_128)
-			vaesenc	xkeyB, var_xdata, var_xdata
-		.else
-			vaesenc	xkey4, var_xdata, var_xdata
-		.endif
-		.set i, (i +1)
-	.endr
-
-	.if (klen == KEY_128)
-		.if (load_keys)
-			vmovdqa	6*16(p_keys), xkey8
-		.endif
-	.else
-		vmovdqa	6*16(p_keys), xkeyB
-	.endif
-
-	.set i, 0
-	.rept by
-		club XDATA, i
-		vaesenc	xkeyA, var_xdata, var_xdata		/* key 5 */
-		.set i, (i +1)
-	.endr
-
-	vmovdqa	7*16(p_keys), xkeyA
-
-	.set i, 0
-	.rept by
-		club XDATA, i
-		/* key 6 */
-		.if (klen == KEY_128)
-			vaesenc	xkey8, var_xdata, var_xdata
-		.else
-			vaesenc	xkeyB, var_xdata, var_xdata
-		.endif
-		.set i, (i +1)
-	.endr
-
-	.if (klen == KEY_128)
-		vmovdqa	8*16(p_keys), xkeyB
-	.else
-		.if (load_keys)
-			vmovdqa	8*16(p_keys), xkey8
-		.endif
-	.endif
-
-	.set i, 0
-	.rept by
-		club XDATA, i
-		vaesenc	xkeyA, var_xdata, var_xdata		/* key 7 */
-		.set i, (i +1)
-	.endr
-
-	.if (klen == KEY_128)
-		.if (load_keys)
-			vmovdqa	9*16(p_keys), xkey12
-		.endif
-	.else
-		vmovdqa	9*16(p_keys), xkeyA
-	.endif
-
-	.set i, 0
-	.rept by
-		club XDATA, i
-		/* key 8 */
-		.if (klen == KEY_128)
-			vaesenc	xkeyB, var_xdata, var_xdata
-		.else
-			vaesenc	xkey8, var_xdata, var_xdata
-		.endif
-		.set i, (i +1)
-	.endr
-
-	vmovdqa	10*16(p_keys), xkeyB
-
-	.set i, 0
-	.rept by
-		club XDATA, i
-		/* key 9 */
-		.if (klen == KEY_128)
-			vaesenc	xkey12, var_xdata, var_xdata
-		.else
-			vaesenc	xkeyA, var_xdata, var_xdata
-		.endif
-		.set i, (i +1)
-	.endr
-
-	.if (klen != KEY_128)
-		vmovdqa	11*16(p_keys), xkeyA
-	.endif
-
-	.set i, 0
-	.rept by
-		club XDATA, i
-		/* key 10 */
-		.if (klen == KEY_128)
-			vaesenclast	xkeyB, var_xdata, var_xdata
-		.else
-			vaesenc	xkeyB, var_xdata, var_xdata
-		.endif
-		.set i, (i +1)
-	.endr
-
-	.if (klen != KEY_128)
-		.if (load_keys)
-			vmovdqa	12*16(p_keys), xkey12
-		.endif
-
-		.set i, 0
-		.rept by
-			club XDATA, i
-			vaesenc	xkeyA, var_xdata, var_xdata	/* key 11 */
-			.set i, (i +1)
-		.endr
-
-		.if (klen == KEY_256)
-			vmovdqa	13*16(p_keys), xkeyA
-		.endif
-
-		.set i, 0
-		.rept by
-			club XDATA, i
-			.if (klen == KEY_256)
-				/* key 12 */
-				vaesenc	xkey12, var_xdata, var_xdata
-			.else
-				vaesenclast xkey12, var_xdata, var_xdata
-			.endif
-			.set i, (i +1)
-		.endr
-
-		.if (klen == KEY_256)
-			vmovdqa	14*16(p_keys), xkeyB
-
-			.set i, 0
-			.rept by
-				club XDATA, i
-				/* key 13 */
-				vaesenc	xkeyA, var_xdata, var_xdata
-				.set i, (i +1)
-			.endr
-
-			.set i, 0
-			.rept by
-				club XDATA, i
-				/* key 14 */
-				vaesenclast	xkeyB, var_xdata, var_xdata
-				.set i, (i +1)
-			.endr
-		.endif
-	.endif
-
-	.set i, 0
-	.rept (by / 2)
-		.set j, (i+1)
-		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
-		VMOVDQ	(j*16 - 16*by)(p_in), xkeyB
-		club XDATA, i
-		vpxor	xkeyA, var_xdata, var_xdata
-		club XDATA, j
-		vpxor	xkeyB, var_xdata, var_xdata
-		.set i, (i+2)
-	.endr
-
-	.if (i < by)
-		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
-		club XDATA, i
-		vpxor	xkeyA, var_xdata, var_xdata
-	.endif
-
-	.set i, 0
-	.rept by
-		club XDATA, i
-		VMOVDQ	var_xdata, i*16(p_out)
-		.set i, (i+1)
-	.endr
-.endm
-
-.macro do_aes_load val, key_len, xctr
-	do_aes \val, 1, \key_len, \xctr
-.endm
-
-.macro do_aes_noload val, key_len, xctr
-	do_aes \val, 0, \key_len, \xctr
-.endm
-
-/* main body of aes ctr load */
-
-.macro do_aes_ctrmain key_len, xctr
-	cmp	$16, num_bytes
-	jb	.Ldo_return2\xctr\key_len
-
-	.if \xctr
-		shr	$4, counter
-		vmovdqu	(p_iv), xiv
-	.else
-		vmovdqa	byteswap_const(%rip), xbyteswap
-		vmovdqu	(p_iv), xcounter
-		vpshufb	xbyteswap, xcounter, xcounter
-	.endif
-
-	mov	num_bytes, tmp
-	and	$(7*16), tmp
-	jz	.Lmult_of_8_blks\xctr\key_len
-
-	/* 1 <= tmp <= 7 */
-	cmp	$(4*16), tmp
-	jg	.Lgt4\xctr\key_len
-	je	.Leq4\xctr\key_len
-
-.Llt4\xctr\key_len:
-	cmp	$(2*16), tmp
-	jg	.Leq3\xctr\key_len
-	je	.Leq2\xctr\key_len
-
-.Leq1\xctr\key_len:
-	do_aes_load	1, \key_len, \xctr
-	add	$(1*16), p_out
-	and	$(~7*16), num_bytes
-	jz	.Ldo_return2\xctr\key_len
-	jmp	.Lmain_loop2\xctr\key_len
-
-.Leq2\xctr\key_len:
-	do_aes_load	2, \key_len, \xctr
-	add	$(2*16), p_out
-	and	$(~7*16), num_bytes
-	jz	.Ldo_return2\xctr\key_len
-	jmp	.Lmain_loop2\xctr\key_len
-
-
-.Leq3\xctr\key_len:
-	do_aes_load	3, \key_len, \xctr
-	add	$(3*16), p_out
-	and	$(~7*16), num_bytes
-	jz	.Ldo_return2\xctr\key_len
-	jmp	.Lmain_loop2\xctr\key_len
-
-.Leq4\xctr\key_len:
-	do_aes_load	4, \key_len, \xctr
-	add	$(4*16), p_out
-	and	$(~7*16), num_bytes
-	jz	.Ldo_return2\xctr\key_len
-	jmp	.Lmain_loop2\xctr\key_len
-
-.Lgt4\xctr\key_len:
-	cmp	$(6*16), tmp
-	jg	.Leq7\xctr\key_len
-	je	.Leq6\xctr\key_len
-
-.Leq5\xctr\key_len:
-	do_aes_load	5, \key_len, \xctr
-	add	$(5*16), p_out
-	and	$(~7*16), num_bytes
-	jz	.Ldo_return2\xctr\key_len
-	jmp	.Lmain_loop2\xctr\key_len
-
-.Leq6\xctr\key_len:
-	do_aes_load	6, \key_len, \xctr
-	add	$(6*16), p_out
-	and	$(~7*16), num_bytes
-	jz	.Ldo_return2\xctr\key_len
-	jmp	.Lmain_loop2\xctr\key_len
-
-.Leq7\xctr\key_len:
-	do_aes_load	7, \key_len, \xctr
-	add	$(7*16), p_out
-	and	$(~7*16), num_bytes
-	jz	.Ldo_return2\xctr\key_len
-	jmp	.Lmain_loop2\xctr\key_len
-
-.Lmult_of_8_blks\xctr\key_len:
-	.if (\key_len != KEY_128)
-		vmovdqa	0*16(p_keys), xkey0
-		vmovdqa	4*16(p_keys), xkey4
-		vmovdqa	8*16(p_keys), xkey8
-		vmovdqa	12*16(p_keys), xkey12
-	.else
-		vmovdqa	0*16(p_keys), xkey0
-		vmovdqa	3*16(p_keys), xkey4
-		vmovdqa	6*16(p_keys), xkey8
-		vmovdqa	9*16(p_keys), xkey12
-	.endif
-.align 16
-.Lmain_loop2\xctr\key_len:
-	/* num_bytes is a multiple of 8 and >0 */
-	do_aes_noload	8, \key_len, \xctr
-	add	$(8*16), p_out
-	sub	$(8*16), num_bytes
-	jne	.Lmain_loop2\xctr\key_len
-
-.Ldo_return2\xctr\key_len:
-	.if !\xctr
-		/* return updated IV */
-		vpshufb	xbyteswap, xcounter, xcounter
-		vmovdqu	xcounter, (p_iv)
-	.endif
-	RET
-.endm
-
-/*
- * routine to do AES128 CTR enc/decrypt "by8"
- * XMM registers are clobbered.
- * Saving/restoring must be done at a higher level
- * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
- *			unsigned int num_bytes)
- */
-SYM_FUNC_START(aes_ctr_enc_128_avx_by8)
-	/* call the aes main loop */
-	do_aes_ctrmain KEY_128 0
-
-SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
-
-/*
- * routine to do AES192 CTR enc/decrypt "by8"
- * XMM registers are clobbered.
- * Saving/restoring must be done at a higher level
- * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
- *			unsigned int num_bytes)
- */
-SYM_FUNC_START(aes_ctr_enc_192_avx_by8)
-	/* call the aes main loop */
-	do_aes_ctrmain KEY_192 0
-
-SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
-
-/*
- * routine to do AES256 CTR enc/decrypt "by8"
- * XMM registers are clobbered.
- * Saving/restoring must be done at a higher level
- * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
- *			unsigned int num_bytes)
- */
-SYM_FUNC_START(aes_ctr_enc_256_avx_by8)
-	/* call the aes main loop */
-	do_aes_ctrmain KEY_256 0
-
-SYM_FUNC_END(aes_ctr_enc_256_avx_by8)
-
-/*
- * routine to do AES128 XCTR enc/decrypt "by8"
- * XMM registers are clobbered.
- * Saving/restoring must be done at a higher level
- * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys,
- * 	u8* out, unsigned int num_bytes, unsigned int byte_ctr)
- */
-SYM_FUNC_START(aes_xctr_enc_128_avx_by8)
-	/* call the aes main loop */
-	do_aes_ctrmain KEY_128 1
-
-SYM_FUNC_END(aes_xctr_enc_128_avx_by8)
-
-/*
- * routine to do AES192 XCTR enc/decrypt "by8"
- * XMM registers are clobbered.
- * Saving/restoring must be done at a higher level
- * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys,
- * 	u8* out, unsigned int num_bytes, unsigned int byte_ctr)
- */
-SYM_FUNC_START(aes_xctr_enc_192_avx_by8)
-	/* call the aes main loop */
-	do_aes_ctrmain KEY_192 1
-
-SYM_FUNC_END(aes_xctr_enc_192_avx_by8)
-
-/*
- * routine to do AES256 XCTR enc/decrypt "by8"
- * XMM registers are clobbered.
- * Saving/restoring must be done at a higher level
- * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys,
- * 	u8* out, unsigned int num_bytes, unsigned int byte_ctr)
- */
-SYM_FUNC_START(aes_xctr_enc_256_avx_by8)
-	/* call the aes main loop */
-	do_aes_ctrmain KEY_256 1
-
-SYM_FUNC_END(aes_xctr_enc_256_avx_by8)
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index eb153eff9331..b37881bb9f15 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -17,6 +17,7 @@
  */
 
 #include <linux/linkage.h>
+#include <linux/objtool.h>
 #include <asm/frame.h>
 
 #define STATE1	%xmm0
@@ -1071,6 +1072,7 @@ SYM_FUNC_END(_aesni_inc)
  *		      size_t len, u8 *iv)
  */
 SYM_FUNC_START(aesni_ctr_enc)
+	ANNOTATE_NOENDBR
 	FRAME_BEGIN
 	cmp $16, LEN
 	jb .Lctr_enc_just_ret
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 11e95fc62636..bc655d794a95 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -23,7 +23,6 @@
 #include <linux/err.h>
 #include <crypto/algapi.h>
 #include <crypto/aes.h>
-#include <crypto/ctr.h>
 #include <crypto/b128ops.h>
 #include <crypto/gcm.h>
 #include <crypto/xts.h>
@@ -82,30 +81,8 @@ asmlinkage void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
 
 #ifdef CONFIG_X86_64
-
 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
-DEFINE_STATIC_CALL(aesni_ctr_enc_tfm, aesni_ctr_enc);
-
-asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv,
-		void *keys, u8 *out, unsigned int num_bytes);
-asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
-		void *keys, u8 *out, unsigned int num_bytes);
-asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
-		void *keys, u8 *out, unsigned int num_bytes);
-
-
-asmlinkage void aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv,
-	const void *keys, u8 *out, unsigned int num_bytes,
-	unsigned int byte_ctr);
-
-asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv,
-	const void *keys, u8 *out, unsigned int num_bytes,
-	unsigned int byte_ctr);
-
-asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv,
-	const void *keys, u8 *out, unsigned int num_bytes,
-	unsigned int byte_ctr);
 #endif
 
 static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
@@ -376,24 +353,8 @@ static int cts_cbc_decrypt(struct skcipher_request *req)
 }
 
 #ifdef CONFIG_X86_64
-static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out,
-			      const u8 *in, unsigned int len, u8 *iv)
-{
-	/*
-	 * based on key length, override with the by8 version
-	 * of ctr mode encryption/decryption for improved performance
-	 * aes_set_key_common() ensures that key length is one of
-	 * {128,192,256}
-	 */
-	if (ctx->key_length == AES_KEYSIZE_128)
-		aes_ctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len);
-	else if (ctx->key_length == AES_KEYSIZE_192)
-		aes_ctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len);
-	else
-		aes_ctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len);
-}
-
-static int ctr_crypt(struct skcipher_request *req)
+/* This is the non-AVX version. */
+static int ctr_crypt_aesni(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
@@ -407,10 +368,9 @@ static int ctr_crypt(struct skcipher_request *req)
 	while ((nbytes = walk.nbytes) > 0) {
 		kernel_fpu_begin();
 		if (nbytes & AES_BLOCK_MASK)
-			static_call(aesni_ctr_enc_tfm)(ctx, walk.dst.virt.addr,
-						       walk.src.virt.addr,
-						       nbytes & AES_BLOCK_MASK,
-						       walk.iv);
+			aesni_ctr_enc(ctx, walk.dst.virt.addr,
+				      walk.src.virt.addr,
+				      nbytes & AES_BLOCK_MASK, walk.iv);
 		nbytes &= ~AES_BLOCK_MASK;
 
 		if (walk.nbytes == walk.total && nbytes > 0) {
@@ -426,59 +386,6 @@ static int ctr_crypt(struct skcipher_request *req)
 	}
 	return err;
 }
-
-static void aesni_xctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out,
-				   const u8 *in, unsigned int len, u8 *iv,
-				   unsigned int byte_ctr)
-{
-	if (ctx->key_length == AES_KEYSIZE_128)
-		aes_xctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len,
-					 byte_ctr);
-	else if (ctx->key_length == AES_KEYSIZE_192)
-		aes_xctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len,
-					 byte_ctr);
-	else
-		aes_xctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len,
-					 byte_ctr);
-}
-
-static int xctr_crypt(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
-	u8 keystream[AES_BLOCK_SIZE];
-	struct skcipher_walk walk;
-	unsigned int nbytes;
-	unsigned int byte_ctr = 0;
-	int err;
-	__le32 block[AES_BLOCK_SIZE / sizeof(__le32)];
-
-	err = skcipher_walk_virt(&walk, req, false);
-
-	while ((nbytes = walk.nbytes) > 0) {
-		kernel_fpu_begin();
-		if (nbytes & AES_BLOCK_MASK)
-			aesni_xctr_enc_avx_tfm(ctx, walk.dst.virt.addr,
-				walk.src.virt.addr, nbytes & AES_BLOCK_MASK,
-				walk.iv, byte_ctr);
-		nbytes &= ~AES_BLOCK_MASK;
-		byte_ctr += walk.nbytes - nbytes;
-
-		if (walk.nbytes == walk.total && nbytes > 0) {
-			memcpy(block, walk.iv, AES_BLOCK_SIZE);
-			block[0] ^= cpu_to_le32(1 + byte_ctr / AES_BLOCK_SIZE);
-			aesni_enc(ctx, keystream, (u8 *)block);
-			crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes -
-				       nbytes, walk.src.virt.addr + walk.nbytes
-				       - nbytes, keystream, nbytes);
-			byte_ctr += nbytes;
-			nbytes = 0;
-		}
-		kernel_fpu_end();
-		err = skcipher_walk_done(&walk, nbytes);
-	}
-	return err;
-}
 #endif
 
 static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key,
@@ -581,11 +488,8 @@ xts_crypt(struct skcipher_request *req, xts_encrypt_iv_func encrypt_iv,
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
-	const unsigned int cryptlen = req->cryptlen;
-	struct scatterlist *src = req->src;
-	struct scatterlist *dst = req->dst;
 
-	if (unlikely(cryptlen < AES_BLOCK_SIZE))
+	if (unlikely(req->cryptlen < AES_BLOCK_SIZE))
 		return -EINVAL;
 
 	kernel_fpu_begin();
@@ -593,23 +497,16 @@ xts_crypt(struct skcipher_request *req, xts_encrypt_iv_func encrypt_iv,
 
 	/*
 	 * In practice, virtually all XTS plaintexts and ciphertexts are either
-	 * 512 or 4096 bytes, aligned such that they don't span page boundaries.
-	 * To optimize the performance of these cases, and also any other case
-	 * where no page boundary is spanned, the below fast-path handles
-	 * single-page sources and destinations as efficiently as possible.
+	 * 512 or 4096 bytes and do not use multiple scatterlist elements.  To
+	 * optimize the performance of these cases, the below fast-path handles
+	 * single-scatterlist-element messages as efficiently as possible.  The
+	 * code is 64-bit specific, as it assumes no page mapping is needed.
 	 */
-	if (likely(src->length >= cryptlen && dst->length >= cryptlen &&
-		   src->offset + cryptlen <= PAGE_SIZE &&
-		   dst->offset + cryptlen <= PAGE_SIZE)) {
-		struct page *src_page = sg_page(src);
-		struct page *dst_page = sg_page(dst);
-		void *src_virt = kmap_local_page(src_page) + src->offset;
-		void *dst_virt = kmap_local_page(dst_page) + dst->offset;
-
-		(*crypt_func)(&ctx->crypt_ctx, src_virt, dst_virt, cryptlen,
-			      req->iv);
-		kunmap_local(dst_virt);
-		kunmap_local(src_virt);
+	if (IS_ENABLED(CONFIG_X86_64) &&
+	    likely(req->src->length >= req->cryptlen &&
+		   req->dst->length >= req->cryptlen)) {
+		(*crypt_func)(&ctx->crypt_ctx, sg_virt(req->src),
+			      sg_virt(req->dst), req->cryptlen, req->iv);
 		kernel_fpu_end();
 		return 0;
 	}
@@ -731,8 +628,8 @@ static struct skcipher_alg aesni_skciphers[] = {
 		.ivsize		= AES_BLOCK_SIZE,
 		.chunksize	= AES_BLOCK_SIZE,
 		.setkey		= aesni_skcipher_setkey,
-		.encrypt	= ctr_crypt,
-		.decrypt	= ctr_crypt,
+		.encrypt	= ctr_crypt_aesni,
+		.decrypt	= ctr_crypt_aesni,
 #endif
 	}, {
 		.base = {
@@ -758,35 +655,105 @@ static
 struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)];
 
 #ifdef CONFIG_X86_64
-/*
- * XCTR does not have a non-AVX implementation, so it must be enabled
- * conditionally.
- */
-static struct skcipher_alg aesni_xctr = {
-	.base = {
-		.cra_name		= "__xctr(aes)",
-		.cra_driver_name	= "__xctr-aes-aesni",
-		.cra_priority		= 400,
-		.cra_flags		= CRYPTO_ALG_INTERNAL,
-		.cra_blocksize		= 1,
-		.cra_ctxsize		= CRYPTO_AES_CTX_SIZE,
-		.cra_module		= THIS_MODULE,
-	},
-	.min_keysize	= AES_MIN_KEY_SIZE,
-	.max_keysize	= AES_MAX_KEY_SIZE,
-	.ivsize		= AES_BLOCK_SIZE,
-	.chunksize	= AES_BLOCK_SIZE,
-	.setkey		= aesni_skcipher_setkey,
-	.encrypt	= xctr_crypt,
-	.decrypt	= xctr_crypt,
-};
-
-static struct simd_skcipher_alg *aesni_simd_xctr;
-
 asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
 				   u8 iv[AES_BLOCK_SIZE]);
 
-#define DEFINE_XTS_ALG(suffix, driver_name, priority)			       \
+/* __always_inline to avoid indirect call */
+static __always_inline int
+ctr_crypt(struct skcipher_request *req,
+	  void (*ctr64_func)(const struct crypto_aes_ctx *key,
+			     const u8 *src, u8 *dst, int len,
+			     const u64 le_ctr[2]))
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	const struct crypto_aes_ctx *key = aes_ctx(crypto_skcipher_ctx(tfm));
+	unsigned int nbytes, p1_nbytes, nblocks;
+	struct skcipher_walk walk;
+	u64 le_ctr[2];
+	u64 ctr64;
+	int err;
+
+	ctr64 = le_ctr[0] = get_unaligned_be64(&req->iv[8]);
+	le_ctr[1] = get_unaligned_be64(&req->iv[0]);
+
+	err = skcipher_walk_virt(&walk, req, false);
+
+	while ((nbytes = walk.nbytes) != 0) {
+		if (nbytes < walk.total) {
+			/* Not the end yet, so keep the length block-aligned. */
+			nbytes = round_down(nbytes, AES_BLOCK_SIZE);
+			nblocks = nbytes / AES_BLOCK_SIZE;
+		} else {
+			/* It's the end, so include any final partial block. */
+			nblocks = DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE);
+		}
+		ctr64 += nblocks;
+
+		kernel_fpu_begin();
+		if (likely(ctr64 >= nblocks)) {
+			/* The low 64 bits of the counter won't overflow. */
+			(*ctr64_func)(key, walk.src.virt.addr,
+				      walk.dst.virt.addr, nbytes, le_ctr);
+		} else {
+			/*
+			 * The low 64 bits of the counter will overflow.  The
+			 * assembly doesn't handle this case, so split the
+			 * operation into two at the point where the overflow
+			 * will occur.  After the first part, add the carry bit.
+			 */
+			p1_nbytes = min_t(unsigned int, nbytes,
+					  (nblocks - ctr64) * AES_BLOCK_SIZE);
+			(*ctr64_func)(key, walk.src.virt.addr,
+				      walk.dst.virt.addr, p1_nbytes, le_ctr);
+			le_ctr[0] = 0;
+			le_ctr[1]++;
+			(*ctr64_func)(key, walk.src.virt.addr + p1_nbytes,
+				      walk.dst.virt.addr + p1_nbytes,
+				      nbytes - p1_nbytes, le_ctr);
+		}
+		kernel_fpu_end();
+		le_ctr[0] = ctr64;
+
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+	}
+
+	put_unaligned_be64(ctr64, &req->iv[8]);
+	put_unaligned_be64(le_ctr[1], &req->iv[0]);
+
+	return err;
+}
+
+/* __always_inline to avoid indirect call */
+static __always_inline int
+xctr_crypt(struct skcipher_request *req,
+	   void (*xctr_func)(const struct crypto_aes_ctx *key,
+			     const u8 *src, u8 *dst, int len,
+			     const u8 iv[AES_BLOCK_SIZE], u64 ctr))
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	const struct crypto_aes_ctx *key = aes_ctx(crypto_skcipher_ctx(tfm));
+	struct skcipher_walk walk;
+	unsigned int nbytes;
+	u64 ctr = 1;
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, false);
+	while ((nbytes = walk.nbytes) != 0) {
+		if (nbytes < walk.total)
+			nbytes = round_down(nbytes, AES_BLOCK_SIZE);
+
+		kernel_fpu_begin();
+		(*xctr_func)(key, walk.src.virt.addr, walk.dst.virt.addr,
+			     nbytes, req->iv, ctr);
+		kernel_fpu_end();
+
+		ctr += DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE);
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+	}
+	return err;
+}
+
+#define DEFINE_AVX_SKCIPHER_ALGS(suffix, driver_name_suffix, priority)	       \
 									       \
 asmlinkage void								       \
 aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src,      \
@@ -805,32 +772,80 @@ static int xts_decrypt_##suffix(struct skcipher_request *req)		       \
 	return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_decrypt_##suffix);   \
 }									       \
 									       \
-static struct skcipher_alg aes_xts_alg_##suffix = {			       \
-	.base = {							       \
-		.cra_name		= "__xts(aes)",			       \
-		.cra_driver_name	= "__" driver_name,		       \
-		.cra_priority		= priority,			       \
-		.cra_flags		= CRYPTO_ALG_INTERNAL,		       \
-		.cra_blocksize		= AES_BLOCK_SIZE,		       \
-		.cra_ctxsize		= XTS_AES_CTX_SIZE,		       \
-		.cra_module		= THIS_MODULE,			       \
-	},								       \
-	.min_keysize	= 2 * AES_MIN_KEY_SIZE,				       \
-	.max_keysize	= 2 * AES_MAX_KEY_SIZE,				       \
-	.ivsize		= AES_BLOCK_SIZE,				       \
-	.walksize	= 2 * AES_BLOCK_SIZE,				       \
-	.setkey		= xts_setkey_aesni,				       \
-	.encrypt	= xts_encrypt_##suffix,				       \
-	.decrypt	= xts_decrypt_##suffix,				       \
-};									       \
+asmlinkage void								       \
+aes_ctr64_crypt_##suffix(const struct crypto_aes_ctx *key,		       \
+			 const u8 *src, u8 *dst, int len, const u64 le_ctr[2]);\
+									       \
+static int ctr_crypt_##suffix(struct skcipher_request *req)		       \
+{									       \
+	return ctr_crypt(req, aes_ctr64_crypt_##suffix);		       \
+}									       \
+									       \
+asmlinkage void								       \
+aes_xctr_crypt_##suffix(const struct crypto_aes_ctx *key,		       \
+			const u8 *src, u8 *dst, int len,		       \
+			const u8 iv[AES_BLOCK_SIZE], u64 ctr);		       \
 									       \
-static struct simd_skcipher_alg *aes_xts_simdalg_##suffix
+static int xctr_crypt_##suffix(struct skcipher_request *req)		       \
+{									       \
+	return xctr_crypt(req, aes_xctr_crypt_##suffix);		       \
+}									       \
+									       \
+static struct skcipher_alg skcipher_algs_##suffix[] = {{		       \
+	.base.cra_name		= "__xts(aes)",				       \
+	.base.cra_driver_name	= "__xts-aes-" driver_name_suffix,	       \
+	.base.cra_priority	= priority,				       \
+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,			       \
+	.base.cra_blocksize	= AES_BLOCK_SIZE,			       \
+	.base.cra_ctxsize	= XTS_AES_CTX_SIZE,			       \
+	.base.cra_module	= THIS_MODULE,				       \
+	.min_keysize		= 2 * AES_MIN_KEY_SIZE,			       \
+	.max_keysize		= 2 * AES_MAX_KEY_SIZE,			       \
+	.ivsize			= AES_BLOCK_SIZE,			       \
+	.walksize		= 2 * AES_BLOCK_SIZE,			       \
+	.setkey			= xts_setkey_aesni,			       \
+	.encrypt		= xts_encrypt_##suffix,			       \
+	.decrypt		= xts_decrypt_##suffix,			       \
+}, {									       \
+	.base.cra_name		= "__ctr(aes)",				       \
+	.base.cra_driver_name	= "__ctr-aes-" driver_name_suffix,	       \
+	.base.cra_priority	= priority,				       \
+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,			       \
+	.base.cra_blocksize	= 1,					       \
+	.base.cra_ctxsize	= CRYPTO_AES_CTX_SIZE,			       \
+	.base.cra_module	= THIS_MODULE,				       \
+	.min_keysize		= AES_MIN_KEY_SIZE,			       \
+	.max_keysize		= AES_MAX_KEY_SIZE,			       \
+	.ivsize			= AES_BLOCK_SIZE,			       \
+	.chunksize		= AES_BLOCK_SIZE,			       \
+	.setkey			= aesni_skcipher_setkey,		       \
+	.encrypt		= ctr_crypt_##suffix,			       \
+	.decrypt		= ctr_crypt_##suffix,			       \
+}, {									       \
+	.base.cra_name		= "__xctr(aes)",			       \
+	.base.cra_driver_name	= "__xctr-aes-" driver_name_suffix,	       \
+	.base.cra_priority	= priority,				       \
+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,			       \
+	.base.cra_blocksize	= 1,					       \
+	.base.cra_ctxsize	= CRYPTO_AES_CTX_SIZE,			       \
+	.base.cra_module	= THIS_MODULE,				       \
+	.min_keysize		= AES_MIN_KEY_SIZE,			       \
+	.max_keysize		= AES_MAX_KEY_SIZE,			       \
+	.ivsize			= AES_BLOCK_SIZE,			       \
+	.chunksize		= AES_BLOCK_SIZE,			       \
+	.setkey			= aesni_skcipher_setkey,		       \
+	.encrypt		= xctr_crypt_##suffix,			       \
+	.decrypt		= xctr_crypt_##suffix,			       \
+}};									       \
+									       \
+static struct simd_skcipher_alg *					       \
+simd_skcipher_algs_##suffix[ARRAY_SIZE(skcipher_algs_##suffix)]
 
-DEFINE_XTS_ALG(aesni_avx, "xts-aes-aesni-avx", 500);
+DEFINE_AVX_SKCIPHER_ALGS(aesni_avx, "aesni-avx", 500);
 #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
-DEFINE_XTS_ALG(vaes_avx2, "xts-aes-vaes-avx2", 600);
-DEFINE_XTS_ALG(vaes_avx10_256, "xts-aes-vaes-avx10_256", 700);
-DEFINE_XTS_ALG(vaes_avx10_512, "xts-aes-vaes-avx10_512", 800);
+DEFINE_AVX_SKCIPHER_ALGS(vaes_avx2, "vaes-avx2", 600);
+DEFINE_AVX_SKCIPHER_ALGS(vaes_avx10_256, "vaes-avx10_256", 700);
+DEFINE_AVX_SKCIPHER_ALGS(vaes_avx10_512, "vaes-avx10_512", 800);
 #endif
 
 /* The common part of the x86_64 AES-GCM key struct */
@@ -1291,41 +1306,40 @@ static void gcm_process_assoc(const struct aes_gcm_key *key, u8 ghash_acc[16],
 	scatterwalk_start(&walk, sg_src);
 
 	while (assoclen) {
-		unsigned int len_this_page = scatterwalk_clamp(&walk, assoclen);
-		void *mapped = scatterwalk_map(&walk);
-		const void *src = mapped;
+		unsigned int orig_len_this_step = scatterwalk_next(
+			&walk, assoclen);
+		unsigned int len_this_step = orig_len_this_step;
 		unsigned int len;
+		const u8 *src = walk.addr;
 
-		assoclen -= len_this_page;
-		scatterwalk_advance(&walk, len_this_page);
 		if (unlikely(pos)) {
-			len = min(len_this_page, 16 - pos);
+			len = min(len_this_step, 16 - pos);
 			memcpy(&buf[pos], src, len);
 			pos += len;
 			src += len;
-			len_this_page -= len;
+			len_this_step -= len;
 			if (pos < 16)
 				goto next;
 			aes_gcm_aad_update(key, ghash_acc, buf, 16, flags);
 			pos = 0;
 		}
-		len = len_this_page;
+		len = len_this_step;
 		if (unlikely(assoclen)) /* Not the last segment yet? */
 			len = round_down(len, 16);
 		aes_gcm_aad_update(key, ghash_acc, src, len, flags);
 		src += len;
-		len_this_page -= len;
-		if (unlikely(len_this_page)) {
-			memcpy(buf, src, len_this_page);
-			pos = len_this_page;
+		len_this_step -= len;
+		if (unlikely(len_this_step)) {
+			memcpy(buf, src, len_this_step);
+			pos = len_this_step;
 		}
 next:
-		scatterwalk_unmap(mapped);
-		scatterwalk_pagedone(&walk, 0, assoclen);
+		scatterwalk_done_src(&walk, orig_len_this_step);
 		if (need_resched()) {
 			kernel_fpu_end();
 			kernel_fpu_begin();
 		}
+		assoclen -= orig_len_this_step;
 	}
 	if (unlikely(pos))
 		aes_gcm_aad_update(key, ghash_acc, buf, pos, flags);
@@ -1536,34 +1550,15 @@ DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512,
 		AES_GCM_KEY_AVX10_SIZE, 800);
 #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
 
-/*
- * This is a list of CPU models that are known to suffer from downclocking when
- * zmm registers (512-bit vectors) are used.  On these CPUs, the AES mode
- * implementations with zmm registers won't be used by default.  Implementations
- * with ymm registers (256-bit vectors) will be used by default instead.
- */
-static const struct x86_cpu_id zmm_exclusion_list[] = {
-	X86_MATCH_VFM(INTEL_SKYLAKE_X,		0),
-	X86_MATCH_VFM(INTEL_ICELAKE_X,		0),
-	X86_MATCH_VFM(INTEL_ICELAKE_D,		0),
-	X86_MATCH_VFM(INTEL_ICELAKE,		0),
-	X86_MATCH_VFM(INTEL_ICELAKE_L,		0),
-	X86_MATCH_VFM(INTEL_ICELAKE_NNPI,	0),
-	X86_MATCH_VFM(INTEL_TIGERLAKE_L,	0),
-	X86_MATCH_VFM(INTEL_TIGERLAKE,		0),
-	/* Allow Rocket Lake and later, and Sapphire Rapids and later. */
-	/* Also allow AMD CPUs (starting with Zen 4, the first with AVX-512). */
-	{},
-};
-
 static int __init register_avx_algs(void)
 {
 	int err;
 
 	if (!boot_cpu_has(X86_FEATURE_AVX))
 		return 0;
-	err = simd_register_skciphers_compat(&aes_xts_alg_aesni_avx, 1,
-					     &aes_xts_simdalg_aesni_avx);
+	err = simd_register_skciphers_compat(skcipher_algs_aesni_avx,
+					     ARRAY_SIZE(skcipher_algs_aesni_avx),
+					     simd_skcipher_algs_aesni_avx);
 	if (err)
 		return err;
 	err = simd_register_aeads_compat(aes_gcm_algs_aesni_avx,
@@ -1571,6 +1566,12 @@ static int __init register_avx_algs(void)
 					 aes_gcm_simdalgs_aesni_avx);
 	if (err)
 		return err;
+	/*
+	 * Note: not all the algorithms registered below actually require
+	 * VPCLMULQDQ.  But in practice every CPU with VAES also has VPCLMULQDQ.
+	 * Similarly, the assembler support was added at about the same time.
+	 * For simplicity, just always check for VAES and VPCLMULQDQ together.
+	 */
 #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
 	if (!boot_cpu_has(X86_FEATURE_AVX2) ||
 	    !boot_cpu_has(X86_FEATURE_VAES) ||
@@ -1578,8 +1579,9 @@ static int __init register_avx_algs(void)
 	    !boot_cpu_has(X86_FEATURE_PCLMULQDQ) ||
 	    !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
 		return 0;
-	err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx2, 1,
-					     &aes_xts_simdalg_vaes_avx2);
+	err = simd_register_skciphers_compat(skcipher_algs_vaes_avx2,
+					     ARRAY_SIZE(skcipher_algs_vaes_avx2),
+					     simd_skcipher_algs_vaes_avx2);
 	if (err)
 		return err;
 
@@ -1590,8 +1592,9 @@ static int __init register_avx_algs(void)
 			       XFEATURE_MASK_AVX512, NULL))
 		return 0;
 
-	err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_256, 1,
-					     &aes_xts_simdalg_vaes_avx10_256);
+	err = simd_register_skciphers_compat(skcipher_algs_vaes_avx10_256,
+					     ARRAY_SIZE(skcipher_algs_vaes_avx10_256),
+					     simd_skcipher_algs_vaes_avx10_256);
 	if (err)
 		return err;
 	err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_256,
@@ -1600,16 +1603,18 @@ static int __init register_avx_algs(void)
 	if (err)
 		return err;
 
-	if (x86_match_cpu(zmm_exclusion_list)) {
+	if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) {
 		int i;
 
-		aes_xts_alg_vaes_avx10_512.base.cra_priority = 1;
+		for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx10_512); i++)
+			skcipher_algs_vaes_avx10_512[i].base.cra_priority = 1;
 		for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++)
 			aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1;
 	}
 
-	err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_512, 1,
-					     &aes_xts_simdalg_vaes_avx10_512);
+	err = simd_register_skciphers_compat(skcipher_algs_vaes_avx10_512,
+					     ARRAY_SIZE(skcipher_algs_vaes_avx10_512),
+					     simd_skcipher_algs_vaes_avx10_512);
 	if (err)
 		return err;
 	err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_512,
@@ -1623,27 +1628,31 @@ static int __init register_avx_algs(void)
 
 static void unregister_avx_algs(void)
 {
-	if (aes_xts_simdalg_aesni_avx)
-		simd_unregister_skciphers(&aes_xts_alg_aesni_avx, 1,
-					  &aes_xts_simdalg_aesni_avx);
+	if (simd_skcipher_algs_aesni_avx[0])
+		simd_unregister_skciphers(skcipher_algs_aesni_avx,
+					  ARRAY_SIZE(skcipher_algs_aesni_avx),
+					  simd_skcipher_algs_aesni_avx);
 	if (aes_gcm_simdalgs_aesni_avx[0])
 		simd_unregister_aeads(aes_gcm_algs_aesni_avx,
 				      ARRAY_SIZE(aes_gcm_algs_aesni_avx),
 				      aes_gcm_simdalgs_aesni_avx);
 #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
-	if (aes_xts_simdalg_vaes_avx2)
-		simd_unregister_skciphers(&aes_xts_alg_vaes_avx2, 1,
-					  &aes_xts_simdalg_vaes_avx2);
-	if (aes_xts_simdalg_vaes_avx10_256)
-		simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_256, 1,
-					  &aes_xts_simdalg_vaes_avx10_256);
+	if (simd_skcipher_algs_vaes_avx2[0])
+		simd_unregister_skciphers(skcipher_algs_vaes_avx2,
+					  ARRAY_SIZE(skcipher_algs_vaes_avx2),
+					  simd_skcipher_algs_vaes_avx2);
+	if (simd_skcipher_algs_vaes_avx10_256[0])
+		simd_unregister_skciphers(skcipher_algs_vaes_avx10_256,
+					  ARRAY_SIZE(skcipher_algs_vaes_avx10_256),
+					  simd_skcipher_algs_vaes_avx10_256);
 	if (aes_gcm_simdalgs_vaes_avx10_256[0])
 		simd_unregister_aeads(aes_gcm_algs_vaes_avx10_256,
 				      ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256),
 				      aes_gcm_simdalgs_vaes_avx10_256);
-	if (aes_xts_simdalg_vaes_avx10_512)
-		simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_512, 1,
-					  &aes_xts_simdalg_vaes_avx10_512);
+	if (simd_skcipher_algs_vaes_avx10_512[0])
+		simd_unregister_skciphers(skcipher_algs_vaes_avx10_512,
+					  ARRAY_SIZE(skcipher_algs_vaes_avx10_512),
+					  simd_skcipher_algs_vaes_avx10_512);
 	if (aes_gcm_simdalgs_vaes_avx10_512[0])
 		simd_unregister_aeads(aes_gcm_algs_vaes_avx10_512,
 				      ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512),
@@ -1676,13 +1685,6 @@ static int __init aesni_init(void)
 
 	if (!x86_match_cpu(aesni_cpu_id))
 		return -ENODEV;
-#ifdef CONFIG_X86_64
-	if (boot_cpu_has(X86_FEATURE_AVX)) {
-		/* optimize performance of ctr mode encryption transform */
-		static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm);
-		pr_info("AES CTR mode by8 optimization enabled\n");
-	}
-#endif /* CONFIG_X86_64 */
 
 	err = crypto_register_alg(&aesni_cipher_alg);
 	if (err)
@@ -1700,14 +1702,6 @@ static int __init aesni_init(void)
 	if (err)
 		goto unregister_skciphers;
 
-#ifdef CONFIG_X86_64
-	if (boot_cpu_has(X86_FEATURE_AVX))
-		err = simd_register_skciphers_compat(&aesni_xctr, 1,
-						     &aesni_simd_xctr);
-	if (err)
-		goto unregister_aeads;
-#endif /* CONFIG_X86_64 */
-
 	err = register_avx_algs();
 	if (err)
 		goto unregister_avx;
@@ -1716,11 +1710,6 @@ static int __init aesni_init(void)
 
 unregister_avx:
 	unregister_avx_algs();
-#ifdef CONFIG_X86_64
-	if (aesni_simd_xctr)
-		simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr);
-unregister_aeads:
-#endif /* CONFIG_X86_64 */
 	simd_unregister_aeads(aes_gcm_algs_aesni,
 			      ARRAY_SIZE(aes_gcm_algs_aesni),
 			      aes_gcm_simdalgs_aesni);
@@ -1740,10 +1729,6 @@ static void __exit aesni_exit(void)
 	simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
 				  aesni_simd_skciphers);
 	crypto_unregister_alg(&aesni_cipher_alg);
-#ifdef CONFIG_X86_64
-	if (boot_cpu_has(X86_FEATURE_AVX))
-		simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr);
-#endif /* CONFIG_X86_64 */
 	unregister_avx_algs();
 }
 
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index 646477a13e11..1dfef28c1266 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -16,6 +16,7 @@
  */
 
 #include <linux/linkage.h>
+#include <linux/cfi_types.h>
 #include <asm/frame.h>
 
 #define CAMELLIA_TABLE_BYTE_LEN 272
@@ -882,7 +883,7 @@ SYM_FUNC_START_LOCAL(__camellia_dec_blk16)
 	jmp .Ldec_max24;
 SYM_FUNC_END(__camellia_dec_blk16)
 
-SYM_FUNC_START(camellia_ecb_enc_16way)
+SYM_TYPED_FUNC_START(camellia_ecb_enc_16way)
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (16 blocks)
@@ -907,7 +908,7 @@ SYM_FUNC_START(camellia_ecb_enc_16way)
 	RET;
 SYM_FUNC_END(camellia_ecb_enc_16way)
 
-SYM_FUNC_START(camellia_ecb_dec_16way)
+SYM_TYPED_FUNC_START(camellia_ecb_dec_16way)
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (16 blocks)
@@ -937,7 +938,7 @@ SYM_FUNC_START(camellia_ecb_dec_16way)
 	RET;
 SYM_FUNC_END(camellia_ecb_dec_16way)
 
-SYM_FUNC_START(camellia_cbc_dec_16way)
+SYM_TYPED_FUNC_START(camellia_cbc_dec_16way)
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (16 blocks)
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index a0eb94e53b1b..b1c9b9450555 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -6,6 +6,7 @@
  */
 
 #include <linux/linkage.h>
+#include <linux/cfi_types.h>
 #include <asm/frame.h>
 
 #define CAMELLIA_TABLE_BYTE_LEN 272
diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S
index 816b6bb8bded..824cb94de6c2 100644
--- a/arch/x86/crypto/camellia-x86_64-asm_64.S
+++ b/arch/x86/crypto/camellia-x86_64-asm_64.S
@@ -6,6 +6,7 @@
  */
 
 #include <linux/linkage.h>
+#include <linux/cfi_types.h>
 
 .file "camellia-x86_64-asm_64.S"
 .text
@@ -177,7 +178,7 @@
 	bswapq				RAB0; \
 	movq RAB0,			4*2(RIO);
 
-SYM_FUNC_START(__camellia_enc_blk)
+SYM_TYPED_FUNC_START(__camellia_enc_blk)
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst
@@ -224,7 +225,7 @@ SYM_FUNC_START(__camellia_enc_blk)
 	RET;
 SYM_FUNC_END(__camellia_enc_blk)
 
-SYM_FUNC_START(camellia_dec_blk)
+SYM_TYPED_FUNC_START(camellia_dec_blk)
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst
@@ -411,7 +412,7 @@ SYM_FUNC_END(camellia_dec_blk)
 		bswapq				RAB1; \
 		movq RAB1,			12*2(RIO);
 
-SYM_FUNC_START(__camellia_enc_blk_2way)
+SYM_TYPED_FUNC_START(__camellia_enc_blk_2way)
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst
@@ -460,7 +461,7 @@ SYM_FUNC_START(__camellia_enc_blk_2way)
 	RET;
 SYM_FUNC_END(__camellia_enc_blk_2way)
 
-SYM_FUNC_START(camellia_dec_blk_2way)
+SYM_TYPED_FUNC_START(camellia_dec_blk_2way)
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst
diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
index 7b3a1cf0984b..8bb74a272879 100644
--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
@@ -133,12 +133,6 @@ void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
 }
 EXPORT_SYMBOL(hchacha_block_arch);
 
-void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
-{
-	chacha_init_generic(state, key, iv);
-}
-EXPORT_SYMBOL(chacha_init_arch);
-
 void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
 		       int nrounds)
 {
@@ -169,7 +163,7 @@ static int chacha_simd_stream_xor(struct skcipher_request *req,
 
 	err = skcipher_walk_virt(&walk, req, false);
 
-	chacha_init_generic(state, ctx->key, iv);
+	chacha_init(state, ctx->key, iv);
 
 	while (walk.nbytes > 0) {
 		unsigned int nbytes = walk.nbytes;
@@ -211,7 +205,7 @@ static int xchacha_simd(struct skcipher_request *req)
 	struct chacha_ctx subctx;
 	u8 real_iv[16];
 
-	chacha_init_generic(state, ctx->key, req->iv);
+	chacha_init(state, ctx->key, req->iv);
 
 	if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) {
 		kernel_fpu_begin();
diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c
index e88439d3828e..34600f90d8a6 100644
--- a/arch/x86/crypto/des3_ede_glue.c
+++ b/arch/x86/crypto/des3_ede_glue.c
@@ -73,7 +73,7 @@ static int ecb_crypt(struct skcipher_request *req, const u32 *expkey)
 	err = skcipher_walk_virt(&walk, req, false);
 
 	while ((nbytes = walk.nbytes)) {
-		u8 *wsrc = walk.src.virt.addr;
+		const u8 *wsrc = walk.src.virt.addr;
 		u8 *wdst = walk.dst.virt.addr;
 
 		/* Process four block batch */
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 41bc02e48916..c759ec808bf1 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -189,6 +189,20 @@ static int ghash_async_init(struct ahash_request *req)
 	return crypto_shash_init(desc);
 }
 
+static void ghash_init_cryptd_req(struct ahash_request *req)
+{
+	struct ahash_request *cryptd_req = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+
+	ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
+	ahash_request_set_callback(cryptd_req, req->base.flags,
+				   req->base.complete, req->base.data);
+	ahash_request_set_crypt(cryptd_req, req->src, req->result,
+				req->nbytes);
+}
+
 static int ghash_async_update(struct ahash_request *req)
 {
 	struct ahash_request *cryptd_req = ahash_request_ctx(req);
@@ -198,8 +212,7 @@ static int ghash_async_update(struct ahash_request *req)
 
 	if (!crypto_simd_usable() ||
 	    (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
-		memcpy(cryptd_req, req, sizeof(*req));
-		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
+		ghash_init_cryptd_req(req);
 		return crypto_ahash_update(cryptd_req);
 	} else {
 		struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
@@ -216,8 +229,7 @@ static int ghash_async_final(struct ahash_request *req)
 
 	if (!crypto_simd_usable() ||
 	    (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
-		memcpy(cryptd_req, req, sizeof(*req));
-		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
+		ghash_init_cryptd_req(req);
 		return crypto_ahash_final(cryptd_req);
 	} else {
 		struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
@@ -257,8 +269,7 @@ static int ghash_async_digest(struct ahash_request *req)
 
 	if (!crypto_simd_usable() ||
 	    (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
-		memcpy(cryptd_req, req, sizeof(*req));
-		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
+		ghash_init_cryptd_req(req);
 		return crypto_ahash_digest(cryptd_req);
 	} else {
 		struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index 97e283621851..84e47f7f6188 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -9,6 +9,7 @@
  */
 
 #include <linux/linkage.h>
+#include <linux/cfi_types.h>
 #include <asm/frame.h>
 #include "glue_helper-asm-avx.S"
 
@@ -656,7 +657,7 @@ SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx)
 	RET;
 SYM_FUNC_END(__serpent_dec_blk8_avx)
 
-SYM_FUNC_START(serpent_ecb_enc_8way_avx)
+SYM_TYPED_FUNC_START(serpent_ecb_enc_8way_avx)
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst
@@ -674,7 +675,7 @@ SYM_FUNC_START(serpent_ecb_enc_8way_avx)
 	RET;
 SYM_FUNC_END(serpent_ecb_enc_8way_avx)
 
-SYM_FUNC_START(serpent_ecb_dec_8way_avx)
+SYM_TYPED_FUNC_START(serpent_ecb_dec_8way_avx)
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst
@@ -692,7 +693,7 @@ SYM_FUNC_START(serpent_ecb_dec_8way_avx)
 	RET;
 SYM_FUNC_END(serpent_ecb_dec_8way_avx)
 
-SYM_FUNC_START(serpent_cbc_dec_8way_avx)
+SYM_TYPED_FUNC_START(serpent_cbc_dec_8way_avx)
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
index d2288bf38a8a..071e90e7f0d8 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
@@ -6,6 +6,7 @@
  */
 
 #include <linux/linkage.h>
+#include <linux/cfi_types.h>
 
 .file "twofish-x86_64-asm-3way.S"
 .text
@@ -220,7 +221,7 @@
 	rorq $32,			RAB2; \
 	outunpack3(mov, RIO, 2, RAB, 2);
 
-SYM_FUNC_START(__twofish_enc_blk_3way)
+SYM_TYPED_FUNC_START(__twofish_enc_blk_3way)
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst
@@ -269,7 +270,7 @@ SYM_FUNC_START(__twofish_enc_blk_3way)
 	RET;
 SYM_FUNC_END(__twofish_enc_blk_3way)
 
-SYM_FUNC_START(twofish_dec_blk_3way)
+SYM_TYPED_FUNC_START(twofish_dec_blk_3way)
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S
index 775af290cd19..e08b4ba07b93 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -8,6 +8,7 @@
 .text
 
 #include <linux/linkage.h>
+#include <linux/cfi_types.h>
 #include <asm/asm-offsets.h>
 
 #define a_offset	0
@@ -202,7 +203,7 @@
 	xor	%r8d,		d ## D;\
 	ror	$1,		d ## D;
 
-SYM_FUNC_START(twofish_enc_blk)
+SYM_TYPED_FUNC_START(twofish_enc_blk)
 	pushq    R1
 
 	/* %rdi contains the ctx address */
@@ -255,7 +256,7 @@ SYM_FUNC_START(twofish_enc_blk)
 	RET
 SYM_FUNC_END(twofish_enc_blk)
 
-SYM_FUNC_START(twofish_dec_blk)
+SYM_TYPED_FUNC_START(twofish_dec_blk)
 	pushq    R1
 
 	/* %rdi contains the ctx address */