diff options
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r-- | arch/x86/crypto/Kconfig | 11 | ||||
-rw-r--r-- | arch/x86/crypto/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/crypto/aegis128-aesni-glue.c | 9 | ||||
-rw-r--r-- | arch/x86/crypto/aes-ctr-avx-x86_64.S | 592 | ||||
-rw-r--r-- | arch/x86/crypto/aes-xts-avx-x86_64.S | 55 | ||||
-rw-r--r-- | arch/x86/crypto/aes_ctrby8_avx-x86_64.S | 597 | ||||
-rw-r--r-- | arch/x86/crypto/aesni-intel_asm.S | 2 | ||||
-rw-r--r-- | arch/x86/crypto/aesni-intel_glue.c | 483 | ||||
-rw-r--r-- | arch/x86/crypto/camellia-aesni-avx-asm_64.S | 7 | ||||
-rw-r--r-- | arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 1 | ||||
-rw-r--r-- | arch/x86/crypto/camellia-x86_64-asm_64.S | 9 | ||||
-rw-r--r-- | arch/x86/crypto/chacha_glue.c | 10 | ||||
-rw-r--r-- | arch/x86/crypto/des3_ede_glue.c | 2 | ||||
-rw-r--r-- | arch/x86/crypto/ghash-clmulni-intel_glue.c | 23 | ||||
-rw-r--r-- | arch/x86/crypto/serpent-avx-x86_64-asm_64.S | 7 | ||||
-rw-r--r-- | arch/x86/crypto/twofish-x86_64-asm_64-3way.S | 5 | ||||
-rw-r--r-- | arch/x86/crypto/twofish-x86_64-asm_64.S | 5 |
17 files changed, 927 insertions, 893 deletions
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index 4757bf922075..3d948f10c94c 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -3,10 +3,12 @@ menu "Accelerated Cryptographic Algorithms for CPU (x86)" config CRYPTO_CURVE25519_X86 - tristate "Public key crypto: Curve25519 (ADX)" + tristate depends on X86 && 64BIT + select CRYPTO_KPP select CRYPTO_LIB_CURVE25519_GENERIC select CRYPTO_ARCH_HAVE_LIB_CURVE25519 + default CRYPTO_LIB_CURVE25519_INTERNAL help Curve25519 algorithm @@ -348,11 +350,12 @@ config CRYPTO_ARIA_GFNI_AVX512_X86_64 Processes 64 blocks in parallel. config CRYPTO_CHACHA20_X86_64 - tristate "Ciphers: ChaCha20, XChaCha20, XChaCha12 (SSSE3/AVX2/AVX-512VL)" + tristate depends on X86 && 64BIT select CRYPTO_SKCIPHER select CRYPTO_LIB_CHACHA_GENERIC select CRYPTO_ARCH_HAVE_LIB_CHACHA + default CRYPTO_LIB_CHACHA_INTERNAL help Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12 stream cipher algorithms @@ -417,10 +420,12 @@ config CRYPTO_POLYVAL_CLMUL_NI - CLMUL-NI (carry-less multiplication new instructions) config CRYPTO_POLY1305_X86_64 - tristate "Hash functions: Poly1305 (SSE2/AVX2)" + tristate depends on X86 && 64BIT + select CRYPTO_HASH select CRYPTO_LIB_POLY1305_GENERIC select CRYPTO_ARCH_HAVE_LIB_POLY1305 + default CRYPTO_LIB_POLY1305_INTERNAL help Poly1305 authenticator algorithm (RFC7539) diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 07b00bfca64b..5d19f41bde58 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -48,7 +48,7 @@ chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o -aesni-intel-$(CONFIG_64BIT) += aes_ctrby8_avx-x86_64.o \ +aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \ aes-gcm-aesni-x86_64.o \ aes-xts-avx-x86_64.o ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy) diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index 01fa568dc5fc..26786e15abac 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -71,10 +71,9 @@ static void crypto_aegis128_aesni_process_ad( scatterwalk_start(&walk, sg_src); while (assoclen != 0) { - unsigned int size = scatterwalk_clamp(&walk, assoclen); + unsigned int size = scatterwalk_next(&walk, assoclen); + const u8 *src = walk.addr; unsigned int left = size; - void *mapped = scatterwalk_map(&walk); - const u8 *src = (const u8 *)mapped; if (pos + size >= AEGIS128_BLOCK_SIZE) { if (pos > 0) { @@ -97,9 +96,7 @@ static void crypto_aegis128_aesni_process_ad( pos += left; assoclen -= size; - scatterwalk_unmap(mapped); - scatterwalk_advance(&walk, size); - scatterwalk_done(&walk, 0, assoclen); + scatterwalk_done_src(&walk, size); } if (pos > 0) { diff --git a/arch/x86/crypto/aes-ctr-avx-x86_64.S b/arch/x86/crypto/aes-ctr-avx-x86_64.S new file mode 100644 index 000000000000..1685d8b24b2c --- /dev/null +++ b/arch/x86/crypto/aes-ctr-avx-x86_64.S @@ -0,0 +1,592 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// Copyright 2025 Google LLC +// +// Author: Eric Biggers <ebiggers@google.com> +// +// This file is dual-licensed, meaning that you can use it under your choice of +// either of the following two licenses: +// +// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// or +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +//------------------------------------------------------------------------------ +// +// This file contains x86_64 assembly implementations of AES-CTR and AES-XCTR +// using the following sets of CPU features: +// - AES-NI && AVX +// - VAES && AVX2 +// - VAES && (AVX10/256 || (AVX512BW && AVX512VL)) && BMI2 +// - VAES && (AVX10/512 || (AVX512BW && AVX512VL)) && BMI2 +// +// See the function definitions at the bottom of the file for more information. + +#include <linux/linkage.h> +#include <linux/cfi_types.h> + +.section .rodata +.p2align 4 + +.Lbswap_mask: + .octa 0x000102030405060708090a0b0c0d0e0f + +.Lctr_pattern: + .quad 0, 0 +.Lone: + .quad 1, 0 +.Ltwo: + .quad 2, 0 + .quad 3, 0 + +.Lfour: + .quad 4, 0 + +.text + +// Move a vector between memory and a register. +// The register operand must be in the first 16 vector registers. +.macro _vmovdqu src, dst +.if VL < 64 + vmovdqu \src, \dst +.else + vmovdqu8 \src, \dst +.endif +.endm + +// Move a vector between registers. +// The registers must be in the first 16 vector registers. +.macro _vmovdqa src, dst +.if VL < 64 + vmovdqa \src, \dst +.else + vmovdqa64 \src, \dst +.endif +.endm + +// Broadcast a 128-bit value from memory to all 128-bit lanes of a vector +// register. The register operand must be in the first 16 vector registers. +.macro _vbroadcast128 src, dst +.if VL == 16 + vmovdqu \src, \dst +.elseif VL == 32 + vbroadcasti128 \src, \dst +.else + vbroadcasti32x4 \src, \dst +.endif +.endm + +// XOR two vectors together. +// Any register operands must be in the first 16 vector registers. +.macro _vpxor src1, src2, dst +.if VL < 64 + vpxor \src1, \src2, \dst +.else + vpxord \src1, \src2, \dst +.endif +.endm + +// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst +// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}. +.macro _load_partial_block src, dst, tmp64, tmp32 + sub $8, %ecx // LEN - 8 + jle .Lle8\@ + + // Load 9 <= LEN <= 15 bytes. + vmovq (\src), \dst // Load first 8 bytes + mov (\src, %rcx), %rax // Load last 8 bytes + neg %ecx + shl $3, %ecx + shr %cl, %rax // Discard overlapping bytes + vpinsrq $1, %rax, \dst, \dst + jmp .Ldone\@ + +.Lle8\@: + add $4, %ecx // LEN - 4 + jl .Llt4\@ + + // Load 4 <= LEN <= 8 bytes. + mov (\src), %eax // Load first 4 bytes + mov (\src, %rcx), \tmp32 // Load last 4 bytes + jmp .Lcombine\@ + +.Llt4\@: + // Load 1 <= LEN <= 3 bytes. + add $2, %ecx // LEN - 2 + movzbl (\src), %eax // Load first byte + jl .Lmovq\@ + movzwl (\src, %rcx), \tmp32 // Load last 2 bytes +.Lcombine\@: + shl $3, %ecx + shl %cl, \tmp64 + or \tmp64, %rax // Combine the two parts +.Lmovq\@: + vmovq %rax, \dst +.Ldone\@: +.endm + +// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst. +// Clobbers %rax, %rcx, and \tmp{64,32}. +.macro _store_partial_block src, dst, tmp64, tmp32 + sub $8, %ecx // LEN - 8 + jl .Llt8\@ + + // Store 8 <= LEN <= 15 bytes. + vpextrq $1, \src, %rax + mov %ecx, \tmp32 + shl $3, %ecx + ror %cl, %rax + mov %rax, (\dst, \tmp64) // Store last LEN - 8 bytes + vmovq \src, (\dst) // Store first 8 bytes + jmp .Ldone\@ + +.Llt8\@: + add $4, %ecx // LEN - 4 + jl .Llt4\@ + + // Store 4 <= LEN <= 7 bytes. + vpextrd $1, \src, %eax + mov %ecx, \tmp32 + shl $3, %ecx + ror %cl, %eax + mov %eax, (\dst, \tmp64) // Store last LEN - 4 bytes + vmovd \src, (\dst) // Store first 4 bytes + jmp .Ldone\@ + +.Llt4\@: + // Store 1 <= LEN <= 3 bytes. + vpextrb $0, \src, 0(\dst) + cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2? + jl .Ldone\@ + vpextrb $1, \src, 1(\dst) + je .Ldone\@ + vpextrb $2, \src, 2(\dst) +.Ldone\@: +.endm + +// Prepare the next two vectors of AES inputs in AESDATA\i0 and AESDATA\i1, and +// XOR each with the zero-th round key. Also update LE_CTR if !\final. +.macro _prepare_2_ctr_vecs is_xctr, i0, i1, final=0 +.if \is_xctr + .if USE_AVX10 + _vmovdqa LE_CTR, AESDATA\i0 + vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i0 + .else + vpxor XCTR_IV, LE_CTR, AESDATA\i0 + vpxor RNDKEY0, AESDATA\i0, AESDATA\i0 + .endif + vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1 + + .if USE_AVX10 + vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i1 + .else + vpxor XCTR_IV, AESDATA\i1, AESDATA\i1 + vpxor RNDKEY0, AESDATA\i1, AESDATA\i1 + .endif +.else + vpshufb BSWAP_MASK, LE_CTR, AESDATA\i0 + _vpxor RNDKEY0, AESDATA\i0, AESDATA\i0 + vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1 + vpshufb BSWAP_MASK, AESDATA\i1, AESDATA\i1 + _vpxor RNDKEY0, AESDATA\i1, AESDATA\i1 +.endif +.if !\final + vpaddq LE_CTR_INC2, LE_CTR, LE_CTR +.endif +.endm + +// Do all AES rounds on the data in the given AESDATA vectors, excluding the +// zero-th and last rounds. +.macro _aesenc_loop vecs:vararg + mov KEY, %rax +1: + _vbroadcast128 (%rax), RNDKEY +.irp i, \vecs + vaesenc RNDKEY, AESDATA\i, AESDATA\i +.endr + add $16, %rax + cmp %rax, RNDKEYLAST_PTR + jne 1b +.endm + +// Finalize the keystream blocks in the given AESDATA vectors by doing the last +// AES round, then XOR those keystream blocks with the corresponding data. +// Reduce latency by doing the XOR before the vaesenclast, utilizing the +// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). +.macro _aesenclast_and_xor vecs:vararg +.irp i, \vecs + _vpxor \i*VL(SRC), RNDKEYLAST, RNDKEY + vaesenclast RNDKEY, AESDATA\i, AESDATA\i +.endr +.irp i, \vecs + _vmovdqu AESDATA\i, \i*VL(DST) +.endr +.endm + +// XOR the keystream blocks in the specified AESDATA vectors with the +// corresponding data. +.macro _xor_data vecs:vararg +.irp i, \vecs + _vpxor \i*VL(SRC), AESDATA\i, AESDATA\i +.endr +.irp i, \vecs + _vmovdqu AESDATA\i, \i*VL(DST) +.endr +.endm + +.macro _aes_ctr_crypt is_xctr + + // Define register aliases V0-V15 that map to the xmm, ymm, or zmm + // registers according to the selected Vector Length (VL). +.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + .if VL == 16 + .set V\i, %xmm\i + .elseif VL == 32 + .set V\i, %ymm\i + .elseif VL == 64 + .set V\i, %zmm\i + .else + .error "Unsupported Vector Length (VL)" + .endif +.endr + + // Function arguments + .set KEY, %rdi // Initially points to the start of the + // crypto_aes_ctx, then is advanced to + // point to the index 1 round key + .set KEY32, %edi // Available as temp register after all + // keystream blocks have been generated + .set SRC, %rsi // Pointer to next source data + .set DST, %rdx // Pointer to next destination data + .set LEN, %ecx // Remaining length in bytes. + // Note: _load_partial_block relies on + // this being in %ecx. + .set LEN64, %rcx // Zero-extend LEN before using! + .set LEN8, %cl +.if \is_xctr + .set XCTR_IV_PTR, %r8 // const u8 iv[AES_BLOCK_SIZE]; + .set XCTR_CTR, %r9 // u64 ctr; +.else + .set LE_CTR_PTR, %r8 // const u64 le_ctr[2]; +.endif + + // Additional local variables + .set RNDKEYLAST_PTR, %r10 + .set AESDATA0, V0 + .set AESDATA0_XMM, %xmm0 + .set AESDATA1, V1 + .set AESDATA1_XMM, %xmm1 + .set AESDATA2, V2 + .set AESDATA3, V3 + .set AESDATA4, V4 + .set AESDATA5, V5 + .set AESDATA6, V6 + .set AESDATA7, V7 +.if \is_xctr + .set XCTR_IV, V8 +.else + .set BSWAP_MASK, V8 +.endif + .set LE_CTR, V9 + .set LE_CTR_XMM, %xmm9 + .set LE_CTR_INC1, V10 + .set LE_CTR_INC2, V11 + .set RNDKEY0, V12 + .set RNDKEYLAST, V13 + .set RNDKEY, V14 + + // Create the first vector of counters. +.if \is_xctr + .if VL == 16 + vmovq XCTR_CTR, LE_CTR + .elseif VL == 32 + vmovq XCTR_CTR, LE_CTR_XMM + inc XCTR_CTR + vmovq XCTR_CTR, AESDATA0_XMM + vinserti128 $1, AESDATA0_XMM, LE_CTR, LE_CTR + .else + vpbroadcastq XCTR_CTR, LE_CTR + vpsrldq $8, LE_CTR, LE_CTR + vpaddq .Lctr_pattern(%rip), LE_CTR, LE_CTR + .endif + _vbroadcast128 (XCTR_IV_PTR), XCTR_IV +.else + _vbroadcast128 (LE_CTR_PTR), LE_CTR + .if VL > 16 + vpaddq .Lctr_pattern(%rip), LE_CTR, LE_CTR + .endif + _vbroadcast128 .Lbswap_mask(%rip), BSWAP_MASK +.endif + +.if VL == 16 + _vbroadcast128 .Lone(%rip), LE_CTR_INC1 +.elseif VL == 32 + _vbroadcast128 .Ltwo(%rip), LE_CTR_INC1 +.else + _vbroadcast128 .Lfour(%rip), LE_CTR_INC1 +.endif + vpsllq $1, LE_CTR_INC1, LE_CTR_INC2 + + // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256). + movl 480(KEY), %eax + + // Compute the pointer to the last round key. + lea 6*16(KEY, %rax, 4), RNDKEYLAST_PTR + + // Load the zero-th and last round keys. + _vbroadcast128 (KEY), RNDKEY0 + _vbroadcast128 (RNDKEYLAST_PTR), RNDKEYLAST + + // Make KEY point to the first round key. + add $16, KEY + + // This is the main loop, which encrypts 8 vectors of data at a time. + add $-8*VL, LEN + jl .Lloop_8x_done\@ +.Lloop_8x\@: + _prepare_2_ctr_vecs \is_xctr, 0, 1 + _prepare_2_ctr_vecs \is_xctr, 2, 3 + _prepare_2_ctr_vecs \is_xctr, 4, 5 + _prepare_2_ctr_vecs \is_xctr, 6, 7 + _aesenc_loop 0,1,2,3,4,5,6,7 + _aesenclast_and_xor 0,1,2,3,4,5,6,7 + sub $-8*VL, SRC + sub $-8*VL, DST + add $-8*VL, LEN + jge .Lloop_8x\@ +.Lloop_8x_done\@: + sub $-8*VL, LEN + jz .Ldone\@ + + // 1 <= LEN < 8*VL. Generate 2, 4, or 8 more vectors of keystream + // blocks, depending on the remaining LEN. + + _prepare_2_ctr_vecs \is_xctr, 0, 1 + _prepare_2_ctr_vecs \is_xctr, 2, 3 + cmp $4*VL, LEN + jle .Lenc_tail_atmost4vecs\@ + + // 4*VL < LEN < 8*VL. Generate 8 vectors of keystream blocks. Use the + // first 4 to XOR 4 full vectors of data. Then XOR the remaining data. + _prepare_2_ctr_vecs \is_xctr, 4, 5 + _prepare_2_ctr_vecs \is_xctr, 6, 7, final=1 + _aesenc_loop 0,1,2,3,4,5,6,7 + _aesenclast_and_xor 0,1,2,3 + vaesenclast RNDKEYLAST, AESDATA4, AESDATA0 + vaesenclast RNDKEYLAST, AESDATA5, AESDATA1 + vaesenclast RNDKEYLAST, AESDATA6, AESDATA2 + vaesenclast RNDKEYLAST, AESDATA7, AESDATA3 + sub $-4*VL, SRC + sub $-4*VL, DST + add $-4*VL, LEN + cmp $1*VL-1, LEN + jle .Lxor_tail_partial_vec_0\@ + _xor_data 0 + cmp $2*VL-1, LEN + jle .Lxor_tail_partial_vec_1\@ + _xor_data 1 + cmp $3*VL-1, LEN + jle .Lxor_tail_partial_vec_2\@ + _xor_data 2 + cmp $4*VL-1, LEN + jle .Lxor_tail_partial_vec_3\@ + _xor_data 3 + jmp .Ldone\@ + +.Lenc_tail_atmost4vecs\@: + cmp $2*VL, LEN + jle .Lenc_tail_atmost2vecs\@ + + // 2*VL < LEN <= 4*VL. Generate 4 vectors of keystream blocks. Use the + // first 2 to XOR 2 full vectors of data. Then XOR the remaining data. + _aesenc_loop 0,1,2,3 + _aesenclast_and_xor 0,1 + vaesenclast RNDKEYLAST, AESDATA2, AESDATA0 + vaesenclast RNDKEYLAST, AESDATA3, AESDATA1 + sub $-2*VL, SRC + sub $-2*VL, DST + add $-2*VL, LEN + jmp .Lxor_tail_upto2vecs\@ + +.Lenc_tail_atmost2vecs\@: + // 1 <= LEN <= 2*VL. Generate 2 vectors of keystream blocks. Then XOR + // the remaining data. + _aesenc_loop 0,1 + vaesenclast RNDKEYLAST, AESDATA0, AESDATA0 + vaesenclast RNDKEYLAST, AESDATA1, AESDATA1 + +.Lxor_tail_upto2vecs\@: + cmp $1*VL-1, LEN + jle .Lxor_tail_partial_vec_0\@ + _xor_data 0 + cmp $2*VL-1, LEN + jle .Lxor_tail_partial_vec_1\@ + _xor_data 1 + jmp .Ldone\@ + +.Lxor_tail_partial_vec_1\@: + add $-1*VL, LEN + jz .Ldone\@ + sub $-1*VL, SRC + sub $-1*VL, DST + _vmovdqa AESDATA1, AESDATA0 + jmp .Lxor_tail_partial_vec_0\@ + +.Lxor_tail_partial_vec_2\@: + add $-2*VL, LEN + jz .Ldone\@ + sub $-2*VL, SRC + sub $-2*VL, DST + _vmovdqa AESDATA2, AESDATA0 + jmp .Lxor_tail_partial_vec_0\@ + +.Lxor_tail_partial_vec_3\@: + add $-3*VL, LEN + jz .Ldone\@ + sub $-3*VL, SRC + sub $-3*VL, DST + _vmovdqa AESDATA3, AESDATA0 + +.Lxor_tail_partial_vec_0\@: + // XOR the remaining 1 <= LEN < VL bytes. It's easy if masked + // loads/stores are available; otherwise it's a bit harder... +.if USE_AVX10 + .if VL <= 32 + mov $-1, %eax + bzhi LEN, %eax, %eax + kmovd %eax, %k1 + .else + mov $-1, %rax + bzhi LEN64, %rax, %rax + kmovq %rax, %k1 + .endif + vmovdqu8 (SRC), AESDATA1{%k1}{z} + _vpxor AESDATA1, AESDATA0, AESDATA0 + vmovdqu8 AESDATA0, (DST){%k1} +.else + .if VL == 32 + cmp $16, LEN + jl 1f + vpxor (SRC), AESDATA0_XMM, AESDATA1_XMM + vmovdqu AESDATA1_XMM, (DST) + add $16, SRC + add $16, DST + sub $16, LEN + jz .Ldone\@ + vextracti128 $1, AESDATA0, AESDATA0_XMM +1: + .endif + mov LEN, %r10d + _load_partial_block SRC, AESDATA1_XMM, KEY, KEY32 + vpxor AESDATA1_XMM, AESDATA0_XMM, AESDATA0_XMM + mov %r10d, %ecx + _store_partial_block AESDATA0_XMM, DST, KEY, KEY32 +.endif + +.Ldone\@: +.if VL > 16 + vzeroupper +.endif + RET +.endm + +// Below are the definitions of the functions generated by the above macro. +// They have the following prototypes: +// +// +// void aes_ctr64_crypt_##suffix(const struct crypto_aes_ctx *key, +// const u8 *src, u8 *dst, int len, +// const u64 le_ctr[2]); +// +// void aes_xctr_crypt_##suffix(const struct crypto_aes_ctx *key, +// const u8 *src, u8 *dst, int len, +// const u8 iv[AES_BLOCK_SIZE], u64 ctr); +// +// Both functions generate |len| bytes of keystream, XOR it with the data from +// |src|, and write the result to |dst|. On non-final calls, |len| must be a +// multiple of 16. On the final call, |len| can be any value. +// +// aes_ctr64_crypt_* implement "regular" CTR, where the keystream is generated +// from a 128-bit big endian counter that increments by 1 for each AES block. +// HOWEVER, to keep the assembly code simple, some of the counter management is +// left to the caller. aes_ctr64_crypt_* take the counter in little endian +// form, only increment the low 64 bits internally, do the conversion to big +// endian internally, and don't write the updated counter back to memory. The +// caller is responsible for converting the starting IV to the little endian +// le_ctr, detecting the (very rare) case of a carry out of the low 64 bits +// being needed and splitting at that point with a carry done in between, and +// updating le_ctr after each part if the message is multi-part. +// +// aes_xctr_crypt_* implement XCTR as specified in "Length-preserving encryption +// with HCTR2" (https://eprint.iacr.org/2021/1441.pdf). XCTR is an +// easier-to-implement variant of CTR that uses little endian byte order and +// eliminates carries. |ctr| is the per-message block counter starting at 1. + +.set VL, 16 +.set USE_AVX10, 0 +SYM_TYPED_FUNC_START(aes_ctr64_crypt_aesni_avx) + _aes_ctr_crypt 0 +SYM_FUNC_END(aes_ctr64_crypt_aesni_avx) +SYM_TYPED_FUNC_START(aes_xctr_crypt_aesni_avx) + _aes_ctr_crypt 1 +SYM_FUNC_END(aes_xctr_crypt_aesni_avx) + +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) +.set VL, 32 +.set USE_AVX10, 0 +SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx2) + _aes_ctr_crypt 0 +SYM_FUNC_END(aes_ctr64_crypt_vaes_avx2) +SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx2) + _aes_ctr_crypt 1 +SYM_FUNC_END(aes_xctr_crypt_vaes_avx2) + +.set VL, 32 +.set USE_AVX10, 1 +SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx10_256) + _aes_ctr_crypt 0 +SYM_FUNC_END(aes_ctr64_crypt_vaes_avx10_256) +SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx10_256) + _aes_ctr_crypt 1 +SYM_FUNC_END(aes_xctr_crypt_vaes_avx10_256) + +.set VL, 64 +.set USE_AVX10, 1 +SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx10_512) + _aes_ctr_crypt 0 +SYM_FUNC_END(aes_ctr64_crypt_vaes_avx10_512) +SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx10_512) + _aes_ctr_crypt 1 +SYM_FUNC_END(aes_xctr_crypt_vaes_avx10_512) +#endif // CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S index 8a3e23fbcf85..93ba0ddbe009 100644 --- a/arch/x86/crypto/aes-xts-avx-x86_64.S +++ b/arch/x86/crypto/aes-xts-avx-x86_64.S @@ -1,11 +1,50 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * AES-XTS for modern x86_64 CPUs - * - * Copyright 2024 Google LLC - * - * Author: Eric Biggers <ebiggers@google.com> - */ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// AES-XTS for modern x86_64 CPUs +// +// Copyright 2024 Google LLC +// +// Author: Eric Biggers <ebiggers@google.com> +// +//------------------------------------------------------------------------------ +// +// This file is dual-licensed, meaning that you can use it under your choice of +// either of the following two licenses: +// +// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// or +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. /* * This file implements AES-XTS for modern x86_64 CPUs. To handle the diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S deleted file mode 100644 index 2402b9418cd7..000000000000 --- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S +++ /dev/null @@ -1,597 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */ -/* - * AES CTR mode by8 optimization with AVX instructions. (x86_64) - * - * Copyright(c) 2014 Intel Corporation. - * - * Contact Information: - * James Guilford <james.guilford@intel.com> - * Sean Gulley <sean.m.gulley@intel.com> - * Chandramouli Narayanan <mouli@linux.intel.com> - */ -/* - * This is AES128/192/256 CTR mode optimization implementation. It requires - * the support of Intel(R) AESNI and AVX instructions. - * - * This work was inspired by the AES CTR mode optimization published - * in Intel Optimized IPSEC Cryptographic library. - * Additional information on it can be found at: - * https://github.com/intel/intel-ipsec-mb - */ - -#include <linux/linkage.h> - -#define VMOVDQ vmovdqu - -/* - * Note: the "x" prefix in these aliases means "this is an xmm register". The - * alias prefixes have no relation to XCTR where the "X" prefix means "XOR - * counter". - */ -#define xdata0 %xmm0 -#define xdata1 %xmm1 -#define xdata2 %xmm2 -#define xdata3 %xmm3 -#define xdata4 %xmm4 -#define xdata5 %xmm5 -#define xdata6 %xmm6 -#define xdata7 %xmm7 -#define xcounter %xmm8 // CTR mode only -#define xiv %xmm8 // XCTR mode only -#define xbyteswap %xmm9 // CTR mode only -#define xtmp %xmm9 // XCTR mode only -#define xkey0 %xmm10 -#define xkey4 %xmm11 -#define xkey8 %xmm12 -#define xkey12 %xmm13 -#define xkeyA %xmm14 -#define xkeyB %xmm15 - -#define p_in %rdi -#define p_iv %rsi -#define p_keys %rdx -#define p_out %rcx -#define num_bytes %r8 -#define counter %r9 // XCTR mode only -#define tmp %r10 -#define DDQ_DATA 0 -#define XDATA 1 -#define KEY_128 1 -#define KEY_192 2 -#define KEY_256 3 - -.section .rodata -.align 16 - -byteswap_const: - .octa 0x000102030405060708090A0B0C0D0E0F -ddq_low_msk: - .octa 0x0000000000000000FFFFFFFFFFFFFFFF -ddq_high_add_1: - .octa 0x00000000000000010000000000000000 -ddq_add_1: - .octa 0x00000000000000000000000000000001 -ddq_add_2: - .octa 0x00000000000000000000000000000002 -ddq_add_3: - .octa 0x00000000000000000000000000000003 -ddq_add_4: - .octa 0x00000000000000000000000000000004 -ddq_add_5: - .octa 0x00000000000000000000000000000005 -ddq_add_6: - .octa 0x00000000000000000000000000000006 -ddq_add_7: - .octa 0x00000000000000000000000000000007 -ddq_add_8: - .octa 0x00000000000000000000000000000008 - -.text - -/* generate a unique variable for ddq_add_x */ - -/* generate a unique variable for xmm register */ -.macro setxdata n - var_xdata = %xmm\n -.endm - -/* club the numeric 'id' to the symbol 'name' */ - -.macro club name, id -.altmacro - .if \name == XDATA - setxdata %\id - .endif -.noaltmacro -.endm - -/* - * do_aes num_in_par load_keys key_len - * This increments p_in, but not p_out - */ -.macro do_aes b, k, key_len, xctr - .set by, \b - .set load_keys, \k - .set klen, \key_len - - .if (load_keys) - vmovdqa 0*16(p_keys), xkey0 - .endif - - .if \xctr - movq counter, xtmp - .set i, 0 - .rept (by) - club XDATA, i - vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata - .set i, (i +1) - .endr - .set i, 0 - .rept (by) - club XDATA, i - vpxor xiv, var_xdata, var_xdata - .set i, (i +1) - .endr - .else - vpshufb xbyteswap, xcounter, xdata0 - .set i, 1 - .rept (by - 1) - club XDATA, i - vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata - vptest ddq_low_msk(%rip), var_xdata - jnz 1f - vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata - vpaddq ddq_high_add_1(%rip), xcounter, xcounter - 1: - vpshufb xbyteswap, var_xdata, var_xdata - .set i, (i +1) - .endr - .endif - - vmovdqa 1*16(p_keys), xkeyA - - vpxor xkey0, xdata0, xdata0 - .if \xctr - add $by, counter - .else - vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter - vptest ddq_low_msk(%rip), xcounter - jnz 1f - vpaddq ddq_high_add_1(%rip), xcounter, xcounter - 1: - .endif - - .set i, 1 - .rept (by - 1) - club XDATA, i - vpxor xkey0, var_xdata, var_xdata - .set i, (i +1) - .endr - - vmovdqa 2*16(p_keys), xkeyB - - .set i, 0 - .rept by - club XDATA, i - vaesenc xkeyA, var_xdata, var_xdata /* key 1 */ - .set i, (i +1) - .endr - - .if (klen == KEY_128) - .if (load_keys) - vmovdqa 3*16(p_keys), xkey4 - .endif - .else - vmovdqa 3*16(p_keys), xkeyA - .endif - - .set i, 0 - .rept by - club XDATA, i - vaesenc xkeyB, var_xdata, var_xdata /* key 2 */ - .set i, (i +1) - .endr - - add $(16*by), p_in - - .if (klen == KEY_128) - vmovdqa 4*16(p_keys), xkeyB - .else - .if (load_keys) - vmovdqa 4*16(p_keys), xkey4 - .endif - .endif - - .set i, 0 - .rept by - club XDATA, i - /* key 3 */ - .if (klen == KEY_128) - vaesenc xkey4, var_xdata, var_xdata - .else - vaesenc xkeyA, var_xdata, var_xdata - .endif - .set i, (i +1) - .endr - - vmovdqa 5*16(p_keys), xkeyA - - .set i, 0 - .rept by - club XDATA, i - /* key 4 */ - .if (klen == KEY_128) - vaesenc xkeyB, var_xdata, var_xdata - .else - vaesenc xkey4, var_xdata, var_xdata - .endif - .set i, (i +1) - .endr - - .if (klen == KEY_128) - .if (load_keys) - vmovdqa 6*16(p_keys), xkey8 - .endif - .else - vmovdqa 6*16(p_keys), xkeyB - .endif - - .set i, 0 - .rept by - club XDATA, i - vaesenc xkeyA, var_xdata, var_xdata /* key 5 */ - .set i, (i +1) - .endr - - vmovdqa 7*16(p_keys), xkeyA - - .set i, 0 - .rept by - club XDATA, i - /* key 6 */ - .if (klen == KEY_128) - vaesenc xkey8, var_xdata, var_xdata - .else - vaesenc xkeyB, var_xdata, var_xdata - .endif - .set i, (i +1) - .endr - - .if (klen == KEY_128) - vmovdqa 8*16(p_keys), xkeyB - .else - .if (load_keys) - vmovdqa 8*16(p_keys), xkey8 - .endif - .endif - - .set i, 0 - .rept by - club XDATA, i - vaesenc xkeyA, var_xdata, var_xdata /* key 7 */ - .set i, (i +1) - .endr - - .if (klen == KEY_128) - .if (load_keys) - vmovdqa 9*16(p_keys), xkey12 - .endif - .else - vmovdqa 9*16(p_keys), xkeyA - .endif - - .set i, 0 - .rept by - club XDATA, i - /* key 8 */ - .if (klen == KEY_128) - vaesenc xkeyB, var_xdata, var_xdata - .else - vaesenc xkey8, var_xdata, var_xdata - .endif - .set i, (i +1) - .endr - - vmovdqa 10*16(p_keys), xkeyB - - .set i, 0 - .rept by - club XDATA, i - /* key 9 */ - .if (klen == KEY_128) - vaesenc xkey12, var_xdata, var_xdata - .else - vaesenc xkeyA, var_xdata, var_xdata - .endif - .set i, (i +1) - .endr - - .if (klen != KEY_128) - vmovdqa 11*16(p_keys), xkeyA - .endif - - .set i, 0 - .rept by - club XDATA, i - /* key 10 */ - .if (klen == KEY_128) - vaesenclast xkeyB, var_xdata, var_xdata - .else - vaesenc xkeyB, var_xdata, var_xdata - .endif - .set i, (i +1) - .endr - - .if (klen != KEY_128) - .if (load_keys) - vmovdqa 12*16(p_keys), xkey12 - .endif - - .set i, 0 - .rept by - club XDATA, i - vaesenc xkeyA, var_xdata, var_xdata /* key 11 */ - .set i, (i +1) - .endr - - .if (klen == KEY_256) - vmovdqa 13*16(p_keys), xkeyA - .endif - - .set i, 0 - .rept by - club XDATA, i - .if (klen == KEY_256) - /* key 12 */ - vaesenc xkey12, var_xdata, var_xdata - .else - vaesenclast xkey12, var_xdata, var_xdata - .endif - .set i, (i +1) - .endr - - .if (klen == KEY_256) - vmovdqa 14*16(p_keys), xkeyB - - .set i, 0 - .rept by - club XDATA, i - /* key 13 */ - vaesenc xkeyA, var_xdata, var_xdata - .set i, (i +1) - .endr - - .set i, 0 - .rept by - club XDATA, i - /* key 14 */ - vaesenclast xkeyB, var_xdata, var_xdata - .set i, (i +1) - .endr - .endif - .endif - - .set i, 0 - .rept (by / 2) - .set j, (i+1) - VMOVDQ (i*16 - 16*by)(p_in), xkeyA - VMOVDQ (j*16 - 16*by)(p_in), xkeyB - club XDATA, i - vpxor xkeyA, var_xdata, var_xdata - club XDATA, j - vpxor xkeyB, var_xdata, var_xdata - .set i, (i+2) - .endr - - .if (i < by) - VMOVDQ (i*16 - 16*by)(p_in), xkeyA - club XDATA, i - vpxor xkeyA, var_xdata, var_xdata - .endif - - .set i, 0 - .rept by - club XDATA, i - VMOVDQ var_xdata, i*16(p_out) - .set i, (i+1) - .endr -.endm - -.macro do_aes_load val, key_len, xctr - do_aes \val, 1, \key_len, \xctr -.endm - -.macro do_aes_noload val, key_len, xctr - do_aes \val, 0, \key_len, \xctr -.endm - -/* main body of aes ctr load */ - -.macro do_aes_ctrmain key_len, xctr - cmp $16, num_bytes - jb .Ldo_return2\xctr\key_len - - .if \xctr - shr $4, counter - vmovdqu (p_iv), xiv - .else - vmovdqa byteswap_const(%rip), xbyteswap - vmovdqu (p_iv), xcounter - vpshufb xbyteswap, xcounter, xcounter - .endif - - mov num_bytes, tmp - and $(7*16), tmp - jz .Lmult_of_8_blks\xctr\key_len - - /* 1 <= tmp <= 7 */ - cmp $(4*16), tmp - jg .Lgt4\xctr\key_len - je .Leq4\xctr\key_len - -.Llt4\xctr\key_len: - cmp $(2*16), tmp - jg .Leq3\xctr\key_len - je .Leq2\xctr\key_len - -.Leq1\xctr\key_len: - do_aes_load 1, \key_len, \xctr - add $(1*16), p_out - and $(~7*16), num_bytes - jz .Ldo_return2\xctr\key_len - jmp .Lmain_loop2\xctr\key_len - -.Leq2\xctr\key_len: - do_aes_load 2, \key_len, \xctr - add $(2*16), p_out - and $(~7*16), num_bytes - jz .Ldo_return2\xctr\key_len - jmp .Lmain_loop2\xctr\key_len - - -.Leq3\xctr\key_len: - do_aes_load 3, \key_len, \xctr - add $(3*16), p_out - and $(~7*16), num_bytes - jz .Ldo_return2\xctr\key_len - jmp .Lmain_loop2\xctr\key_len - -.Leq4\xctr\key_len: - do_aes_load 4, \key_len, \xctr - add $(4*16), p_out - and $(~7*16), num_bytes - jz .Ldo_return2\xctr\key_len - jmp .Lmain_loop2\xctr\key_len - -.Lgt4\xctr\key_len: - cmp $(6*16), tmp - jg .Leq7\xctr\key_len - je .Leq6\xctr\key_len - -.Leq5\xctr\key_len: - do_aes_load 5, \key_len, \xctr - add $(5*16), p_out - and $(~7*16), num_bytes - jz .Ldo_return2\xctr\key_len - jmp .Lmain_loop2\xctr\key_len - -.Leq6\xctr\key_len: - do_aes_load 6, \key_len, \xctr - add $(6*16), p_out - and $(~7*16), num_bytes - jz .Ldo_return2\xctr\key_len - jmp .Lmain_loop2\xctr\key_len - -.Leq7\xctr\key_len: - do_aes_load 7, \key_len, \xctr - add $(7*16), p_out - and $(~7*16), num_bytes - jz .Ldo_return2\xctr\key_len - jmp .Lmain_loop2\xctr\key_len - -.Lmult_of_8_blks\xctr\key_len: - .if (\key_len != KEY_128) - vmovdqa 0*16(p_keys), xkey0 - vmovdqa 4*16(p_keys), xkey4 - vmovdqa 8*16(p_keys), xkey8 - vmovdqa 12*16(p_keys), xkey12 - .else - vmovdqa 0*16(p_keys), xkey0 - vmovdqa 3*16(p_keys), xkey4 - vmovdqa 6*16(p_keys), xkey8 - vmovdqa 9*16(p_keys), xkey12 - .endif -.align 16 -.Lmain_loop2\xctr\key_len: - /* num_bytes is a multiple of 8 and >0 */ - do_aes_noload 8, \key_len, \xctr - add $(8*16), p_out - sub $(8*16), num_bytes - jne .Lmain_loop2\xctr\key_len - -.Ldo_return2\xctr\key_len: - .if !\xctr - /* return updated IV */ - vpshufb xbyteswap, xcounter, xcounter - vmovdqu xcounter, (p_iv) - .endif - RET -.endm - -/* - * routine to do AES128 CTR enc/decrypt "by8" - * XMM registers are clobbered. - * Saving/restoring must be done at a higher level - * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, - * unsigned int num_bytes) - */ -SYM_FUNC_START(aes_ctr_enc_128_avx_by8) - /* call the aes main loop */ - do_aes_ctrmain KEY_128 0 - -SYM_FUNC_END(aes_ctr_enc_128_avx_by8) - -/* - * routine to do AES192 CTR enc/decrypt "by8" - * XMM registers are clobbered. - * Saving/restoring must be done at a higher level - * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, - * unsigned int num_bytes) - */ -SYM_FUNC_START(aes_ctr_enc_192_avx_by8) - /* call the aes main loop */ - do_aes_ctrmain KEY_192 0 - -SYM_FUNC_END(aes_ctr_enc_192_avx_by8) - -/* - * routine to do AES256 CTR enc/decrypt "by8" - * XMM registers are clobbered. - * Saving/restoring must be done at a higher level - * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, - * unsigned int num_bytes) - */ -SYM_FUNC_START(aes_ctr_enc_256_avx_by8) - /* call the aes main loop */ - do_aes_ctrmain KEY_256 0 - -SYM_FUNC_END(aes_ctr_enc_256_avx_by8) - -/* - * routine to do AES128 XCTR enc/decrypt "by8" - * XMM registers are clobbered. - * Saving/restoring must be done at a higher level - * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys, - * u8* out, unsigned int num_bytes, unsigned int byte_ctr) - */ -SYM_FUNC_START(aes_xctr_enc_128_avx_by8) - /* call the aes main loop */ - do_aes_ctrmain KEY_128 1 - -SYM_FUNC_END(aes_xctr_enc_128_avx_by8) - -/* - * routine to do AES192 XCTR enc/decrypt "by8" - * XMM registers are clobbered. - * Saving/restoring must be done at a higher level - * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys, - * u8* out, unsigned int num_bytes, unsigned int byte_ctr) - */ -SYM_FUNC_START(aes_xctr_enc_192_avx_by8) - /* call the aes main loop */ - do_aes_ctrmain KEY_192 1 - -SYM_FUNC_END(aes_xctr_enc_192_avx_by8) - -/* - * routine to do AES256 XCTR enc/decrypt "by8" - * XMM registers are clobbered. - * Saving/restoring must be done at a higher level - * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys, - * u8* out, unsigned int num_bytes, unsigned int byte_ctr) - */ -SYM_FUNC_START(aes_xctr_enc_256_avx_by8) - /* call the aes main loop */ - do_aes_ctrmain KEY_256 1 - -SYM_FUNC_END(aes_xctr_enc_256_avx_by8) diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index eb153eff9331..b37881bb9f15 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -17,6 +17,7 @@ */ #include <linux/linkage.h> +#include <linux/objtool.h> #include <asm/frame.h> #define STATE1 %xmm0 @@ -1071,6 +1072,7 @@ SYM_FUNC_END(_aesni_inc) * size_t len, u8 *iv) */ SYM_FUNC_START(aesni_ctr_enc) + ANNOTATE_NOENDBR FRAME_BEGIN cmp $16, LEN jb .Lctr_enc_just_ret diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 11e95fc62636..bc655d794a95 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -23,7 +23,6 @@ #include <linux/err.h> #include <crypto/algapi.h> #include <crypto/aes.h> -#include <crypto/ctr.h> #include <crypto/b128ops.h> #include <crypto/gcm.h> #include <crypto/xts.h> @@ -82,30 +81,8 @@ asmlinkage void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); #ifdef CONFIG_X86_64 - asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); -DEFINE_STATIC_CALL(aesni_ctr_enc_tfm, aesni_ctr_enc); - -asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv, - void *keys, u8 *out, unsigned int num_bytes); -asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv, - void *keys, u8 *out, unsigned int num_bytes); -asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv, - void *keys, u8 *out, unsigned int num_bytes); - - -asmlinkage void aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, - const void *keys, u8 *out, unsigned int num_bytes, - unsigned int byte_ctr); - -asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, - const void *keys, u8 *out, unsigned int num_bytes, - unsigned int byte_ctr); - -asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, - const void *keys, u8 *out, unsigned int num_bytes, - unsigned int byte_ctr); #endif static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) @@ -376,24 +353,8 @@ static int cts_cbc_decrypt(struct skcipher_request *req) } #ifdef CONFIG_X86_64 -static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in, unsigned int len, u8 *iv) -{ - /* - * based on key length, override with the by8 version - * of ctr mode encryption/decryption for improved performance - * aes_set_key_common() ensures that key length is one of - * {128,192,256} - */ - if (ctx->key_length == AES_KEYSIZE_128) - aes_ctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len); - else if (ctx->key_length == AES_KEYSIZE_192) - aes_ctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len); - else - aes_ctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len); -} - -static int ctr_crypt(struct skcipher_request *req) +/* This is the non-AVX version. */ +static int ctr_crypt_aesni(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); @@ -407,10 +368,9 @@ static int ctr_crypt(struct skcipher_request *req) while ((nbytes = walk.nbytes) > 0) { kernel_fpu_begin(); if (nbytes & AES_BLOCK_MASK) - static_call(aesni_ctr_enc_tfm)(ctx, walk.dst.virt.addr, - walk.src.virt.addr, - nbytes & AES_BLOCK_MASK, - walk.iv); + aesni_ctr_enc(ctx, walk.dst.virt.addr, + walk.src.virt.addr, + nbytes & AES_BLOCK_MASK, walk.iv); nbytes &= ~AES_BLOCK_MASK; if (walk.nbytes == walk.total && nbytes > 0) { @@ -426,59 +386,6 @@ static int ctr_crypt(struct skcipher_request *req) } return err; } - -static void aesni_xctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in, unsigned int len, u8 *iv, - unsigned int byte_ctr) -{ - if (ctx->key_length == AES_KEYSIZE_128) - aes_xctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len, - byte_ctr); - else if (ctx->key_length == AES_KEYSIZE_192) - aes_xctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len, - byte_ctr); - else - aes_xctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len, - byte_ctr); -} - -static int xctr_crypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); - u8 keystream[AES_BLOCK_SIZE]; - struct skcipher_walk walk; - unsigned int nbytes; - unsigned int byte_ctr = 0; - int err; - __le32 block[AES_BLOCK_SIZE / sizeof(__le32)]; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes) > 0) { - kernel_fpu_begin(); - if (nbytes & AES_BLOCK_MASK) - aesni_xctr_enc_avx_tfm(ctx, walk.dst.virt.addr, - walk.src.virt.addr, nbytes & AES_BLOCK_MASK, - walk.iv, byte_ctr); - nbytes &= ~AES_BLOCK_MASK; - byte_ctr += walk.nbytes - nbytes; - - if (walk.nbytes == walk.total && nbytes > 0) { - memcpy(block, walk.iv, AES_BLOCK_SIZE); - block[0] ^= cpu_to_le32(1 + byte_ctr / AES_BLOCK_SIZE); - aesni_enc(ctx, keystream, (u8 *)block); - crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes - - nbytes, walk.src.virt.addr + walk.nbytes - - nbytes, keystream, nbytes); - byte_ctr += nbytes; - nbytes = 0; - } - kernel_fpu_end(); - err = skcipher_walk_done(&walk, nbytes); - } - return err; -} #endif static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key, @@ -581,11 +488,8 @@ xts_crypt(struct skcipher_request *req, xts_encrypt_iv_func encrypt_iv, { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); - const unsigned int cryptlen = req->cryptlen; - struct scatterlist *src = req->src; - struct scatterlist *dst = req->dst; - if (unlikely(cryptlen < AES_BLOCK_SIZE)) + if (unlikely(req->cryptlen < AES_BLOCK_SIZE)) return -EINVAL; kernel_fpu_begin(); @@ -593,23 +497,16 @@ xts_crypt(struct skcipher_request *req, xts_encrypt_iv_func encrypt_iv, /* * In practice, virtually all XTS plaintexts and ciphertexts are either - * 512 or 4096 bytes, aligned such that they don't span page boundaries. - * To optimize the performance of these cases, and also any other case - * where no page boundary is spanned, the below fast-path handles - * single-page sources and destinations as efficiently as possible. + * 512 or 4096 bytes and do not use multiple scatterlist elements. To + * optimize the performance of these cases, the below fast-path handles + * single-scatterlist-element messages as efficiently as possible. The + * code is 64-bit specific, as it assumes no page mapping is needed. */ - if (likely(src->length >= cryptlen && dst->length >= cryptlen && - src->offset + cryptlen <= PAGE_SIZE && - dst->offset + cryptlen <= PAGE_SIZE)) { - struct page *src_page = sg_page(src); - struct page *dst_page = sg_page(dst); - void *src_virt = kmap_local_page(src_page) + src->offset; - void *dst_virt = kmap_local_page(dst_page) + dst->offset; - - (*crypt_func)(&ctx->crypt_ctx, src_virt, dst_virt, cryptlen, - req->iv); - kunmap_local(dst_virt); - kunmap_local(src_virt); + if (IS_ENABLED(CONFIG_X86_64) && + likely(req->src->length >= req->cryptlen && + req->dst->length >= req->cryptlen)) { + (*crypt_func)(&ctx->crypt_ctx, sg_virt(req->src), + sg_virt(req->dst), req->cryptlen, req->iv); kernel_fpu_end(); return 0; } @@ -731,8 +628,8 @@ static struct skcipher_alg aesni_skciphers[] = { .ivsize = AES_BLOCK_SIZE, .chunksize = AES_BLOCK_SIZE, .setkey = aesni_skcipher_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, + .encrypt = ctr_crypt_aesni, + .decrypt = ctr_crypt_aesni, #endif }, { .base = { @@ -758,35 +655,105 @@ static struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)]; #ifdef CONFIG_X86_64 -/* - * XCTR does not have a non-AVX implementation, so it must be enabled - * conditionally. - */ -static struct skcipher_alg aesni_xctr = { - .base = { - .cra_name = "__xctr(aes)", - .cra_driver_name = "__xctr-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = CRYPTO_AES_CTX_SIZE, - .cra_module = THIS_MODULE, - }, - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .chunksize = AES_BLOCK_SIZE, - .setkey = aesni_skcipher_setkey, - .encrypt = xctr_crypt, - .decrypt = xctr_crypt, -}; - -static struct simd_skcipher_alg *aesni_simd_xctr; - asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, u8 iv[AES_BLOCK_SIZE]); -#define DEFINE_XTS_ALG(suffix, driver_name, priority) \ +/* __always_inline to avoid indirect call */ +static __always_inline int +ctr_crypt(struct skcipher_request *req, + void (*ctr64_func)(const struct crypto_aes_ctx *key, + const u8 *src, u8 *dst, int len, + const u64 le_ctr[2])) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct crypto_aes_ctx *key = aes_ctx(crypto_skcipher_ctx(tfm)); + unsigned int nbytes, p1_nbytes, nblocks; + struct skcipher_walk walk; + u64 le_ctr[2]; + u64 ctr64; + int err; + + ctr64 = le_ctr[0] = get_unaligned_be64(&req->iv[8]); + le_ctr[1] = get_unaligned_be64(&req->iv[0]); + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) != 0) { + if (nbytes < walk.total) { + /* Not the end yet, so keep the length block-aligned. */ + nbytes = round_down(nbytes, AES_BLOCK_SIZE); + nblocks = nbytes / AES_BLOCK_SIZE; + } else { + /* It's the end, so include any final partial block. */ + nblocks = DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE); + } + ctr64 += nblocks; + + kernel_fpu_begin(); + if (likely(ctr64 >= nblocks)) { + /* The low 64 bits of the counter won't overflow. */ + (*ctr64_func)(key, walk.src.virt.addr, + walk.dst.virt.addr, nbytes, le_ctr); + } else { + /* + * The low 64 bits of the counter will overflow. The + * assembly doesn't handle this case, so split the + * operation into two at the point where the overflow + * will occur. After the first part, add the carry bit. + */ + p1_nbytes = min_t(unsigned int, nbytes, + (nblocks - ctr64) * AES_BLOCK_SIZE); + (*ctr64_func)(key, walk.src.virt.addr, + walk.dst.virt.addr, p1_nbytes, le_ctr); + le_ctr[0] = 0; + le_ctr[1]++; + (*ctr64_func)(key, walk.src.virt.addr + p1_nbytes, + walk.dst.virt.addr + p1_nbytes, + nbytes - p1_nbytes, le_ctr); + } + kernel_fpu_end(); + le_ctr[0] = ctr64; + + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + put_unaligned_be64(ctr64, &req->iv[8]); + put_unaligned_be64(le_ctr[1], &req->iv[0]); + + return err; +} + +/* __always_inline to avoid indirect call */ +static __always_inline int +xctr_crypt(struct skcipher_request *req, + void (*xctr_func)(const struct crypto_aes_ctx *key, + const u8 *src, u8 *dst, int len, + const u8 iv[AES_BLOCK_SIZE], u64 ctr)) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct crypto_aes_ctx *key = aes_ctx(crypto_skcipher_ctx(tfm)); + struct skcipher_walk walk; + unsigned int nbytes; + u64 ctr = 1; + int err; + + err = skcipher_walk_virt(&walk, req, false); + while ((nbytes = walk.nbytes) != 0) { + if (nbytes < walk.total) + nbytes = round_down(nbytes, AES_BLOCK_SIZE); + + kernel_fpu_begin(); + (*xctr_func)(key, walk.src.virt.addr, walk.dst.virt.addr, + nbytes, req->iv, ctr); + kernel_fpu_end(); + + ctr += DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + return err; +} + +#define DEFINE_AVX_SKCIPHER_ALGS(suffix, driver_name_suffix, priority) \ \ asmlinkage void \ aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \ @@ -805,32 +772,80 @@ static int xts_decrypt_##suffix(struct skcipher_request *req) \ return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_decrypt_##suffix); \ } \ \ -static struct skcipher_alg aes_xts_alg_##suffix = { \ - .base = { \ - .cra_name = "__xts(aes)", \ - .cra_driver_name = "__" driver_name, \ - .cra_priority = priority, \ - .cra_flags = CRYPTO_ALG_INTERNAL, \ - .cra_blocksize = AES_BLOCK_SIZE, \ - .cra_ctxsize = XTS_AES_CTX_SIZE, \ - .cra_module = THIS_MODULE, \ - }, \ - .min_keysize = 2 * AES_MIN_KEY_SIZE, \ - .max_keysize = 2 * AES_MAX_KEY_SIZE, \ - .ivsize = AES_BLOCK_SIZE, \ - .walksize = 2 * AES_BLOCK_SIZE, \ - .setkey = xts_setkey_aesni, \ - .encrypt = xts_encrypt_##suffix, \ - .decrypt = xts_decrypt_##suffix, \ -}; \ +asmlinkage void \ +aes_ctr64_crypt_##suffix(const struct crypto_aes_ctx *key, \ + const u8 *src, u8 *dst, int len, const u64 le_ctr[2]);\ + \ +static int ctr_crypt_##suffix(struct skcipher_request *req) \ +{ \ + return ctr_crypt(req, aes_ctr64_crypt_##suffix); \ +} \ + \ +asmlinkage void \ +aes_xctr_crypt_##suffix(const struct crypto_aes_ctx *key, \ + const u8 *src, u8 *dst, int len, \ + const u8 iv[AES_BLOCK_SIZE], u64 ctr); \ \ -static struct simd_skcipher_alg *aes_xts_simdalg_##suffix +static int xctr_crypt_##suffix(struct skcipher_request *req) \ +{ \ + return xctr_crypt(req, aes_xctr_crypt_##suffix); \ +} \ + \ +static struct skcipher_alg skcipher_algs_##suffix[] = {{ \ + .base.cra_name = "__xts(aes)", \ + .base.cra_driver_name = "__xts-aes-" driver_name_suffix, \ + .base.cra_priority = priority, \ + .base.cra_flags = CRYPTO_ALG_INTERNAL, \ + .base.cra_blocksize = AES_BLOCK_SIZE, \ + .base.cra_ctxsize = XTS_AES_CTX_SIZE, \ + .base.cra_module = THIS_MODULE, \ + .min_keysize = 2 * AES_MIN_KEY_SIZE, \ + .max_keysize = 2 * AES_MAX_KEY_SIZE, \ + .ivsize = AES_BLOCK_SIZE, \ + .walksize = 2 * AES_BLOCK_SIZE, \ + .setkey = xts_setkey_aesni, \ + .encrypt = xts_encrypt_##suffix, \ + .decrypt = xts_decrypt_##suffix, \ +}, { \ + .base.cra_name = "__ctr(aes)", \ + .base.cra_driver_name = "__ctr-aes-" driver_name_suffix, \ + .base.cra_priority = priority, \ + .base.cra_flags = CRYPTO_ALG_INTERNAL, \ + .base.cra_blocksize = 1, \ + .base.cra_ctxsize = CRYPTO_AES_CTX_SIZE, \ + .base.cra_module = THIS_MODULE, \ + .min_keysize = AES_MIN_KEY_SIZE, \ + .max_keysize = AES_MAX_KEY_SIZE, \ + .ivsize = AES_BLOCK_SIZE, \ + .chunksize = AES_BLOCK_SIZE, \ + .setkey = aesni_skcipher_setkey, \ + .encrypt = ctr_crypt_##suffix, \ + .decrypt = ctr_crypt_##suffix, \ +}, { \ + .base.cra_name = "__xctr(aes)", \ + .base.cra_driver_name = "__xctr-aes-" driver_name_suffix, \ + .base.cra_priority = priority, \ + .base.cra_flags = CRYPTO_ALG_INTERNAL, \ + .base.cra_blocksize = 1, \ + .base.cra_ctxsize = CRYPTO_AES_CTX_SIZE, \ + .base.cra_module = THIS_MODULE, \ + .min_keysize = AES_MIN_KEY_SIZE, \ + .max_keysize = AES_MAX_KEY_SIZE, \ + .ivsize = AES_BLOCK_SIZE, \ + .chunksize = AES_BLOCK_SIZE, \ + .setkey = aesni_skcipher_setkey, \ + .encrypt = xctr_crypt_##suffix, \ + .decrypt = xctr_crypt_##suffix, \ +}}; \ + \ +static struct simd_skcipher_alg * \ +simd_skcipher_algs_##suffix[ARRAY_SIZE(skcipher_algs_##suffix)] -DEFINE_XTS_ALG(aesni_avx, "xts-aes-aesni-avx", 500); +DEFINE_AVX_SKCIPHER_ALGS(aesni_avx, "aesni-avx", 500); #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) -DEFINE_XTS_ALG(vaes_avx2, "xts-aes-vaes-avx2", 600); -DEFINE_XTS_ALG(vaes_avx10_256, "xts-aes-vaes-avx10_256", 700); -DEFINE_XTS_ALG(vaes_avx10_512, "xts-aes-vaes-avx10_512", 800); +DEFINE_AVX_SKCIPHER_ALGS(vaes_avx2, "vaes-avx2", 600); +DEFINE_AVX_SKCIPHER_ALGS(vaes_avx10_256, "vaes-avx10_256", 700); +DEFINE_AVX_SKCIPHER_ALGS(vaes_avx10_512, "vaes-avx10_512", 800); #endif /* The common part of the x86_64 AES-GCM key struct */ @@ -1291,41 +1306,40 @@ static void gcm_process_assoc(const struct aes_gcm_key *key, u8 ghash_acc[16], scatterwalk_start(&walk, sg_src); while (assoclen) { - unsigned int len_this_page = scatterwalk_clamp(&walk, assoclen); - void *mapped = scatterwalk_map(&walk); - const void *src = mapped; + unsigned int orig_len_this_step = scatterwalk_next( + &walk, assoclen); + unsigned int len_this_step = orig_len_this_step; unsigned int len; + const u8 *src = walk.addr; - assoclen -= len_this_page; - scatterwalk_advance(&walk, len_this_page); if (unlikely(pos)) { - len = min(len_this_page, 16 - pos); + len = min(len_this_step, 16 - pos); memcpy(&buf[pos], src, len); pos += len; src += len; - len_this_page -= len; + len_this_step -= len; if (pos < 16) goto next; aes_gcm_aad_update(key, ghash_acc, buf, 16, flags); pos = 0; } - len = len_this_page; + len = len_this_step; if (unlikely(assoclen)) /* Not the last segment yet? */ len = round_down(len, 16); aes_gcm_aad_update(key, ghash_acc, src, len, flags); src += len; - len_this_page -= len; - if (unlikely(len_this_page)) { - memcpy(buf, src, len_this_page); - pos = len_this_page; + len_this_step -= len; + if (unlikely(len_this_step)) { + memcpy(buf, src, len_this_step); + pos = len_this_step; } next: - scatterwalk_unmap(mapped); - scatterwalk_pagedone(&walk, 0, assoclen); + scatterwalk_done_src(&walk, orig_len_this_step); if (need_resched()) { kernel_fpu_end(); kernel_fpu_begin(); } + assoclen -= orig_len_this_step; } if (unlikely(pos)) aes_gcm_aad_update(key, ghash_acc, buf, pos, flags); @@ -1536,34 +1550,15 @@ DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512, AES_GCM_KEY_AVX10_SIZE, 800); #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ -/* - * This is a list of CPU models that are known to suffer from downclocking when - * zmm registers (512-bit vectors) are used. On these CPUs, the AES mode - * implementations with zmm registers won't be used by default. Implementations - * with ymm registers (256-bit vectors) will be used by default instead. - */ -static const struct x86_cpu_id zmm_exclusion_list[] = { - X86_MATCH_VFM(INTEL_SKYLAKE_X, 0), - X86_MATCH_VFM(INTEL_ICELAKE_X, 0), - X86_MATCH_VFM(INTEL_ICELAKE_D, 0), - X86_MATCH_VFM(INTEL_ICELAKE, 0), - X86_MATCH_VFM(INTEL_ICELAKE_L, 0), - X86_MATCH_VFM(INTEL_ICELAKE_NNPI, 0), - X86_MATCH_VFM(INTEL_TIGERLAKE_L, 0), - X86_MATCH_VFM(INTEL_TIGERLAKE, 0), - /* Allow Rocket Lake and later, and Sapphire Rapids and later. */ - /* Also allow AMD CPUs (starting with Zen 4, the first with AVX-512). */ - {}, -}; - static int __init register_avx_algs(void) { int err; if (!boot_cpu_has(X86_FEATURE_AVX)) return 0; - err = simd_register_skciphers_compat(&aes_xts_alg_aesni_avx, 1, - &aes_xts_simdalg_aesni_avx); + err = simd_register_skciphers_compat(skcipher_algs_aesni_avx, + ARRAY_SIZE(skcipher_algs_aesni_avx), + simd_skcipher_algs_aesni_avx); if (err) return err; err = simd_register_aeads_compat(aes_gcm_algs_aesni_avx, @@ -1571,6 +1566,12 @@ static int __init register_avx_algs(void) aes_gcm_simdalgs_aesni_avx); if (err) return err; + /* + * Note: not all the algorithms registered below actually require + * VPCLMULQDQ. But in practice every CPU with VAES also has VPCLMULQDQ. + * Similarly, the assembler support was added at about the same time. + * For simplicity, just always check for VAES and VPCLMULQDQ together. + */ #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_VAES) || @@ -1578,8 +1579,9 @@ static int __init register_avx_algs(void) !boot_cpu_has(X86_FEATURE_PCLMULQDQ) || !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) return 0; - err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx2, 1, - &aes_xts_simdalg_vaes_avx2); + err = simd_register_skciphers_compat(skcipher_algs_vaes_avx2, + ARRAY_SIZE(skcipher_algs_vaes_avx2), + simd_skcipher_algs_vaes_avx2); if (err) return err; @@ -1590,8 +1592,9 @@ static int __init register_avx_algs(void) XFEATURE_MASK_AVX512, NULL)) return 0; - err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_256, 1, - &aes_xts_simdalg_vaes_avx10_256); + err = simd_register_skciphers_compat(skcipher_algs_vaes_avx10_256, + ARRAY_SIZE(skcipher_algs_vaes_avx10_256), + simd_skcipher_algs_vaes_avx10_256); if (err) return err; err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_256, @@ -1600,16 +1603,18 @@ static int __init register_avx_algs(void) if (err) return err; - if (x86_match_cpu(zmm_exclusion_list)) { + if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) { int i; - aes_xts_alg_vaes_avx10_512.base.cra_priority = 1; + for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx10_512); i++) + skcipher_algs_vaes_avx10_512[i].base.cra_priority = 1; for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++) aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1; } - err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_512, 1, - &aes_xts_simdalg_vaes_avx10_512); + err = simd_register_skciphers_compat(skcipher_algs_vaes_avx10_512, + ARRAY_SIZE(skcipher_algs_vaes_avx10_512), + simd_skcipher_algs_vaes_avx10_512); if (err) return err; err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_512, @@ -1623,27 +1628,31 @@ static int __init register_avx_algs(void) static void unregister_avx_algs(void) { - if (aes_xts_simdalg_aesni_avx) - simd_unregister_skciphers(&aes_xts_alg_aesni_avx, 1, - &aes_xts_simdalg_aesni_avx); + if (simd_skcipher_algs_aesni_avx[0]) + simd_unregister_skciphers(skcipher_algs_aesni_avx, + ARRAY_SIZE(skcipher_algs_aesni_avx), + simd_skcipher_algs_aesni_avx); if (aes_gcm_simdalgs_aesni_avx[0]) simd_unregister_aeads(aes_gcm_algs_aesni_avx, ARRAY_SIZE(aes_gcm_algs_aesni_avx), aes_gcm_simdalgs_aesni_avx); #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) - if (aes_xts_simdalg_vaes_avx2) - simd_unregister_skciphers(&aes_xts_alg_vaes_avx2, 1, - &aes_xts_simdalg_vaes_avx2); - if (aes_xts_simdalg_vaes_avx10_256) - simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_256, 1, - &aes_xts_simdalg_vaes_avx10_256); + if (simd_skcipher_algs_vaes_avx2[0]) + simd_unregister_skciphers(skcipher_algs_vaes_avx2, + ARRAY_SIZE(skcipher_algs_vaes_avx2), + simd_skcipher_algs_vaes_avx2); + if (simd_skcipher_algs_vaes_avx10_256[0]) + simd_unregister_skciphers(skcipher_algs_vaes_avx10_256, + ARRAY_SIZE(skcipher_algs_vaes_avx10_256), + simd_skcipher_algs_vaes_avx10_256); if (aes_gcm_simdalgs_vaes_avx10_256[0]) simd_unregister_aeads(aes_gcm_algs_vaes_avx10_256, ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256), aes_gcm_simdalgs_vaes_avx10_256); - if (aes_xts_simdalg_vaes_avx10_512) - simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_512, 1, - &aes_xts_simdalg_vaes_avx10_512); + if (simd_skcipher_algs_vaes_avx10_512[0]) + simd_unregister_skciphers(skcipher_algs_vaes_avx10_512, + ARRAY_SIZE(skcipher_algs_vaes_avx10_512), + simd_skcipher_algs_vaes_avx10_512); if (aes_gcm_simdalgs_vaes_avx10_512[0]) simd_unregister_aeads(aes_gcm_algs_vaes_avx10_512, ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512), @@ -1676,13 +1685,6 @@ static int __init aesni_init(void) if (!x86_match_cpu(aesni_cpu_id)) return -ENODEV; -#ifdef CONFIG_X86_64 - if (boot_cpu_has(X86_FEATURE_AVX)) { - /* optimize performance of ctr mode encryption transform */ - static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm); - pr_info("AES CTR mode by8 optimization enabled\n"); - } -#endif /* CONFIG_X86_64 */ err = crypto_register_alg(&aesni_cipher_alg); if (err) @@ -1700,14 +1702,6 @@ static int __init aesni_init(void) if (err) goto unregister_skciphers; -#ifdef CONFIG_X86_64 - if (boot_cpu_has(X86_FEATURE_AVX)) - err = simd_register_skciphers_compat(&aesni_xctr, 1, - &aesni_simd_xctr); - if (err) - goto unregister_aeads; -#endif /* CONFIG_X86_64 */ - err = register_avx_algs(); if (err) goto unregister_avx; @@ -1716,11 +1710,6 @@ static int __init aesni_init(void) unregister_avx: unregister_avx_algs(); -#ifdef CONFIG_X86_64 - if (aesni_simd_xctr) - simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); -unregister_aeads: -#endif /* CONFIG_X86_64 */ simd_unregister_aeads(aes_gcm_algs_aesni, ARRAY_SIZE(aes_gcm_algs_aesni), aes_gcm_simdalgs_aesni); @@ -1740,10 +1729,6 @@ static void __exit aesni_exit(void) simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), aesni_simd_skciphers); crypto_unregister_alg(&aesni_cipher_alg); -#ifdef CONFIG_X86_64 - if (boot_cpu_has(X86_FEATURE_AVX)) - simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); -#endif /* CONFIG_X86_64 */ unregister_avx_algs(); } diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index 646477a13e11..1dfef28c1266 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -16,6 +16,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/frame.h> #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -882,7 +883,7 @@ SYM_FUNC_START_LOCAL(__camellia_dec_blk16) jmp .Ldec_max24; SYM_FUNC_END(__camellia_dec_blk16) -SYM_FUNC_START(camellia_ecb_enc_16way) +SYM_TYPED_FUNC_START(camellia_ecb_enc_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -907,7 +908,7 @@ SYM_FUNC_START(camellia_ecb_enc_16way) RET; SYM_FUNC_END(camellia_ecb_enc_16way) -SYM_FUNC_START(camellia_ecb_dec_16way) +SYM_TYPED_FUNC_START(camellia_ecb_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -937,7 +938,7 @@ SYM_FUNC_START(camellia_ecb_dec_16way) RET; SYM_FUNC_END(camellia_ecb_dec_16way) -SYM_FUNC_START(camellia_cbc_dec_16way) +SYM_TYPED_FUNC_START(camellia_cbc_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index a0eb94e53b1b..b1c9b9450555 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -6,6 +6,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/frame.h> #define CAMELLIA_TABLE_BYTE_LEN 272 diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S index 816b6bb8bded..824cb94de6c2 100644 --- a/arch/x86/crypto/camellia-x86_64-asm_64.S +++ b/arch/x86/crypto/camellia-x86_64-asm_64.S @@ -6,6 +6,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> .file "camellia-x86_64-asm_64.S" .text @@ -177,7 +178,7 @@ bswapq RAB0; \ movq RAB0, 4*2(RIO); -SYM_FUNC_START(__camellia_enc_blk) +SYM_TYPED_FUNC_START(__camellia_enc_blk) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -224,7 +225,7 @@ SYM_FUNC_START(__camellia_enc_blk) RET; SYM_FUNC_END(__camellia_enc_blk) -SYM_FUNC_START(camellia_dec_blk) +SYM_TYPED_FUNC_START(camellia_dec_blk) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -411,7 +412,7 @@ SYM_FUNC_END(camellia_dec_blk) bswapq RAB1; \ movq RAB1, 12*2(RIO); -SYM_FUNC_START(__camellia_enc_blk_2way) +SYM_TYPED_FUNC_START(__camellia_enc_blk_2way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -460,7 +461,7 @@ SYM_FUNC_START(__camellia_enc_blk_2way) RET; SYM_FUNC_END(__camellia_enc_blk_2way) -SYM_FUNC_START(camellia_dec_blk_2way) +SYM_TYPED_FUNC_START(camellia_dec_blk_2way) /* input: * %rdi: ctx, CTX * %rsi: dst diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index 7b3a1cf0984b..8bb74a272879 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -133,12 +133,6 @@ void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) } EXPORT_SYMBOL(hchacha_block_arch); -void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) -{ - chacha_init_generic(state, key, iv); -} -EXPORT_SYMBOL(chacha_init_arch); - void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds) { @@ -169,7 +163,7 @@ static int chacha_simd_stream_xor(struct skcipher_request *req, err = skcipher_walk_virt(&walk, req, false); - chacha_init_generic(state, ctx->key, iv); + chacha_init(state, ctx->key, iv); while (walk.nbytes > 0) { unsigned int nbytes = walk.nbytes; @@ -211,7 +205,7 @@ static int xchacha_simd(struct skcipher_request *req) struct chacha_ctx subctx; u8 real_iv[16]; - chacha_init_generic(state, ctx->key, req->iv); + chacha_init(state, ctx->key, req->iv); if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) { kernel_fpu_begin(); diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c index e88439d3828e..34600f90d8a6 100644 --- a/arch/x86/crypto/des3_ede_glue.c +++ b/arch/x86/crypto/des3_ede_glue.c @@ -73,7 +73,7 @@ static int ecb_crypt(struct skcipher_request *req, const u32 *expkey) err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes)) { - u8 *wsrc = walk.src.virt.addr; + const u8 *wsrc = walk.src.virt.addr; u8 *wdst = walk.dst.virt.addr; /* Process four block batch */ diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 41bc02e48916..c759ec808bf1 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -189,6 +189,20 @@ static int ghash_async_init(struct ahash_request *req) return crypto_shash_init(desc); } +static void ghash_init_cryptd_req(struct ahash_request *req) +{ + struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + ahash_request_set_callback(cryptd_req, req->base.flags, + req->base.complete, req->base.data); + ahash_request_set_crypt(cryptd_req, req->src, req->result, + req->nbytes); +} + static int ghash_async_update(struct ahash_request *req) { struct ahash_request *cryptd_req = ahash_request_ctx(req); @@ -198,8 +212,7 @@ static int ghash_async_update(struct ahash_request *req) if (!crypto_simd_usable() || (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { - memcpy(cryptd_req, req, sizeof(*req)); - ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + ghash_init_cryptd_req(req); return crypto_ahash_update(cryptd_req); } else { struct shash_desc *desc = cryptd_shash_desc(cryptd_req); @@ -216,8 +229,7 @@ static int ghash_async_final(struct ahash_request *req) if (!crypto_simd_usable() || (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { - memcpy(cryptd_req, req, sizeof(*req)); - ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + ghash_init_cryptd_req(req); return crypto_ahash_final(cryptd_req); } else { struct shash_desc *desc = cryptd_shash_desc(cryptd_req); @@ -257,8 +269,7 @@ static int ghash_async_digest(struct ahash_request *req) if (!crypto_simd_usable() || (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { - memcpy(cryptd_req, req, sizeof(*req)); - ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + ghash_init_cryptd_req(req); return crypto_ahash_digest(cryptd_req); } else { struct shash_desc *desc = cryptd_shash_desc(cryptd_req); diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index 97e283621851..84e47f7f6188 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -9,6 +9,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/frame.h> #include "glue_helper-asm-avx.S" @@ -656,7 +657,7 @@ SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx) RET; SYM_FUNC_END(__serpent_dec_blk8_avx) -SYM_FUNC_START(serpent_ecb_enc_8way_avx) +SYM_TYPED_FUNC_START(serpent_ecb_enc_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -674,7 +675,7 @@ SYM_FUNC_START(serpent_ecb_enc_8way_avx) RET; SYM_FUNC_END(serpent_ecb_enc_8way_avx) -SYM_FUNC_START(serpent_ecb_dec_8way_avx) +SYM_TYPED_FUNC_START(serpent_ecb_dec_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -692,7 +693,7 @@ SYM_FUNC_START(serpent_ecb_dec_8way_avx) RET; SYM_FUNC_END(serpent_ecb_dec_8way_avx) -SYM_FUNC_START(serpent_cbc_dec_8way_avx) +SYM_TYPED_FUNC_START(serpent_cbc_dec_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S index d2288bf38a8a..071e90e7f0d8 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S @@ -6,6 +6,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> .file "twofish-x86_64-asm-3way.S" .text @@ -220,7 +221,7 @@ rorq $32, RAB2; \ outunpack3(mov, RIO, 2, RAB, 2); -SYM_FUNC_START(__twofish_enc_blk_3way) +SYM_TYPED_FUNC_START(__twofish_enc_blk_3way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -269,7 +270,7 @@ SYM_FUNC_START(__twofish_enc_blk_3way) RET; SYM_FUNC_END(__twofish_enc_blk_3way) -SYM_FUNC_START(twofish_dec_blk_3way) +SYM_TYPED_FUNC_START(twofish_dec_blk_3way) /* input: * %rdi: ctx, CTX * %rsi: dst diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S index 775af290cd19..e08b4ba07b93 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64.S @@ -8,6 +8,7 @@ .text #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/asm-offsets.h> #define a_offset 0 @@ -202,7 +203,7 @@ xor %r8d, d ## D;\ ror $1, d ## D; -SYM_FUNC_START(twofish_enc_blk) +SYM_TYPED_FUNC_START(twofish_enc_blk) pushq R1 /* %rdi contains the ctx address */ @@ -255,7 +256,7 @@ SYM_FUNC_START(twofish_enc_blk) RET SYM_FUNC_END(twofish_enc_blk) -SYM_FUNC_START(twofish_dec_blk) +SYM_TYPED_FUNC_START(twofish_dec_blk) pushq R1 /* %rdi contains the ctx address */ |