diff options
author | Ard Biesheuvel <ardb@kernel.org> | 2022-01-27 12:35:44 +0100 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2022-02-05 15:10:51 +1100 |
commit | fc074e130051015e39245a4241956ff122e2f465 (patch) | |
tree | bd626b745bb39ebd97f5b932ff4f4a2948111b1c /arch/arm64/crypto/aes-neonbs-core.S | |
parent | c8bf850e991a1838964ed8d8426f96cf8d3fbab5 (diff) | |
download | linux-stable-fc074e130051015e39245a4241956ff122e2f465.tar.gz linux-stable-fc074e130051015e39245a4241956ff122e2f465.tar.bz2 linux-stable-fc074e130051015e39245a4241956ff122e2f465.zip |
crypto: arm64/aes-neonbs-ctr - fallback to plain NEON for final chunk
Instead of processing the entire input with the 8-way bit sliced
algorithm, which is sub-optimal for inputs that are not a multiple of
128 bytes in size, invoke the plain NEON version of CTR for the
remainder of the input after processing the bulk using 128 byte strides.
This allows us to greatly simplify the asm code that implements CTR, and
get rid of all the branches and special code paths. It also gains us a
couple of percent of performance.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/arm64/crypto/aes-neonbs-core.S')
-rw-r--r-- | arch/arm64/crypto/aes-neonbs-core.S | 132 |
1 files changed, 25 insertions, 107 deletions
diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S index a3405b8c344b..f2761481181d 100644 --- a/arch/arm64/crypto/aes-neonbs-core.S +++ b/arch/arm64/crypto/aes-neonbs-core.S @@ -869,133 +869,51 @@ SYM_FUNC_END(aesbs_xts_decrypt) /* * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], - * int rounds, int blocks, u8 iv[], u8 final[]) + * int rounds, int blocks, u8 iv[]) */ SYM_FUNC_START(aesbs_ctr_encrypt) - frame_push 8 + stp x29, x30, [sp, #-16]! + mov x29, sp - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - mov x23, x4 - mov x24, x5 - mov x25, x6 - - cmp x25, #0 - cset x26, ne - add x23, x23, x26 // do one extra block if final - - ldp x7, x8, [x24] - ld1 {v0.16b}, [x24] + ldp x7, x8, [x5] + ld1 {v0.16b}, [x5] CPU_LE( rev x7, x7 ) CPU_LE( rev x8, x8 ) adds x8, x8, #1 adc x7, x7, xzr -99: mov x9, #1 - lsl x9, x9, x23 - subs w23, w23, #8 - csel x23, x23, xzr, pl - csel x9, x9, xzr, le - - tbnz x9, #1, 0f - next_ctr v1 - tbnz x9, #2, 0f +0: next_ctr v1 next_ctr v2 - tbnz x9, #3, 0f next_ctr v3 - tbnz x9, #4, 0f next_ctr v4 - tbnz x9, #5, 0f next_ctr v5 - tbnz x9, #6, 0f next_ctr v6 - tbnz x9, #7, 0f next_ctr v7 -0: mov bskey, x21 - mov rounds, x22 + mov bskey, x2 + mov rounds, x3 bl aesbs_encrypt8 - lsr x9, x9, x26 // disregard the extra block - tbnz x9, #0, 0f - - ld1 {v8.16b}, [x20], #16 - eor v0.16b, v0.16b, v8.16b - st1 {v0.16b}, [x19], #16 - tbnz x9, #1, 1f - - ld1 {v9.16b}, [x20], #16 - eor v1.16b, v1.16b, v9.16b - st1 {v1.16b}, [x19], #16 - tbnz x9, #2, 2f - - ld1 {v10.16b}, [x20], #16 - eor v4.16b, v4.16b, v10.16b - st1 {v4.16b}, [x19], #16 - tbnz x9, #3, 3f + ld1 { v8.16b-v11.16b}, [x1], #64 + ld1 {v12.16b-v15.16b}, [x1], #64 - ld1 {v11.16b}, [x20], #16 - eor v6.16b, v6.16b, v11.16b - st1 {v6.16b}, [x19], #16 - tbnz x9, #4, 4f + eor v8.16b, v0.16b, v8.16b + eor v9.16b, v1.16b, v9.16b + eor v10.16b, v4.16b, v10.16b + eor v11.16b, v6.16b, v11.16b + eor v12.16b, v3.16b, v12.16b + eor v13.16b, v7.16b, v13.16b + eor v14.16b, v2.16b, v14.16b + eor v15.16b, v5.16b, v15.16b - ld1 {v12.16b}, [x20], #16 - eor v3.16b, v3.16b, v12.16b - st1 {v3.16b}, [x19], #16 - tbnz x9, #5, 5f + st1 { v8.16b-v11.16b}, [x0], #64 + st1 {v12.16b-v15.16b}, [x0], #64 - ld1 {v13.16b}, [x20], #16 - eor v7.16b, v7.16b, v13.16b - st1 {v7.16b}, [x19], #16 - tbnz x9, #6, 6f - - ld1 {v14.16b}, [x20], #16 - eor v2.16b, v2.16b, v14.16b - st1 {v2.16b}, [x19], #16 - tbnz x9, #7, 7f + next_ctr v0 + subs x4, x4, #8 + b.gt 0b - ld1 {v15.16b}, [x20], #16 - eor v5.16b, v5.16b, v15.16b - st1 {v5.16b}, [x19], #16 - -8: next_ctr v0 - st1 {v0.16b}, [x24] - cbz x23, .Lctr_done - - b 99b - -.Lctr_done: - frame_pop + st1 {v0.16b}, [x5] + ldp x29, x30, [sp], #16 ret - - /* - * If we are handling the tail of the input (x6 != NULL), return the - * final keystream block back to the caller. - */ -0: cbz x25, 8b - st1 {v0.16b}, [x25] - b 8b -1: cbz x25, 8b - st1 {v1.16b}, [x25] - b 8b -2: cbz x25, 8b - st1 {v4.16b}, [x25] - b 8b -3: cbz x25, 8b - st1 {v6.16b}, [x25] - b 8b -4: cbz x25, 8b - st1 {v3.16b}, [x25] - b 8b -5: cbz x25, 8b - st1 {v7.16b}, [x25] - b 8b -6: cbz x25, 8b - st1 {v2.16b}, [x25] - b 8b -7: cbz x25, 8b - st1 {v5.16b}, [x25] - b 8b SYM_FUNC_END(aesbs_ctr_encrypt) |