summaryrefslogtreecommitdiffstats
path: root/arch/arm64/kernel/vdso/vgetrandom-chacha.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/kernel/vdso/vgetrandom-chacha.S')
-rw-r--r--arch/arm64/kernel/vdso/vgetrandom-chacha.S172
1 files changed, 172 insertions, 0 deletions
diff --git a/arch/arm64/kernel/vdso/vgetrandom-chacha.S b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
new file mode 100644
index 000000000000..67890b445309
--- /dev/null
+++ b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/linkage.h>
+#include <asm/cache.h>
+#include <asm/assembler.h>
+
+ .text
+
+#define state0 v0
+#define state1 v1
+#define state2 v2
+#define state3 v3
+#define copy0 v4
+#define copy0_q q4
+#define copy1 v5
+#define copy2 v6
+#define copy3 v7
+#define copy3_d d7
+#define one_d d16
+#define one_q q16
+#define one_v v16
+#define tmp v17
+#define rot8 v18
+
+/*
+ * ARM64 ChaCha20 implementation meant for vDSO. Produces a given positive
+ * number of blocks of output with nonce 0, taking an input key and 8-bytes
+ * counter. Importantly does not spill to the stack.
+ *
+ * This implementation avoids d8-d15 because they are callee-save in user
+ * space.
+ *
+ * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
+ * const uint8_t *key,
+ * uint32_t *counter,
+ * size_t nblocks)
+ *
+ * x0: output bytes
+ * x1: 32-byte key input
+ * x2: 8-byte counter input/output
+ * x3: number of 64-byte block to write to output
+ */
+SYM_FUNC_START(__arch_chacha20_blocks_nostack)
+
+ /* copy0 = "expand 32-byte k" */
+ mov_q x8, 0x3320646e61707865
+ mov_q x9, 0x6b20657479622d32
+ mov copy0.d[0], x8
+ mov copy0.d[1], x9
+
+ /* copy1,copy2 = key */
+ ld1 { copy1.4s, copy2.4s }, [x1]
+ /* copy3 = counter || zero nonce */
+ ld1 { copy3.2s }, [x2]
+
+ movi one_v.2s, #1
+ uzp1 one_v.4s, one_v.4s, one_v.4s
+
+.Lblock:
+ /* copy state to auxiliary vectors for the final add after the permute. */
+ mov state0.16b, copy0.16b
+ mov state1.16b, copy1.16b
+ mov state2.16b, copy2.16b
+ mov state3.16b, copy3.16b
+
+ mov w4, 20
+.Lpermute:
+ /*
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers state0-state3. It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ */
+
+.Ldoubleround:
+ /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
+ add state0.4s, state0.4s, state1.4s
+ eor state3.16b, state3.16b, state0.16b
+ rev32 state3.8h, state3.8h
+
+ /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
+ add state2.4s, state2.4s, state3.4s
+ eor tmp.16b, state1.16b, state2.16b
+ shl state1.4s, tmp.4s, #12
+ sri state1.4s, tmp.4s, #20
+
+ /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
+ add state0.4s, state0.4s, state1.4s
+ eor tmp.16b, state3.16b, state0.16b
+ shl state3.4s, tmp.4s, #8
+ sri state3.4s, tmp.4s, #24
+
+ /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
+ add state2.4s, state2.4s, state3.4s
+ eor tmp.16b, state1.16b, state2.16b
+ shl state1.4s, tmp.4s, #7
+ sri state1.4s, tmp.4s, #25
+
+ /* state1[0,1,2,3] = state1[1,2,3,0] */
+ ext state1.16b, state1.16b, state1.16b, #4
+ /* state2[0,1,2,3] = state2[2,3,0,1] */
+ ext state2.16b, state2.16b, state2.16b, #8
+ /* state3[0,1,2,3] = state3[1,2,3,0] */
+ ext state3.16b, state3.16b, state3.16b, #12
+
+ /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
+ add state0.4s, state0.4s, state1.4s
+ eor state3.16b, state3.16b, state0.16b
+ rev32 state3.8h, state3.8h
+
+ /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
+ add state2.4s, state2.4s, state3.4s
+ eor tmp.16b, state1.16b, state2.16b
+ shl state1.4s, tmp.4s, #12
+ sri state1.4s, tmp.4s, #20
+
+ /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
+ add state0.4s, state0.4s, state1.4s
+ eor tmp.16b, state3.16b, state0.16b
+ shl state3.4s, tmp.4s, #8
+ sri state3.4s, tmp.4s, #24
+
+ /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
+ add state2.4s, state2.4s, state3.4s
+ eor tmp.16b, state1.16b, state2.16b
+ shl state1.4s, tmp.4s, #7
+ sri state1.4s, tmp.4s, #25
+
+ /* state1[0,1,2,3] = state1[3,0,1,2] */
+ ext state1.16b, state1.16b, state1.16b, #12
+ /* state2[0,1,2,3] = state2[2,3,0,1] */
+ ext state2.16b, state2.16b, state2.16b, #8
+ /* state3[0,1,2,3] = state3[1,2,3,0] */
+ ext state3.16b, state3.16b, state3.16b, #4
+
+ subs w4, w4, #2
+ b.ne .Ldoubleround
+
+ /* output0 = state0 + state0 */
+ add state0.4s, state0.4s, copy0.4s
+ /* output1 = state1 + state1 */
+ add state1.4s, state1.4s, copy1.4s
+ /* output2 = state2 + state2 */
+ add state2.4s, state2.4s, copy2.4s
+ /* output2 = state3 + state3 */
+ add state3.4s, state3.4s, copy3.4s
+ st1 { state0.16b - state3.16b }, [x0]
+
+ /*
+ * ++copy3.counter, the 'add' clears the upper half of the SIMD register
+ * which is the expected behaviour here.
+ */
+ add copy3_d, copy3_d, one_d
+
+ /* output += 64, --nblocks */
+ add x0, x0, 64
+ subs x3, x3, #1
+ b.ne .Lblock
+
+ /* counter = copy3.counter */
+ st1 { copy3.2s }, [x2]
+
+ /* Zero out the potentially sensitive regs, in case nothing uses these again. */
+ movi state0.16b, #0
+ movi state1.16b, #0
+ movi state2.16b, #0
+ movi state3.16b, #0
+ movi copy1.16b, #0
+ movi copy2.16b, #0
+ ret
+SYM_FUNC_END(__arch_chacha20_blocks_nostack)
+
+emit_aarch64_feature_1_and