summaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto/serpent-sse2-i586-asm_32.S
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@mbnet.fi>2011-12-20 12:58:06 +0200
committerHerbert Xu <herbert@gondor.apana.org.au>2012-01-13 16:38:40 +1100
commit847cb7ef565d31484f426677e0bea081bfd2acd9 (patch)
tree7325f4ce5961e0d51ea4707119aeba80622991c3 /arch/x86/crypto/serpent-sse2-i586-asm_32.S
parent4c58464b8034cef4317593bf4ccbfc19d5bb3a77 (diff)
downloadlinux-847cb7ef565d31484f426677e0bea081bfd2acd9.tar.gz
linux-847cb7ef565d31484f426677e0bea081bfd2acd9.tar.bz2
linux-847cb7ef565d31484f426677e0bea081bfd2acd9.zip
crypto: serpent-sse2 - change transpose_4x4 to only use integer instructions
Matrix transpose macro in serpent-sse2 uses mix of SSE2 integer and SSE floating point instructions, which might cause performance penality on some CPUs. This patch replaces transpose_4x4 macro with version that uses only SSE2 integer instructions. Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/serpent-sse2-i586-asm_32.S')
-rw-r--r--arch/x86/crypto/serpent-sse2-i586-asm_32.S29
1 files changed, 13 insertions, 16 deletions
diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
index 4e37677ca851..c00053d42f99 100644
--- a/arch/x86/crypto/serpent-sse2-i586-asm_32.S
+++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
@@ -463,23 +463,20 @@
pand x0, x4; \
pxor x2, x4;
-#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
- movdqa x2, t3; \
- movdqa x0, t1; \
- unpcklps x3, t3; \
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
movdqa x0, t2; \
- unpcklps x1, t1; \
- unpckhps x1, t2; \
- movdqa t3, x1; \
- unpckhps x3, x2; \
- movdqa t1, x0; \
- movhlps t1, x1; \
- movdqa t2, t1; \
- movlhps t3, x0; \
- movlhps x2, t1; \
- movhlps t2, x2; \
- movdqa x2, x3; \
- movdqa t1, x2;
+ punpckldq x1, x0; \
+ punpckhdq x1, t2; \
+ movdqa x2, t1; \
+ punpckhdq x3, x2; \
+ punpckldq x3, t1; \
+ movdqa x0, x1; \
+ punpcklqdq t1, x0; \
+ punpckhqdq t1, x1; \
+ movdqa t2, x3; \
+ punpcklqdq x2, t2; \
+ punpckhqdq x2, x3; \
+ movdqa t2, x2;
#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
movdqu (0*4*4)(in), x0; \