2 files changed, 31 insertions, 1 deletions
diff --git a/arch/powerpc/crypto/crc32-vpmsum_core.S b/arch/powerpc/crypto/crc32-vpmsum_core.S
index 7c6be6a5c977..aadb59c96a27 100644
--- a/arch/powerpc/crypto/crc32-vpmsum_core.S
+++ b/arch/powerpc/crypto/crc32-vpmsum_core.S
@@ -35,7 +35,9 @@
 
 	.text
 
-#if defined(__BIG_ENDIAN__)
+#if defined(__BIG_ENDIAN__) && defined(REFLECT)
+#define BYTESWAP_DATA
+#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
 #define BYTESWAP_DATA
 #else
 #undef BYTESWAP_DATA
@@ -108,7 +110,11 @@ FUNC_START(CRC_FUNCTION_NAME)
 	/* Get the initial value into v8 */
 	vxor	v8,v8,v8
 	MTVRD(v8, R3)
+#ifdef REFLECT
 	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
+#else
+	vsldoi	v8,v8,zeroes,4	/* shift into top 32 bits */
+#endif
 
 #ifdef BYTESWAP_DATA
 	addis	r3,r2,.byteswap_constant@toc@ha
@@ -354,6 +360,7 @@ FUNC_START(CRC_FUNCTION_NAME)
 	vxor	v6,v6,v14
 	vxor	v7,v7,v15
 
+#ifdef REFLECT
 	/*
 	 * vpmsumd produces a 96 bit result in the least significant bits
 	 * of the register. Since we are bit reflected we have to shift it
@@ -368,6 +375,7 @@ FUNC_START(CRC_FUNCTION_NAME)
 	vsldoi	v5,v5,zeroes,4
 	vsldoi	v6,v6,zeroes,4
 	vsldoi	v7,v7,zeroes,4
+#endif
 
 	/* xor with last 1024 bits */
 	lvx	v8,0,r4
@@ -511,13 +519,33 @@ FUNC_START(CRC_FUNCTION_NAME)
 	vsldoi	v1,v0,v0,8
 	vxor	v0,v0,v1		/* xor two 64 bit results together */
 
+#ifdef REFLECT
 	/* shift left one bit */
 	vspltisb v1,1
 	vsl	v0,v0,v1
+#endif
 
 	vand	v0,v0,mask_64bit
+#ifndef REFLECT
+	/*
+	 * Now for the Barrett reduction algorithm. The idea is to calculate q,
+	 * the multiple of our polynomial that we need to subtract. By
+	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
+	 * result back down 2x bits, we round down to the nearest multiple.
+	 */
+	VPMSUMD(v1,v0,const1)	/* ma */
+	vsldoi	v1,zeroes,v1,8	/* q = floor(ma/(2^64)) */
+	VPMSUMD(v1,v1,const2)	/* qn */
+	vxor	v0,v0,v1	/* a - qn, subtraction is xor in GF(2) */
 
 	/*
+	 * Get the result into r3. We need to shift it left 8 bytes:
+	 * V0 [ 0 1 2 X ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	vsldoi	v0,v0,zeroes,8	/* shift result into top 64 bits */
+#else
+	/*
 	 * The reflected version of Barrett reduction. Instead of bit
 	 * reflecting our data (which is expensive to do), we bit reflect our
 	 * constants and our algorithm, which means the intermediate data in
@@ -537,6 +565,7 @@ FUNC_START(CRC_FUNCTION_NAME)
 	 * V0 [ 0 X 2 3 ]
 	 */
 	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
+#endif
 
 	/* Get it into r3 */
 	MFVRD(R3, v0)
diff --git a/arch/powerpc/crypto/crc32c-vpmsum_asm.S b/arch/powerpc/crypto/crc32c-vpmsum_asm.S
index c0d080caefc1..d2bea48051a0 100644
--- a/arch/powerpc/crypto/crc32c-vpmsum_asm.S
+++ b/arch/powerpc/crypto/crc32c-vpmsum_asm.S
@@ -842,4 +842,5 @@
 	.octa 0x00000000000000000000000105ec76f1
 
 #define CRC_FUNCTION_NAME __crc32c_vpmsum
+#define REFLECT
 #include "crc32-vpmsum_core.S"