powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision

This patch add VMX primitives to do memcmp() in case the compare size is equal or greater than 4K bytes. KSM feature can benefit from this. Test result with following test program(replace the "^>" with ""): ------ ># cat tools/testing/selftests/powerpc/stringloops/memcmp.c >#include <malloc.h> >#include <stdlib.h> >#include <string.h> >#include <time.h> >#include "utils.h" >#define SIZE (1024 * 1024 * 900) >#define ITERATIONS 40 int test_memcmp(const void *s1, const void *s2, size_t n); static int testcase(void) { char *s1; char *s2; unsigned long i; s1 = memalign(128, SIZE); if (!s1) { perror("memalign"); exit(1); } s2 = memalign(128, SIZE); if (!s2) { perror("memalign"); exit(1); } for (i = 0; i < SIZE; i++) { s1[i] = i & 0xff; s2[i] = i & 0xff; } for (i = 0; i < ITERATIONS; i++) { int ret = test_memcmp(s1, s2, SIZE); if (ret) { printf("return %d at[%ld]! should have returned zero\n", ret, i); abort(); } } return 0; } int main(void) { return test_harness(testcase, "memcmp"); } ------ Without this patch (but with the first patch "powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp()." in the series): 4.726728762 seconds time elapsed ( +- 3.54%) With VMX patch: 4.234335473 seconds time elapsed ( +- 2.63%) There is ~+10% improvement. Testing with unaligned and different offset version (make s1 and s2 shift random offset within 16 bytes) can archieve higher improvement than 10%.. Signed-off-by: Simon Guo <wei.guo.simon@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
author: Simon Guo <wei.guo.simon@gmail.com> 2018-06-07 09:57:53 +0800
committer: Michael Ellerman <mpe@ellerman.id.au> 2018-07-24 22:03:21 +1000
commit: d58badfb7cf1792ab4f1d0cd7896d733b85d650f (patch)
tree: 2dbfd8f16f2fd270790f5e91fa527af3b63f1731 /arch/powerpc/lib/copypage_power7.S
parent: f1ecbaf466be5a7a5c666b41eede6991caff8646 (diff)
download: linux-stable-d58badfb7cf1792ab4f1d0cd7896d733b85d650f.tar.gz
linux-stable-d58badfb7cf1792ab4f1d0cd7896d733b85d650f.tar.bz2
linux-stable-d58badfb7cf1792ab4f1d0cd7896d733b85d650f.zip
1 files changed, 2 insertions, 2 deletions
diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S
index 8fa73b7ab20e..e38f956f7d9f 100644
--- a/arch/powerpc/lib/copypage_power7.S
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -57,7 +57,7 @@ _GLOBAL(copypage_power7)
 	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
-	bl	enter_vmx_copy
+	bl	enter_vmx_ops
 	cmpwi	r3,0
 	ld	r0,STACKFRAMESIZE+16(r1)
 	ld	r3,STK_REG(R31)(r1)
@@ -100,7 +100,7 @@ _GLOBAL(copypage_power7)
 	addi	r3,r3,128
 	bdnz	1b
 
-	b	exit_vmx_copy		/* tail call optimise */
+	b	exit_vmx_ops		/* tail call optimise */
 
 #else
 	li	r0,(PAGE_SIZE/128)
author	Simon Guo <wei.guo.simon@gmail.com>	2018-06-07 09:57:53 +0800
committer	Michael Ellerman <mpe@ellerman.id.au>	2018-07-24 22:03:21 +1000
commit	d58badfb7cf1792ab4f1d0cd7896d733b85d650f (patch)
tree	2dbfd8f16f2fd270790f5e91fa527af3b63f1731 /arch/powerpc/lib/copypage_power7.S
parent	f1ecbaf466be5a7a5c666b41eede6991caff8646 (diff)
download	linux-stable-d58badfb7cf1792ab4f1d0cd7896d733b85d650f.tar.gz linux-stable-d58badfb7cf1792ab4f1d0cd7896d733b85d650f.tar.bz2 linux-stable-d58badfb7cf1792ab4f1d0cd7896d733b85d650f.zip