diff options
Diffstat (limited to 'arch/powerpc/lib/memcmp_64.S')
-rw-r--r-- | arch/powerpc/lib/memcmp_64.S | 414 |
1 files changed, 406 insertions, 8 deletions
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S index d75d18b7bd55..844d8e774492 100644 --- a/arch/powerpc/lib/memcmp_64.S +++ b/arch/powerpc/lib/memcmp_64.S @@ -9,6 +9,7 @@ */ #include <asm/ppc_asm.h> #include <asm/export.h> +#include <asm/ppc-opcode.h> #define off8 r6 #define off16 r7 @@ -24,28 +25,102 @@ #define rH r31 #ifdef __LITTLE_ENDIAN__ +#define LH lhbrx +#define LW lwbrx #define LD ldbrx +#define LVS lvsr +#define VPERM(_VRT,_VRA,_VRB,_VRC) \ + vperm _VRT,_VRB,_VRA,_VRC #else +#define LH lhzx +#define LW lwzx #define LD ldx +#define LVS lvsl +#define VPERM(_VRT,_VRA,_VRB,_VRC) \ + vperm _VRT,_VRA,_VRB,_VRC #endif -_GLOBAL(memcmp) +#define VMX_THRESH 4096 +#define ENTER_VMX_OPS \ + mflr r0; \ + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ + std r0,16(r1); \ + stdu r1,-STACKFRAMESIZE(r1); \ + bl enter_vmx_ops; \ + cmpwi cr1,r3,0; \ + ld r0,STACKFRAMESIZE+16(r1); \ + ld r3,STK_REG(R31)(r1); \ + ld r4,STK_REG(R30)(r1); \ + ld r5,STK_REG(R29)(r1); \ + addi r1,r1,STACKFRAMESIZE; \ + mtlr r0 + +#define EXIT_VMX_OPS \ + mflr r0; \ + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ + std r0,16(r1); \ + stdu r1,-STACKFRAMESIZE(r1); \ + bl exit_vmx_ops; \ + ld r0,STACKFRAMESIZE+16(r1); \ + ld r3,STK_REG(R31)(r1); \ + ld r4,STK_REG(R30)(r1); \ + ld r5,STK_REG(R29)(r1); \ + addi r1,r1,STACKFRAMESIZE; \ + mtlr r0 + +/* + * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with + * 16 bytes boundary and permute the result with the 1st 16 bytes. + + * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z | + * ^ ^ ^ + * 0xbbbb10 0xbbbb20 0xbbb30 + * ^ + * _vaddr + * + * + * _vmask is the mask generated by LVS + * _v1st_qw is the 1st aligned QW of current addr which is already loaded. + * for example: 0xyyyyyyyyyyyyy012 for big endian + * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded. + * for example: 0x3456789abcdefzzz for big endian + * The permute result is saved in _v_res. + * for example: 0x0123456789abcdef for big endian. + */ +#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \ + lvx _v2nd_qw,_vaddr,off16; \ + VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask) + +/* + * There are 2 categories for memcmp: + * 1) src/dst has the same offset to the 8 bytes boundary. The handlers + * are named like .Lsameoffset_xxxx + * 2) src/dst has different offset to the 8 bytes boundary. The handlers + * are named like .Ldiffoffset_xxxx + */ +_GLOBAL_TOC(memcmp) cmpdi cr1,r5,0 - /* Use the short loop if both strings are not 8B aligned */ - or r6,r3,r4 + /* Use the short loop if the src/dst addresses are not + * with the same offset of 8 bytes align boundary. + */ + xor r6,r3,r4 andi. r6,r6,7 - /* Use the short loop if length is less than 32B */ - cmpdi cr6,r5,31 + /* Fall back to short loop if compare at aligned addrs + * with less than 8 bytes. + */ + cmpdi cr6,r5,7 beq cr1,.Lzero - bne .Lshort - bgt cr6,.Llong + bgt cr6,.Lno_short .Lshort: mtctr r5 - 1: lbz rA,0(r3) lbz rB,0(r4) subf. rC,rB,rA @@ -78,11 +153,98 @@ _GLOBAL(memcmp) li r3,0 blr +.Lno_short: + dcbt 0,r3 + dcbt 0,r4 + bne .Ldiffoffset_8bytes_make_align_start + + +.Lsameoffset_8bytes_make_align_start: + /* attempt to compare bytes not aligned with 8 bytes so that + * rest comparison can run based on 8 bytes alignment. + */ + andi. r6,r3,7 + + /* Try to compare the first double word which is not 8 bytes aligned: + * load the first double word at (src & ~7UL) and shift left appropriate + * bits before comparision. + */ + rlwinm r6,r3,3,26,28 + beq .Lsameoffset_8bytes_aligned + clrrdi r3,r3,3 + clrrdi r4,r4,3 + LD rA,0,r3 + LD rB,0,r4 + sld rA,rA,r6 + sld rB,rB,r6 + cmpld cr0,rA,rB + srwi r6,r6,3 + bne cr0,.LcmpAB_lightweight + subfic r6,r6,8 + subf. r5,r6,r5 + addi r3,r3,8 + addi r4,r4,8 + beq .Lzero + +.Lsameoffset_8bytes_aligned: + /* now we are aligned with 8 bytes. + * Use .Llong loop if left cmp bytes are equal or greater than 32B. + */ + cmpdi cr6,r5,31 + bgt cr6,.Llong + +.Lcmp_lt32bytes: + /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */ + cmpdi cr5,r5,7 + srdi r0,r5,3 + ble cr5,.Lcmp_rest_lt8bytes + + /* handle 8 ~ 31 bytes */ + clrldi r5,r5,61 + mtctr r0 +2: + LD rA,0,r3 + LD rB,0,r4 + cmpld cr0,rA,rB + addi r3,r3,8 + addi r4,r4,8 + bne cr0,.LcmpAB_lightweight + bdnz 2b + + cmpwi r5,0 + beq .Lzero + +.Lcmp_rest_lt8bytes: + /* Here we have only less than 8 bytes to compare with. at least s1 + * Address is aligned with 8 bytes. + * The next double words are load and shift right with appropriate + * bits. + */ + subfic r6,r5,8 + slwi r6,r6,3 + LD rA,0,r3 + LD rB,0,r4 + srd rA,rA,r6 + srd rB,rB,r6 + cmpld cr0,rA,rB + bne cr0,.LcmpAB_lightweight + b .Lzero + .Lnon_zero: mr r3,rC blr .Llong: +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION + /* Try to use vmx loop if length is equal or greater than 4K */ + cmpldi cr6,r5,VMX_THRESH + bge cr6,.Lsameoffset_vmx_cmp +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + +.Llong_novmx_cmp: +#endif + /* At least s1 addr is aligned with 8 bytes */ li off8,8 li off16,16 li off24,24 @@ -232,4 +394,240 @@ _GLOBAL(memcmp) ld r28,-32(r1) ld r27,-40(r1) blr + +.LcmpAB_lightweight: /* skip NV GPRS restore */ + li r3,1 + bgtlr + li r3,-1 + blr + +#ifdef CONFIG_ALTIVEC +.Lsameoffset_vmx_cmp: + /* Enter with src/dst addrs has the same offset with 8 bytes + * align boundary. + * + * There is an optimization based on following fact: memcmp() + * prones to fail early at the first 32 bytes. + * Before applying VMX instructions which will lead to 32x128bits + * VMX regs load/restore penalty, we compare the first 32 bytes + * so that we can catch the ~80% fail cases. + */ + + li r0,4 + mtctr r0 +.Lsameoffset_prechk_32B_loop: + LD rA,0,r3 + LD rB,0,r4 + cmpld cr0,rA,rB + addi r3,r3,8 + addi r4,r4,8 + bne cr0,.LcmpAB_lightweight + addi r5,r5,-8 + bdnz .Lsameoffset_prechk_32B_loop + + ENTER_VMX_OPS + beq cr1,.Llong_novmx_cmp + +3: + /* need to check whether r4 has the same offset with r3 + * for 16 bytes boundary. + */ + xor r0,r3,r4 + andi. r0,r0,0xf + bne .Ldiffoffset_vmx_cmp_start + + /* len is no less than 4KB. Need to align with 16 bytes further. + */ + andi. rA,r3,8 + LD rA,0,r3 + beq 4f + LD rB,0,r4 + cmpld cr0,rA,rB + addi r3,r3,8 + addi r4,r4,8 + addi r5,r5,-8 + + beq cr0,4f + /* save and restore cr0 */ + mfocrf r5,128 + EXIT_VMX_OPS + mtocrf 128,r5 + b .LcmpAB_lightweight + +4: + /* compare 32 bytes for each loop */ + srdi r0,r5,5 + mtctr r0 + clrldi r5,r5,59 + li off16,16 + +.balign 16 +5: + lvx v0,0,r3 + lvx v1,0,r4 + VCMPEQUD_RC(v0,v0,v1) + bnl cr6,7f + lvx v0,off16,r3 + lvx v1,off16,r4 + VCMPEQUD_RC(v0,v0,v1) + bnl cr6,6f + addi r3,r3,32 + addi r4,r4,32 + bdnz 5b + + EXIT_VMX_OPS + cmpdi r5,0 + beq .Lzero + b .Lcmp_lt32bytes + +6: + addi r3,r3,16 + addi r4,r4,16 + +7: + /* diff the last 16 bytes */ + EXIT_VMX_OPS + LD rA,0,r3 + LD rB,0,r4 + cmpld cr0,rA,rB + li off8,8 + bne cr0,.LcmpAB_lightweight + + LD rA,off8,r3 + LD rB,off8,r4 + cmpld cr0,rA,rB + bne cr0,.LcmpAB_lightweight + b .Lzero +#endif + +.Ldiffoffset_8bytes_make_align_start: + /* now try to align s1 with 8 bytes */ + rlwinm r6,r3,3,26,28 + beq .Ldiffoffset_align_s1_8bytes + + clrrdi r3,r3,3 + LD rA,0,r3 + LD rB,0,r4 /* unaligned load */ + sld rA,rA,r6 + srd rA,rA,r6 + srd rB,rB,r6 + cmpld cr0,rA,rB + srwi r6,r6,3 + bne cr0,.LcmpAB_lightweight + + subfic r6,r6,8 + subf. r5,r6,r5 + addi r3,r3,8 + add r4,r4,r6 + + beq .Lzero + +.Ldiffoffset_align_s1_8bytes: + /* now s1 is aligned with 8 bytes. */ +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION + /* only do vmx ops when the size equal or greater than 4K bytes */ + cmpdi cr5,r5,VMX_THRESH + bge cr5,.Ldiffoffset_vmx_cmp +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + +.Ldiffoffset_novmx_cmp: +#endif + + + cmpdi cr5,r5,31 + ble cr5,.Lcmp_lt32bytes + +#ifdef CONFIG_ALTIVEC + b .Llong_novmx_cmp +#else + b .Llong +#endif + +#ifdef CONFIG_ALTIVEC +.Ldiffoffset_vmx_cmp: + /* perform a 32 bytes pre-checking before + * enable VMX operations. + */ + li r0,4 + mtctr r0 +.Ldiffoffset_prechk_32B_loop: + LD rA,0,r3 + LD rB,0,r4 + cmpld cr0,rA,rB + addi r3,r3,8 + addi r4,r4,8 + bne cr0,.LcmpAB_lightweight + addi r5,r5,-8 + bdnz .Ldiffoffset_prechk_32B_loop + + ENTER_VMX_OPS + beq cr1,.Ldiffoffset_novmx_cmp + +.Ldiffoffset_vmx_cmp_start: + /* Firstly try to align r3 with 16 bytes */ + andi. r6,r3,0xf + li off16,16 + beq .Ldiffoffset_vmx_s1_16bytes_align + + LVS v3,0,r3 + LVS v4,0,r4 + + lvx v5,0,r3 + lvx v6,0,r4 + LD_VSR_CROSS16B(r3,v3,v5,v7,v9) + LD_VSR_CROSS16B(r4,v4,v6,v8,v10) + + VCMPEQUB_RC(v7,v9,v10) + bnl cr6,.Ldiffoffset_vmx_diff_found + + subfic r6,r6,16 + subf r5,r6,r5 + add r3,r3,r6 + add r4,r4,r6 + +.Ldiffoffset_vmx_s1_16bytes_align: + /* now s1 is aligned with 16 bytes */ + lvx v6,0,r4 + LVS v4,0,r4 + srdi r6,r5,5 /* loop for 32 bytes each */ + clrldi r5,r5,59 + mtctr r6 + +.balign 16 +.Ldiffoffset_vmx_32bytesloop: + /* the first qw of r4 was saved in v6 */ + lvx v9,0,r3 + LD_VSR_CROSS16B(r4,v4,v6,v8,v10) + VCMPEQUB_RC(v7,v9,v10) + vor v6,v8,v8 + bnl cr6,.Ldiffoffset_vmx_diff_found + + addi r3,r3,16 + addi r4,r4,16 + + lvx v9,0,r3 + LD_VSR_CROSS16B(r4,v4,v6,v8,v10) + VCMPEQUB_RC(v7,v9,v10) + vor v6,v8,v8 + bnl cr6,.Ldiffoffset_vmx_diff_found + + addi r3,r3,16 + addi r4,r4,16 + + bdnz .Ldiffoffset_vmx_32bytesloop + + EXIT_VMX_OPS + + cmpdi r5,0 + beq .Lzero + b .Lcmp_lt32bytes + +.Ldiffoffset_vmx_diff_found: + EXIT_VMX_OPS + /* anyway, the diff will appear in next 16 bytes */ + li r5,16 + b .Lcmp_lt32bytes + +#endif EXPORT_SYMBOL(memcmp) |