//
// Copyright (c) 2012 - 2016, Linaro Limited
// All rights reserved.
// Copyright (c) 2015 ARM Ltd
// All rights reserved.
// SPDX-License-Identifier: BSD-2-Clause-Patent
//

// Assumptions:
//
// ARMv8-a, AArch64, unaligned accesses.
//
//

#define dstin     x0
#define src       x1
#define count     x2
#define dst       x3
#define srcend    x4
#define dstend    x5
#define A_l       x6
#define A_lw      w6
#define A_h       x7
#define A_hw      w7
#define B_l       x8
#define B_lw      w8
#define B_h       x9
#define C_l       x10
#define C_h       x11
#define D_l       x12
#define D_h       x13
#define E_l       x14
#define E_h       x15
#define F_l       srcend
#define F_h       dst
#define tmp1      x9
#define tmp2      x3

#define L(l) .L ## l

// Copies are split into 3 main cases: small copies of up to 16 bytes,
// medium copies of 17..96 bytes which are fully unrolled. Large copies
// of more than 96 bytes align the destination and use an unrolled loop
// processing 64 bytes per iteration.
// Small and medium copies read all data before writing, allowing any
// kind of overlap, and memmove tailcalls memcpy for these cases as
// well as non-overlapping copies.

__memcpy:
    prfm    PLDL1KEEP, [src]
    add     srcend, src, count
    add     dstend, dstin, count
    cmp     count, 16
    b.ls    L(copy16)
    cmp     count, 96
    b.hi    L(copy_long)

    // Medium copies: 17..96 bytes.
    sub     tmp1, count, 1
    ldp     A_l, A_h, [src]
    tbnz    tmp1, 6, L(copy96)
    ldp     D_l, D_h, [srcend, -16]
    tbz     tmp1, 5, 1f
    ldp     B_l, B_h, [src, 16]
    ldp     C_l, C_h, [srcend, -32]
    stp     B_l, B_h, [dstin, 16]
    stp     C_l, C_h, [dstend, -32]
1:
    stp     A_l, A_h, [dstin]
    stp     D_l, D_h, [dstend, -16]
    ret

    .p2align 4
    // Small copies: 0..16 bytes.
L(copy16):
    cmp     count, 8
    b.lo    1f
    ldr     A_l, [src]
    ldr     A_h, [srcend, -8]
    str     A_l, [dstin]
    str     A_h, [dstend, -8]
    ret
    .p2align 4
1:
    tbz     count, 2, 1f
    ldr     A_lw, [src]
    ldr     A_hw, [srcend, -4]
    str     A_lw, [dstin]
    str     A_hw, [dstend, -4]
    ret

    // Copy 0..3 bytes.  Use a branchless sequence that copies the same
    // byte 3 times if count==1, or the 2nd byte twice if count==2.
1:
    cbz     count, 2f
    lsr     tmp1, count, 1
    ldrb    A_lw, [src]
    ldrb    A_hw, [srcend, -1]
    ldrb    B_lw, [src, tmp1]
    strb    A_lw, [dstin]
    strb    B_lw, [dstin, tmp1]
    strb    A_hw, [dstend, -1]
2:  ret

    .p2align 4
    // Copy 64..96 bytes.  Copy 64 bytes from the start and
    // 32 bytes from the end.
L(copy96):
    ldp     B_l, B_h, [src, 16]
    ldp     C_l, C_h, [src, 32]
    ldp     D_l, D_h, [src, 48]
    ldp     E_l, E_h, [srcend, -32]
    ldp     F_l, F_h, [srcend, -16]
    stp     A_l, A_h, [dstin]
    stp     B_l, B_h, [dstin, 16]
    stp     C_l, C_h, [dstin, 32]
    stp     D_l, D_h, [dstin, 48]
    stp     E_l, E_h, [dstend, -32]
    stp     F_l, F_h, [dstend, -16]
    ret

    // Align DST to 16 byte alignment so that we don't cross cache line
    // boundaries on both loads and stores. There are at least 96 bytes
    // to copy, so copy 16 bytes unaligned and then align.  The loop
    // copies 64 bytes per iteration and prefetches one iteration ahead.

    .p2align 4
L(copy_long):
    and     tmp1, dstin, 15
    bic     dst, dstin, 15
    ldp     D_l, D_h, [src]
    sub     src, src, tmp1
    add     count, count, tmp1      // Count is now 16 too large.
    ldp     A_l, A_h, [src, 16]
    stp     D_l, D_h, [dstin]
    ldp     B_l, B_h, [src, 32]
    ldp     C_l, C_h, [src, 48]
    ldp     D_l, D_h, [src, 64]!
    subs    count, count, 128 + 16  // Test and readjust count.
    b.ls    2f
1:
    stp     A_l, A_h, [dst, 16]
    ldp     A_l, A_h, [src, 16]
    stp     B_l, B_h, [dst, 32]
    ldp     B_l, B_h, [src, 32]
    stp     C_l, C_h, [dst, 48]
    ldp     C_l, C_h, [src, 48]
    stp     D_l, D_h, [dst, 64]!
    ldp     D_l, D_h, [src, 64]!
    subs    count, count, 64
    b.hi    1b

    // Write the last full set of 64 bytes.   The remainder is at most 64
    // bytes, so it is safe to always copy 64 bytes from the end even if
    // there is just 1 byte left.
2:
    ldp     E_l, E_h, [srcend, -64]
    stp     A_l, A_h, [dst, 16]
    ldp     A_l, A_h, [srcend, -48]
    stp     B_l, B_h, [dst, 32]
    ldp     B_l, B_h, [srcend, -32]
    stp     C_l, C_h, [dst, 48]
    ldp     C_l, C_h, [srcend, -16]
    stp     D_l, D_h, [dst, 64]
    stp     E_l, E_h, [dstend, -64]
    stp     A_l, A_h, [dstend, -48]
    stp     B_l, B_h, [dstend, -32]
    stp     C_l, C_h, [dstend, -16]
    ret


//
// All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
// Larger backwards copies are also handled by memcpy. The only remaining
// case is forward large copies.  The destination is aligned, and an
// unrolled loop processes 64 bytes per iteration.
//

ASM_GLOBAL ASM_PFX(InternalMemCopyMem)
ASM_PFX(InternalMemCopyMem):
    AARCH64_BTI(c)
    sub     tmp2, dstin, src
    cmp     count, 96
    ccmp    tmp2, count, 2, hi
    b.hs    __memcpy

    cbz     tmp2, 3f
    add     dstend, dstin, count
    add     srcend, src, count

    // Align dstend to 16 byte alignment so that we don't cross cache line
    // boundaries on both loads and stores. There are at least 96 bytes
    // to copy, so copy 16 bytes unaligned and then align. The loop
    // copies 64 bytes per iteration and prefetches one iteration ahead.

    and     tmp2, dstend, 15
    ldp     D_l, D_h, [srcend, -16]
    sub     srcend, srcend, tmp2
    sub     count, count, tmp2
    ldp     A_l, A_h, [srcend, -16]
    stp     D_l, D_h, [dstend, -16]
    ldp     B_l, B_h, [srcend, -32]
    ldp     C_l, C_h, [srcend, -48]
    ldp     D_l, D_h, [srcend, -64]!
    sub     dstend, dstend, tmp2
    subs    count, count, 128
    b.ls    2f
    nop
1:
    stp     A_l, A_h, [dstend, -16]
    ldp     A_l, A_h, [srcend, -16]
    stp     B_l, B_h, [dstend, -32]
    ldp     B_l, B_h, [srcend, -32]
    stp     C_l, C_h, [dstend, -48]
    ldp     C_l, C_h, [srcend, -48]
    stp     D_l, D_h, [dstend, -64]!
    ldp     D_l, D_h, [srcend, -64]!
    subs    count, count, 64
    b.hi    1b

    // Write the last full set of 64 bytes. The remainder is at most 64
    // bytes, so it is safe to always copy 64 bytes from the start even if
    // there is just 1 byte left.
2:
    ldp     E_l, E_h, [src, 48]
    stp     A_l, A_h, [dstend, -16]
    ldp     A_l, A_h, [src, 32]
    stp     B_l, B_h, [dstend, -32]
    ldp     B_l, B_h, [src, 16]
    stp     C_l, C_h, [dstend, -48]
    ldp     C_l, C_h, [src]
    stp     D_l, D_h, [dstend, -64]
    stp     E_l, E_h, [dstin, 48]
    stp     A_l, A_h, [dstin, 32]
    stp     B_l, B_h, [dstin, 16]
    stp     C_l, C_h, [dstin]
3:  ret