Home | History | Annotate | Download | only in AArch64
      1 //
      2 // Copyright (c) 2012 - 2016, Linaro Limited
      3 // All rights reserved.
      4 //
      5 // Redistribution and use in source and binary forms, with or without
      6 // modification, are permitted provided that the following conditions are met:
      7 //     * Redistributions of source code must retain the above copyright
      8 //       notice, this list of conditions and the following disclaimer.
      9 //     * Redistributions in binary form must reproduce the above copyright
     10 //       notice, this list of conditions and the following disclaimer in the
     11 //       documentation and/or other materials provided with the distribution.
     12 //     * Neither the name of the Linaro nor the
     13 //       names of its contributors may be used to endorse or promote products
     14 //       derived from this software without specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     20 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 //
     28 
     29 //
     30 // Copyright (c) 2015 ARM Ltd
     31 // All rights reserved.
     32 //
     33 // Redistribution and use in source and binary forms, with or without
     34 // modification, are permitted provided that the following conditions
     35 // are met:
     36 // 1. Redistributions of source code must retain the above copyright
     37 //    notice, this list of conditions and the following disclaimer.
     38 // 2. Redistributions in binary form must reproduce the above copyright
     39 //    notice, this list of conditions and the following disclaimer in the
     40 //    documentation and/or other materials provided with the distribution.
     41 // 3. The name of the company may not be used to endorse or promote
     42 //    products derived from this software without specific prior written
     43 //    permission.
     44 //
     45 // THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
     46 // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
     47 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     48 // IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     49 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
     50 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     51 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     52 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     53 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     54 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     55 //
     56 
     57 // Assumptions:
     58 //
     59 // ARMv8-a, AArch64, unaligned accesses.
     60 //
     61 //
     62 
     63 #define dstin     x0
     64 #define src       x1
     65 #define count     x2
     66 #define dst       x3
     67 #define srcend    x4
     68 #define dstend    x5
     69 #define A_l       x6
     70 #define A_lw      w6
     71 #define A_h       x7
     72 #define A_hw      w7
     73 #define B_l       x8
     74 #define B_lw      w8
     75 #define B_h       x9
     76 #define C_l       x10
     77 #define C_h       x11
     78 #define D_l       x12
     79 #define D_h       x13
     80 #define E_l       x14
     81 #define E_h       x15
     82 #define F_l       srcend
     83 #define F_h       dst
     84 #define tmp1      x9
     85 #define tmp2      x3
     86 
     87 #define L(l) .L ## l
     88 
     89 // Copies are split into 3 main cases: small copies of up to 16 bytes,
     90 // medium copies of 17..96 bytes which are fully unrolled. Large copies
     91 // of more than 96 bytes align the destination and use an unrolled loop
     92 // processing 64 bytes per iteration.
     93 // Small and medium copies read all data before writing, allowing any
     94 // kind of overlap, and memmove tailcalls memcpy for these cases as
     95 // well as non-overlapping copies.
     96 
     97 __memcpy:
     98     prfm    PLDL1KEEP, [src]
     99     add     srcend, src, count
    100     add     dstend, dstin, count
    101     cmp     count, 16
    102     b.ls    L(copy16)
    103     cmp     count, 96
    104     b.hi    L(copy_long)
    105 
    106     // Medium copies: 17..96 bytes.
    107     sub     tmp1, count, 1
    108     ldp     A_l, A_h, [src]
    109     tbnz    tmp1, 6, L(copy96)
    110     ldp     D_l, D_h, [srcend, -16]
    111     tbz     tmp1, 5, 1f
    112     ldp     B_l, B_h, [src, 16]
    113     ldp     C_l, C_h, [srcend, -32]
    114     stp     B_l, B_h, [dstin, 16]
    115     stp     C_l, C_h, [dstend, -32]
    116 1:
    117     stp     A_l, A_h, [dstin]
    118     stp     D_l, D_h, [dstend, -16]
    119     ret
    120 
    121     .p2align 4
    122     // Small copies: 0..16 bytes.
    123 L(copy16):
    124     cmp     count, 8
    125     b.lo    1f
    126     ldr     A_l, [src]
    127     ldr     A_h, [srcend, -8]
    128     str     A_l, [dstin]
    129     str     A_h, [dstend, -8]
    130     ret
    131     .p2align 4
    132 1:
    133     tbz     count, 2, 1f
    134     ldr     A_lw, [src]
    135     ldr     A_hw, [srcend, -4]
    136     str     A_lw, [dstin]
    137     str     A_hw, [dstend, -4]
    138     ret
    139 
    140     // Copy 0..3 bytes.  Use a branchless sequence that copies the same
    141     // byte 3 times if count==1, or the 2nd byte twice if count==2.
    142 1:
    143     cbz     count, 2f
    144     lsr     tmp1, count, 1
    145     ldrb    A_lw, [src]
    146     ldrb    A_hw, [srcend, -1]
    147     ldrb    B_lw, [src, tmp1]
    148     strb    A_lw, [dstin]
    149     strb    B_lw, [dstin, tmp1]
    150     strb    A_hw, [dstend, -1]
    151 2:  ret
    152 
    153     .p2align 4
    154     // Copy 64..96 bytes.  Copy 64 bytes from the start and
    155     // 32 bytes from the end.
    156 L(copy96):
    157     ldp     B_l, B_h, [src, 16]
    158     ldp     C_l, C_h, [src, 32]
    159     ldp     D_l, D_h, [src, 48]
    160     ldp     E_l, E_h, [srcend, -32]
    161     ldp     F_l, F_h, [srcend, -16]
    162     stp     A_l, A_h, [dstin]
    163     stp     B_l, B_h, [dstin, 16]
    164     stp     C_l, C_h, [dstin, 32]
    165     stp     D_l, D_h, [dstin, 48]
    166     stp     E_l, E_h, [dstend, -32]
    167     stp     F_l, F_h, [dstend, -16]
    168     ret
    169 
    170     // Align DST to 16 byte alignment so that we don't cross cache line
    171     // boundaries on both loads and stores. There are at least 96 bytes
    172     // to copy, so copy 16 bytes unaligned and then align.	The loop
    173     // copies 64 bytes per iteration and prefetches one iteration ahead.
    174 
    175     .p2align 4
    176 L(copy_long):
    177     and     tmp1, dstin, 15
    178     bic     dst, dstin, 15
    179     ldp     D_l, D_h, [src]
    180     sub     src, src, tmp1
    181     add     count, count, tmp1      // Count is now 16 too large.
    182     ldp     A_l, A_h, [src, 16]
    183     stp     D_l, D_h, [dstin]
    184     ldp     B_l, B_h, [src, 32]
    185     ldp     C_l, C_h, [src, 48]
    186     ldp     D_l, D_h, [src, 64]!
    187     subs    count, count, 128 + 16  // Test and readjust count.
    188     b.ls    2f
    189 1:
    190     stp     A_l, A_h, [dst, 16]
    191     ldp     A_l, A_h, [src, 16]
    192     stp     B_l, B_h, [dst, 32]
    193     ldp     B_l, B_h, [src, 32]
    194     stp     C_l, C_h, [dst, 48]
    195     ldp     C_l, C_h, [src, 48]
    196     stp     D_l, D_h, [dst, 64]!
    197     ldp     D_l, D_h, [src, 64]!
    198     subs    count, count, 64
    199     b.hi    1b
    200 
    201     // Write the last full set of 64 bytes.	 The remainder is at most 64
    202     // bytes, so it is safe to always copy 64 bytes from the end even if
    203     // there is just 1 byte left.
    204 2:
    205     ldp     E_l, E_h, [srcend, -64]
    206     stp     A_l, A_h, [dst, 16]
    207     ldp     A_l, A_h, [srcend, -48]
    208     stp     B_l, B_h, [dst, 32]
    209     ldp     B_l, B_h, [srcend, -32]
    210     stp     C_l, C_h, [dst, 48]
    211     ldp     C_l, C_h, [srcend, -16]
    212     stp     D_l, D_h, [dst, 64]
    213     stp     E_l, E_h, [dstend, -64]
    214     stp     A_l, A_h, [dstend, -48]
    215     stp     B_l, B_h, [dstend, -32]
    216     stp     C_l, C_h, [dstend, -16]
    217     ret
    218 
    219 
    220 //
    221 // All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
    222 // Larger backwards copies are also handled by memcpy. The only remaining
    223 // case is forward large copies.  The destination is aligned, and an
    224 // unrolled loop processes 64 bytes per iteration.
    225 //
    226 
    227 ASM_GLOBAL ASM_PFX(InternalMemCopyMem)
    228 ASM_PFX(InternalMemCopyMem):
    229     sub     tmp2, dstin, src
    230     cmp     count, 96
    231     ccmp    tmp2, count, 2, hi
    232     b.hs    __memcpy
    233 
    234     cbz     tmp2, 3f
    235     add     dstend, dstin, count
    236     add     srcend, src, count
    237 
    238     // Align dstend to 16 byte alignment so that we don't cross cache line
    239     // boundaries on both loads and stores. There are at least 96 bytes
    240     // to copy, so copy 16 bytes unaligned and then align. The loop
    241     // copies 64 bytes per iteration and prefetches one iteration ahead.
    242 
    243     and     tmp2, dstend, 15
    244     ldp     D_l, D_h, [srcend, -16]
    245     sub     srcend, srcend, tmp2
    246     sub     count, count, tmp2
    247     ldp     A_l, A_h, [srcend, -16]
    248     stp     D_l, D_h, [dstend, -16]
    249     ldp     B_l, B_h, [srcend, -32]
    250     ldp     C_l, C_h, [srcend, -48]
    251     ldp     D_l, D_h, [srcend, -64]!
    252     sub     dstend, dstend, tmp2
    253     subs    count, count, 128
    254     b.ls    2f
    255     nop
    256 1:
    257     stp     A_l, A_h, [dstend, -16]
    258     ldp     A_l, A_h, [srcend, -16]
    259     stp     B_l, B_h, [dstend, -32]
    260     ldp     B_l, B_h, [srcend, -32]
    261     stp     C_l, C_h, [dstend, -48]
    262     ldp     C_l, C_h, [srcend, -48]
    263     stp     D_l, D_h, [dstend, -64]!
    264     ldp     D_l, D_h, [srcend, -64]!
    265     subs    count, count, 64
    266     b.hi    1b
    267 
    268     // Write the last full set of 64 bytes. The remainder is at most 64
    269     // bytes, so it is safe to always copy 64 bytes from the start even if
    270     // there is just 1 byte left.
    271 2:
    272     ldp     E_l, E_h, [src, 48]
    273     stp     A_l, A_h, [dstend, -16]
    274     ldp     A_l, A_h, [src, 32]
    275     stp     B_l, B_h, [dstend, -32]
    276     ldp     B_l, B_h, [src, 16]
    277     stp     C_l, C_h, [dstend, -48]
    278     ldp     C_l, C_h, [src]
    279     stp     D_l, D_h, [dstend, -64]
    280     stp     E_l, E_h, [dstin, 48]
    281     stp     A_l, A_h, [dstin, 32]
    282     stp     B_l, B_h, [dstin, 16]
    283     stp     C_l, C_h, [dstin]
    284 3:  ret
    285