Home | History | Annotate | Download | only in AArch64
      1 /*
      2  * Copyright (c) 2011 - 2013, ARM Ltd
      3  * All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  * 3. The name of the company may not be used to endorse or promote
     14  *    products derived from this software without specific prior written
     15  *    permission.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
     18  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
     19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     20  * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
     22  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     23  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     24  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     25  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 
     30 .text
     31 .align 2
     32 
     33 
     34 GCC_ASM_EXPORT(memcpy)
     35 
     36 
     37 // Taken from Newlib BSD implementation.
     38 ASM_PFX(memcpy):
     39         // Copy dst to x6, so we can preserve return value.
     40         mov     x6, x0
     41 
     42         // NOTE: although size_t is unsigned, this code uses signed
     43         // comparisons on x2 so relies on nb never having its top bit
     44         // set. In practice this is not going to be a real problem.
     45 
     46         // Require at least 64 bytes to be worth aligning.
     47         cmp     x2, #64
     48         blt     qwordcopy
     49 
     50         // Compute offset to align destination to 16 bytes.
     51         neg     x3, x0
     52         and     x3, x3, 15
     53 
     54         cbz     x3, blockcopy           // offset == 0 is likely
     55 
     56         // We know there is at least 64 bytes to be done, so we
     57         // do a 16 byte misaligned copy at first and then later do
     58         // all 16-byte aligned copies.  Some bytes will be copied
     59         // twice, but there's no harm in that since memcpy does not
     60         // guarantee correctness on overlap.
     61 
     62         sub     x2, x2, x3              // nb -= offset
     63         ldp     x4, x5, [x1]
     64         add     x1, x1, x3
     65         stp     x4, x5, [x6]
     66         add     x6, x6, x3
     67 
     68         // The destination pointer is now qword (16 byte) aligned.
     69         // (The src pointer might be.)
     70 
     71 blockcopy:
     72         // Copy 64 bytes at a time.
     73         subs    x2, x2, #64
     74         blt     3f
     75 2:      subs    x2, x2, #64
     76         ldp     x4, x5, [x1,#0]
     77         ldp     x8, x9, [x1,#16]
     78         ldp     x10,x11,[x1,#32]
     79         ldp     x12,x13,[x1,#48]
     80         add     x1, x1, #64
     81         stp     x4, x5, [x6,#0]
     82         stp     x8, x9, [x6,#16]
     83         stp     x10,x11,[x6,#32]
     84         stp     x12,x13,[x6,#48]
     85         add     x6, x6, #64
     86         bge     2b
     87 
     88         // Unwind pre-decrement
     89 3:      add     x2, x2, #64
     90 
     91 qwordcopy:
     92         // Copy 0-48 bytes, 16 bytes at a time.
     93         subs    x2, x2, #16
     94         blt     tailcopy
     95 2:      ldp     x4, x5, [x1],#16
     96         subs    x2, x2, #16
     97         stp     x4, x5, [x6],#16
     98         bge     2b
     99 
    100         // No need to unwind the pre-decrement, it would not change
    101         // the low 4 bits of the count. But how likely is it for the
    102         // byte count to be multiple of 16? Is it worth the overhead
    103         // of testing for x2 == -16?
    104 
    105 tailcopy:
    106         // Copy trailing 0-15 bytes.
    107         tbz     x2, #3, 1f
    108         ldr     x4, [x1],#8             // copy 8 bytes
    109         str     x4, [x6],#8
    110 1:
    111         tbz     x2, #2, 1f
    112         ldr     w4, [x1],#4             // copy 4 bytes
    113         str     w4, [x6],#4
    114 1:
    115         tbz     x2, #1, 1f
    116         ldrh    w4, [x1],#2             // copy 2 bytes
    117         strh    w4, [x6],#2
    118 1:
    119         tbz     x2, #0, return
    120         ldrb    w4, [x1]                // copy 1 byte
    121         strb    w4, [x6]
    122 
    123 return:
    124         // This is the only return point of memcpy.
    125         ret
    126