1 /* 2 * Copyright (c) 2011 - 2013, ARM Ltd 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. The name of the company may not be used to endorse or promote 14 * products derived from this software without specific prior written 15 * permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED 18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 30 .text 31 .align 2 32 33 34 GCC_ASM_EXPORT(memcpy) 35 36 37 // Taken from Newlib BSD implementation. 38 ASM_PFX(memcpy): 39 // Copy dst to x6, so we can preserve return value. 40 mov x6, x0 41 42 // NOTE: although size_t is unsigned, this code uses signed 43 // comparisons on x2 so relies on nb never having its top bit 44 // set. In practice this is not going to be a real problem. 45 46 // Require at least 64 bytes to be worth aligning. 47 cmp x2, #64 48 blt qwordcopy 49 50 // Compute offset to align destination to 16 bytes. 51 neg x3, x0 52 and x3, x3, 15 53 54 cbz x3, blockcopy // offset == 0 is likely 55 56 // We know there is at least 64 bytes to be done, so we 57 // do a 16 byte misaligned copy at first and then later do 58 // all 16-byte aligned copies. Some bytes will be copied 59 // twice, but there's no harm in that since memcpy does not 60 // guarantee correctness on overlap. 61 62 sub x2, x2, x3 // nb -= offset 63 ldp x4, x5, [x1] 64 add x1, x1, x3 65 stp x4, x5, [x6] 66 add x6, x6, x3 67 68 // The destination pointer is now qword (16 byte) aligned. 69 // (The src pointer might be.) 70 71 blockcopy: 72 // Copy 64 bytes at a time. 73 subs x2, x2, #64 74 blt 3f 75 2: subs x2, x2, #64 76 ldp x4, x5, [x1,#0] 77 ldp x8, x9, [x1,#16] 78 ldp x10,x11,[x1,#32] 79 ldp x12,x13,[x1,#48] 80 add x1, x1, #64 81 stp x4, x5, [x6,#0] 82 stp x8, x9, [x6,#16] 83 stp x10,x11,[x6,#32] 84 stp x12,x13,[x6,#48] 85 add x6, x6, #64 86 bge 2b 87 88 // Unwind pre-decrement 89 3: add x2, x2, #64 90 91 qwordcopy: 92 // Copy 0-48 bytes, 16 bytes at a time. 93 subs x2, x2, #16 94 blt tailcopy 95 2: ldp x4, x5, [x1],#16 96 subs x2, x2, #16 97 stp x4, x5, [x6],#16 98 bge 2b 99 100 // No need to unwind the pre-decrement, it would not change 101 // the low 4 bits of the count. But how likely is it for the 102 // byte count to be multiple of 16? Is it worth the overhead 103 // of testing for x2 == -16? 104 105 tailcopy: 106 // Copy trailing 0-15 bytes. 107 tbz x2, #3, 1f 108 ldr x4, [x1],#8 // copy 8 bytes 109 str x4, [x6],#8 110 1: 111 tbz x2, #2, 1f 112 ldr w4, [x1],#4 // copy 4 bytes 113 str w4, [x6],#4 114 1: 115 tbz x2, #1, 1f 116 ldrh w4, [x1],#2 // copy 2 bytes 117 strh w4, [x6],#2 118 1: 119 tbz x2, #0, return 120 ldrb w4, [x1] // copy 1 byte 121 strb w4, [x6] 122 123 return: 124 // This is the only return point of memcpy. 125 ret 126