1 // 2 // Copyright (c) 2012 - 2016, Linaro Limited 3 // All rights reserved. 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are met: 7 // * Redistributions of source code must retain the above copyright 8 // notice, this list of conditions and the following disclaimer. 9 // * Redistributions in binary form must reproduce the above copyright 10 // notice, this list of conditions and the following disclaimer in the 11 // documentation and/or other materials provided with the distribution. 12 // * Neither the name of the Linaro nor the 13 // names of its contributors may be used to endorse or promote products 14 // derived from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 // 28 29 // 30 // Copyright (c) 2015 ARM Ltd 31 // All rights reserved. 32 // 33 // Redistribution and use in source and binary forms, with or without 34 // modification, are permitted provided that the following conditions 35 // are met: 36 // 1. Redistributions of source code must retain the above copyright 37 // notice, this list of conditions and the following disclaimer. 38 // 2. Redistributions in binary form must reproduce the above copyright 39 // notice, this list of conditions and the following disclaimer in the 40 // documentation and/or other materials provided with the distribution. 41 // 3. The name of the company may not be used to endorse or promote 42 // products derived from this software without specific prior written 43 // permission. 44 // 45 // THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED 46 // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 47 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 48 // IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 49 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 50 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 51 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 52 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 53 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 54 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 55 // 56 57 // Assumptions: 58 // 59 // ARMv8-a, AArch64, unaligned accesses. 60 // 61 // 62 63 #define dstin x0 64 #define src x1 65 #define count x2 66 #define dst x3 67 #define srcend x4 68 #define dstend x5 69 #define A_l x6 70 #define A_lw w6 71 #define A_h x7 72 #define A_hw w7 73 #define B_l x8 74 #define B_lw w8 75 #define B_h x9 76 #define C_l x10 77 #define C_h x11 78 #define D_l x12 79 #define D_h x13 80 #define E_l x14 81 #define E_h x15 82 #define F_l srcend 83 #define F_h dst 84 #define tmp1 x9 85 #define tmp2 x3 86 87 #define L(l) .L ## l 88 89 // Copies are split into 3 main cases: small copies of up to 16 bytes, 90 // medium copies of 17..96 bytes which are fully unrolled. Large copies 91 // of more than 96 bytes align the destination and use an unrolled loop 92 // processing 64 bytes per iteration. 93 // Small and medium copies read all data before writing, allowing any 94 // kind of overlap, and memmove tailcalls memcpy for these cases as 95 // well as non-overlapping copies. 96 97 __memcpy: 98 prfm PLDL1KEEP, [src] 99 add srcend, src, count 100 add dstend, dstin, count 101 cmp count, 16 102 b.ls L(copy16) 103 cmp count, 96 104 b.hi L(copy_long) 105 106 // Medium copies: 17..96 bytes. 107 sub tmp1, count, 1 108 ldp A_l, A_h, [src] 109 tbnz tmp1, 6, L(copy96) 110 ldp D_l, D_h, [srcend, -16] 111 tbz tmp1, 5, 1f 112 ldp B_l, B_h, [src, 16] 113 ldp C_l, C_h, [srcend, -32] 114 stp B_l, B_h, [dstin, 16] 115 stp C_l, C_h, [dstend, -32] 116 1: 117 stp A_l, A_h, [dstin] 118 stp D_l, D_h, [dstend, -16] 119 ret 120 121 .p2align 4 122 // Small copies: 0..16 bytes. 123 L(copy16): 124 cmp count, 8 125 b.lo 1f 126 ldr A_l, [src] 127 ldr A_h, [srcend, -8] 128 str A_l, [dstin] 129 str A_h, [dstend, -8] 130 ret 131 .p2align 4 132 1: 133 tbz count, 2, 1f 134 ldr A_lw, [src] 135 ldr A_hw, [srcend, -4] 136 str A_lw, [dstin] 137 str A_hw, [dstend, -4] 138 ret 139 140 // Copy 0..3 bytes. Use a branchless sequence that copies the same 141 // byte 3 times if count==1, or the 2nd byte twice if count==2. 142 1: 143 cbz count, 2f 144 lsr tmp1, count, 1 145 ldrb A_lw, [src] 146 ldrb A_hw, [srcend, -1] 147 ldrb B_lw, [src, tmp1] 148 strb A_lw, [dstin] 149 strb B_lw, [dstin, tmp1] 150 strb A_hw, [dstend, -1] 151 2: ret 152 153 .p2align 4 154 // Copy 64..96 bytes. Copy 64 bytes from the start and 155 // 32 bytes from the end. 156 L(copy96): 157 ldp B_l, B_h, [src, 16] 158 ldp C_l, C_h, [src, 32] 159 ldp D_l, D_h, [src, 48] 160 ldp E_l, E_h, [srcend, -32] 161 ldp F_l, F_h, [srcend, -16] 162 stp A_l, A_h, [dstin] 163 stp B_l, B_h, [dstin, 16] 164 stp C_l, C_h, [dstin, 32] 165 stp D_l, D_h, [dstin, 48] 166 stp E_l, E_h, [dstend, -32] 167 stp F_l, F_h, [dstend, -16] 168 ret 169 170 // Align DST to 16 byte alignment so that we don't cross cache line 171 // boundaries on both loads and stores. There are at least 96 bytes 172 // to copy, so copy 16 bytes unaligned and then align. The loop 173 // copies 64 bytes per iteration and prefetches one iteration ahead. 174 175 .p2align 4 176 L(copy_long): 177 and tmp1, dstin, 15 178 bic dst, dstin, 15 179 ldp D_l, D_h, [src] 180 sub src, src, tmp1 181 add count, count, tmp1 // Count is now 16 too large. 182 ldp A_l, A_h, [src, 16] 183 stp D_l, D_h, [dstin] 184 ldp B_l, B_h, [src, 32] 185 ldp C_l, C_h, [src, 48] 186 ldp D_l, D_h, [src, 64]! 187 subs count, count, 128 + 16 // Test and readjust count. 188 b.ls 2f 189 1: 190 stp A_l, A_h, [dst, 16] 191 ldp A_l, A_h, [src, 16] 192 stp B_l, B_h, [dst, 32] 193 ldp B_l, B_h, [src, 32] 194 stp C_l, C_h, [dst, 48] 195 ldp C_l, C_h, [src, 48] 196 stp D_l, D_h, [dst, 64]! 197 ldp D_l, D_h, [src, 64]! 198 subs count, count, 64 199 b.hi 1b 200 201 // Write the last full set of 64 bytes. The remainder is at most 64 202 // bytes, so it is safe to always copy 64 bytes from the end even if 203 // there is just 1 byte left. 204 2: 205 ldp E_l, E_h, [srcend, -64] 206 stp A_l, A_h, [dst, 16] 207 ldp A_l, A_h, [srcend, -48] 208 stp B_l, B_h, [dst, 32] 209 ldp B_l, B_h, [srcend, -32] 210 stp C_l, C_h, [dst, 48] 211 ldp C_l, C_h, [srcend, -16] 212 stp D_l, D_h, [dst, 64] 213 stp E_l, E_h, [dstend, -64] 214 stp A_l, A_h, [dstend, -48] 215 stp B_l, B_h, [dstend, -32] 216 stp C_l, C_h, [dstend, -16] 217 ret 218 219 220 // 221 // All memmoves up to 96 bytes are done by memcpy as it supports overlaps. 222 // Larger backwards copies are also handled by memcpy. The only remaining 223 // case is forward large copies. The destination is aligned, and an 224 // unrolled loop processes 64 bytes per iteration. 225 // 226 227 ASM_GLOBAL ASM_PFX(InternalMemCopyMem) 228 ASM_PFX(InternalMemCopyMem): 229 sub tmp2, dstin, src 230 cmp count, 96 231 ccmp tmp2, count, 2, hi 232 b.hs __memcpy 233 234 cbz tmp2, 3f 235 add dstend, dstin, count 236 add srcend, src, count 237 238 // Align dstend to 16 byte alignment so that we don't cross cache line 239 // boundaries on both loads and stores. There are at least 96 bytes 240 // to copy, so copy 16 bytes unaligned and then align. The loop 241 // copies 64 bytes per iteration and prefetches one iteration ahead. 242 243 and tmp2, dstend, 15 244 ldp D_l, D_h, [srcend, -16] 245 sub srcend, srcend, tmp2 246 sub count, count, tmp2 247 ldp A_l, A_h, [srcend, -16] 248 stp D_l, D_h, [dstend, -16] 249 ldp B_l, B_h, [srcend, -32] 250 ldp C_l, C_h, [srcend, -48] 251 ldp D_l, D_h, [srcend, -64]! 252 sub dstend, dstend, tmp2 253 subs count, count, 128 254 b.ls 2f 255 nop 256 1: 257 stp A_l, A_h, [dstend, -16] 258 ldp A_l, A_h, [srcend, -16] 259 stp B_l, B_h, [dstend, -32] 260 ldp B_l, B_h, [srcend, -32] 261 stp C_l, C_h, [dstend, -48] 262 ldp C_l, C_h, [srcend, -48] 263 stp D_l, D_h, [dstend, -64]! 264 ldp D_l, D_h, [srcend, -64]! 265 subs count, count, 64 266 b.hi 1b 267 268 // Write the last full set of 64 bytes. The remainder is at most 64 269 // bytes, so it is safe to always copy 64 bytes from the start even if 270 // there is just 1 byte left. 271 2: 272 ldp E_l, E_h, [src, 48] 273 stp A_l, A_h, [dstend, -16] 274 ldp A_l, A_h, [src, 32] 275 stp B_l, B_h, [dstend, -32] 276 ldp B_l, B_h, [src, 16] 277 stp C_l, C_h, [dstend, -48] 278 ldp C_l, C_h, [src] 279 stp D_l, D_h, [dstend, -64] 280 stp E_l, E_h, [dstin, 48] 281 stp A_l, A_h, [dstin, 32] 282 stp B_l, B_h, [dstin, 16] 283 stp C_l, C_h, [dstin] 284 3: ret 285