1 /* Copyright (c) 2014, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 /* Assumptions: 29 * 30 * ARMv8-a, AArch64 31 * Unaligned accesses 32 * wchar_t is 4 bytes 33 */ 34 35 #include <private/bionic_asm.h> 36 37 /* Parameters and result. */ 38 #ifdef BCOPY 39 #define origdstin x1 40 #define origsrc x0 41 #endif 42 #define dstin x0 43 #define src x1 44 #define count x2 45 #define tmp1 x3 46 #define tmp1w w3 47 #define tmp2 x4 48 #define tmp2w w4 49 #define tmp3 x5 50 #define tmp3w w5 51 #define dst x6 52 53 #define A_l x7 54 #define A_h x8 55 #define B_l x9 56 #define B_h x10 57 #define C_l x11 58 #define C_h x12 59 #define D_l x13 60 #define D_h x14 61 62 #ifdef BCOPY 63 ENTRY(bcopy) 64 /* Swap src and dst so that a branch to memcpy doesn't cause issues. */ 65 mov tmp1, origsrc 66 mov origsrc, origdstin 67 mov origdstin, tmp1 68 #elif defined(WMEMMOVE) 69 ENTRY(wmemmove) 70 lsl count, count, #2 71 #else 72 ENTRY(memmove) 73 #endif 74 cmp dstin, src 75 b.lo .Ldownwards 76 add tmp1, src, count 77 cmp dstin, tmp1 78 b.hs memcpy /* No overlap. */ 79 80 /* Upwards move with potential overlap. 81 * Need to move from the tail backwards. SRC and DST point one 82 * byte beyond the remaining data to move. */ 83 add dst, dstin, count 84 add src, src, count 85 cmp count, #64 86 b.ge .Lmov_not_short_up 87 88 /* Deal with small moves quickly by dropping straight into the 89 * exit block. */ 90 .Ltail63up: 91 /* Move up to 48 bytes of data. At this point we only need the 92 * bottom 6 bits of count to be accurate. */ 93 ands tmp1, count, #0x30 94 b.eq .Ltail15up 95 sub dst, dst, tmp1 96 sub src, src, tmp1 97 cmp tmp1w, #0x20 98 b.eq 1f 99 b.lt 2f 100 ldp A_l, A_h, [src, #32] 101 stp A_l, A_h, [dst, #32] 102 1: 103 ldp A_l, A_h, [src, #16] 104 stp A_l, A_h, [dst, #16] 105 2: 106 ldp A_l, A_h, [src] 107 stp A_l, A_h, [dst] 108 .Ltail15up: 109 /* Move up to 15 bytes of data. Does not assume additional data 110 * being moved. */ 111 tbz count, #3, 1f 112 ldr tmp1, [src, #-8]! 113 str tmp1, [dst, #-8]! 114 1: 115 tbz count, #2, 1f 116 ldr tmp1w, [src, #-4]! 117 str tmp1w, [dst, #-4]! 118 1: 119 tbz count, #1, 1f 120 ldrh tmp1w, [src, #-2]! 121 strh tmp1w, [dst, #-2]! 122 1: 123 tbz count, #0, 1f 124 ldrb tmp1w, [src, #-1] 125 strb tmp1w, [dst, #-1] 126 1: 127 ret 128 129 .Lmov_not_short_up: 130 /* We don't much care about the alignment of DST, but we want SRC 131 * to be 128-bit (16 byte) aligned so that we don't cross cache line 132 * boundaries on both loads and stores. */ 133 ands tmp2, src, #15 /* Bytes to reach alignment. */ 134 b.eq 2f 135 sub count, count, tmp2 136 /* Move enough data to reach alignment; unlike memcpy, we have to 137 * be aware of the overlap, which means we can't move data twice. */ 138 tbz tmp2, #3, 1f 139 ldr tmp1, [src, #-8]! 140 str tmp1, [dst, #-8]! 141 1: 142 tbz tmp2, #2, 1f 143 ldr tmp1w, [src, #-4]! 144 str tmp1w, [dst, #-4]! 145 1: 146 tbz tmp2, #1, 1f 147 ldrh tmp1w, [src, #-2]! 148 strh tmp1w, [dst, #-2]! 149 1: 150 tbz tmp2, #0, 1f 151 ldrb tmp1w, [src, #-1]! 152 strb tmp1w, [dst, #-1]! 153 1: 154 155 /* There may be less than 63 bytes to go now. */ 156 cmp count, #63 157 b.le .Ltail63up 158 2: 159 subs count, count, #128 160 b.ge .Lmov_body_large_up 161 /* Less than 128 bytes to move, so handle 64 here and then jump 162 * to the tail. */ 163 ldp A_l, A_h, [src, #-64]! 164 ldp B_l, B_h, [src, #16] 165 ldp C_l, C_h, [src, #32] 166 ldp D_l, D_h, [src, #48] 167 stp A_l, A_h, [dst, #-64]! 168 stp B_l, B_h, [dst, #16] 169 stp C_l, C_h, [dst, #32] 170 stp D_l, D_h, [dst, #48] 171 tst count, #0x3f 172 b.ne .Ltail63up 173 ret 174 175 /* Critical loop. Start at a new Icache line boundary. Assuming 176 * 64 bytes per line this ensures the entire loop is in one line. */ 177 .p2align 6 178 .Lmov_body_large_up: 179 /* There are at least 128 bytes to move. */ 180 ldp A_l, A_h, [src, #-16] 181 ldp B_l, B_h, [src, #-32] 182 ldp C_l, C_h, [src, #-48] 183 ldp D_l, D_h, [src, #-64]! 184 1: 185 stp A_l, A_h, [dst, #-16] 186 ldp A_l, A_h, [src, #-16] 187 stp B_l, B_h, [dst, #-32] 188 ldp B_l, B_h, [src, #-32] 189 stp C_l, C_h, [dst, #-48] 190 ldp C_l, C_h, [src, #-48] 191 stp D_l, D_h, [dst, #-64]! 192 ldp D_l, D_h, [src, #-64]! 193 subs count, count, #64 194 b.ge 1b 195 stp A_l, A_h, [dst, #-16] 196 stp B_l, B_h, [dst, #-32] 197 stp C_l, C_h, [dst, #-48] 198 stp D_l, D_h, [dst, #-64]! 199 tst count, #0x3f 200 b.ne .Ltail63up 201 ret 202 203 204 .Ldownwards: 205 /* For a downwards move we can safely use memcpy provided that 206 * DST is more than 16 bytes away from SRC. */ 207 sub tmp1, src, #16 208 cmp dstin, tmp1 209 b.ls memcpy /* May overlap, but not critically. */ 210 211 mov dst, dstin /* Preserve DSTIN for return value. */ 212 cmp count, #64 213 b.ge .Lmov_not_short_down 214 215 /* Deal with small moves quickly by dropping straight into the 216 * exit block. */ 217 .Ltail63down: 218 /* Move up to 48 bytes of data. At this point we only need the 219 * bottom 6 bits of count to be accurate. */ 220 ands tmp1, count, #0x30 221 b.eq .Ltail15down 222 add dst, dst, tmp1 223 add src, src, tmp1 224 cmp tmp1w, #0x20 225 b.eq 1f 226 b.lt 2f 227 ldp A_l, A_h, [src, #-48] 228 stp A_l, A_h, [dst, #-48] 229 1: 230 ldp A_l, A_h, [src, #-32] 231 stp A_l, A_h, [dst, #-32] 232 2: 233 ldp A_l, A_h, [src, #-16] 234 stp A_l, A_h, [dst, #-16] 235 .Ltail15down: 236 /* Move up to 15 bytes of data. Does not assume additional data 237 being moved. */ 238 tbz count, #3, 1f 239 ldr tmp1, [src], #8 240 str tmp1, [dst], #8 241 1: 242 tbz count, #2, 1f 243 ldr tmp1w, [src], #4 244 str tmp1w, [dst], #4 245 1: 246 tbz count, #1, 1f 247 ldrh tmp1w, [src], #2 248 strh tmp1w, [dst], #2 249 1: 250 tbz count, #0, 1f 251 ldrb tmp1w, [src] 252 strb tmp1w, [dst] 253 1: 254 ret 255 256 .Lmov_not_short_down: 257 /* We don't much care about the alignment of DST, but we want SRC 258 * to be 128-bit (16 byte) aligned so that we don't cross cache line 259 * boundaries on both loads and stores. */ 260 neg tmp2, src 261 ands tmp2, tmp2, #15 /* Bytes to reach alignment. */ 262 b.eq 2f 263 sub count, count, tmp2 264 /* Move enough data to reach alignment; unlike memcpy, we have to 265 * be aware of the overlap, which means we can't move data twice. */ 266 tbz tmp2, #3, 1f 267 ldr tmp1, [src], #8 268 str tmp1, [dst], #8 269 1: 270 tbz tmp2, #2, 1f 271 ldr tmp1w, [src], #4 272 str tmp1w, [dst], #4 273 1: 274 tbz tmp2, #1, 1f 275 ldrh tmp1w, [src], #2 276 strh tmp1w, [dst], #2 277 1: 278 tbz tmp2, #0, 1f 279 ldrb tmp1w, [src], #1 280 strb tmp1w, [dst], #1 281 1: 282 283 /* There may be less than 63 bytes to go now. */ 284 cmp count, #63 285 b.le .Ltail63down 286 2: 287 subs count, count, #128 288 b.ge .Lmov_body_large_down 289 /* Less than 128 bytes to move, so handle 64 here and then jump 290 * to the tail. */ 291 ldp A_l, A_h, [src] 292 ldp B_l, B_h, [src, #16] 293 ldp C_l, C_h, [src, #32] 294 ldp D_l, D_h, [src, #48] 295 stp A_l, A_h, [dst] 296 stp B_l, B_h, [dst, #16] 297 stp C_l, C_h, [dst, #32] 298 stp D_l, D_h, [dst, #48] 299 tst count, #0x3f 300 add src, src, #64 301 add dst, dst, #64 302 b.ne .Ltail63down 303 ret 304 305 /* Critical loop. Start at a new cache line boundary. Assuming 306 * 64 bytes per line this ensures the entire loop is in one line. */ 307 .p2align 6 308 .Lmov_body_large_down: 309 /* There are at least 128 bytes to move. */ 310 ldp A_l, A_h, [src, #0] 311 sub dst, dst, #16 /* Pre-bias. */ 312 ldp B_l, B_h, [src, #16] 313 ldp C_l, C_h, [src, #32] 314 ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */ 315 1: 316 stp A_l, A_h, [dst, #16] 317 ldp A_l, A_h, [src, #16] 318 stp B_l, B_h, [dst, #32] 319 ldp B_l, B_h, [src, #32] 320 stp C_l, C_h, [dst, #48] 321 ldp C_l, C_h, [src, #48] 322 stp D_l, D_h, [dst, #64]! 323 ldp D_l, D_h, [src, #64]! 324 subs count, count, #64 325 b.ge 1b 326 stp A_l, A_h, [dst, #16] 327 stp B_l, B_h, [dst, #32] 328 stp C_l, C_h, [dst, #48] 329 stp D_l, D_h, [dst, #64] 330 add src, src, #16 331 add dst, dst, #64 + 16 332 tst count, #0x3f 333 b.ne .Ltail63down 334 ret 335 #ifdef BCOPY 336 END(bcopy) 337 #elif defined(WMEMMOVE) 338 END(wmemmove) 339 #else 340 END(memmove) 341 #endif 342