1 /* 2 * Copyright (C) 2008 The Android Open Source Project 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in 12 * the documentation and/or other materials provided with the 13 * distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <machine/cpu-features.h> 30 31 #if defined(__ARM_NEON__) 32 33 .text 34 .fpu neon 35 36 .global memcpy 37 .type memcpy, %function 38 .align 4 39 40 /* a prefetch distance of 4 cache-lines works best experimentally */ 41 #define CACHE_LINE_SIZE 64 42 #define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4) 43 44 memcpy: 45 .fnstart 46 .save {r0, lr} 47 stmfd sp!, {r0, lr} 48 49 /* start preloading as early as possible */ 50 pld [r1, #(CACHE_LINE_SIZE*0)] 51 pld [r1, #(CACHE_LINE_SIZE*1)] 52 53 /* do we have at least 16-bytes to copy (needed for alignment below) */ 54 cmp r2, #16 55 blo 5f 56 57 /* align destination to half cache-line for the write-buffer */ 58 rsb r3, r0, #0 59 ands r3, r3, #0xF 60 beq 0f 61 62 /* copy up to 15-bytes (count in r3) */ 63 sub r2, r2, r3 64 movs ip, r3, lsl #31 65 ldrmib lr, [r1], #1 66 strmib lr, [r0], #1 67 ldrcsb ip, [r1], #1 68 ldrcsb lr, [r1], #1 69 strcsb ip, [r0], #1 70 strcsb lr, [r0], #1 71 movs ip, r3, lsl #29 72 bge 1f 73 // copies 4 bytes, destination 32-bits aligned 74 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! 75 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]! 76 1: bcc 2f 77 // copies 8 bytes, destination 64-bits aligned 78 vld1.8 {d0}, [r1]! 79 vst1.8 {d0}, [r0, :64]! 80 2: 81 82 0: /* preload immediately the next cache line, which we may need */ 83 pld [r1, #(CACHE_LINE_SIZE*0)] 84 pld [r1, #(CACHE_LINE_SIZE*1)] 85 86 /* make sure we have at least 64 bytes to copy */ 87 subs r2, r2, #64 88 blo 2f 89 90 /* preload all the cache lines we need. 91 * NOTE: the number of pld below depends on PREFETCH_DISTANCE, 92 * ideally would would increase the distance in the main loop to 93 * avoid the goofy code below. In practice this doesn't seem to make 94 * a big difference. 95 */ 96 pld [r1, #(CACHE_LINE_SIZE*2)] 97 pld [r1, #(CACHE_LINE_SIZE*3)] 98 pld [r1, #(PREFETCH_DISTANCE)] 99 100 1: /* The main loop copies 64 bytes at a time */ 101 vld1.8 {d0 - d3}, [r1]! 102 vld1.8 {d4 - d7}, [r1]! 103 pld [r1, #(PREFETCH_DISTANCE)] 104 subs r2, r2, #64 105 vst1.8 {d0 - d3}, [r0, :128]! 106 vst1.8 {d4 - d7}, [r0, :128]! 107 bhs 1b 108 109 2: /* fix-up the remaining count and make sure we have >= 32 bytes left */ 110 add r2, r2, #64 111 subs r2, r2, #32 112 blo 4f 113 114 3: /* 32 bytes at a time. These cache lines were already preloaded */ 115 vld1.8 {d0 - d3}, [r1]! 116 subs r2, r2, #32 117 vst1.8 {d0 - d3}, [r0, :128]! 118 bhs 3b 119 120 4: /* less than 32 left */ 121 add r2, r2, #32 122 tst r2, #0x10 123 beq 5f 124 // copies 16 bytes, 128-bits aligned 125 vld1.8 {d0, d1}, [r1]! 126 vst1.8 {d0, d1}, [r0, :128]! 127 128 5: /* copy up to 15-bytes (count in r2) */ 129 movs ip, r2, lsl #29 130 bcc 1f 131 vld1.8 {d0}, [r1]! 132 vst1.8 {d0}, [r0]! 133 1: bge 2f 134 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! 135 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]! 136 2: movs ip, r2, lsl #31 137 ldrmib r3, [r1], #1 138 ldrcsb ip, [r1], #1 139 ldrcsb lr, [r1], #1 140 strmib r3, [r0], #1 141 strcsb ip, [r0], #1 142 strcsb lr, [r0], #1 143 144 ldmfd sp!, {r0, lr} 145 bx lr 146 .fnend 147 148 149 #else /* __ARM_ARCH__ < 7 */ 150 151 152 .text 153 154 .global memcpy 155 .type memcpy, %function 156 .align 4 157 158 /* 159 * Optimized memcpy() for ARM. 160 * 161 * note that memcpy() always returns the destination pointer, 162 * so we have to preserve R0. 163 */ 164 165 memcpy: 166 /* The stack must always be 64-bits aligned to be compliant with the 167 * ARM ABI. Since we have to save R0, we might as well save R4 168 * which we can use for better pipelining of the reads below 169 */ 170 .fnstart 171 .save {r0, r4, lr} 172 stmfd sp!, {r0, r4, lr} 173 /* Making room for r5-r11 which will be spilled later */ 174 .pad #28 175 sub sp, sp, #28 176 177 // preload the destination because we'll align it to a cache line 178 // with small writes. Also start the source "pump". 179 PLD (r0, #0) 180 PLD (r1, #0) 181 PLD (r1, #32) 182 183 /* it simplifies things to take care of len<4 early */ 184 cmp r2, #4 185 blo copy_last_3_and_return 186 187 /* compute the offset to align the source 188 * offset = (4-(src&3))&3 = -src & 3 189 */ 190 rsb r3, r1, #0 191 ands r3, r3, #3 192 beq src_aligned 193 194 /* align source to 32 bits. We need to insert 2 instructions between 195 * a ldr[b|h] and str[b|h] because byte and half-word instructions 196 * stall 2 cycles. 197 */ 198 movs r12, r3, lsl #31 199 sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */ 200 ldrmib r3, [r1], #1 201 ldrcsb r4, [r1], #1 202 ldrcsb r12,[r1], #1 203 strmib r3, [r0], #1 204 strcsb r4, [r0], #1 205 strcsb r12,[r0], #1 206 207 src_aligned: 208 209 /* see if src and dst are aligned together (congruent) */ 210 eor r12, r0, r1 211 tst r12, #3 212 bne non_congruent 213 214 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack 215 * frame. Don't update sp. 216 */ 217 stmea sp, {r5-r11} 218 219 /* align the destination to a cache-line */ 220 rsb r3, r0, #0 221 ands r3, r3, #0x1C 222 beq congruent_aligned32 223 cmp r3, r2 224 andhi r3, r2, #0x1C 225 226 /* conditionnaly copies 0 to 7 words (length in r3) */ 227 movs r12, r3, lsl #28 228 ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */ 229 ldmmiia r1!, {r8, r9} /* 8 bytes */ 230 stmcsia r0!, {r4, r5, r6, r7} 231 stmmiia r0!, {r8, r9} 232 tst r3, #0x4 233 ldrne r10,[r1], #4 /* 4 bytes */ 234 strne r10,[r0], #4 235 sub r2, r2, r3 236 237 congruent_aligned32: 238 /* 239 * here source is aligned to 32 bytes. 240 */ 241 242 cached_aligned32: 243 subs r2, r2, #32 244 blo less_than_32_left 245 246 /* 247 * We preload a cache-line up to 64 bytes ahead. On the 926, this will 248 * stall only until the requested world is fetched, but the linefill 249 * continues in the the background. 250 * While the linefill is going, we write our previous cache-line 251 * into the write-buffer (which should have some free space). 252 * When the linefill is done, the writebuffer will 253 * start dumping its content into memory 254 * 255 * While all this is going, we then load a full cache line into 256 * 8 registers, this cache line should be in the cache by now 257 * (or partly in the cache). 258 * 259 * This code should work well regardless of the source/dest alignment. 260 * 261 */ 262 263 // Align the preload register to a cache-line because the cpu does 264 // "critical word first" (the first word requested is loaded first). 265 bic r12, r1, #0x1F 266 add r12, r12, #64 267 268 1: ldmia r1!, { r4-r11 } 269 PLD (r12, #64) 270 subs r2, r2, #32 271 272 // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi 273 // for ARM9 preload will not be safely guarded by the preceding subs. 274 // When it is safely guarded the only possibility to have SIGSEGV here 275 // is because the caller overstates the length. 276 ldrhi r3, [r12], #32 /* cheap ARM9 preload */ 277 stmia r0!, { r4-r11 } 278 bhs 1b 279 280 add r2, r2, #32 281 282 283 284 285 less_than_32_left: 286 /* 287 * less than 32 bytes left at this point (length in r2) 288 */ 289 290 /* skip all this if there is nothing to do, which should 291 * be a common case (if not executed the code below takes 292 * about 16 cycles) 293 */ 294 tst r2, #0x1F 295 beq 1f 296 297 /* conditionnaly copies 0 to 31 bytes */ 298 movs r12, r2, lsl #28 299 ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */ 300 ldmmiia r1!, {r8, r9} /* 8 bytes */ 301 stmcsia r0!, {r4, r5, r6, r7} 302 stmmiia r0!, {r8, r9} 303 movs r12, r2, lsl #30 304 ldrcs r3, [r1], #4 /* 4 bytes */ 305 ldrmih r4, [r1], #2 /* 2 bytes */ 306 strcs r3, [r0], #4 307 strmih r4, [r0], #2 308 tst r2, #0x1 309 ldrneb r3, [r1] /* last byte */ 310 strneb r3, [r0] 311 312 /* we're done! restore everything and return */ 313 1: ldmfd sp!, {r5-r11} 314 ldmfd sp!, {r0, r4, lr} 315 bx lr 316 317 /********************************************************************/ 318 319 non_congruent: 320 /* 321 * here source is aligned to 4 bytes 322 * but destination is not. 323 * 324 * in the code below r2 is the number of bytes read 325 * (the number of bytes written is always smaller, because we have 326 * partial words in the shift queue) 327 */ 328 cmp r2, #4 329 blo copy_last_3_and_return 330 331 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack 332 * frame. Don't update sp. 333 */ 334 stmea sp, {r5-r11} 335 336 /* compute shifts needed to align src to dest */ 337 rsb r5, r0, #0 338 and r5, r5, #3 /* r5 = # bytes in partial words */ 339 mov r12, r5, lsl #3 /* r12 = right */ 340 rsb lr, r12, #32 /* lr = left */ 341 342 /* read the first word */ 343 ldr r3, [r1], #4 344 sub r2, r2, #4 345 346 /* write a partial word (0 to 3 bytes), such that destination 347 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment) 348 */ 349 movs r5, r5, lsl #31 350 strmib r3, [r0], #1 351 movmi r3, r3, lsr #8 352 strcsb r3, [r0], #1 353 movcs r3, r3, lsr #8 354 strcsb r3, [r0], #1 355 movcs r3, r3, lsr #8 356 357 cmp r2, #4 358 blo partial_word_tail 359 360 /* Align destination to 32 bytes (cache line boundary) */ 361 1: tst r0, #0x1c 362 beq 2f 363 ldr r5, [r1], #4 364 sub r2, r2, #4 365 orr r4, r3, r5, lsl lr 366 mov r3, r5, lsr r12 367 str r4, [r0], #4 368 cmp r2, #4 369 bhs 1b 370 blo partial_word_tail 371 372 /* copy 32 bytes at a time */ 373 2: subs r2, r2, #32 374 blo less_than_thirtytwo 375 376 /* Use immediate mode for the shifts, because there is an extra cycle 377 * for register shifts, which could account for up to 50% of 378 * performance hit. 379 */ 380 381 cmp r12, #24 382 beq loop24 383 cmp r12, #8 384 beq loop8 385 386 loop16: 387 ldr r12, [r1], #4 388 1: mov r4, r12 389 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} 390 PLD (r1, #64) 391 subs r2, r2, #32 392 ldrhs r12, [r1], #4 393 orr r3, r3, r4, lsl #16 394 mov r4, r4, lsr #16 395 orr r4, r4, r5, lsl #16 396 mov r5, r5, lsr #16 397 orr r5, r5, r6, lsl #16 398 mov r6, r6, lsr #16 399 orr r6, r6, r7, lsl #16 400 mov r7, r7, lsr #16 401 orr r7, r7, r8, lsl #16 402 mov r8, r8, lsr #16 403 orr r8, r8, r9, lsl #16 404 mov r9, r9, lsr #16 405 orr r9, r9, r10, lsl #16 406 mov r10, r10, lsr #16 407 orr r10, r10, r11, lsl #16 408 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} 409 mov r3, r11, lsr #16 410 bhs 1b 411 b less_than_thirtytwo 412 413 loop8: 414 ldr r12, [r1], #4 415 1: mov r4, r12 416 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} 417 PLD (r1, #64) 418 subs r2, r2, #32 419 ldrhs r12, [r1], #4 420 orr r3, r3, r4, lsl #24 421 mov r4, r4, lsr #8 422 orr r4, r4, r5, lsl #24 423 mov r5, r5, lsr #8 424 orr r5, r5, r6, lsl #24 425 mov r6, r6, lsr #8 426 orr r6, r6, r7, lsl #24 427 mov r7, r7, lsr #8 428 orr r7, r7, r8, lsl #24 429 mov r8, r8, lsr #8 430 orr r8, r8, r9, lsl #24 431 mov r9, r9, lsr #8 432 orr r9, r9, r10, lsl #24 433 mov r10, r10, lsr #8 434 orr r10, r10, r11, lsl #24 435 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} 436 mov r3, r11, lsr #8 437 bhs 1b 438 b less_than_thirtytwo 439 440 loop24: 441 ldr r12, [r1], #4 442 1: mov r4, r12 443 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} 444 PLD (r1, #64) 445 subs r2, r2, #32 446 ldrhs r12, [r1], #4 447 orr r3, r3, r4, lsl #8 448 mov r4, r4, lsr #24 449 orr r4, r4, r5, lsl #8 450 mov r5, r5, lsr #24 451 orr r5, r5, r6, lsl #8 452 mov r6, r6, lsr #24 453 orr r6, r6, r7, lsl #8 454 mov r7, r7, lsr #24 455 orr r7, r7, r8, lsl #8 456 mov r8, r8, lsr #24 457 orr r8, r8, r9, lsl #8 458 mov r9, r9, lsr #24 459 orr r9, r9, r10, lsl #8 460 mov r10, r10, lsr #24 461 orr r10, r10, r11, lsl #8 462 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} 463 mov r3, r11, lsr #24 464 bhs 1b 465 466 467 less_than_thirtytwo: 468 /* copy the last 0 to 31 bytes of the source */ 469 rsb r12, lr, #32 /* we corrupted r12, recompute it */ 470 add r2, r2, #32 471 cmp r2, #4 472 blo partial_word_tail 473 474 1: ldr r5, [r1], #4 475 sub r2, r2, #4 476 orr r4, r3, r5, lsl lr 477 mov r3, r5, lsr r12 478 str r4, [r0], #4 479 cmp r2, #4 480 bhs 1b 481 482 partial_word_tail: 483 /* we have a partial word in the input buffer */ 484 movs r5, lr, lsl #(31-3) 485 strmib r3, [r0], #1 486 movmi r3, r3, lsr #8 487 strcsb r3, [r0], #1 488 movcs r3, r3, lsr #8 489 strcsb r3, [r0], #1 490 491 /* Refill spilled registers from the stack. Don't update sp. */ 492 ldmfd sp, {r5-r11} 493 494 copy_last_3_and_return: 495 movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */ 496 ldrmib r2, [r1], #1 497 ldrcsb r3, [r1], #1 498 ldrcsb r12,[r1] 499 strmib r2, [r0], #1 500 strcsb r3, [r0], #1 501 strcsb r12,[r0] 502 503 /* we're done! restore sp and spilled registers and return */ 504 add sp, sp, #28 505 ldmfd sp!, {r0, r4, lr} 506 bx lr 507 .fnend 508 509 510 #endif /* __ARM_ARCH__ < 7 */ 511