1 /* 2 * Copyright (C) 2008 The Android Open Source Project 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in 12 * the documentation and/or other materials provided with the 13 * distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <machine/cpu-features.h> 30 #include <machine/asm.h> 31 32 #if defined(__ARM_NEON__) && !defined(ARCH_ARM_USE_NON_NEON_MEMCPY) 33 34 .text 35 .fpu neon 36 37 /* a prefetch distance of 4 cache-lines works best experimentally */ 38 #define CACHE_LINE_SIZE 64 39 #define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4) 40 41 ENTRY(memcpy) 42 .save {r0, lr} 43 stmfd sp!, {r0, lr} 44 45 /* start preloading as early as possible */ 46 pld [r1, #(CACHE_LINE_SIZE*0)] 47 pld [r1, #(CACHE_LINE_SIZE*1)] 48 49 /* do we have at least 16-bytes to copy (needed for alignment below) */ 50 cmp r2, #16 51 blo 5f 52 53 /* align destination to half cache-line for the write-buffer */ 54 rsb r3, r0, #0 55 ands r3, r3, #0xF 56 beq 0f 57 58 /* copy up to 15-bytes (count in r3) */ 59 sub r2, r2, r3 60 movs ip, r3, lsl #31 61 ldrmib lr, [r1], #1 62 strmib lr, [r0], #1 63 ldrcsb ip, [r1], #1 64 ldrcsb lr, [r1], #1 65 strcsb ip, [r0], #1 66 strcsb lr, [r0], #1 67 movs ip, r3, lsl #29 68 bge 1f 69 // copies 4 bytes, destination 32-bits aligned 70 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! 71 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]! 72 1: bcc 2f 73 // copies 8 bytes, destination 64-bits aligned 74 vld1.8 {d0}, [r1]! 75 vst1.8 {d0}, [r0, :64]! 76 2: 77 78 0: /* preload immediately the next cache line, which we may need */ 79 pld [r1, #(CACHE_LINE_SIZE*0)] 80 pld [r1, #(CACHE_LINE_SIZE*1)] 81 82 /* make sure we have at least 64 bytes to copy */ 83 subs r2, r2, #64 84 blo 2f 85 86 /* preload all the cache lines we need. 87 * NOTE: the number of pld below depends on PREFETCH_DISTANCE, 88 * ideally would would increase the distance in the main loop to 89 * avoid the goofy code below. In practice this doesn't seem to make 90 * a big difference. 91 */ 92 pld [r1, #(CACHE_LINE_SIZE*2)] 93 pld [r1, #(CACHE_LINE_SIZE*3)] 94 pld [r1, #(PREFETCH_DISTANCE)] 95 96 1: /* The main loop copies 64 bytes at a time */ 97 vld1.8 {d0 - d3}, [r1]! 98 vld1.8 {d4 - d7}, [r1]! 99 pld [r1, #(PREFETCH_DISTANCE)] 100 subs r2, r2, #64 101 vst1.8 {d0 - d3}, [r0, :128]! 102 vst1.8 {d4 - d7}, [r0, :128]! 103 bhs 1b 104 105 2: /* fix-up the remaining count and make sure we have >= 32 bytes left */ 106 add r2, r2, #64 107 subs r2, r2, #32 108 blo 4f 109 110 3: /* 32 bytes at a time. These cache lines were already preloaded */ 111 vld1.8 {d0 - d3}, [r1]! 112 subs r2, r2, #32 113 vst1.8 {d0 - d3}, [r0, :128]! 114 bhs 3b 115 116 4: /* less than 32 left */ 117 add r2, r2, #32 118 tst r2, #0x10 119 beq 5f 120 // copies 16 bytes, 128-bits aligned 121 vld1.8 {d0, d1}, [r1]! 122 vst1.8 {d0, d1}, [r0, :128]! 123 124 5: /* copy up to 15-bytes (count in r2) */ 125 movs ip, r2, lsl #29 126 bcc 1f 127 vld1.8 {d0}, [r1]! 128 vst1.8 {d0}, [r0]! 129 1: bge 2f 130 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! 131 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]! 132 2: movs ip, r2, lsl #31 133 ldrmib r3, [r1], #1 134 ldrcsb ip, [r1], #1 135 ldrcsb lr, [r1], #1 136 strmib r3, [r0], #1 137 strcsb ip, [r0], #1 138 strcsb lr, [r0], #1 139 140 ldmfd sp!, {r0, lr} 141 bx lr 142 END(memcpy) 143 144 145 #else /* __ARM_ARCH__ < 7 */ 146 147 148 /* 149 * Optimized memcpy() for ARM. 150 * 151 * note that memcpy() always returns the destination pointer, 152 * so we have to preserve R0. 153 */ 154 155 ENTRY(memcpy) 156 /* The stack must always be 64-bits aligned to be compliant with the 157 * ARM ABI. Since we have to save R0, we might as well save R4 158 * which we can use for better pipelining of the reads below 159 */ 160 .save {r0, r4, lr} 161 stmfd sp!, {r0, r4, lr} 162 /* Making room for r5-r11 which will be spilled later */ 163 .pad #28 164 sub sp, sp, #28 165 166 // preload the destination because we'll align it to a cache line 167 // with small writes. Also start the source "pump". 168 PLD (r0, #0) 169 PLD (r1, #0) 170 PLD (r1, #32) 171 172 /* it simplifies things to take care of len<4 early */ 173 cmp r2, #4 174 blo copy_last_3_and_return 175 176 /* compute the offset to align the source 177 * offset = (4-(src&3))&3 = -src & 3 178 */ 179 rsb r3, r1, #0 180 ands r3, r3, #3 181 beq src_aligned 182 183 /* align source to 32 bits. We need to insert 2 instructions between 184 * a ldr[b|h] and str[b|h] because byte and half-word instructions 185 * stall 2 cycles. 186 */ 187 movs r12, r3, lsl #31 188 sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */ 189 ldrmib r3, [r1], #1 190 ldrcsb r4, [r1], #1 191 ldrcsb r12,[r1], #1 192 strmib r3, [r0], #1 193 strcsb r4, [r0], #1 194 strcsb r12,[r0], #1 195 196 src_aligned: 197 198 /* see if src and dst are aligned together (congruent) */ 199 eor r12, r0, r1 200 tst r12, #3 201 bne non_congruent 202 203 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack 204 * frame. Don't update sp. 205 */ 206 stmea sp, {r5-r11} 207 208 /* align the destination to a cache-line */ 209 rsb r3, r0, #0 210 ands r3, r3, #0x1C 211 beq congruent_aligned32 212 cmp r3, r2 213 andhi r3, r2, #0x1C 214 215 /* conditionnaly copies 0 to 7 words (length in r3) */ 216 movs r12, r3, lsl #28 217 ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */ 218 ldmmiia r1!, {r8, r9} /* 8 bytes */ 219 stmcsia r0!, {r4, r5, r6, r7} 220 stmmiia r0!, {r8, r9} 221 tst r3, #0x4 222 ldrne r10,[r1], #4 /* 4 bytes */ 223 strne r10,[r0], #4 224 sub r2, r2, r3 225 226 congruent_aligned32: 227 /* 228 * here source is aligned to 32 bytes. 229 */ 230 231 cached_aligned32: 232 subs r2, r2, #32 233 blo less_than_32_left 234 235 /* 236 * We preload a cache-line up to 64 bytes ahead. On the 926, this will 237 * stall only until the requested world is fetched, but the linefill 238 * continues in the the background. 239 * While the linefill is going, we write our previous cache-line 240 * into the write-buffer (which should have some free space). 241 * When the linefill is done, the writebuffer will 242 * start dumping its content into memory 243 * 244 * While all this is going, we then load a full cache line into 245 * 8 registers, this cache line should be in the cache by now 246 * (or partly in the cache). 247 * 248 * This code should work well regardless of the source/dest alignment. 249 * 250 */ 251 252 // Align the preload register to a cache-line because the cpu does 253 // "critical word first" (the first word requested is loaded first). 254 bic r12, r1, #0x1F 255 add r12, r12, #64 256 257 1: ldmia r1!, { r4-r11 } 258 PLD (r12, #64) 259 subs r2, r2, #32 260 261 // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi 262 // for ARM9 preload will not be safely guarded by the preceding subs. 263 // When it is safely guarded the only possibility to have SIGSEGV here 264 // is because the caller overstates the length. 265 ldrhi r3, [r12], #32 /* cheap ARM9 preload */ 266 stmia r0!, { r4-r11 } 267 bhs 1b 268 269 add r2, r2, #32 270 271 272 273 274 less_than_32_left: 275 /* 276 * less than 32 bytes left at this point (length in r2) 277 */ 278 279 /* skip all this if there is nothing to do, which should 280 * be a common case (if not executed the code below takes 281 * about 16 cycles) 282 */ 283 tst r2, #0x1F 284 beq 1f 285 286 /* conditionnaly copies 0 to 31 bytes */ 287 movs r12, r2, lsl #28 288 ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */ 289 ldmmiia r1!, {r8, r9} /* 8 bytes */ 290 stmcsia r0!, {r4, r5, r6, r7} 291 stmmiia r0!, {r8, r9} 292 movs r12, r2, lsl #30 293 ldrcs r3, [r1], #4 /* 4 bytes */ 294 ldrmih r4, [r1], #2 /* 2 bytes */ 295 strcs r3, [r0], #4 296 strmih r4, [r0], #2 297 tst r2, #0x1 298 ldrneb r3, [r1] /* last byte */ 299 strneb r3, [r0] 300 301 /* we're done! restore everything and return */ 302 1: ldmfd sp!, {r5-r11} 303 ldmfd sp!, {r0, r4, lr} 304 bx lr 305 306 /********************************************************************/ 307 308 non_congruent: 309 /* 310 * here source is aligned to 4 bytes 311 * but destination is not. 312 * 313 * in the code below r2 is the number of bytes read 314 * (the number of bytes written is always smaller, because we have 315 * partial words in the shift queue) 316 */ 317 cmp r2, #4 318 blo copy_last_3_and_return 319 320 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack 321 * frame. Don't update sp. 322 */ 323 stmea sp, {r5-r11} 324 325 /* compute shifts needed to align src to dest */ 326 rsb r5, r0, #0 327 and r5, r5, #3 /* r5 = # bytes in partial words */ 328 mov r12, r5, lsl #3 /* r12 = right */ 329 rsb lr, r12, #32 /* lr = left */ 330 331 /* read the first word */ 332 ldr r3, [r1], #4 333 sub r2, r2, #4 334 335 /* write a partial word (0 to 3 bytes), such that destination 336 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment) 337 */ 338 movs r5, r5, lsl #31 339 strmib r3, [r0], #1 340 movmi r3, r3, lsr #8 341 strcsb r3, [r0], #1 342 movcs r3, r3, lsr #8 343 strcsb r3, [r0], #1 344 movcs r3, r3, lsr #8 345 346 cmp r2, #4 347 blo partial_word_tail 348 349 /* Align destination to 32 bytes (cache line boundary) */ 350 1: tst r0, #0x1c 351 beq 2f 352 ldr r5, [r1], #4 353 sub r2, r2, #4 354 orr r4, r3, r5, lsl lr 355 mov r3, r5, lsr r12 356 str r4, [r0], #4 357 cmp r2, #4 358 bhs 1b 359 blo partial_word_tail 360 361 /* copy 32 bytes at a time */ 362 2: subs r2, r2, #32 363 blo less_than_thirtytwo 364 365 /* Use immediate mode for the shifts, because there is an extra cycle 366 * for register shifts, which could account for up to 50% of 367 * performance hit. 368 */ 369 370 cmp r12, #24 371 beq loop24 372 cmp r12, #8 373 beq loop8 374 375 loop16: 376 ldr r12, [r1], #4 377 1: mov r4, r12 378 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} 379 PLD (r1, #64) 380 subs r2, r2, #32 381 ldrhs r12, [r1], #4 382 orr r3, r3, r4, lsl #16 383 mov r4, r4, lsr #16 384 orr r4, r4, r5, lsl #16 385 mov r5, r5, lsr #16 386 orr r5, r5, r6, lsl #16 387 mov r6, r6, lsr #16 388 orr r6, r6, r7, lsl #16 389 mov r7, r7, lsr #16 390 orr r7, r7, r8, lsl #16 391 mov r8, r8, lsr #16 392 orr r8, r8, r9, lsl #16 393 mov r9, r9, lsr #16 394 orr r9, r9, r10, lsl #16 395 mov r10, r10, lsr #16 396 orr r10, r10, r11, lsl #16 397 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} 398 mov r3, r11, lsr #16 399 bhs 1b 400 b less_than_thirtytwo 401 402 loop8: 403 ldr r12, [r1], #4 404 1: mov r4, r12 405 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} 406 PLD (r1, #64) 407 subs r2, r2, #32 408 ldrhs r12, [r1], #4 409 orr r3, r3, r4, lsl #24 410 mov r4, r4, lsr #8 411 orr r4, r4, r5, lsl #24 412 mov r5, r5, lsr #8 413 orr r5, r5, r6, lsl #24 414 mov r6, r6, lsr #8 415 orr r6, r6, r7, lsl #24 416 mov r7, r7, lsr #8 417 orr r7, r7, r8, lsl #24 418 mov r8, r8, lsr #8 419 orr r8, r8, r9, lsl #24 420 mov r9, r9, lsr #8 421 orr r9, r9, r10, lsl #24 422 mov r10, r10, lsr #8 423 orr r10, r10, r11, lsl #24 424 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} 425 mov r3, r11, lsr #8 426 bhs 1b 427 b less_than_thirtytwo 428 429 loop24: 430 ldr r12, [r1], #4 431 1: mov r4, r12 432 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} 433 PLD (r1, #64) 434 subs r2, r2, #32 435 ldrhs r12, [r1], #4 436 orr r3, r3, r4, lsl #8 437 mov r4, r4, lsr #24 438 orr r4, r4, r5, lsl #8 439 mov r5, r5, lsr #24 440 orr r5, r5, r6, lsl #8 441 mov r6, r6, lsr #24 442 orr r6, r6, r7, lsl #8 443 mov r7, r7, lsr #24 444 orr r7, r7, r8, lsl #8 445 mov r8, r8, lsr #24 446 orr r8, r8, r9, lsl #8 447 mov r9, r9, lsr #24 448 orr r9, r9, r10, lsl #8 449 mov r10, r10, lsr #24 450 orr r10, r10, r11, lsl #8 451 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} 452 mov r3, r11, lsr #24 453 bhs 1b 454 455 456 less_than_thirtytwo: 457 /* copy the last 0 to 31 bytes of the source */ 458 rsb r12, lr, #32 /* we corrupted r12, recompute it */ 459 add r2, r2, #32 460 cmp r2, #4 461 blo partial_word_tail 462 463 1: ldr r5, [r1], #4 464 sub r2, r2, #4 465 orr r4, r3, r5, lsl lr 466 mov r3, r5, lsr r12 467 str r4, [r0], #4 468 cmp r2, #4 469 bhs 1b 470 471 partial_word_tail: 472 /* we have a partial word in the input buffer */ 473 movs r5, lr, lsl #(31-3) 474 strmib r3, [r0], #1 475 movmi r3, r3, lsr #8 476 strcsb r3, [r0], #1 477 movcs r3, r3, lsr #8 478 strcsb r3, [r0], #1 479 480 /* Refill spilled registers from the stack. Don't update sp. */ 481 ldmfd sp, {r5-r11} 482 483 copy_last_3_and_return: 484 movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */ 485 ldrmib r2, [r1], #1 486 ldrcsb r3, [r1], #1 487 ldrcsb r12,[r1] 488 strmib r2, [r0], #1 489 strcsb r3, [r0], #1 490 strcsb r12,[r0] 491 492 /* we're done! restore sp and spilled registers and return */ 493 add sp, sp, #28 494 ldmfd sp!, {r0, r4, lr} 495 bx lr 496 END(memcpy) 497 498 499 #endif /* __ARM_ARCH__ < 7 */ 500