Home | History | Annotate | Download | only in bionic
      1 /*
      2  * Copyright (C) 2008 The Android Open Source Project
      3  * All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  *  * Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  *  * Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in
     12  *    the documentation and/or other materials provided with the
     13  *    distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     18  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
     19  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
     22  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
     23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
     24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
     25  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 #include <machine/cpu-features.h>
     30 
     31 #if defined(__ARM_NEON__)
     32 
     33         .text
     34         .fpu    neon
     35 
     36         .global memcpy
     37         .type memcpy, %function
     38         .align 4
     39 
     40 /* a prefetch distance of 4 cache-lines works best experimentally */
     41 #define CACHE_LINE_SIZE     64
     42 #define PREFETCH_DISTANCE   (CACHE_LINE_SIZE*4)
     43 
     44 memcpy:
     45         .fnstart
     46         .save       {r0, lr}
     47         stmfd       sp!, {r0, lr}
     48 
     49         /* start preloading as early as possible */
     50         pld         [r1, #(CACHE_LINE_SIZE*0)]
     51         pld         [r1, #(CACHE_LINE_SIZE*1)]
     52 
     53         /* do we have at least 16-bytes to copy (needed for alignment below) */
     54         cmp         r2, #16
     55         blo         5f
     56 
     57         /* align destination to half cache-line for the write-buffer */
     58         rsb         r3, r0, #0
     59         ands        r3, r3, #0xF
     60         beq         0f
     61 
     62         /* copy up to 15-bytes (count in r3) */
     63         sub         r2, r2, r3
     64         movs        ip, r3, lsl #31
     65         ldrmib      lr, [r1], #1
     66         strmib      lr, [r0], #1
     67         ldrcsb      ip, [r1], #1
     68         ldrcsb      lr, [r1], #1
     69         strcsb      ip, [r0], #1
     70         strcsb      lr, [r0], #1
     71         movs        ip, r3, lsl #29
     72         bge         1f
     73         // copies 4 bytes, destination 32-bits aligned
     74         vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
     75         vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
     76 1:      bcc         2f
     77         // copies 8 bytes, destination 64-bits aligned
     78         vld1.8      {d0}, [r1]!
     79         vst1.8      {d0}, [r0, :64]!
     80 2:
     81 
     82 0:      /* preload immediately the next cache line, which we may need */
     83         pld         [r1, #(CACHE_LINE_SIZE*0)]
     84         pld         [r1, #(CACHE_LINE_SIZE*1)]
     85 
     86         /* make sure we have at least 64 bytes to copy */
     87         subs        r2, r2, #64
     88         blo         2f
     89 
     90         /* preload all the cache lines we need.
     91          * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
     92          * ideally would would increase the distance in the main loop to
     93          * avoid the goofy code below. In practice this doesn't seem to make
     94          * a big difference.
     95          */
     96         pld         [r1, #(CACHE_LINE_SIZE*2)]
     97         pld         [r1, #(CACHE_LINE_SIZE*3)]
     98         pld         [r1, #(PREFETCH_DISTANCE)]
     99 
    100 1:      /* The main loop copies 64 bytes at a time */
    101         vld1.8      {d0  - d3},   [r1]!
    102         vld1.8      {d4  - d7},   [r1]!
    103         pld         [r1, #(PREFETCH_DISTANCE)]
    104         subs        r2, r2, #64
    105         vst1.8      {d0  - d3},   [r0, :128]!
    106         vst1.8      {d4  - d7},   [r0, :128]!
    107         bhs         1b
    108 
    109 2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
    110         add         r2, r2, #64
    111         subs        r2, r2, #32
    112         blo         4f
    113 
    114 3:      /* 32 bytes at a time. These cache lines were already preloaded */
    115         vld1.8      {d0 - d3},  [r1]!
    116         subs        r2, r2, #32
    117         vst1.8      {d0 - d3},  [r0, :128]!
    118         bhs         3b
    119 
    120 4:      /* less than 32 left */
    121         add         r2, r2, #32
    122         tst         r2, #0x10
    123         beq         5f
    124         // copies 16 bytes, 128-bits aligned
    125         vld1.8      {d0, d1}, [r1]!
    126         vst1.8      {d0, d1}, [r0, :128]!
    127 
    128 5:      /* copy up to 15-bytes (count in r2) */
    129         movs        ip, r2, lsl #29
    130         bcc         1f
    131         vld1.8      {d0}, [r1]!
    132         vst1.8      {d0}, [r0]!
    133 1:      bge         2f
    134         vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
    135         vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]!
    136 2:      movs        ip, r2, lsl #31
    137         ldrmib      r3, [r1], #1
    138         ldrcsb      ip, [r1], #1
    139         ldrcsb      lr, [r1], #1
    140         strmib      r3, [r0], #1
    141         strcsb      ip, [r0], #1
    142         strcsb      lr, [r0], #1
    143 
    144         ldmfd       sp!, {r0, lr}
    145         bx          lr
    146         .fnend
    147 
    148 
    149 #else   /* __ARM_ARCH__ < 7 */
    150 
    151 
    152 	.text
    153 
    154     .global memcpy
    155     .type memcpy, %function
    156     .align 4
    157 
    158 		/*
    159 		 * Optimized memcpy() for ARM.
    160          *
    161 		 * note that memcpy() always returns the destination pointer,
    162 		 * so we have to preserve R0.
    163 		 */
    164 
    165 memcpy:
    166 		/* The stack must always be 64-bits aligned to be compliant with the
    167 		 * ARM ABI. Since we have to save R0, we might as well save R4
    168 		 * which we can use for better pipelining of the reads below
    169 		 */
    170         .fnstart
    171         .save       {r0, r4, lr}
    172         stmfd       sp!, {r0, r4, lr}
    173         /* Making room for r5-r11 which will be spilled later */
    174         .pad        #28
    175         sub         sp, sp, #28
    176 
    177         // preload the destination because we'll align it to a cache line
    178         // with small writes. Also start the source "pump".
    179         PLD         (r0, #0)
    180         PLD         (r1, #0)
    181         PLD         (r1, #32)
    182 
    183 		/* it simplifies things to take care of len<4 early */
    184 		cmp			r2, #4
    185 		blo			copy_last_3_and_return
    186 
    187 		/* compute the offset to align the source
    188 		 * offset = (4-(src&3))&3 = -src & 3
    189 		 */
    190 		rsb			r3, r1, #0
    191 		ands		r3, r3, #3
    192 		beq			src_aligned
    193 
    194 		/* align source to 32 bits. We need to insert 2 instructions between
    195 		 * a ldr[b|h] and str[b|h] because byte and half-word instructions
    196 		 * stall 2 cycles.
    197 		 */
    198 		movs		r12, r3, lsl #31
    199 		sub			r2, r2, r3		/* we know that r3 <= r2 because r2 >= 4 */
    200 		ldrmib		r3, [r1], #1
    201 		ldrcsb		r4, [r1], #1
    202 		ldrcsb		r12,[r1], #1
    203         strmib		r3, [r0], #1
    204 		strcsb		r4, [r0], #1
    205 		strcsb		r12,[r0], #1
    206 
    207 src_aligned:
    208 
    209 		/* see if src and dst are aligned together (congruent) */
    210 		eor			r12, r0, r1
    211 		tst			r12, #3
    212 		bne			non_congruent
    213 
    214         /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
    215          * frame. Don't update sp.
    216          */
    217         stmea		sp, {r5-r11}
    218 
    219 		/* align the destination to a cache-line */
    220 		rsb         r3, r0, #0
    221 		ands		r3, r3, #0x1C
    222 		beq         congruent_aligned32
    223 		cmp         r3, r2
    224 		andhi		r3, r2, #0x1C
    225 
    226 		/* conditionnaly copies 0 to 7 words (length in r3) */
    227 		movs		r12, r3, lsl #28
    228 		ldmcsia		r1!, {r4, r5, r6, r7}	/* 16 bytes */
    229 		ldmmiia		r1!, {r8, r9}			/*  8 bytes */
    230 		stmcsia		r0!, {r4, r5, r6, r7}
    231 		stmmiia		r0!, {r8, r9}
    232 		tst         r3, #0x4
    233 		ldrne		r10,[r1], #4			/*  4 bytes */
    234 		strne		r10,[r0], #4
    235 		sub         r2, r2, r3
    236 
    237 congruent_aligned32:
    238 		/*
    239 		 * here source is aligned to 32 bytes.
    240 		 */
    241 
    242 cached_aligned32:
    243         subs        r2, r2, #32
    244         blo         less_than_32_left
    245 
    246         /*
    247          * We preload a cache-line up to 64 bytes ahead. On the 926, this will
    248          * stall only until the requested world is fetched, but the linefill
    249          * continues in the the background.
    250          * While the linefill is going, we write our previous cache-line
    251          * into the write-buffer (which should have some free space).
    252          * When the linefill is done, the writebuffer will
    253          * start dumping its content into memory
    254          *
    255          * While all this is going, we then load a full cache line into
    256          * 8 registers, this cache line should be in the cache by now
    257          * (or partly in the cache).
    258          *
    259          * This code should work well regardless of the source/dest alignment.
    260          *
    261          */
    262 
    263         // Align the preload register to a cache-line because the cpu does
    264         // "critical word first" (the first word requested is loaded first).
    265         bic         r12, r1, #0x1F
    266         add         r12, r12, #64
    267 
    268 1:      ldmia       r1!, { r4-r11 }
    269         PLD         (r12, #64)
    270         subs        r2, r2, #32
    271 
    272         // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
    273         // for ARM9 preload will not be safely guarded by the preceding subs.
    274         // When it is safely guarded the only possibility to have SIGSEGV here
    275         // is because the caller overstates the length.
    276         ldrhi       r3, [r12], #32      /* cheap ARM9 preload */
    277         stmia       r0!, { r4-r11 }
    278 		bhs         1b
    279 
    280         add         r2, r2, #32
    281 
    282 
    283 
    284 
    285 less_than_32_left:
    286 		/*
    287 		 * less than 32 bytes left at this point (length in r2)
    288 		 */
    289 
    290 		/* skip all this if there is nothing to do, which should
    291 		 * be a common case (if not executed the code below takes
    292 		 * about 16 cycles)
    293 		 */
    294 		tst			r2, #0x1F
    295 		beq			1f
    296 
    297 		/* conditionnaly copies 0 to 31 bytes */
    298 		movs		r12, r2, lsl #28
    299 		ldmcsia		r1!, {r4, r5, r6, r7}	/* 16 bytes */
    300 		ldmmiia		r1!, {r8, r9}			/*  8 bytes */
    301 		stmcsia		r0!, {r4, r5, r6, r7}
    302 		stmmiia		r0!, {r8, r9}
    303 		movs		r12, r2, lsl #30
    304 		ldrcs		r3, [r1], #4			/*  4 bytes */
    305 		ldrmih		r4, [r1], #2			/*  2 bytes */
    306 		strcs		r3, [r0], #4
    307 		strmih		r4, [r0], #2
    308 		tst         r2, #0x1
    309 		ldrneb		r3, [r1]				/*  last byte  */
    310 		strneb		r3, [r0]
    311 
    312 		/* we're done! restore everything and return */
    313 1:		ldmfd		sp!, {r5-r11}
    314 		ldmfd		sp!, {r0, r4, lr}
    315 		bx			lr
    316 
    317 		/********************************************************************/
    318 
    319 non_congruent:
    320 		/*
    321 		 * here source is aligned to 4 bytes
    322 		 * but destination is not.
    323 		 *
    324 		 * in the code below r2 is the number of bytes read
    325 		 * (the number of bytes written is always smaller, because we have
    326 		 * partial words in the shift queue)
    327 		 */
    328 		cmp			r2, #4
    329 		blo			copy_last_3_and_return
    330 
    331         /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
    332          * frame. Don't update sp.
    333          */
    334         stmea		sp, {r5-r11}
    335 
    336 		/* compute shifts needed to align src to dest */
    337 		rsb			r5, r0, #0
    338 		and			r5, r5, #3			/* r5 = # bytes in partial words */
    339 		mov			r12, r5, lsl #3		/* r12 = right */
    340 		rsb			lr, r12, #32		/* lr = left  */
    341 
    342 		/* read the first word */
    343 		ldr			r3, [r1], #4
    344 		sub			r2, r2, #4
    345 
    346 		/* write a partial word (0 to 3 bytes), such that destination
    347 		 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
    348 		 */
    349 		movs		r5, r5, lsl #31
    350 		strmib		r3, [r0], #1
    351 		movmi		r3, r3, lsr #8
    352 		strcsb		r3, [r0], #1
    353 		movcs		r3, r3, lsr #8
    354 		strcsb		r3, [r0], #1
    355 		movcs		r3, r3, lsr #8
    356 
    357 		cmp			r2, #4
    358 		blo			partial_word_tail
    359 
    360 		/* Align destination to 32 bytes (cache line boundary) */
    361 1:		tst			r0, #0x1c
    362 		beq			2f
    363 		ldr			r5, [r1], #4
    364 		sub         r2, r2, #4
    365 		orr			r4, r3, r5,		lsl lr
    366 		mov			r3, r5,			lsr r12
    367 		str			r4, [r0], #4
    368         cmp         r2, #4
    369 		bhs			1b
    370 		blo			partial_word_tail
    371 
    372 		/* copy 32 bytes at a time */
    373 2:		subs		r2, r2, #32
    374 		blo			less_than_thirtytwo
    375 
    376 		/* Use immediate mode for the shifts, because there is an extra cycle
    377 		 * for register shifts, which could account for up to 50% of
    378 		 * performance hit.
    379 		 */
    380 
    381         cmp			r12, #24
    382 		beq			loop24
    383 		cmp			r12, #8
    384 		beq			loop8
    385 
    386 loop16:
    387         ldr         r12, [r1], #4
    388 1:      mov         r4, r12
    389 		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
    390         PLD         (r1, #64)
    391         subs        r2, r2, #32
    392         ldrhs       r12, [r1], #4
    393 		orr			r3, r3, r4,		lsl #16
    394 		mov			r4, r4,			lsr #16
    395 		orr			r4, r4, r5,		lsl #16
    396 		mov			r5, r5,			lsr #16
    397 		orr			r5, r5, r6,		lsl #16
    398 		mov			r6, r6,			lsr #16
    399 		orr			r6, r6, r7,		lsl #16
    400 		mov			r7, r7,			lsr #16
    401 		orr			r7, r7, r8,		lsl #16
    402 		mov			r8, r8,			lsr #16
    403 		orr			r8, r8, r9,		lsl #16
    404 		mov			r9, r9,			lsr #16
    405 		orr			r9, r9, r10,	lsl #16
    406 		mov			r10, r10,		lsr #16
    407 		orr			r10, r10, r11,	lsl #16
    408 		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
    409 		mov			r3, r11,		lsr #16
    410 		bhs			1b
    411 		b			less_than_thirtytwo
    412 
    413 loop8:
    414         ldr         r12, [r1], #4
    415 1:      mov         r4, r12
    416 		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
    417         PLD         (r1, #64)
    418 		subs		r2, r2, #32
    419         ldrhs       r12, [r1], #4
    420 		orr			r3, r3, r4,		lsl #24
    421 		mov			r4, r4,			lsr #8
    422 		orr			r4, r4, r5,		lsl #24
    423 		mov			r5, r5,			lsr #8
    424 		orr			r5, r5, r6,		lsl #24
    425 		mov			r6, r6,			lsr #8
    426 		orr			r6, r6, r7,		lsl #24
    427 		mov			r7, r7,			lsr #8
    428 		orr			r7, r7, r8,		lsl #24
    429 		mov			r8, r8,			lsr #8
    430 		orr			r8, r8, r9,		lsl #24
    431 		mov			r9, r9,			lsr #8
    432 		orr			r9, r9, r10,	lsl #24
    433 		mov			r10, r10,		lsr #8
    434 		orr			r10, r10, r11,	lsl #24
    435 		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
    436 		mov			r3, r11,		lsr #8
    437 		bhs			1b
    438 		b			less_than_thirtytwo
    439 
    440 loop24:
    441         ldr         r12, [r1], #4
    442 1:      mov         r4, r12
    443 		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
    444         PLD         (r1, #64)
    445 		subs		r2, r2, #32
    446         ldrhs       r12, [r1], #4
    447 		orr			r3, r3, r4,		lsl #8
    448 		mov			r4, r4,			lsr #24
    449 		orr			r4, r4, r5,		lsl #8
    450 		mov			r5, r5,			lsr #24
    451 		orr			r5, r5, r6,		lsl #8
    452 		mov			r6, r6,			lsr #24
    453 		orr			r6, r6, r7,		lsl #8
    454 		mov			r7, r7,			lsr #24
    455 		orr			r7, r7, r8,		lsl #8
    456 		mov			r8, r8,			lsr #24
    457 		orr			r8, r8, r9,		lsl #8
    458 		mov			r9, r9,			lsr #24
    459 		orr			r9, r9, r10,	lsl #8
    460 		mov			r10, r10,		lsr #24
    461 		orr			r10, r10, r11,	lsl #8
    462 		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
    463 		mov			r3, r11,		lsr #24
    464 		bhs			1b
    465 
    466 
    467 less_than_thirtytwo:
    468 		/* copy the last 0 to 31 bytes of the source */
    469 		rsb			r12, lr, #32		/* we corrupted r12, recompute it  */
    470 		add			r2, r2, #32
    471 		cmp			r2, #4
    472 		blo			partial_word_tail
    473 
    474 1:		ldr			r5, [r1], #4
    475 		sub         r2, r2, #4
    476 		orr			r4, r3, r5,		lsl lr
    477 		mov			r3,	r5,			lsr r12
    478 		str			r4, [r0], #4
    479         cmp         r2, #4
    480 		bhs			1b
    481 
    482 partial_word_tail:
    483 		/* we have a partial word in the input buffer */
    484 		movs		r5, lr, lsl #(31-3)
    485 		strmib		r3, [r0], #1
    486 		movmi		r3, r3, lsr #8
    487 		strcsb		r3, [r0], #1
    488 		movcs		r3, r3, lsr #8
    489 		strcsb		r3, [r0], #1
    490 
    491 		/* Refill spilled registers from the stack. Don't update sp. */
    492 		ldmfd		sp, {r5-r11}
    493 
    494 copy_last_3_and_return:
    495 		movs		r2, r2, lsl #31	/* copy remaining 0, 1, 2 or 3 bytes */
    496 		ldrmib		r2, [r1], #1
    497 		ldrcsb		r3, [r1], #1
    498 		ldrcsb		r12,[r1]
    499 		strmib		r2, [r0], #1
    500 		strcsb		r3, [r0], #1
    501 		strcsb		r12,[r0]
    502 
    503         /* we're done! restore sp and spilled registers and return */
    504         add         sp,  sp, #28
    505 		ldmfd		sp!, {r0, r4, lr}
    506 		bx			lr
    507         .fnend
    508 
    509 
    510 #endif    /* __ARM_ARCH__ < 7 */
    511