Home | History | Annotate | Download | only in bionic
      1 /*
      2  * Copyright (c) 2013 ARM Ltd
      3  * All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  * 3. The name of the company may not be used to endorse or promote
     14  *    products derived from this software without specific prior written
     15  *    permission.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
     18  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
     19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     20  * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
     22  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     23  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     24  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     25  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29     /* Prototype: void *memcpy (void *dst, const void *src, size_t count).  */
     30 
     31         /* Use the version of memcpy implemented using LDRD and STRD.
     32            This version is tuned for Cortex-A15.
     33            This might not be the best for other ARMv7-A CPUs,
     34            but there is no predefine to distinguish between
     35            different CPUs in the same architecture,
     36            and this version is better than the plain memcpy provided in newlib.
     37 
     38            Therefore, we use this version for all ARMv7-A CPUS.  */
     39 
     40         /* To make the same code compile for both ARM and Thumb instruction
     41        sets, switch to unified syntax at the beginning of this function.
     42            However, by using the same code, we may be missing optimization
     43        opportunities.  For instance, in LDRD/STRD instructions, the first
     44        destination register must be even and the second consecutive in
     45        ARM state, but not in Thumb state.  */
     46 
     47 #include <machine/cpu-features.h>
     48 #include <machine/asm.h>
     49 
     50         .syntax         unified
     51 
     52 ENTRY(memcpy)
     53 
     54        /* Assumes that n >= 0, and dst, src are valid pointers.
     55           If there is at least 8 bytes to copy, use LDRD/STRD.
     56           If src and dst are misaligned with different offsets,
     57           first copy byte by byte until dst is aligned,
     58           and then copy using LDRD/STRD and shift if needed.
     59           When less than 8 left, copy a word and then byte by byte.  */
     60 
     61        /* Save registers (r0 holds the return value):
     62           optimized push {r0, r4, r5, r6, r7, lr}.
     63           To try and improve performance, stack layout changed,
     64           i.e., not keeping the stack looking like users expect
     65           (highest numbered register at highest address).  */
     66         .save   {r0, lr}
     67         push    {r0, lr}
     68         .save   {r4, r5}
     69         strd    r4, r5, [sp, #-8]!
     70         .save   {r6, r7}
     71         strd    r6, r7, [sp, #-8]!
     72 
     73        /* TODO: Add debug frame directives.
     74           We don't need exception unwind directives, because the code below
     75       does not throw any exceptions and does not call any other functions.
     76           Generally, newlib functions like this lack debug information for
     77       assembler source.  */
     78 
     79         /* Get copying of tiny blocks out of the way first.  */
     80         /* Is there at least 4 bytes to copy?  */
     81         subs    r2, r2, #4
     82         blt     copy_less_than_4                 /* If n < 4.  */
     83 
     84         /* Check word alignment.  */
     85         ands    ip, r0, #3                       /* ip = last 2 bits of dst.  */
     86         bne     dst_not_word_aligned             /* If dst is not word-aligned.  */
     87 
     88         /* Get here if dst is word-aligned.  */
     89         ands    ip, r1, #3                      /* ip = last 2 bits of src.  */
     90         bne     src_not_word_aligned            /* If src is not word-aligned.  */
     91 word_aligned:
     92         /* Get here if source and dst both are word-aligned.
     93            The number of bytes remaining to copy is r2+4.  */
     94 
     95         /* Is there is at least 64 bytes to copy?  */
     96         subs    r2, r2, #60
     97         blt     copy_less_than_64                /* If r2 + 4 < 64.  */
     98 
     99         /* First, align the destination buffer to 8-bytes,
    100            to make sure double loads and stores don't cross cache line boundary,
    101            as they are then more expensive even if the data is in the cache
    102            (require two load/store issue cycles instead of one).
    103            If only one of the buffers is not 8-bytes aligned,
    104            then it's more important to align dst than src,
    105            because there is more penalty for stores
    106            than loads that cross cacheline boundary.
    107            This check and realignment are only worth doing
    108            if there is a lot to copy.  */
    109 
    110         /* Get here if dst is word aligned,
    111            i.e., the 2 least significant bits are 0.
    112            If dst is not 2w aligned (i.e., the 3rd bit is not set in dst),
    113            then copy 1 word (4 bytes).  */
    114         ands    r3, r0, #4
    115         beq     11f                  /* If dst already two-word aligned.  */
    116         ldr     r3, [r1], #4
    117         str     r3, [r0], #4
    118         subs    r2, r2, #4
    119         blt     copy_less_than_64
    120 
    121 11:
    122         /* TODO: Align to cacheline (useful for PLD optimization).  */
    123 
    124         /* Every loop iteration copies 64 bytes.  */
    125 1:
    126         .irp    offset, #0, #8, #16, #24, #32, #40, #48, #56
    127         ldrd    r4, r5, [r1, \offset]
    128         strd    r4, r5, [r0, \offset]
    129         .endr
    130 
    131         add     r0, r0, #64
    132         add     r1, r1, #64
    133         subs    r2, r2, #64
    134         bge     1b                            /* If there is more to copy.  */
    135 
    136 copy_less_than_64:
    137 
    138         /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
    139            Restore the count if there is more than 7 bytes to copy.  */
    140         adds    r2, r2, #56
    141         blt     copy_less_than_8
    142 
    143         /* Copy 8 bytes at a time.  */
    144 2:
    145         ldrd    r4, r5, [r1], #8
    146         strd    r4, r5, [r0], #8
    147         subs    r2, r2, #8
    148         bge     2b                            /* If there is more to copy.  */
    149 
    150 copy_less_than_8:
    151 
    152         /* Get here if less than 8 bytes to copy, -8 <= r2 < 0.
    153            Check if there is more to copy.  */
    154         cmn     r2, #8
    155         beq     return                          /* If r2 + 8 == 0.  */
    156 
    157         /* Restore the count if there is more than 3 bytes to copy.  */
    158         adds    r2, r2, #4
    159         blt     copy_less_than_4
    160 
    161         /* Copy 4 bytes.  */
    162         ldr     r3, [r1], #4
    163         str     r3, [r0], #4
    164 
    165 copy_less_than_4:
    166         /* Get here if less than 4 bytes to copy, -4 <= r2 < 0.  */
    167 
    168         /* Restore the count, check if there is more to copy.  */
    169         adds    r2, r2, #4
    170         beq     return                          /* If r2 == 0.  */
    171 
    172         /* Get here with r2 is in {1,2,3}={01,10,11}.  */
    173         /* Logical shift left r2, insert 0s, update flags.  */
    174         lsls    r2, r2, #31
    175 
    176         /* Copy byte by byte.
    177            Condition ne means the last bit of r2 is 0.
    178            Condition cs means the second to last bit of r2 is set,
    179            i.e., r2 is 1 or 3.  */
    180         itt     ne
    181         ldrbne  r3, [r1], #1
    182         strbne  r3, [r0], #1
    183 
    184         itttt   cs
    185         ldrbcs  r4, [r1], #1
    186         ldrbcs  r5, [r1]
    187         strbcs  r4, [r0], #1
    188         strbcs  r5, [r0]
    189 
    190 return:
    191         /* Restore registers: optimized pop {r0, r4, r5, r6, r7, pc}   */
    192         /* This is the only return point of memcpy.  */
    193         ldrd r6, r7, [sp], #8
    194         ldrd r4, r5, [sp], #8
    195         pop {r0, pc}
    196 
    197 #ifndef __ARM_FEATURE_UNALIGNED
    198 
    199        /* The following assembly macro implements misaligned copy in software.
    200           Assumes that dst is word aligned, src is at offset "pull" bits from
    201       word, push = 32 - pull, and the number of bytes that remain to copy
    202       is r2 + 4, r2 >= 0.  */
    203 
    204        /* In the code below, r2 is the number of bytes that remain to be
    205       written.  The number of bytes read is always larger, because we have
    206       partial words in the shift queue.  */
    207 
    208         .macro  miscopy pull push shiftleft shiftright
    209 
    210         /* Align src to the previous word boundary.  */
    211         bic     r1, r1, #3
    212 
    213         /* Initialize the shift queue.  */
    214         ldr     r5, [r1], #4                   /* Load a word from source.  */
    215 
    216         subs    r2, r2, #4
    217         blt     6f          /* Go to misaligned copy of less than 8 bytes.  */
    218 
    219        /* Get here if there is more than 8 bytes to copy.
    220           The number of bytes to copy is r2+8, r2 >= 0.  */
    221 
    222        subs     r2, r2, #56
    223        blt      4f         /* Go to misaligned copy of less than 64 bytes.  */
    224 
    225 3:
    226        /* Get here if there is more than 64 bytes to copy.
    227           The number of bytes to copy is r2+64, r2 >= 0.  */
    228 
    229        /* Copy 64 bytes in every iteration.
    230           Use a partial word from the shift queue.  */
    231         .irp    offset, #0, #8, #16, #24, #32, #40, #48, #56
    232         mov     r6, r5, \shiftleft #\pull
    233         ldrd    r4, r5, [r1, \offset]
    234         orr     r6, r6, r4, \shiftright #\push
    235         mov     r7, r4, \shiftleft #\pull
    236         orr     r7, r7, r5, \shiftright #\push
    237         strd    r6, r7, [r0, \offset]
    238         .endr
    239 
    240         add     r1, r1, #64
    241         add     r0, r0, #64
    242         subs    r2, r2, #64
    243         bge     3b
    244 
    245 4:
    246        /* Get here if there is less than 64 bytes to copy (-64 <= r2 < 0)
    247       and they are misaligned.  */
    248 
    249        /* Restore the count if there is more than 7 bytes to copy.  */
    250         adds    r2, r2, #56
    251 
    252         blt     6f          /* Go to misaligned copy of less than 8 bytes.  */
    253 
    254 5:
    255         /* Copy 8 bytes at a time.
    256            Use a partial word from the shift queue.  */
    257         mov     r6, r5, \shiftleft #\pull
    258         ldrd    r4, r5, [r1], #8
    259         orr     r6, r6, r4, \shiftright #\push
    260         mov     r7, r4, \shiftleft #\pull
    261         orr     r7, r7, r5, \shiftright #\push
    262         strd    r6, r7, [r0], #8
    263 
    264         subs    r2, r2, #8
    265         bge     5b                        /* If there is more to copy.  */
    266 
    267 6:
    268         /* Get here if there less than 8 bytes to copy (-8 <= r2 < 0)
    269            and they are misaligned.  */
    270 
    271         /* Check if there is more to copy.  */
    272         cmn     r2, #8
    273         beq     return
    274 
    275         /* Check if there is less than 4 bytes to copy.  */
    276         cmn     r2, #4
    277 
    278         itt     lt
    279         /* Restore src offset from word-align.  */
    280         sublt   r1, r1, #(\push / 8)
    281         blt     copy_less_than_4
    282 
    283         /* Use a partial word from the shift queue.  */
    284         mov     r3, r5, \shiftleft #\pull
    285         /* Load a word from src, but without writeback
    286            (this word is not fully written to dst).  */
    287         ldr     r5, [r1]
    288 
    289         /* Restore src offset from word-align.  */
    290         add     r1, r1, #(\pull / 8)
    291 
    292         /* Shift bytes to create one dst word and store it.  */
    293         orr     r3, r3, r5, \shiftright #\push
    294         str     r3, [r0], #4
    295 
    296         /* Use single byte copying of the remaining bytes.  */
    297         b       copy_less_than_4
    298 
    299         .endm
    300 
    301 #endif /* not __ARM_FEATURE_UNALIGNED  */
    302 
    303 dst_not_word_aligned:
    304 
    305        /* Get here when dst is not aligned and ip has the last 2 bits of dst,
    306           i.e., ip is the offset of dst from word.
    307           The number of bytes that remains to copy is r2 + 4,
    308           i.e., there are at least 4 bytes to copy.
    309           Write a partial word (0 to 3 bytes), such that dst becomes
    310       word-aligned.  */
    311 
    312        /* If dst is at ip bytes offset from a word (with 0 < ip < 4),
    313           then there are (4 - ip) bytes to fill up to align dst to the next
    314       word.  */
    315         rsb     ip, ip, #4                        /* ip = #4 - ip.  */
    316         cmp     ip, #2
    317 
    318        /* Copy byte by byte with conditionals.  */
    319         itt     gt
    320         ldrbgt  r3, [r1], #1
    321         strbgt  r3, [r0], #1
    322 
    323         itt     ge
    324         ldrbge  r4, [r1], #1
    325         strbge  r4, [r0], #1
    326 
    327         ldrb    lr, [r1], #1
    328         strb    lr, [r0], #1
    329 
    330        /* Update the count.
    331           ip holds the number of bytes we have just copied.  */
    332         subs    r2, r2, ip                        /* r2 = r2 - ip.  */
    333         blt     copy_less_than_4                  /* If r2 < ip.  */
    334 
    335        /* Get here if there are more than 4 bytes to copy.
    336           Check if src is aligned.  If beforehand src and dst were not word
    337       aligned but congruent (same offset), then now they are both
    338       word-aligned, and we can copy the rest efficiently (without
    339       shifting).  */
    340         ands    ip, r1, #3                    /* ip = last 2 bits of src.  */
    341         beq     word_aligned                  /* If r1 is word-aligned.  */
    342 
    343 src_not_word_aligned:
    344        /* Get here when src is not word-aligned, but dst is word-aligned.
    345           The number of bytes that remains to copy is r2+4.  */
    346 
    347 #ifdef __ARM_FEATURE_UNALIGNED
    348        /* Copy word by word using LDR when alignment can be done in hardware,
    349           i.e., SCTLR.A is set, supporting unaligned access in LDR and STR.  */
    350         subs    r2, r2, #60
    351         blt     8f
    352 
    353 7:
    354         /* Copy 64 bytes in every loop iteration.  */
    355         .irp    offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60
    356         ldr     r3, [r1, \offset]
    357         str     r3, [r0, \offset]
    358         .endr
    359 
    360         add     r0, r0, #64
    361         add     r1, r1, #64
    362         subs    r2, r2, #64
    363         bge     7b
    364 
    365 8:
    366         /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
    367            Check if there is more than 3 bytes to copy.  */
    368         adds    r2, r2, #60
    369         blt     copy_less_than_4
    370 
    371 9:
    372        /* Get here if there is less than 64 but at least 4 bytes to copy,
    373           where the number of bytes to copy is r2+4.  */
    374         ldr     r3, [r1], #4
    375         str     r3, [r0], #4
    376         subs    r2, r2, #4
    377         bge     9b
    378 
    379         b       copy_less_than_4
    380 
    381 #else /* not __ARM_FEATURE_UNALIGNED  */
    382 
    383        /* ip has last 2 bits of src,
    384           i.e., ip is the offset of src from word, and ip > 0.
    385           Compute shifts needed to copy from src to dst.  */
    386         cmp     ip, #2
    387         beq     miscopy_16_16             /* If ip == 2.  */
    388         bge     miscopy_24_8              /* If ip == 3.  */
    389 
    390         /* Get here if ip == 1.  */
    391 
    392         /* Endian independent macros for shifting bytes within registers.  */
    393 
    394 #ifndef __ARMEB__
    395 miscopy_8_24:   miscopy pull=8 push=24 shiftleft=lsr shiftright=lsl
    396 miscopy_16_16:  miscopy pull=16 push=16 shiftleft=lsr shiftright=lsl
    397 miscopy_24_8:   miscopy pull=24 push=8 shiftleft=lsr shiftright=lsl
    398 #else  /* not __ARMEB__ */
    399 miscopy_8_24:   miscopy pull=8 push=24 shiftleft=lsl shiftright=lsr
    400 miscopy_16_16:  miscopy pull=16 push=16 shiftleft=lsl shiftright=lsr
    401 miscopy_24_8:   miscopy pull=24 push=8 shiftleft=lsl shiftright=lsr
    402 #endif  /* not __ARMEB__ */
    403 
    404 #endif  /* not __ARM_FEATURE_UNALIGNED  */
    405 
    406 END(memcpy)
    407