Home | History | Annotate | Download | only in bionic
      1 /*
      2  * Copyright (C) 2008 The Android Open Source Project
      3  * All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  *  * Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  *  * Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in
     12  *    the documentation and/or other materials provided with the
     13  *    distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     18  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
     19  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
     22  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
     23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
     24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
     25  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 #include <machine/cpu-features.h>
     30 #include <private/bionic_asm.h>
     31 #include <private/libc_events.h>
     32 
     33         /*
     34          * Optimized memcpy() for ARM.
     35          *
     36          * note that memcpy() always returns the destination pointer,
     37          * so we have to preserve R0.
     38          */
     39 
     40          .syntax unified
     41 
     42 ENTRY(__memcpy_chk)
     43         cmp         r2, r3
     44         bhi         __memcpy_chk_fail
     45 
     46         // Fall through to memcpy...
     47 END(__memcpy_chk)
     48 
     49 ENTRY(memcpy)
     50         /* The stack must always be 64-bits aligned to be compliant with the
     51          * ARM ABI. Since we have to save R0, we might as well save R4
     52          * which we can use for better pipelining of the reads below
     53          */
     54         stmfd       sp!, {r0, r4, lr}
     55         .cfi_def_cfa_offset 12
     56         .cfi_rel_offset r0, 0
     57         .cfi_rel_offset r4, 4
     58         .cfi_rel_offset lr, 8
     59         /* Making room for r5-r11 which will be spilled later */
     60         sub         sp, sp, #28
     61         .cfi_adjust_cfa_offset 28
     62 
     63         // preload the destination because we'll align it to a cache line
     64         // with small writes. Also start the source "pump".
     65         pld         [r0, #0]
     66         pld         [r1, #0]
     67         pld         [r1, #32]
     68 
     69         /* it simplifies things to take care of len<4 early */
     70         cmp         r2, #4
     71         blo         .Lcopy_last_3_and_return
     72 
     73         /* compute the offset to align the source
     74          * offset = (4-(src&3))&3 = -src & 3
     75          */
     76         rsb         r3, r1, #0
     77         ands        r3, r3, #3
     78         beq         .Lsrc_aligned
     79 
     80         /* align source to 32 bits. We need to insert 2 instructions between
     81          * a ldr[b|h] and str[b|h] because byte and half-word instructions
     82          * stall 2 cycles.
     83          */
     84         movs        r12, r3, lsl #31
     85         sub         r2, r2, r3      /* we know that r3 <= r2 because r2 >= 4 */
     86         ldrbmi      r3, [r1], #1
     87         ldrbcs      r4, [r1], #1
     88         ldrbcs      r12,[r1], #1
     89         strbmi      r3, [r0], #1
     90         strbcs      r4, [r0], #1
     91         strbcs      r12,[r0], #1
     92 
     93 .Lsrc_aligned:
     94 
     95         /* see if src and dst are aligned together (congruent) */
     96         eor         r12, r0, r1
     97         tst         r12, #3
     98         bne         .Lnon_congruent
     99 
    100         /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
    101          * frame. Don't update sp.
    102          */
    103         stmea       sp, {r5-r11}
    104 
    105         /* align the destination to a cache-line */
    106         rsb         r3, r0, #0
    107         ands        r3, r3, #0x1C
    108         beq         .Lcongruent_aligned32
    109         cmp         r3, r2
    110         andhi       r3, r2, #0x1C
    111 
    112         /* conditionally copies 0 to 7 words (length in r3) */
    113         movs        r12, r3, lsl #28
    114         ldmcs       r1!, {r4, r5, r6, r7}   /* 16 bytes */
    115         ldmmi       r1!, {r8, r9}           /*  8 bytes */
    116         stmcs       r0!, {r4, r5, r6, r7}
    117         stmmi       r0!, {r8, r9}
    118         tst         r3, #0x4
    119         ldrne       r10,[r1], #4            /*  4 bytes */
    120         strne       r10,[r0], #4
    121         sub         r2, r2, r3
    122 
    123 .Lcongruent_aligned32:
    124         /*
    125          * here source is aligned to 32 bytes.
    126          */
    127 
    128 .Lcached_aligned32:
    129         subs        r2, r2, #32
    130         blo         .Lless_than_32_left
    131 
    132         /*
    133          * We preload a cache-line up to 64 bytes ahead. On the 926, this will
    134          * stall only until the requested world is fetched, but the linefill
    135          * continues in the the background.
    136          * While the linefill is going, we write our previous cache-line
    137          * into the write-buffer (which should have some free space).
    138          * When the linefill is done, the writebuffer will
    139          * start dumping its content into memory
    140          *
    141          * While all this is going, we then load a full cache line into
    142          * 8 registers, this cache line should be in the cache by now
    143          * (or partly in the cache).
    144          *
    145          * This code should work well regardless of the source/dest alignment.
    146          *
    147          */
    148 
    149         // Align the preload register to a cache-line because the cpu does
    150         // "critical word first" (the first word requested is loaded first).
    151         bic         r12, r1, #0x1F
    152         add         r12, r12, #64
    153 
    154 1:      ldmia       r1!, { r4-r11 }
    155         pld         [r12, #64]
    156         subs        r2, r2, #32
    157 
    158         // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
    159         // for ARM9 preload will not be safely guarded by the preceding subs.
    160         // When it is safely guarded the only possibility to have SIGSEGV here
    161         // is because the caller overstates the length.
    162         ldrhi       r3, [r12], #32      /* cheap ARM9 preload */
    163         stmia       r0!, { r4-r11 }
    164         bhs         1b
    165 
    166         add         r2, r2, #32
    167 
    168 .Lless_than_32_left:
    169         /*
    170          * less than 32 bytes left at this point (length in r2)
    171          */
    172 
    173         /* skip all this if there is nothing to do, which should
    174          * be a common case (if not executed the code below takes
    175          * about 16 cycles)
    176          */
    177         tst         r2, #0x1F
    178         beq         1f
    179 
    180         /* conditionnaly copies 0 to 31 bytes */
    181         movs        r12, r2, lsl #28
    182         ldmcs       r1!, {r4, r5, r6, r7}   /* 16 bytes */
    183         ldmmi       r1!, {r8, r9}           /*  8 bytes */
    184         stmcs       r0!, {r4, r5, r6, r7}
    185         stmmi       r0!, {r8, r9}
    186         movs        r12, r2, lsl #30
    187         ldrcs       r3, [r1], #4            /*  4 bytes */
    188         ldrhmi      r4, [r1], #2            /*  2 bytes */
    189         strcs       r3, [r0], #4
    190         strhmi      r4, [r0], #2
    191         tst         r2, #0x1
    192         ldrbne      r3, [r1]                /*  last byte  */
    193         strbne      r3, [r0]
    194 
    195         /* we're done! restore everything and return */
    196 1:      ldmfd       sp!, {r5-r11}
    197         ldmfd       sp!, {r0, r4, pc}
    198 
    199         /********************************************************************/
    200 
    201 .Lnon_congruent:
    202         /*
    203          * here source is aligned to 4 bytes
    204          * but destination is not.
    205          *
    206          * in the code below r2 is the number of bytes read
    207          * (the number of bytes written is always smaller, because we have
    208          * partial words in the shift queue)
    209          */
    210         cmp         r2, #4
    211         blo         .Lcopy_last_3_and_return
    212 
    213         /* Use post-increment mode for stm to spill r5-r11 to reserved stack
    214          * frame. Don't update sp.
    215          */
    216         stmea       sp, {r5-r11}
    217 
    218         /* compute shifts needed to align src to dest */
    219         rsb         r5, r0, #0
    220         and         r5, r5, #3          /* r5 = # bytes in partial words */
    221         mov         r12, r5, lsl #3     /* r12 = right */
    222         rsb         lr, r12, #32        /* lr = left  */
    223 
    224         /* read the first word */
    225         ldr         r3, [r1], #4
    226         sub         r2, r2, #4
    227 
    228         /* write a partial word (0 to 3 bytes), such that destination
    229          * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
    230          */
    231         movs        r5, r5, lsl #31
    232         strbmi      r3, [r0], #1
    233         movmi       r3, r3, lsr #8
    234         strbcs      r3, [r0], #1
    235         movcs       r3, r3, lsr #8
    236         strbcs      r3, [r0], #1
    237         movcs       r3, r3, lsr #8
    238 
    239         cmp         r2, #4
    240         blo         .Lpartial_word_tail
    241 
    242         /* Align destination to 32 bytes (cache line boundary) */
    243 1:      tst         r0, #0x1c
    244         beq         2f
    245         ldr         r5, [r1], #4
    246         sub         r2, r2, #4
    247         orr         r4, r3, r5,     lsl lr
    248         mov         r3, r5,         lsr r12
    249         str         r4, [r0], #4
    250         cmp         r2, #4
    251         bhs         1b
    252         blo         .Lpartial_word_tail
    253 
    254         /* copy 32 bytes at a time */
    255 2:      subs        r2, r2, #32
    256         blo         .Lless_than_thirtytwo
    257 
    258         /* Use immediate mode for the shifts, because there is an extra cycle
    259          * for register shifts, which could account for up to 50% of
    260          * performance hit.
    261          */
    262 
    263         cmp         r12, #24
    264         beq         .Lloop24
    265         cmp         r12, #8
    266         beq         .Lloop8
    267 
    268 .Lloop16:
    269         ldr         r12, [r1], #4
    270 1:      mov         r4, r12
    271         ldmia       r1!, {   r5,r6,r7,  r8,r9,r10,r11}
    272         pld         [r1, #64]
    273         subs        r2, r2, #32
    274         ldrhs       r12, [r1], #4
    275         orr         r3, r3, r4,     lsl #16
    276         mov         r4, r4,         lsr #16
    277         orr         r4, r4, r5,     lsl #16
    278         mov         r5, r5,         lsr #16
    279         orr         r5, r5, r6,     lsl #16
    280         mov         r6, r6,         lsr #16
    281         orr         r6, r6, r7,     lsl #16
    282         mov         r7, r7,         lsr #16
    283         orr         r7, r7, r8,     lsl #16
    284         mov         r8, r8,         lsr #16
    285         orr         r8, r8, r9,     lsl #16
    286         mov         r9, r9,         lsr #16
    287         orr         r9, r9, r10,    lsl #16
    288         mov         r10, r10,       lsr #16
    289         orr         r10, r10, r11,  lsl #16
    290         stmia       r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
    291         mov         r3, r11,        lsr #16
    292         bhs         1b
    293         b           .Lless_than_thirtytwo
    294 
    295 .Lloop8:
    296         ldr         r12, [r1], #4
    297 1:      mov         r4, r12
    298         ldmia       r1!, {   r5,r6,r7,  r8,r9,r10,r11}
    299         pld         [r1, #64]
    300         subs        r2, r2, #32
    301         ldrhs       r12, [r1], #4
    302         orr         r3, r3, r4,     lsl #24
    303         mov         r4, r4,         lsr #8
    304         orr         r4, r4, r5,     lsl #24
    305         mov         r5, r5,         lsr #8
    306         orr         r5, r5, r6,     lsl #24
    307         mov         r6, r6,         lsr #8
    308         orr         r6, r6, r7,     lsl #24
    309         mov         r7, r7,         lsr #8
    310         orr         r7, r7, r8,     lsl #24
    311         mov         r8, r8,         lsr #8
    312         orr         r8, r8, r9,     lsl #24
    313         mov         r9, r9,         lsr #8
    314         orr         r9, r9, r10,    lsl #24
    315         mov         r10, r10,       lsr #8
    316         orr         r10, r10, r11,  lsl #24
    317         stmia       r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
    318         mov         r3, r11,        lsr #8
    319         bhs         1b
    320         b           .Lless_than_thirtytwo
    321 
    322 .Lloop24:
    323         ldr         r12, [r1], #4
    324 1:      mov         r4, r12
    325         ldmia       r1!, {   r5,r6,r7,  r8,r9,r10,r11}
    326         pld         [r1, #64]
    327         subs        r2, r2, #32
    328         ldrhs       r12, [r1], #4
    329         orr         r3, r3, r4,     lsl #8
    330         mov         r4, r4,         lsr #24
    331         orr         r4, r4, r5,     lsl #8
    332         mov         r5, r5,         lsr #24
    333         orr         r5, r5, r6,     lsl #8
    334         mov         r6, r6,         lsr #24
    335         orr         r6, r6, r7,     lsl #8
    336         mov         r7, r7,         lsr #24
    337         orr         r7, r7, r8,     lsl #8
    338         mov         r8, r8,         lsr #24
    339         orr         r8, r8, r9,     lsl #8
    340         mov         r9, r9,         lsr #24
    341         orr         r9, r9, r10,    lsl #8
    342         mov         r10, r10,       lsr #24
    343         orr         r10, r10, r11,  lsl #8
    344         stmia       r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
    345         mov         r3, r11,        lsr #24
    346         bhs         1b
    347 
    348 
    349 .Lless_than_thirtytwo:
    350         /* copy the last 0 to 31 bytes of the source */
    351         rsb         r12, lr, #32        /* we corrupted r12, recompute it  */
    352         add         r2, r2, #32
    353         cmp         r2, #4
    354         blo         .Lpartial_word_tail
    355 
    356 1:      ldr         r5, [r1], #4
    357         sub         r2, r2, #4
    358         orr         r4, r3, r5,     lsl lr
    359         mov         r3, r5,         lsr r12
    360         str         r4, [r0], #4
    361         cmp         r2, #4
    362         bhs         1b
    363 
    364 .Lpartial_word_tail:
    365         /* we have a partial word in the input buffer */
    366         movs        r5, lr, lsl #(31-3)
    367         strbmi      r3, [r0], #1
    368         movmi       r3, r3, lsr #8
    369         strbcs      r3, [r0], #1
    370         movcs       r3, r3, lsr #8
    371         strbcs      r3, [r0], #1
    372 
    373         /* Refill spilled registers from the stack. Don't update sp. */
    374         ldmfd       sp, {r5-r11}
    375 
    376 .Lcopy_last_3_and_return:
    377         movs        r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */
    378         ldrbmi      r2, [r1], #1
    379         ldrbcs      r3, [r1], #1
    380         ldrbcs      r12,[r1]
    381         strbmi      r2, [r0], #1
    382         strbcs      r3, [r0], #1
    383         strbcs      r12,[r0]
    384 
    385         /* we're done! restore sp and spilled registers and return */
    386         add         sp,  sp, #28
    387         ldmfd       sp!, {r0, r4, pc}
    388 END(memcpy)
    389 
    390         // Only reached when the __memcpy_chk check fails.
    391 ENTRY_PRIVATE(__memcpy_chk_fail)
    392         // Preserve lr for backtrace.
    393         push    {lr}
    394         .cfi_def_cfa_offset 4
    395         .cfi_rel_offset lr, 0
    396 
    397         ldr     r0, error_message
    398         ldr     r1, error_code
    399 1:
    400         add     r0, pc
    401         bl      __fortify_chk_fail
    402 error_code:
    403         .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
    404 error_message:
    405         .word   error_string-(1b+8)
    406 END(__memcpy_chk_fail)
    407 
    408         .data
    409 error_string:
    410         .string     "memcpy: prevented write past end of buffer"
    411