Home | History | Annotate | Download | only in bionic
      1 /*
      2  * Copyright (C) 2008 The Android Open Source Project
      3  * All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  *  * Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  *  * Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in
     12  *    the documentation and/or other materials provided with the
     13  *    distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     18  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
     19  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
     22  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
     23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
     24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
     25  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 #include <private/bionic_asm.h>
     30 
     31 
     32 #ifdef HAVE_32_BYTE_CACHE_LINE
     33 #define CACHE_LINE_SIZE     32
     34 #else
     35 #define CACHE_LINE_SIZE     64
     36 #endif
     37 
     38 /*
     39  * Optimized memcmp() for Cortex-A9.
     40  */
     41 
     42 .syntax unified
     43 
     44 ENTRY(memcmp)
     45         pld         [r0, #(CACHE_LINE_SIZE * 0)]
     46         pld         [r0, #(CACHE_LINE_SIZE * 1)]
     47 
     48         /* take of the case where length is 0 or the buffers are the same */
     49         cmp         r0, r1
     50         moveq       r0, #0
     51         bxeq        lr
     52 
     53         pld         [r1, #(CACHE_LINE_SIZE * 0)]
     54         pld         [r1, #(CACHE_LINE_SIZE * 1)]
     55 
     56         /* make sure we have at least 8+4 bytes, this simplify things below
     57          * and avoid some overhead for small blocks
     58          */
     59         cmp        r2, #(8+4)
     60         bmi        10f
     61 /*
     62  * Neon optimization
     63  * Comparing 32 bytes at a time
     64  */
     65 #if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS)
     66         subs        r2, r2, #32
     67         blo         3f
     68 
     69         /* preload all the cache lines we need. */
     70         pld         [r0, #(CACHE_LINE_SIZE * 2)]
     71         pld         [r1, #(CACHE_LINE_SIZE * 2)]
     72 
     73 1:      /* The main loop compares 32 bytes at a time */
     74         vld1.8      {d0 - d3}, [r0]!
     75         pld         [r0, #(CACHE_LINE_SIZE * 2)]
     76         vld1.8      {d4 - d7}, [r1]!
     77         pld         [r1, #(CACHE_LINE_SIZE * 2)]
     78 
     79         /* Start subtracting the values and merge results */
     80         vsub.i8     q0, q2
     81         vsub.i8     q1, q3
     82         vorr        q2, q0, q1
     83         vorr        d4, d5
     84         vmov        r3, ip, d4
     85         /* Check if there are any differences among the 32 bytes */
     86         orrs        r3, ip
     87         bne         2f
     88         subs        r2, r2, #32
     89         bhs         1b
     90         b           3f
     91 2:
     92         /* Check if the difference was in the first or last 16 bytes */
     93         sub         r0, #32
     94         vorr        d0, d1
     95         sub         r1, #32
     96         vmov        r3, ip, d0
     97         orrs        r3, ip
     98         /* if the first 16 bytes are equal, we only have to rewind 16 bytes */
     99         ittt        eq
    100         subeq       r2, #16
    101         addeq       r0, #16
    102         addeq       r1, #16
    103 
    104 3:      /* fix-up the remaining count */
    105         add         r2, r2, #32
    106 
    107         cmp        r2, #(8+4)
    108         bmi        10f
    109 #endif
    110 
    111         /* save registers */
    112         stmfd       sp!, {r4, lr}
    113         .cfi_def_cfa_offset 8
    114         .cfi_rel_offset r4, 0
    115         .cfi_rel_offset lr, 4
    116 
    117         /* since r0 hold the result, move the first source
    118          * pointer somewhere else
    119          */
    120          mov        r4, r0
    121 
    122         /* align first pointer to word boundary
    123          * offset = -src & 3
    124          */
    125         rsb         r3, r4, #0
    126         ands        r3, r3, #3
    127         beq         0f
    128 
    129         /* align first pointer  */
    130         sub         r2, r2, r3
    131 1:      ldrb        r0, [r4], #1
    132         ldrb        ip, [r1], #1
    133         subs        r0, r0, ip
    134         bne         9f
    135         subs        r3, r3, #1
    136         bne         1b
    137 
    138 
    139 0:      /* here the first pointer is aligned, and we have at least 4 bytes
    140          * to process.
    141          */
    142 
    143         /* see if the pointers are congruent */
    144         eor         r0, r4, r1
    145         ands        r0, r0, #3
    146         bne         5f
    147 
    148         /* congruent case, 32 bytes per iteration
    149          * We need to make sure there are at least 32+4 bytes left
    150          * because we effectively read ahead one word, and we could
    151          * read past the buffer (and segfault) if we're not careful.
    152          */
    153 
    154         ldr         ip, [r1]
    155         subs        r2, r2, #(32 + 4)
    156         bmi         1f
    157 
    158 0:      pld         [r4, #(CACHE_LINE_SIZE * 2)]
    159         pld         [r1, #(CACHE_LINE_SIZE * 2)]
    160         ldr         r0, [r4], #4
    161         ldr         lr, [r1, #4]!
    162         eors        r0, r0, ip
    163         ldreq       r0, [r4], #4
    164         ldreq       ip, [r1, #4]!
    165         eorseq      r0, r0, lr
    166         ldreq       r0, [r4], #4
    167         ldreq       lr, [r1, #4]!
    168         eorseq      r0, r0, ip
    169         ldreq       r0, [r4], #4
    170         ldreq       ip, [r1, #4]!
    171         eorseq      r0, r0, lr
    172         ldreq       r0, [r4], #4
    173         ldreq       lr, [r1, #4]!
    174         eorseq      r0, r0, ip
    175         ldreq       r0, [r4], #4
    176         ldreq       ip, [r1, #4]!
    177         eorseq      r0, r0, lr
    178         ldreq       r0, [r4], #4
    179         ldreq       lr, [r1, #4]!
    180         eorseq      r0, r0, ip
    181         ldreq       r0, [r4], #4
    182         ldreq       ip, [r1, #4]!
    183         eorseq      r0, r0, lr
    184         bne         2f
    185         subs        r2, r2, #32
    186         bhs         0b
    187 
    188         /* do we have at least 4 bytes left? */
    189 1:      adds        r2, r2, #(32 - 4 + 4)
    190         bmi         4f
    191 
    192         /* finish off 4 bytes at a time */
    193 3:      ldr         r0, [r4], #4
    194         ldr         ip, [r1], #4
    195         eors        r0, r0, ip
    196         bne         2f
    197         subs        r2, r2, #4
    198         bhs         3b
    199 
    200         /* are we done? */
    201 4:      adds        r2, r2, #4
    202         moveq       r0, #0
    203         beq         9f
    204 
    205         /* finish off the remaining bytes */
    206         b           8f
    207 
    208 2:      /* the last 4 bytes are different, restart them */
    209         sub         r4, r4, #4
    210         sub         r1, r1, #4
    211         mov         r2, #4
    212 
    213         /* process the last few bytes */
    214 8:      ldrb        r0, [r4], #1
    215         ldrb        ip, [r1], #1
    216         // stall
    217         subs        r0, r0, ip
    218         bne         9f
    219         subs        r2, r2, #1
    220         bne         8b
    221 
    222 9:      /* restore registers and return */
    223         ldmfd       sp!, {r4, pc}
    224 
    225 10:     /* process less than 12 bytes */
    226         cmp         r2, #0
    227         moveq       r0, #0
    228         bxeq        lr
    229         mov         r3, r0
    230 11:
    231         ldrb        r0, [r3], #1
    232         ldrb        ip, [r1], #1
    233         subs        r0, ip
    234         bxne        lr
    235         subs        r2, r2, #1
    236         bne         11b
    237         bx          lr
    238 
    239 5:      /*************** non-congruent case ***************/
    240         and         r0, r1, #3
    241         cmp         r0, #2
    242         bne         4f
    243 
    244         /* here, offset is 2 (16-bits aligned, special cased) */
    245 
    246         /* make sure we have at least 16 bytes to process */
    247         subs        r2, r2, #16
    248         addmi       r2, r2, #16
    249         bmi         8b
    250 
    251         /* align the unaligned pointer */
    252         bic         r1, r1, #3
    253         ldr         lr, [r1], #4
    254 
    255 6:      pld         [r1, #(CACHE_LINE_SIZE * 2)]
    256         pld         [r4, #(CACHE_LINE_SIZE * 2)]
    257         mov         ip, lr, lsr #16
    258         ldr         lr, [r1], #4
    259         ldr         r0, [r4], #4
    260         orr         ip, ip, lr, lsl #16
    261         eors        r0, r0, ip
    262         moveq       ip, lr, lsr #16
    263         ldreq       lr, [r1], #4
    264         ldreq       r0, [r4], #4
    265         orreq       ip, ip, lr, lsl #16
    266         eorseq      r0, r0, ip
    267         moveq       ip, lr, lsr #16
    268         ldreq       lr, [r1], #4
    269         ldreq       r0, [r4], #4
    270         orreq       ip, ip, lr, lsl #16
    271         eorseq      r0, r0, ip
    272         moveq       ip, lr, lsr #16
    273         ldreq       lr, [r1], #4
    274         ldreq       r0, [r4], #4
    275         orreq       ip, ip, lr, lsl #16
    276         eorseq      r0, r0, ip
    277         bne         7f
    278         subs        r2, r2, #16
    279         bhs         6b
    280         sub         r1, r1, #2
    281         /* are we done? */
    282         adds        r2, r2, #16
    283         moveq       r0, #0
    284         beq         9b
    285         /* finish off the remaining bytes */
    286         b           8b
    287 
    288 7:      /* fix up the 2 pointers and fallthrough... */
    289         sub         r1, r1, #(4+2)
    290         sub         r4, r4, #4
    291         mov         r2, #4
    292         b           8b
    293 
    294 
    295 4:      /*************** offset is 1 or 3 (less optimized) ***************/
    296 
    297 		stmfd		sp!, {r5, r6, r7}
    298 
    299         // r5 = rhs
    300         // r6 = lhs
    301         // r7 = scratch
    302 
    303         mov         r5, r0, lsl #3		/* r5 = right shift */
    304         rsb         r6, r5, #32         /* r6 = left shift */
    305 
    306         /* align the unaligned pointer */
    307         bic         r1, r1, #3
    308         ldr         r7, [r1], #4
    309         sub         r2, r2, #8
    310 
    311 6:      mov         ip, r7, lsr r5
    312         ldr         r7, [r1], #4
    313         ldr         r0, [r4], #4
    314         orr         ip, ip, r7, lsl r6
    315         eors        r0, r0, ip
    316         moveq       ip, r7, lsr r5
    317         ldreq       r7, [r1], #4
    318         ldreq       r0, [r4], #4
    319         orreq       ip, ip, r7, lsl r6
    320         eorseq      r0, r0, ip
    321         bne         7f
    322         subs        r2, r2, #8
    323         bhs         6b
    324 
    325         sub         r1, r1, r6, lsr #3
    326 		ldmfd       sp!, {r5, r6, r7}
    327 
    328         /* are we done? */
    329         adds        r2, r2, #8
    330         moveq       r0, #0
    331         beq         9b
    332 
    333         /* finish off the remaining bytes */
    334         b           8b
    335 
    336 7:      /* fix up the 2 pointers and fallthrough... */
    337         sub         r1, r1, #4
    338         sub         r1, r1, r6, lsr #3
    339         sub         r4, r4, #4
    340         mov         r2, #4
    341 		ldmfd		sp!, {r5, r6, r7}
    342         b           8b
    343 END(memcmp)
    344