Home | History | Annotate | Download | only in bionic
      1 /*
      2  * Copyright (C) 2008 The Android Open Source Project
      3  * All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  *  * Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  *  * Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in
     12  *    the documentation and/or other materials provided with the
     13  *    distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     18  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
     19  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
     22  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
     23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
     24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
     25  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 #include <machine/cpu-features.h>
     30 #include <machine/asm.h>
     31 
     32 
     33 #ifdef HAVE_32_BYTE_CACHE_LINE
     34 #define CACHE_LINE_SIZE     32
     35 #else
     36 #define CACHE_LINE_SIZE     64
     37 #endif
     38 
     39 /*
     40  * Optimized memcmp() for Cortex-A9.
     41  */
     42 
     43 ENTRY(memcmp)
     44         pld         [r0, #(CACHE_LINE_SIZE * 0)]
     45         pld         [r0, #(CACHE_LINE_SIZE * 1)]
     46 
     47         /* take of the case where length is 0 or the buffers are the same */
     48         cmp         r0, r1
     49         moveq       r0, #0
     50         bxeq        lr
     51 
     52         pld         [r1, #(CACHE_LINE_SIZE * 0)]
     53         pld         [r1, #(CACHE_LINE_SIZE * 1)]
     54 
     55         /* make sure we have at least 8+4 bytes, this simplify things below
     56          * and avoid some overhead for small blocks
     57          */
     58         cmp        r2, #(8+4)
     59         bmi        10f
     60 /*
     61  * Neon optimization
     62  * Comparing 32 bytes at a time
     63  */
     64 #if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS)
     65         subs        r2, r2, #32
     66         blo         3f
     67 
     68         /* preload all the cache lines we need. */
     69         pld         [r0, #(CACHE_LINE_SIZE * 2)]
     70         pld         [r1, #(CACHE_LINE_SIZE * 2)]
     71 
     72 1:      /* The main loop compares 32 bytes at a time */
     73         vld1.8      {d0 - d3}, [r0]!
     74         pld         [r0, #(CACHE_LINE_SIZE * 2)]
     75         vld1.8      {d4 - d7}, [r1]!
     76         pld         [r1, #(CACHE_LINE_SIZE * 2)]
     77 
     78         /* Start subtracting the values and merge results */
     79         vsub.i8     q0, q2
     80         vsub.i8     q1, q3
     81         vorr        q2, q0, q1
     82         vorr        d4, d5
     83         vmov        r3, ip, d4
     84         /* Check if there are any differences among the 32 bytes */
     85         orrs        r3, ip
     86         bne         2f
     87         subs        r2, r2, #32
     88         bhs         1b
     89         b           3f
     90 2:
     91         /* Check if the difference was in the first or last 16 bytes */
     92         sub         r0, #32
     93         vorr        d0, d1
     94         sub         r1, #32
     95         vmov        r3, ip, d0
     96         orrs        r3, ip
     97         /* if the first 16 bytes are equal, we only have to rewind 16 bytes */
     98         ittt        eq
     99         subeq       r2, #16
    100         addeq       r0, #16
    101         addeq       r1, #16
    102 
    103 3:      /* fix-up the remaining count */
    104         add         r2, r2, #32
    105 
    106         cmp        r2, #(8+4)
    107         bmi        10f
    108 #endif
    109 
    110         .save {r4, lr}
    111         /* save registers */
    112         stmfd       sp!, {r4, lr}
    113 
    114         /* since r0 hold the result, move the first source
    115          * pointer somewhere else
    116          */
    117          mov        r4, r0
    118 
    119         /* align first pointer to word boundary
    120          * offset = -src & 3
    121          */
    122         rsb         r3, r4, #0
    123         ands        r3, r3, #3
    124         beq         0f
    125 
    126         /* align first pointer  */
    127         sub         r2, r2, r3
    128 1:      ldrb        r0, [r4], #1
    129         ldrb        ip, [r1], #1
    130         subs        r0, r0, ip
    131         bne         9f
    132         subs        r3, r3, #1
    133         bne         1b
    134 
    135 
    136 0:      /* here the first pointer is aligned, and we have at least 4 bytes
    137          * to process.
    138          */
    139 
    140         /* see if the pointers are congruent */
    141         eor         r0, r4, r1
    142         ands        r0, r0, #3
    143         bne         5f
    144 
    145         /* congruent case, 32 bytes per iteration
    146          * We need to make sure there are at least 32+4 bytes left
    147          * because we effectively read ahead one word, and we could
    148          * read past the buffer (and segfault) if we're not careful.
    149          */
    150 
    151         ldr         ip, [r1]
    152         subs        r2, r2, #(32 + 4)
    153         bmi         1f
    154 
    155 0:      pld         [r4, #(CACHE_LINE_SIZE * 2)]
    156         pld         [r1, #(CACHE_LINE_SIZE * 2)]
    157         ldr         r0, [r4], #4
    158         ldr         lr, [r1, #4]!
    159         eors        r0, r0, ip
    160         ldreq       r0, [r4], #4
    161         ldreq       ip, [r1, #4]!
    162         eoreqs      r0, r0, lr
    163         ldreq       r0, [r4], #4
    164         ldreq       lr, [r1, #4]!
    165         eoreqs      r0, r0, ip
    166         ldreq       r0, [r4], #4
    167         ldreq       ip, [r1, #4]!
    168         eoreqs      r0, r0, lr
    169         ldreq       r0, [r4], #4
    170         ldreq       lr, [r1, #4]!
    171         eoreqs      r0, r0, ip
    172         ldreq       r0, [r4], #4
    173         ldreq       ip, [r1, #4]!
    174         eoreqs      r0, r0, lr
    175         ldreq       r0, [r4], #4
    176         ldreq       lr, [r1, #4]!
    177         eoreqs      r0, r0, ip
    178         ldreq       r0, [r4], #4
    179         ldreq       ip, [r1, #4]!
    180         eoreqs      r0, r0, lr
    181         bne         2f
    182         subs        r2, r2, #32
    183         bhs         0b
    184 
    185         /* do we have at least 4 bytes left? */
    186 1:      adds        r2, r2, #(32 - 4 + 4)
    187         bmi         4f
    188 
    189         /* finish off 4 bytes at a time */
    190 3:      ldr         r0, [r4], #4
    191         ldr         ip, [r1], #4
    192         eors        r0, r0, ip
    193         bne         2f
    194         subs        r2, r2, #4
    195         bhs         3b
    196 
    197         /* are we done? */
    198 4:      adds        r2, r2, #4
    199         moveq       r0, #0
    200         beq         9f
    201 
    202         /* finish off the remaining bytes */
    203         b           8f
    204 
    205 2:      /* the last 4 bytes are different, restart them */
    206         sub         r4, r4, #4
    207         sub         r1, r1, #4
    208         mov         r2, #4
    209 
    210         /* process the last few bytes */
    211 8:      ldrb        r0, [r4], #1
    212         ldrb        ip, [r1], #1
    213         // stall
    214         subs        r0, r0, ip
    215         bne         9f
    216         subs        r2, r2, #1
    217         bne         8b
    218 
    219 9:      /* restore registers and return */
    220         ldmfd       sp!, {r4, lr}
    221         bx          lr
    222 
    223 10:     /* process less than 12 bytes */
    224         cmp         r2, #0
    225         moveq       r0, #0
    226         bxeq        lr
    227         mov         r3, r0
    228 11:
    229         ldrb        r0, [r3], #1
    230         ldrb        ip, [r1], #1
    231         subs        r0, ip
    232         bxne        lr
    233         subs        r2, r2, #1
    234         bne         11b
    235         bx          lr
    236 
    237 5:      /*************** non-congruent case ***************/
    238         and         r0, r1, #3
    239         cmp         r0, #2
    240         bne         4f
    241 
    242         /* here, offset is 2 (16-bits aligned, special cased) */
    243 
    244         /* make sure we have at least 16 bytes to process */
    245         subs        r2, r2, #16
    246         addmi       r2, r2, #16
    247         bmi         8b
    248 
    249         /* align the unaligned pointer */
    250         bic         r1, r1, #3
    251         ldr         lr, [r1], #4
    252 
    253 6:      pld         [r1, #(CACHE_LINE_SIZE * 2)]
    254         pld         [r4, #(CACHE_LINE_SIZE * 2)]
    255         mov         ip, lr, lsr #16
    256         ldr         lr, [r1], #4
    257         ldr         r0, [r4], #4
    258         orr         ip, ip, lr, lsl #16
    259         eors        r0, r0, ip
    260         moveq       ip, lr, lsr #16
    261         ldreq       lr, [r1], #4
    262         ldreq       r0, [r4], #4
    263         orreq       ip, ip, lr, lsl #16
    264         eoreqs      r0, r0, ip
    265         moveq       ip, lr, lsr #16
    266         ldreq       lr, [r1], #4
    267         ldreq       r0, [r4], #4
    268         orreq       ip, ip, lr, lsl #16
    269         eoreqs      r0, r0, ip
    270         moveq       ip, lr, lsr #16
    271         ldreq       lr, [r1], #4
    272         ldreq       r0, [r4], #4
    273         orreq       ip, ip, lr, lsl #16
    274         eoreqs      r0, r0, ip
    275         bne         7f
    276         subs        r2, r2, #16
    277         bhs         6b
    278         sub         r1, r1, #2
    279         /* are we done? */
    280         adds        r2, r2, #16
    281         moveq       r0, #0
    282         beq         9b
    283         /* finish off the remaining bytes */
    284         b           8b
    285 
    286 7:      /* fix up the 2 pointers and fallthrough... */
    287         sub         r1, r1, #(4+2)
    288         sub         r4, r4, #4
    289         mov         r2, #4
    290         b           8b
    291 
    292 
    293 4:      /*************** offset is 1 or 3 (less optimized) ***************/
    294 
    295 		stmfd		sp!, {r5, r6, r7}
    296 
    297         // r5 = rhs
    298         // r6 = lhs
    299         // r7 = scratch
    300 
    301         mov         r5, r0, lsl #3		/* r5 = right shift */
    302         rsb         r6, r5, #32         /* r6 = left shift */
    303 
    304         /* align the unaligned pointer */
    305         bic         r1, r1, #3
    306         ldr         r7, [r1], #4
    307         sub         r2, r2, #8
    308 
    309 6:      mov         ip, r7, lsr r5
    310         ldr         r7, [r1], #4
    311         ldr         r0, [r4], #4
    312         orr         ip, ip, r7, lsl r6
    313         eors        r0, r0, ip
    314         moveq       ip, r7, lsr r5
    315         ldreq       r7, [r1], #4
    316         ldreq       r0, [r4], #4
    317         orreq       ip, ip, r7, lsl r6
    318         eoreqs      r0, r0, ip
    319         bne         7f
    320         subs        r2, r2, #8
    321         bhs         6b
    322 
    323         sub         r1, r1, r6, lsr #3
    324 		ldmfd       sp!, {r5, r6, r7}
    325 
    326         /* are we done? */
    327         adds        r2, r2, #8
    328         moveq       r0, #0
    329         beq         9b
    330 
    331         /* finish off the remaining bytes */
    332         b           8b
    333 
    334 7:      /* fix up the 2 pointers and fallthrough... */
    335         sub         r1, r1, #4
    336         sub         r1, r1, r6, lsr #3
    337         sub         r4, r4, #4
    338         mov         r2, #4
    339 		ldmfd		sp!, {r5, r6, r7}
    340         b           8b
    341 END(memcmp)
    342