Home | History | Annotate | Download | only in bionic
      1 /*
      2  * Copyright (C) 2008 The Android Open Source Project
      3  * All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  *  * Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  *  * Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in
     12  *    the documentation and/or other materials provided with the
     13  *    distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     18  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
     19  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
     22  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
     23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
     24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
     25  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 #include <machine/cpu-features.h>
     30 #include <machine/asm.h>
     31 
     32 /*
     33  * Optimized memcmp16() for ARM9.
     34  * This would not be optimal on XScale or ARM11, where more prefetching
     35  * and use of PLD will be needed.
     36  * The 2 major optimzations here are
     37  * (1) The main loop compares 16 bytes at a time
     38  * (2) The loads are scheduled in a way they won't stall
     39  */
     40 
     41 ENTRY(__memcmp16)
     42         PLD         (r0, #0)
     43         PLD         (r1, #0)
     44 
     45         /* take of the case where length is nul or the buffers are the same */
     46         cmp         r0, r1
     47         cmpne       r2, #0
     48         moveq       r0, #0
     49         bxeq        lr
     50 
     51         /* since r0 hold the result, move the first source
     52          * pointer somewhere else
     53          */
     54 
     55         mov         r3, r0
     56 
     57          /* make sure we have at least 12 words, this simplify things below
     58           * and avoid some overhead for small blocks
     59           */
     60 
     61         cmp         r2, #12
     62         bpl         0f
     63 
     64         /* small blocks (less then 12 words) */
     65         PLD         (r0, #32)
     66         PLD         (r1, #32)
     67 
     68 1:      ldrh        r0, [r3], #2
     69         ldrh        ip, [r1], #2
     70         subs        r0, r0, ip
     71         bxne        lr
     72         subs        r2, r2, #1
     73         bne         1b
     74         bx          lr
     75 
     76 
     77         .save {r4, lr}
     78         /* save registers */
     79 0:      stmfd       sp!, {r4, lr}
     80 
     81         /* align first pointer to word boundary */
     82         tst         r3, #2
     83         beq         0f
     84 
     85         ldrh        r0, [r3], #2
     86         ldrh        ip, [r1], #2
     87         sub         r2, r2, #1
     88         subs        r0, r0, ip
     89         /* restore registers and return */
     90         ldmnefd     sp!, {r4, lr}
     91         bxne        lr
     92 
     93 
     94 0:      /* here the first pointer is aligned, and we have at least 3 words
     95          * to process.
     96          */
     97 
     98         /* see if the pointers are congruent */
     99         eor         r0, r3, r1
    100         ands        r0, r0, #2
    101         bne         5f
    102 
    103         /* congruent case, 16 half-words per iteration
    104          * We need to make sure there are at least 16+2 words left
    105          * because we effectively read ahead one long word, and we could
    106          * read past the buffer (and segfault) if we're not careful.
    107          */
    108 
    109         ldr         ip, [r1]
    110         subs        r2, r2, #(16 + 2)
    111         bmi         1f
    112 
    113 0:
    114         PLD         (r3, #64)
    115         PLD         (r1, #64)
    116         ldr         r0, [r3], #4
    117         ldr         lr, [r1, #4]!
    118         eors        r0, r0, ip
    119         ldreq       r0, [r3], #4
    120         ldreq       ip, [r1, #4]!
    121         eoreqs      r0, r0, lr
    122         ldreq       r0, [r3], #4
    123         ldreq       lr, [r1, #4]!
    124         eoreqs      r0, r0, ip
    125         ldreq       r0, [r3], #4
    126         ldreq       ip, [r1, #4]!
    127         eoreqs      r0, r0, lr
    128         ldreq       r0, [r3], #4
    129         ldreq       lr, [r1, #4]!
    130         eoreqs      r0, r0, ip
    131         ldreq       r0, [r3], #4
    132         ldreq       ip, [r1, #4]!
    133         eoreqs      r0, r0, lr
    134         ldreq       r0, [r3], #4
    135         ldreq       lr, [r1, #4]!
    136         eoreqs      r0, r0, ip
    137         ldreq       r0, [r3], #4
    138         ldreq       ip, [r1, #4]!
    139         eoreqs      r0, r0, lr
    140         bne         2f
    141         subs        r2, r2, #16
    142         bhs         0b
    143 
    144         /* do we have at least 2 words left? */
    145 1:      adds        r2, r2, #(16 - 2 + 2)
    146         bmi         4f
    147 
    148         /* finish off 2 words at a time */
    149 3:      ldr         r0, [r3], #4
    150         ldr         ip, [r1], #4
    151         eors        r0, r0, ip
    152         bne         2f
    153         subs        r2, r2, #2
    154         bhs         3b
    155 
    156         /* are we done? */
    157 4:      adds        r2, r2, #2
    158         bne         8f
    159         /* restore registers and return */
    160         mov         r0, #0
    161         ldmfd       sp!, {r4, lr}
    162         bx          lr
    163 
    164 2:      /* the last 2 words are different, restart them */
    165         ldrh        r0, [r3, #-4]
    166         ldrh        ip, [r1, #-4]
    167         subs        r0, r0, ip
    168         ldreqh      r0, [r3, #-2]
    169         ldreqh      ip, [r1, #-2]
    170         subeqs      r0, r0, ip
    171         /* restore registers and return */
    172         ldmfd       sp!, {r4, lr}
    173         bx          lr
    174 
    175         /* process the last few words */
    176 8:      ldrh        r0, [r3], #2
    177         ldrh        ip, [r1], #2
    178         subs        r0, r0, ip
    179         bne         9f
    180         subs        r2, r2, #1
    181         bne         8b
    182 
    183 9:      /* restore registers and return */
    184         ldmfd       sp!, {r4, lr}
    185         bx          lr
    186 
    187 
    188 5:      /*************** non-congruent case ***************/
    189 
    190         /* align the unaligned pointer */
    191         bic         r1, r1, #3
    192         ldr         lr, [r1], #4
    193         sub         r2, r2, #8
    194 
    195 6:
    196         PLD         (r3, #64)
    197         PLD         (r1, #64)
    198         mov         ip, lr, lsr #16
    199         ldr         lr, [r1], #4
    200         ldr         r0, [r3], #4
    201         orr         ip, ip, lr, lsl #16
    202         eors        r0, r0, ip
    203         moveq       ip, lr, lsr #16
    204         ldreq       lr, [r1], #4
    205         ldreq       r0, [r3], #4
    206         orreq       ip, ip, lr, lsl #16
    207         eoreqs      r0, r0, ip
    208         moveq       ip, lr, lsr #16
    209         ldreq       lr, [r1], #4
    210         ldreq       r0, [r3], #4
    211         orreq       ip, ip, lr, lsl #16
    212         eoreqs      r0, r0, ip
    213         moveq       ip, lr, lsr #16
    214         ldreq       lr, [r1], #4
    215         ldreq       r0, [r3], #4
    216         orreq       ip, ip, lr, lsl #16
    217         eoreqs      r0, r0, ip
    218         bne         7f
    219         subs        r2, r2, #8
    220         bhs         6b
    221         sub         r1, r1, #2
    222         /* are we done? */
    223         adds        r2, r2, #8
    224         moveq       r0, #0
    225         beq         9b
    226         /* finish off the remaining bytes */
    227         b           8b
    228 
    229 7:      /* fix up the 2 pointers and fallthrough... */
    230         sub         r1, r1, #2
    231         b           2b
    232 END(__memcmp16)
    233