Home | History | Annotate | Download | only in bionic
      1 /*
      2  * Copyright (C) 2008 The Android Open Source Project
      3  * All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  *  * Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  *  * Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in
     12  *    the documentation and/or other materials provided with the
     13  *    distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     18  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
     19  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
     22  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
     23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
     24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
     25  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 #include <machine/cpu-features.h>
     30 
     31     .text
     32 
     33     .global __memcmp16
     34     .type __memcmp16, %function
     35     .align 4
     36 
     37 /*
     38  * Optimized memcmp16() for ARM9.
     39  * This would not be optimal on XScale or ARM11, where more prefetching
     40  * and use of PLD will be needed.
     41  * The 2 major optimzations here are
     42  * (1) The main loop compares 16 bytes at a time
     43  * (2) The loads are scheduled in a way they won't stall
     44  */
     45 
     46 __memcmp16:
     47         .fnstart
     48         PLD         (r0, #0)
     49         PLD         (r1, #0)
     50 
     51         /* take of the case where length is nul or the buffers are the same */
     52         cmp         r0, r1
     53         cmpne       r2, #0
     54         moveq       r0, #0
     55         bxeq        lr
     56 
     57         /* since r0 hold the result, move the first source
     58          * pointer somewhere else
     59          */
     60 
     61         mov         r3, r0
     62 
     63          /* make sure we have at least 12 words, this simplify things below
     64           * and avoid some overhead for small blocks
     65           */
     66 
     67         cmp         r2, #12
     68         bpl         0f
     69 
     70         /* small blocks (less then 12 words) */
     71         PLD         (r0, #32)
     72         PLD         (r1, #32)
     73 
     74 1:      ldrh        r0, [r3], #2
     75         ldrh        ip, [r1], #2
     76         subs        r0, r0, ip
     77         bxne        lr
     78         subs        r2, r2, #1
     79         bne         1b
     80         bx          lr
     81 
     82 
     83         .save {r4, lr}
     84         /* save registers */
     85 0:      stmfd       sp!, {r4, lr}
     86 
     87         /* align first pointer to word boundary */
     88         tst         r3, #2
     89         beq         0f
     90 
     91         ldrh        r0, [r3], #2
     92         ldrh        ip, [r1], #2
     93         sub         r2, r2, #1
     94         subs        r0, r0, ip
     95         /* restore registers and return */
     96         ldmnefd     sp!, {r4, lr}
     97         bxne        lr
     98         .fnend
     99 
    100 
    101 
    102 0:      /* here the first pointer is aligned, and we have at least 3 words
    103          * to process.
    104          */
    105 
    106         /* see if the pointers are congruent */
    107         eor         r0, r3, r1
    108         ands        r0, r0, #2
    109         bne         5f
    110 
    111         /* congruent case, 16 half-words per iteration
    112          * We need to make sure there are at least 16+2 words left
    113          * because we effectively read ahead one long word, and we could
    114          * read past the buffer (and segfault) if we're not careful.
    115          */
    116 
    117         ldr         ip, [r1]
    118         subs        r2, r2, #(16 + 2)
    119         bmi         1f
    120 
    121 0:
    122         PLD         (r3, #64)
    123         PLD         (r1, #64)
    124         ldr         r0, [r3], #4
    125         ldr         lr, [r1, #4]!
    126         eors        r0, r0, ip
    127         ldreq       r0, [r3], #4
    128         ldreq       ip, [r1, #4]!
    129         eoreqs      r0, r0, lr
    130         ldreq       r0, [r3], #4
    131         ldreq       lr, [r1, #4]!
    132         eoreqs      r0, r0, ip
    133         ldreq       r0, [r3], #4
    134         ldreq       ip, [r1, #4]!
    135         eoreqs      r0, r0, lr
    136         ldreq       r0, [r3], #4
    137         ldreq       lr, [r1, #4]!
    138         eoreqs      r0, r0, ip
    139         ldreq       r0, [r3], #4
    140         ldreq       ip, [r1, #4]!
    141         eoreqs      r0, r0, lr
    142         ldreq       r0, [r3], #4
    143         ldreq       lr, [r1, #4]!
    144         eoreqs      r0, r0, ip
    145         ldreq       r0, [r3], #4
    146         ldreq       ip, [r1, #4]!
    147         eoreqs      r0, r0, lr
    148         bne         2f
    149         subs        r2, r2, #16
    150         bhs         0b
    151 
    152         /* do we have at least 2 words left? */
    153 1:      adds        r2, r2, #(16 - 2 + 2)
    154         bmi         4f
    155 
    156         /* finish off 2 words at a time */
    157 3:      ldr         r0, [r3], #4
    158         ldr         ip, [r1], #4
    159         eors        r0, r0, ip
    160         bne         2f
    161         subs        r2, r2, #2
    162         bhs         3b
    163 
    164         /* are we done? */
    165 4:      adds        r2, r2, #2
    166         bne         8f
    167         /* restore registers and return */
    168         mov         r0, #0
    169         ldmfd       sp!, {r4, lr}
    170         bx          lr
    171 
    172 2:      /* the last 2 words are different, restart them */
    173         ldrh        r0, [r3, #-4]
    174         ldrh        ip, [r1, #-4]
    175         subs        r0, r0, ip
    176         ldreqh      r0, [r3, #-2]
    177         ldreqh      ip, [r1, #-2]
    178         subeqs      r0, r0, ip
    179         /* restore registers and return */
    180         ldmfd       sp!, {r4, lr}
    181         bx          lr
    182 
    183         /* process the last few words */
    184 8:      ldrh        r0, [r3], #2
    185         ldrh        ip, [r1], #2
    186         subs        r0, r0, ip
    187         bne         9f
    188         subs        r2, r2, #1
    189         bne         8b
    190 
    191 9:      /* restore registers and return */
    192         ldmfd       sp!, {r4, lr}
    193         bx          lr
    194 
    195 
    196 5:      /*************** non-congruent case ***************/
    197 
    198         /* align the unaligned pointer */
    199         bic         r1, r1, #3
    200         ldr         lr, [r1], #4
    201         sub         r2, r2, #8
    202 
    203 6:
    204         PLD         (r3, #64)
    205         PLD         (r1, #64)
    206         mov         ip, lr, lsr #16
    207         ldr         lr, [r1], #4
    208         ldr         r0, [r3], #4
    209         orr         ip, ip, lr, lsl #16
    210         eors        r0, r0, ip
    211         moveq       ip, lr, lsr #16
    212         ldreq       lr, [r1], #4
    213         ldreq       r0, [r3], #4
    214         orreq       ip, ip, lr, lsl #16
    215         eoreqs      r0, r0, ip
    216         moveq       ip, lr, lsr #16
    217         ldreq       lr, [r1], #4
    218         ldreq       r0, [r3], #4
    219         orreq       ip, ip, lr, lsl #16
    220         eoreqs      r0, r0, ip
    221         moveq       ip, lr, lsr #16
    222         ldreq       lr, [r1], #4
    223         ldreq       r0, [r3], #4
    224         orreq       ip, ip, lr, lsl #16
    225         eoreqs      r0, r0, ip
    226         bne         7f
    227         subs        r2, r2, #8
    228         bhs         6b
    229         sub         r1, r1, #2
    230         /* are we done? */
    231         adds        r2, r2, #8
    232         moveq       r0, #0
    233         beq         9b
    234         /* finish off the remaining bytes */
    235         b           8b
    236 
    237 7:      /* fix up the 2 pointers and fallthrough... */
    238         sub         r1, r1, #2
    239         b           2b
    240