Home | History | Annotate | Download | only in arm
      1 /*
      2  * Copyright (C) 2014 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef ART_RUNTIME_ARCH_ARM_MEMCMP16_ARM_S_
     18 #define ART_RUNTIME_ARCH_ARM_MEMCMP16_ARM_S_
     19 
     20 #include "asm_support_arm.S"
     21 
     22 /*
     23  * Optimized memcmp16() for ARM9.
     24  * This would not be optimal on XScale or ARM11, where more prefetching
     25  * and use of pld will be needed.
     26  * The 2 major optimzations here are
     27  * (1) The main loop compares 16 bytes at a time
     28  * (2) The loads are scheduled in a way they won't stall
     29  */
     30 
     31 ARM_ENTRY __memcmp16
     32         pld         [r0, #0]
     33         pld         [r1, #0]
     34 
     35         /* take of the case where length is nul or the buffers are the same */
     36         cmp         r0, r1
     37         cmpne       r2, #0
     38         moveq       r0, #0
     39         bxeq        lr
     40 
     41         /* since r0 hold the result, move the first source
     42          * pointer somewhere else
     43          */
     44 
     45         mov         r3, r0
     46 
     47          /* make sure we have at least 12 words, this simplify things below
     48           * and avoid some overhead for small blocks
     49           */
     50 
     51         cmp         r2, #12
     52         bpl         0f
     53 
     54         /* small blocks (less then 12 words) */
     55         pld         [r0, #32]
     56         pld         [r1, #32]
     57 
     58 1:      ldrh        r0, [r3], #2
     59         ldrh        ip, [r1], #2
     60         subs        r0, r0, ip
     61         bxne        lr
     62         subs        r2, r2, #1
     63         bne         1b
     64         bx          lr
     65 
     66 
     67         /* save registers */
     68 0:      push        {r4, lr}
     69         .cfi_def_cfa_offset 8
     70         .cfi_rel_offset r4, 0
     71         .cfi_rel_offset lr, 4
     72 
     73         /* align first pointer to word boundary */
     74         tst         r3, #2
     75         beq         0f
     76 
     77         ldrh        r0, [r3], #2
     78         ldrh        ip, [r1], #2
     79         sub         r2, r2, #1
     80         subs        r0, r0, ip
     81         /* restore registers and return */
     82         popne       {r4, lr}
     83         bxne        lr
     84 
     85 
     86 0:      /* here the first pointer is aligned, and we have at least 3 words
     87          * to process.
     88          */
     89 
     90         /* see if the pointers are congruent */
     91         eor         r0, r3, r1
     92         ands        r0, r0, #2
     93         bne         5f
     94 
     95         /* congruent case, 16 half-words per iteration
     96          * We need to make sure there are at least 16+2 words left
     97          * because we effectively read ahead one long word, and we could
     98          * read past the buffer (and segfault) if we're not careful.
     99          */
    100 
    101         ldr         ip, [r1]
    102         subs        r2, r2, #(16 + 2)
    103         bmi         1f
    104 
    105 0:
    106         pld         [r3, #64]
    107         pld         [r1, #64]
    108         ldr         r0, [r3], #4
    109         ldr         lr, [r1, #4]!
    110         eors        r0, r0, ip
    111         ldreq       r0, [r3], #4
    112         ldreq       ip, [r1, #4]!
    113         eorseq      r0, r0, lr
    114         ldreq       r0, [r3], #4
    115         ldreq       lr, [r1, #4]!
    116         eorseq      r0, r0, ip
    117         ldreq       r0, [r3], #4
    118         ldreq       ip, [r1, #4]!
    119         eorseq      r0, r0, lr
    120         ldreq       r0, [r3], #4
    121         ldreq       lr, [r1, #4]!
    122         eorseq      r0, r0, ip
    123         ldreq       r0, [r3], #4
    124         ldreq       ip, [r1, #4]!
    125         eorseq      r0, r0, lr
    126         ldreq       r0, [r3], #4
    127         ldreq       lr, [r1, #4]!
    128         eorseq      r0, r0, ip
    129         ldreq       r0, [r3], #4
    130         ldreq       ip, [r1, #4]!
    131         eorseq      r0, r0, lr
    132         bne         2f
    133         subs        r2, r2, #16
    134         bhs         0b
    135 
    136         /* do we have at least 2 words left? */
    137 1:      adds        r2, r2, #(16 - 2 + 2)
    138         bmi         4f
    139 
    140         /* finish off 2 words at a time */
    141 3:      ldr         r0, [r3], #4
    142         ldr         ip, [r1], #4
    143         eors        r0, r0, ip
    144         bne         2f
    145         subs        r2, r2, #2
    146         bhs         3b
    147 
    148         /* are we done? */
    149 4:      adds        r2, r2, #2
    150         bne         8f
    151         /* restore registers and return */
    152         mov         r0, #0
    153         pop         {r4, lr}
    154         .cfi_restore r4
    155         .cfi_restore lr
    156         .cfi_adjust_cfa_offset -8
    157         bx          lr
    158 
    159 2:      /* the last 2 words are different, restart them */
    160         ldrh        r0, [r3, #-4]
    161         ldrh        ip, [r1, #-4]
    162         subs        r0, r0, ip
    163         ldrheq      r0, [r3, #-2]
    164         ldrheq      ip, [r1, #-2]
    165         subseq      r0, r0, ip
    166         /* restore registers and return */
    167         pop         {r4, lr}
    168         .cfi_restore r4
    169         .cfi_restore lr
    170         .cfi_adjust_cfa_offset -8
    171         bx          lr
    172 
    173         /* process the last few words */
    174 8:      ldrh        r0, [r3], #2
    175         ldrh        ip, [r1], #2
    176         subs        r0, r0, ip
    177         bne         9f
    178         subs        r2, r2, #1
    179         bne         8b
    180 
    181 9:      /* restore registers and return */
    182         pop         {r4, lr}
    183         .cfi_restore r4
    184         .cfi_restore lr
    185         .cfi_adjust_cfa_offset -8
    186         bx          lr
    187 
    188 
    189 5:      /*************** non-congruent case ***************/
    190 
    191         /* align the unaligned pointer */
    192         bic         r1, r1, #3
    193         ldr         lr, [r1], #4
    194         sub         r2, r2, #8
    195 
    196 6:
    197         pld         [r3, #64]
    198         pld         [r1, #64]
    199         mov         ip, lr, lsr #16
    200         ldr         lr, [r1], #4
    201         ldr         r0, [r3], #4
    202         orr         ip, ip, lr, lsl #16
    203         eors        r0, r0, ip
    204         moveq       ip, lr, lsr #16
    205         ldreq       lr, [r1], #4
    206         ldreq       r0, [r3], #4
    207         orreq       ip, ip, lr, lsl #16
    208         eorseq      r0, r0, ip
    209         moveq       ip, lr, lsr #16
    210         ldreq       lr, [r1], #4
    211         ldreq       r0, [r3], #4
    212         orreq       ip, ip, lr, lsl #16
    213         eorseq      r0, r0, ip
    214         moveq       ip, lr, lsr #16
    215         ldreq       lr, [r1], #4
    216         ldreq       r0, [r3], #4
    217         orreq       ip, ip, lr, lsl #16
    218         eorseq      r0, r0, ip
    219         bne         7f
    220         subs        r2, r2, #8
    221         bhs         6b
    222         sub         r1, r1, #2
    223         /* are we done? */
    224         adds        r2, r2, #8
    225         moveq       r0, #0
    226         beq         9b
    227         /* finish off the remaining bytes */
    228         b           8b
    229 
    230 7:      /* fix up the 2 pointers and fallthrough... */
    231         sub         r1, r1, #2
    232         b           2b
    233 END __memcmp16
    234 
    235 
    236 #endif  // ART_RUNTIME_ARCH_ARM_MEMCMP16_ARM_S_
    237