Home | History | Annotate | Download | only in bionic
      1 /*
      2  * Copyright (c) 2013 ARM Ltd
      3  * All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  * 3. The name of the company may not be used to endorse or promote
     14  *    products derived from this software without specific prior written
     15  *    permission.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
     18  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
     19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     20  * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
     22  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     23  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     24  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     25  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include <machine/cpu-features.h>
     30 #include <machine/asm.h>
     31 
     32 #ifdef __ARMEB__
     33 #define S2LOMEM lsl
     34 #define S2LOMEMEQ lsleq
     35 #define S2HIMEM lsr
     36 #define MSB 0x000000ff
     37 #define LSB 0xff000000
     38 #define BYTE0_OFFSET 24
     39 #define BYTE1_OFFSET 16
     40 #define BYTE2_OFFSET 8
     41 #define BYTE3_OFFSET 0
     42 #else /* not  __ARMEB__ */
     43 #define S2LOMEM lsr
     44 #define S2LOMEMEQ lsreq
     45 #define S2HIMEM lsl
     46 #define BYTE0_OFFSET 0
     47 #define BYTE1_OFFSET 8
     48 #define BYTE2_OFFSET 16
     49 #define BYTE3_OFFSET 24
     50 #define MSB 0xff000000
     51 #define LSB 0x000000ff
     52 #endif /* not  __ARMEB__ */
     53 
     54 .syntax         unified
     55 
     56 #if defined (__thumb__)
     57         .thumb
     58         .thumb_func
     59 #endif
     60 
     61 ENTRY(strcmp)
     62       /* Use LDRD whenever possible.  */
     63 
     64 /* The main thing to look out for when comparing large blocks is that
     65    the loads do not cross a page boundary when loading past the index
     66    of the byte with the first difference or the first string-terminator.
     67 
     68    For example, if the strings are identical and the string-terminator
     69    is at index k, byte by byte comparison will not load beyond address
     70    s1+k and s2+k; word by word comparison may load up to 3 bytes beyond
     71    k; double word - up to 7 bytes.  If the load of these bytes crosses
     72    a page boundary, it might cause a memory fault (if the page is not mapped)
     73    that would not have happened in byte by byte comparison.
     74 
     75    If an address is (double) word aligned, then a load of a (double) word
     76    from that address will not cross a page boundary.
     77    Therefore, the algorithm below considers word and double-word alignment
     78    of strings separately.  */
     79 
     80 /* High-level description of the algorithm.
     81 
     82    * The fast path: if both strings are double-word aligned,
     83      use LDRD to load two words from each string in every loop iteration.
     84    * If the strings have the same offset from a word boundary,
     85      use LDRB to load and compare byte by byte until
     86      the first string is aligned to a word boundary (at most 3 bytes).
     87      This is optimized for quick return on short unaligned strings.
     88    * If the strings have the same offset from a double-word boundary,
     89      use LDRD to load two words from each string in every loop iteration, as in the fast path.
     90    * If the strings do not have the same offset from a double-word boundary,
     91      load a word from the second string before the loop to initialize the queue.
     92      Use LDRD to load two words from every string in every loop iteration.
     93      Inside the loop, load the second word from the second string only after comparing
     94      the first word, using the queued value, to guarantee safety across page boundaries.
     95    * If the strings do not have the same offset from a word boundary,
     96      use LDR and a shift queue. Order of loads and comparisons matters,
     97      similarly to the previous case.
     98 
     99    * Use UADD8 and SEL to compare words, and use REV and CLZ to compute the return value.
    100    * The only difference between ARM and Thumb modes is the use of CBZ instruction.
    101    * The only difference between big and little endian is the use of REV in little endian
    102      to compute the return value, instead of MOV.
    103 */
    104 
    105         .macro m_cbz reg label
    106 #ifdef __thumb2__
    107         cbz     \reg, \label
    108 #else   /* not defined __thumb2__ */
    109         cmp     \reg, #0
    110         beq     \label
    111 #endif /* not defined __thumb2__ */
    112         .endm /* m_cbz */
    113 
    114         .macro m_cbnz reg label
    115 #ifdef __thumb2__
    116         cbnz    \reg, \label
    117 #else   /* not defined __thumb2__ */
    118         cmp     \reg, #0
    119         bne     \label
    120 #endif /* not defined __thumb2__ */
    121         .endm /* m_cbnz */
    122 
    123         .macro  init
    124         /* Macro to save temporary registers and prepare magic values.  */
    125         subs    sp, sp, #16
    126         strd    r4, r5, [sp, #8]
    127         strd    r6, r7, [sp]
    128         mvn     r6, #0  /* all F */
    129         mov     r7, #0  /* all 0 */
    130         .endm   /* init */
    131 
    132         .macro  magic_compare_and_branch w1 w2 label
    133         /* Macro to compare registers w1 and w2 and conditionally branch to label.  */
    134         cmp     \w1, \w2        /* Are w1 and w2 the same?  */
    135         magic_find_zero_bytes \w1
    136         it      eq
    137         cmpeq   ip, #0          /* Is there a zero byte in w1?  */
    138         bne     \label
    139         .endm /* magic_compare_and_branch */
    140 
    141         .macro  magic_find_zero_bytes w1
    142         /* Macro to find all-zero bytes in w1, result is in ip.  */
    143 #if (defined (__ARM_FEATURE_DSP))
    144         uadd8   ip, \w1, r6
    145         sel     ip, r7, r6
    146 #else /* not defined (__ARM_FEATURE_DSP) */
    147         /* __ARM_FEATURE_DSP is not defined for some Cortex-M processors.
    148         Coincidently, these processors only have Thumb-2 mode, where we can use the
    149         the (large) magic constant available directly as an immediate in instructions.
    150         Note that we cannot use the magic constant in ARM mode, where we need
    151         to create the constant in a register.  */
    152         sub     ip, \w1, #0x01010101
    153         bic     ip, ip, \w1
    154         and     ip, ip, #0x80808080
    155 #endif /* not defined (__ARM_FEATURE_DSP) */
    156         .endm /* magic_find_zero_bytes */
    157 
    158         .macro  setup_return w1 w2
    159 #ifdef __ARMEB__
    160         mov     r1, \w1
    161         mov     r2, \w2
    162 #else /* not  __ARMEB__ */
    163         rev     r1, \w1
    164         rev     r2, \w2
    165 #endif /* not  __ARMEB__ */
    166         .endm /* setup_return */
    167 
    168         pld [r0, #0]
    169         pld [r1, #0]
    170 
    171         /* Are both strings double-word aligned?  */
    172         orr     ip, r0, r1
    173         tst     ip, #7
    174         bne     do_align
    175 
    176         /* Fast path.  */
    177         init
    178 
    179 doubleword_aligned:
    180 
    181         /* Get here when the strings to compare are double-word aligned.  */
    182         /* Compare two words in every iteration.  */
    183         .p2align        2
    184 2:
    185         pld [r0, #16]
    186         pld [r1, #16]
    187 
    188         /* Load the next double-word from each string.  */
    189         ldrd    r2, r3, [r0], #8
    190         ldrd    r4, r5, [r1], #8
    191 
    192         magic_compare_and_branch w1=r2, w2=r4, label=return_24
    193         magic_compare_and_branch w1=r3, w2=r5, label=return_35
    194         b       2b
    195 
    196 do_align:
    197         /* Is the first string word-aligned?  */
    198         ands    ip, r0, #3
    199         beq     word_aligned_r0
    200 
    201         /* Fast compare byte by byte until the first string is word-aligned.  */
    202         /* The offset of r0 from a word boundary is in ip. Thus, the number of bytes
    203         to read until the next word boundary is 4-ip.  */
    204         bic     r0, r0, #3
    205         ldr     r2, [r0], #4
    206         lsls    ip, ip, #31
    207         beq     byte2
    208         bcs     byte3
    209 
    210 byte1:
    211         ldrb    ip, [r1], #1
    212         uxtb    r3, r2, ror #BYTE1_OFFSET
    213         subs    ip, r3, ip
    214         bne     fast_return
    215         m_cbz   reg=r3, label=fast_return
    216 
    217 byte2:
    218         ldrb    ip, [r1], #1
    219         uxtb    r3, r2, ror #BYTE2_OFFSET
    220         subs    ip, r3, ip
    221         bne     fast_return
    222         m_cbz   reg=r3, label=fast_return
    223 
    224 byte3:
    225         ldrb    ip, [r1], #1
    226         uxtb    r3, r2, ror #BYTE3_OFFSET
    227         subs    ip, r3, ip
    228         bne     fast_return
    229         m_cbnz  reg=r3, label=word_aligned_r0
    230 
    231 fast_return:
    232         mov     r0, ip
    233         bx      lr
    234 
    235 word_aligned_r0:
    236         init
    237         /* The first string is word-aligned.  */
    238         /* Is the second string word-aligned?  */
    239         ands    ip, r1, #3
    240         bne     strcmp_unaligned
    241 
    242 word_aligned:
    243         /* The strings are word-aligned. */
    244         /* Is the first string double-word aligned?  */
    245         tst     r0, #4
    246         beq     doubleword_aligned_r0
    247 
    248         /* If r0 is not double-word aligned yet, align it by loading
    249         and comparing the next word from each string.  */
    250         ldr     r2, [r0], #4
    251         ldr     r4, [r1], #4
    252         magic_compare_and_branch w1=r2 w2=r4 label=return_24
    253 
    254 doubleword_aligned_r0:
    255         /* Get here when r0 is double-word aligned.  */
    256         /* Is r1 doubleword_aligned?  */
    257         tst     r1, #4
    258         beq     doubleword_aligned
    259 
    260         /* Get here when the strings to compare are word-aligned,
    261         r0 is double-word aligned, but r1 is not double-word aligned.  */
    262 
    263         /* Initialize the queue.  */
    264         ldr     r5, [r1], #4
    265 
    266         /* Compare two words in every iteration.  */
    267         .p2align        2
    268 3:
    269         pld [r0, #16]
    270         pld [r1, #16]
    271 
    272         /* Load the next double-word from each string and compare.  */
    273         ldrd    r2, r3, [r0], #8
    274         magic_compare_and_branch w1=r2 w2=r5 label=return_25
    275         ldrd    r4, r5, [r1], #8
    276         magic_compare_and_branch w1=r3 w2=r4 label=return_34
    277         b       3b
    278 
    279         .macro miscmp_word offsetlo offsethi
    280         /* Macro to compare misaligned strings.  */
    281         /* r0, r1 are word-aligned, and at least one of the strings
    282         is not double-word aligned.  */
    283         /* Compare one word in every loop iteration.  */
    284         /* OFFSETLO is the original bit-offset of r1 from a word-boundary,
    285         OFFSETHI is 32 - OFFSETLO (i.e., offset from the next word).  */
    286 
    287         /* Initialize the shift queue.  */
    288         ldr     r5, [r1], #4
    289 
    290         /* Compare one word from each string in every loop iteration.  */
    291         .p2align        2
    292 7:
    293         ldr     r3, [r0], #4
    294         S2LOMEM r5, r5, #\offsetlo
    295         magic_find_zero_bytes w1=r3
    296         cmp     r7, ip, S2HIMEM #\offsetlo
    297         and     r2, r3, r6, S2LOMEM #\offsetlo
    298         it      eq
    299         cmpeq   r2, r5
    300         bne     return_25
    301         ldr     r5, [r1], #4
    302         cmp     ip, #0
    303         eor r3, r2, r3
    304         S2HIMEM r2, r5, #\offsethi
    305         it      eq
    306         cmpeq   r3, r2
    307         bne     return_32
    308         b       7b
    309         .endm /* miscmp_word */
    310 
    311 strcmp_unaligned:
    312         /* r0 is word-aligned, r1 is at offset ip from a word.  */
    313         /* Align r1 to the (previous) word-boundary.  */
    314         bic     r1, r1, #3
    315 
    316         /* Unaligned comparison word by word using LDRs. */
    317         cmp     ip, #2
    318         beq     miscmp_word_16                    /* If ip == 2.  */
    319         bge     miscmp_word_24                    /* If ip == 3.  */
    320         miscmp_word offsetlo=8 offsethi=24        /* If ip == 1.  */
    321 miscmp_word_16:  miscmp_word offsetlo=16 offsethi=16
    322 miscmp_word_24:  miscmp_word offsetlo=24 offsethi=8
    323 
    324 
    325 return_32:
    326         setup_return w1=r3, w2=r2
    327         b       do_return
    328 return_34:
    329         setup_return w1=r3, w2=r4
    330         b       do_return
    331 return_25:
    332         setup_return w1=r2, w2=r5
    333         b       do_return
    334 return_35:
    335         setup_return w1=r3, w2=r5
    336         b       do_return
    337 return_24:
    338         setup_return w1=r2, w2=r4
    339 
    340 do_return:
    341 
    342 #ifdef __ARMEB__
    343         mov     r0, ip
    344 #else /* not  __ARMEB__ */
    345         rev     r0, ip
    346 #endif /* not  __ARMEB__ */
    347 
    348         /* Restore temporaries early, before computing the return value.  */
    349         ldrd    r6, r7, [sp]
    350         ldrd    r4, r5, [sp, #8]
    351         adds    sp, sp, #16
    352 
    353         /* There is a zero or a different byte between r1 and r2.  */
    354         /* r0 contains a mask of all-zero bytes in r1.  */
    355         /* Using r0 and not ip here because cbz requires low register.  */
    356         m_cbz   reg=r0, label=compute_return_value
    357         clz     r0, r0
    358         /* r0 contains the number of bits on the left of the first all-zero byte in r1.  */
    359         rsb     r0, r0, #24
    360         /* Here, r0 contains the number of bits on the right of the first all-zero byte in r1.  */
    361         lsr     r1, r1, r0
    362         lsr     r2, r2, r0
    363 
    364 compute_return_value:
    365         movs    r0, #1
    366         cmp     r1, r2
    367         /* The return value is computed as follows.
    368         If r1>r2 then (C==1 and Z==0) and LS doesn't hold and r0 is #1 at return.
    369         If r1<r2 then (C==0 and Z==0) and we execute SBC with carry_in=0,
    370         which means r0:=r0-r0-1 and r0 is #-1 at return.
    371         If r1=r2 then (C==1 and Z==1) and we execute SBC with carry_in=1,
    372         which means r0:=r0-r0 and r0 is #0 at return.
    373         (C==0 and Z==1) cannot happen because the carry bit is "not borrow".  */
    374         it      ls
    375         sbcls   r0, r0, r0
    376         bx      lr
    377 END(strcmp)
    378