Home | History | Annotate | Download | only in bionic
      1 /*
      2  * Copyright (c) 2013 ARM Ltd
      3  * All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  * 3. The name of the company may not be used to endorse or promote
     14  *    products derived from this software without specific prior written
     15  *    permission.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
     18  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
     19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     20  * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
     22  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     23  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     24  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     25  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include <private/bionic_asm.h>
     30 
     31 #ifdef __ARMEB__
     32 #define S2LOMEM lsl
     33 #define S2LOMEMEQ lsleq
     34 #define S2HIMEM lsr
     35 #define MSB 0x000000ff
     36 #define LSB 0xff000000
     37 #define BYTE0_OFFSET 24
     38 #define BYTE1_OFFSET 16
     39 #define BYTE2_OFFSET 8
     40 #define BYTE3_OFFSET 0
     41 #else /* not  __ARMEB__ */
     42 #define S2LOMEM lsr
     43 #define S2LOMEMEQ lsreq
     44 #define S2HIMEM lsl
     45 #define BYTE0_OFFSET 0
     46 #define BYTE1_OFFSET 8
     47 #define BYTE2_OFFSET 16
     48 #define BYTE3_OFFSET 24
     49 #define MSB 0xff000000
     50 #define LSB 0x000000ff
     51 #endif /* not  __ARMEB__ */
     52 
     53 .syntax         unified
     54 
     55 #if defined (__thumb__)
     56         .thumb
     57         .thumb_func
     58 #endif
     59 
     60         // To avoid warning about deprecated instructions, add an explicit
     61         // arch. The code generated is exactly the same.
     62         .arch armv7-a
     63 
     64 ENTRY(strcmp)
     65       /* Use LDRD whenever possible.  */
     66 
     67 /* The main thing to look out for when comparing large blocks is that
     68    the loads do not cross a page boundary when loading past the index
     69    of the byte with the first difference or the first string-terminator.
     70 
     71    For example, if the strings are identical and the string-terminator
     72    is at index k, byte by byte comparison will not load beyond address
     73    s1+k and s2+k; word by word comparison may load up to 3 bytes beyond
     74    k; double word - up to 7 bytes.  If the load of these bytes crosses
     75    a page boundary, it might cause a memory fault (if the page is not mapped)
     76    that would not have happened in byte by byte comparison.
     77 
     78    If an address is (double) word aligned, then a load of a (double) word
     79    from that address will not cross a page boundary.
     80    Therefore, the algorithm below considers word and double-word alignment
     81    of strings separately.  */
     82 
     83 /* High-level description of the algorithm.
     84 
     85    * The fast path: if both strings are double-word aligned,
     86      use LDRD to load two words from each string in every loop iteration.
     87    * If the strings have the same offset from a word boundary,
     88      use LDRB to load and compare byte by byte until
     89      the first string is aligned to a word boundary (at most 3 bytes).
     90      This is optimized for quick return on short unaligned strings.
     91    * If the strings have the same offset from a double-word boundary,
     92      use LDRD to load two words from each string in every loop iteration, as in the fast path.
     93    * If the strings do not have the same offset from a double-word boundary,
     94      load a word from the second string before the loop to initialize the queue.
     95      Use LDRD to load two words from every string in every loop iteration.
     96      Inside the loop, load the second word from the second string only after comparing
     97      the first word, using the queued value, to guarantee safety across page boundaries.
     98    * If the strings do not have the same offset from a word boundary,
     99      use LDR and a shift queue. Order of loads and comparisons matters,
    100      similarly to the previous case.
    101 
    102    * Use UADD8 and SEL to compare words, and use REV and CLZ to compute the return value.
    103    * The only difference between ARM and Thumb modes is the use of CBZ instruction.
    104    * The only difference between big and little endian is the use of REV in little endian
    105      to compute the return value, instead of MOV.
    106 */
    107 
    108         .macro m_cbz reg label
    109 #ifdef __thumb2__
    110         cbz     \reg, \label
    111 #else   /* not defined __thumb2__ */
    112         cmp     \reg, #0
    113         beq     \label
    114 #endif /* not defined __thumb2__ */
    115         .endm /* m_cbz */
    116 
    117         .macro m_cbnz reg label
    118 #ifdef __thumb2__
    119         cbnz    \reg, \label
    120 #else   /* not defined __thumb2__ */
    121         cmp     \reg, #0
    122         bne     \label
    123 #endif /* not defined __thumb2__ */
    124         .endm /* m_cbnz */
    125 
    126         .macro  init
    127         /* Macro to save temporary registers and prepare magic values.  */
    128         subs    sp, sp, #16
    129         .cfi_def_cfa_offset 16
    130         strd    r4, r5, [sp, #8]
    131         .cfi_rel_offset r4, 0
    132         .cfi_rel_offset r5, 4
    133         strd    r6, r7, [sp]
    134         .cfi_rel_offset r6, 8
    135         .cfi_rel_offset r7, 12
    136         mvn     r6, #0  /* all F */
    137         mov     r7, #0  /* all 0 */
    138         .endm   /* init */
    139 
    140         .macro  magic_compare_and_branch w1 w2 label
    141         /* Macro to compare registers w1 and w2 and conditionally branch to label.  */
    142         cmp     \w1, \w2        /* Are w1 and w2 the same?  */
    143         magic_find_zero_bytes \w1
    144         it      eq
    145         cmpeq   ip, #0          /* Is there a zero byte in w1?  */
    146         bne     \label
    147         .endm /* magic_compare_and_branch */
    148 
    149         .macro  magic_find_zero_bytes w1
    150         /* Macro to find all-zero bytes in w1, result is in ip.  */
    151         uadd8   ip, \w1, r6
    152         sel     ip, r7, r6
    153         .endm /* magic_find_zero_bytes */
    154 
    155         .macro  setup_return w1 w2
    156 #ifdef __ARMEB__
    157         mov     r1, \w1
    158         mov     r2, \w2
    159 #else /* not  __ARMEB__ */
    160         rev     r1, \w1
    161         rev     r2, \w2
    162 #endif /* not  __ARMEB__ */
    163         .endm /* setup_return */
    164 
    165         pld [r0, #0]
    166         pld [r1, #0]
    167 
    168         /* Are both strings double-word aligned?  */
    169         orr     ip, r0, r1
    170         tst     ip, #7
    171         bne     .L_do_align
    172 
    173         /* Fast path.  */
    174         init
    175 
    176 .L_doubleword_aligned:
    177 
    178         /* Get here when the strings to compare are double-word aligned.  */
    179         /* Compare two words in every iteration.  */
    180         .p2align        2
    181 2:
    182         pld [r0, #16]
    183         pld [r1, #16]
    184 
    185         /* Load the next double-word from each string.  */
    186         ldrd    r2, r3, [r0], #8
    187         ldrd    r4, r5, [r1], #8
    188 
    189         magic_compare_and_branch w1=r2, w2=r4, label=.L_return_24
    190         magic_compare_and_branch w1=r3, w2=r5, label=.L_return_35
    191         b       2b
    192 
    193 .L_do_align:
    194         /* Is the first string word-aligned?  */
    195         ands    ip, r0, #3
    196         beq     .L_word_aligned_r0
    197 
    198         /* Fast compare byte by byte until the first string is word-aligned.  */
    199         /* The offset of r0 from a word boundary is in ip. Thus, the number of bytes
    200         to read until the next word boundary is 4-ip.  */
    201         bic     r0, r0, #3
    202         ldr     r2, [r0], #4
    203         lsls    ip, ip, #31
    204         beq     .L_byte2
    205         bcs     .L_byte3
    206 
    207 .L_byte1:
    208         ldrb    ip, [r1], #1
    209         uxtb    r3, r2, ror #BYTE1_OFFSET
    210         subs    ip, r3, ip
    211         bne     .L_fast_return
    212         m_cbz   reg=r3, label=.L_fast_return
    213 
    214 .L_byte2:
    215         ldrb    ip, [r1], #1
    216         uxtb    r3, r2, ror #BYTE2_OFFSET
    217         subs    ip, r3, ip
    218         bne     .L_fast_return
    219         m_cbz   reg=r3, label=.L_fast_return
    220 
    221 .L_byte3:
    222         ldrb    ip, [r1], #1
    223         uxtb    r3, r2, ror #BYTE3_OFFSET
    224         subs    ip, r3, ip
    225         bne     .L_fast_return
    226         m_cbnz  reg=r3, label=.L_word_aligned_r0
    227 
    228 .L_fast_return:
    229         mov     r0, ip
    230         bx      lr
    231 
    232 .L_word_aligned_r0:
    233         init
    234         /* The first string is word-aligned.  */
    235         /* Is the second string word-aligned?  */
    236         ands    ip, r1, #3
    237         bne     .L_strcmp_unaligned
    238 
    239 .L_word_aligned:
    240         /* The strings are word-aligned. */
    241         /* Is the first string double-word aligned?  */
    242         tst     r0, #4
    243         beq     .L_doubleword_aligned_r0
    244 
    245         /* If r0 is not double-word aligned yet, align it by loading
    246         and comparing the next word from each string.  */
    247         ldr     r2, [r0], #4
    248         ldr     r4, [r1], #4
    249         magic_compare_and_branch w1=r2 w2=r4 label=.L_return_24
    250 
    251 .L_doubleword_aligned_r0:
    252         /* Get here when r0 is double-word aligned.  */
    253         /* Is r1 doubleword_aligned?  */
    254         tst     r1, #4
    255         beq     .L_doubleword_aligned
    256 
    257         /* Get here when the strings to compare are word-aligned,
    258         r0 is double-word aligned, but r1 is not double-word aligned.  */
    259 
    260         /* Initialize the queue.  */
    261         ldr     r5, [r1], #4
    262 
    263         /* Compare two words in every iteration.  */
    264         .p2align        2
    265 3:
    266         pld [r0, #16]
    267         pld [r1, #16]
    268 
    269         /* Load the next double-word from each string and compare.  */
    270         ldrd    r2, r3, [r0], #8
    271         magic_compare_and_branch w1=r2 w2=r5 label=.L_return_25
    272         ldrd    r4, r5, [r1], #8
    273         magic_compare_and_branch w1=r3 w2=r4 label=.L_return_34
    274         b       3b
    275 
    276         .macro miscmp_word offsetlo offsethi
    277         /* Macro to compare misaligned strings.  */
    278         /* r0, r1 are word-aligned, and at least one of the strings
    279         is not double-word aligned.  */
    280         /* Compare one word in every loop iteration.  */
    281         /* OFFSETLO is the original bit-offset of r1 from a word-boundary,
    282         OFFSETHI is 32 - OFFSETLO (i.e., offset from the next word).  */
    283 
    284         /* Initialize the shift queue.  */
    285         ldr     r5, [r1], #4
    286 
    287         /* Compare one word from each string in every loop iteration.  */
    288         .p2align        2
    289 7:
    290         ldr     r3, [r0], #4
    291         S2LOMEM r5, r5, #\offsetlo
    292         magic_find_zero_bytes w1=r3
    293         cmp     r7, ip, S2HIMEM #\offsetlo
    294         and     r2, r3, r6, S2LOMEM #\offsetlo
    295         it      eq
    296         cmpeq   r2, r5
    297         bne     .L_return_25
    298         ldr     r5, [r1], #4
    299         cmp     ip, #0
    300         eor r3, r2, r3
    301         S2HIMEM r2, r5, #\offsethi
    302         it      eq
    303         cmpeq   r3, r2
    304         bne     .L_return_32
    305         b       7b
    306         .endm /* miscmp_word */
    307 
    308 .L_strcmp_unaligned:
    309         /* r0 is word-aligned, r1 is at offset ip from a word.  */
    310         /* Align r1 to the (previous) word-boundary.  */
    311         bic     r1, r1, #3
    312 
    313         /* Unaligned comparison word by word using LDRs. */
    314         cmp     ip, #2
    315         beq     .L_miscmp_word_16                 /* If ip == 2.  */
    316         bge     .L_miscmp_word_24                 /* If ip == 3.  */
    317         miscmp_word offsetlo=8 offsethi=24        /* If ip == 1.  */
    318 .L_miscmp_word_16:  miscmp_word offsetlo=16 offsethi=16
    319 .L_miscmp_word_24:  miscmp_word offsetlo=24 offsethi=8
    320 
    321 
    322 .L_return_32:
    323         setup_return w1=r3, w2=r2
    324         b       .L_do_return
    325 .L_return_34:
    326         setup_return w1=r3, w2=r4
    327         b       .L_do_return
    328 .L_return_25:
    329         setup_return w1=r2, w2=r5
    330         b       .L_do_return
    331 .L_return_35:
    332         setup_return w1=r3, w2=r5
    333         b       .L_do_return
    334 .L_return_24:
    335         setup_return w1=r2, w2=r4
    336 
    337 .L_do_return:
    338 
    339 #ifdef __ARMEB__
    340         mov     r0, ip
    341 #else /* not  __ARMEB__ */
    342         rev     r0, ip
    343 #endif /* not  __ARMEB__ */
    344 
    345         /* Restore temporaries early, before computing the return value.  */
    346         ldrd    r6, r7, [sp]
    347         ldrd    r4, r5, [sp, #8]
    348         adds    sp, sp, #16
    349         .cfi_def_cfa_offset 0
    350         .cfi_restore r4
    351         .cfi_restore r5
    352         .cfi_restore r6
    353         .cfi_restore r7
    354 
    355         /* There is a zero or a different byte between r1 and r2.  */
    356         /* r0 contains a mask of all-zero bytes in r1.  */
    357         /* Using r0 and not ip here because cbz requires low register.  */
    358         m_cbz   reg=r0, label=.L_compute_return_value
    359         clz     r0, r0
    360         /* r0 contains the number of bits on the left of the first all-zero byte in r1.  */
    361         rsb     r0, r0, #24
    362         /* Here, r0 contains the number of bits on the right of the first all-zero byte in r1.  */
    363         lsr     r1, r1, r0
    364         lsr     r2, r2, r0
    365 
    366 .L_compute_return_value:
    367         movs    r0, #1
    368         cmp     r1, r2
    369         /* The return value is computed as follows.
    370         If r1>r2 then (C==1 and Z==0) and LS doesn't hold and r0 is #1 at return.
    371         If r1<r2 then (C==0 and Z==0) and we execute SBC with carry_in=0,
    372         which means r0:=r0-r0-1 and r0 is #-1 at return.
    373         If r1=r2 then (C==1 and Z==1) and we execute SBC with carry_in=1,
    374         which means r0:=r0-r0 and r0 is #0 at return.
    375         (C==0 and Z==1) cannot happen because the carry bit is "not borrow".  */
    376         it      ls
    377         sbcls   r0, r0, r0
    378         bx      lr
    379 END(strcmp)
    380