Home | History | Annotate | Download | only in bionic
      1 /*
      2  * Copyright (c) 2013 ARM Ltd
      3  * All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  * 3. The name of the company may not be used to endorse or promote
     14  *    products derived from this software without specific prior written
     15  *    permission.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
     18  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
     19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     20  * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
     22  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     23  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     24  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     25  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include <machine/cpu-features.h>
     30 #include <machine/asm.h>
     31 
     32 #ifdef __ARMEB__
     33 #define S2LOMEM lsl
     34 #define S2LOMEMEQ lsleq
     35 #define S2HIMEM lsr
     36 #define MSB 0x000000ff
     37 #define LSB 0xff000000
     38 #define BYTE0_OFFSET 24
     39 #define BYTE1_OFFSET 16
     40 #define BYTE2_OFFSET 8
     41 #define BYTE3_OFFSET 0
     42 #else /* not  __ARMEB__ */
     43 #define S2LOMEM lsr
     44 #define S2LOMEMEQ lsreq
     45 #define S2HIMEM lsl
     46 #define BYTE0_OFFSET 0
     47 #define BYTE1_OFFSET 8
     48 #define BYTE2_OFFSET 16
     49 #define BYTE3_OFFSET 24
     50 #define MSB 0xff000000
     51 #define LSB 0x000000ff
     52 #endif /* not  __ARMEB__ */
     53 
     54 .syntax         unified
     55 
     56 #if defined (__thumb__)
     57         .thumb
     58         .thumb_func
     59 #endif
     60 
     61 ENTRY(strcmp)
     62       /* Use LDRD whenever possible.  */
     63 
     64 /* The main thing to look out for when comparing large blocks is that
     65    the loads do not cross a page boundary when loading past the index
     66    of the byte with the first difference or the first string-terminator.
     67 
     68    For example, if the strings are identical and the string-terminator
     69    is at index k, byte by byte comparison will not load beyond address
     70    s1+k and s2+k; word by word comparison may load up to 3 bytes beyond
     71    k; double word - up to 7 bytes.  If the load of these bytes crosses
     72    a page boundary, it might cause a memory fault (if the page is not mapped)
     73    that would not have happened in byte by byte comparison.
     74 
     75    If an address is (double) word aligned, then a load of a (double) word
     76    from that address will not cross a page boundary.
     77    Therefore, the algorithm below considers word and double-word alignment
     78    of strings separately.  */
     79 
     80 /* High-level description of the algorithm.
     81 
     82    * The fast path: if both strings are double-word aligned,
     83      use LDRD to load two words from each string in every loop iteration.
     84    * If the strings have the same offset from a word boundary,
     85      use LDRB to load and compare byte by byte until
     86      the first string is aligned to a word boundary (at most 3 bytes).
     87      This is optimized for quick return on short unaligned strings.
     88    * If the strings have the same offset from a double-word boundary,
     89      use LDRD to load two words from each string in every loop iteration, as in the fast path.
     90    * If the strings do not have the same offset from a double-word boundary,
     91      load a word from the second string before the loop to initialize the queue.
     92      Use LDRD to load two words from every string in every loop iteration.
     93      Inside the loop, load the second word from the second string only after comparing
     94      the first word, using the queued value, to guarantee safety across page boundaries.
     95    * If the strings do not have the same offset from a word boundary,
     96      use LDR and a shift queue. Order of loads and comparisons matters,
     97      similarly to the previous case.
     98 
     99    * Use UADD8 and SEL to compare words, and use REV and CLZ to compute the return value.
    100    * The only difference between ARM and Thumb modes is the use of CBZ instruction.
    101    * The only difference between big and little endian is the use of REV in little endian
    102      to compute the return value, instead of MOV.
    103 */
    104 
    105         .macro m_cbz reg label
    106 #ifdef __thumb2__
    107         cbz     \reg, \label
    108 #else   /* not defined __thumb2__ */
    109         cmp     \reg, #0
    110         beq     \label
    111 #endif /* not defined __thumb2__ */
    112         .endm /* m_cbz */
    113 
    114         .macro m_cbnz reg label
    115 #ifdef __thumb2__
    116         cbnz    \reg, \label
    117 #else   /* not defined __thumb2__ */
    118         cmp     \reg, #0
    119         bne     \label
    120 #endif /* not defined __thumb2__ */
    121         .endm /* m_cbnz */
    122 
    123         .macro  init
    124         /* Macro to save temporary registers and prepare magic values.  */
    125         subs    sp, sp, #16
    126         strd    r4, r5, [sp, #8]
    127         strd    r6, r7, [sp]
    128         mvn     r6, #0  /* all F */
    129         mov     r7, #0  /* all 0 */
    130         .endm   /* init */
    131 
    132         .macro  magic_compare_and_branch w1 w2 label
    133         /* Macro to compare registers w1 and w2 and conditionally branch to label.  */
    134         cmp     \w1, \w2        /* Are w1 and w2 the same?  */
    135         magic_find_zero_bytes \w1
    136         it      eq
    137         cmpeq   ip, #0          /* Is there a zero byte in w1?  */
    138         bne     \label
    139         .endm /* magic_compare_and_branch */
    140 
    141         .macro  magic_find_zero_bytes w1
    142         /* Macro to find all-zero bytes in w1, result is in ip.  */
    143 #if (defined (__ARM_FEATURE_DSP))
    144         uadd8   ip, \w1, r6
    145         sel     ip, r7, r6
    146 #else /* not defined (__ARM_FEATURE_DSP) */
    147         /* __ARM_FEATURE_DSP is not defined for some Cortex-M processors.
    148         Coincidently, these processors only have Thumb-2 mode, where we can use the
    149         the (large) magic constant available directly as an immediate in instructions.
    150         Note that we cannot use the magic constant in ARM mode, where we need
    151         to create the constant in a register.  */
    152         sub     ip, \w1, #0x01010101
    153         bic     ip, ip, \w1
    154         and     ip, ip, #0x80808080
    155 #endif /* not defined (__ARM_FEATURE_DSP) */
    156         .endm /* magic_find_zero_bytes */
    157 
    158         .macro  setup_return w1 w2
    159 #ifdef __ARMEB__
    160         mov     r1, \w1
    161         mov     r2, \w2
    162 #else /* not  __ARMEB__ */
    163         rev     r1, \w1
    164         rev     r2, \w2
    165 #endif /* not  __ARMEB__ */
    166         .endm /* setup_return */
    167 
    168         pld [r0, #0]
    169         pld [r1, #0]
    170 
    171         /* Are both strings double-word aligned?  */
    172         orr     ip, r0, r1
    173         tst     ip, #7
    174         bne     do_align
    175 
    176         /* Fast path.  */
    177         init
    178 
    179 doubleword_aligned:
    180 
    181         /* Get here when the strings to compare are double-word aligned.  */
    182         /* Compare two words in every iteration.  */
    183         .p2align        2
    184 2:
    185         pld [r0, #16]
    186         pld [r1, #16]
    187 
    188         /* Load the next double-word from each string.  */
    189         ldrd    r2, r3, [r0], #8
    190         ldrd    r4, r5, [r1], #8
    191 
    192         magic_compare_and_branch w1=r2, w2=r4, label=return_24
    193         magic_compare_and_branch w1=r3, w2=r5, label=return_35
    194         b       2b
    195 
    196 do_align:
    197         /* Is the first string word-aligned?  */
    198         ands    ip, r0, #3
    199         beq     word_aligned_r0
    200 
    201         /* Fast compare byte by byte until the first string is word-aligned.  */
    202         /* The offset of r0 from a word boundary is in ip. Thus, the number of bytes
    203         to read until the next word boundary is 4-ip.  */
    204         bic     r0, r0, #3
    205         ldr     r2, [r0], #4
    206         lsls    ip, ip, #31
    207         beq     byte2
    208         bcs     byte3
    209 
    210 byte1:
    211         ldrb    ip, [r1], #1
    212         uxtb    r3, r2, ror #BYTE1_OFFSET
    213         subs    ip, r3, ip
    214         bne     fast_return
    215         m_cbz   reg=r3, label=fast_return
    216 
    217 byte2:
    218         ldrb    ip, [r1], #1
    219         uxtb    r3, r2, ror #BYTE2_OFFSET
    220         subs    ip, r3, ip
    221         bne     fast_return
    222         m_cbz   reg=r3, label=fast_return
    223 
    224 byte3:
    225         ldrb    ip, [r1], #1
    226         uxtb    r3, r2, ror #BYTE3_OFFSET
    227         subs    ip, r3, ip
    228         bne     fast_return
    229         m_cbnz  reg=r3, label=word_aligned_r0
    230 
    231 fast_return:
    232         mov     r0, ip
    233         bx      lr
    234 
    235 word_aligned_r0:
    236         init
    237         /* The first string is word-aligned.  */
    238         /* Is the second string word-aligned?  */
    239         ands    ip, r1, #3
    240         bne     strcmp_unaligned
    241 
    242 word_aligned:
    243         /* The strings are word-aligned. */
    244         /* Is the first string double-word aligned?  */
    245         tst     r0, #4
    246         beq     doubleword_aligned_r0
    247 
    248         /* If r0 is not double-word aligned yet, align it by loading
    249         and comparing the next word from each string.  */
    250         ldr     r2, [r0], #4
    251         ldr     r4, [r1], #4
    252         magic_compare_and_branch w1=r2 w2=r4 label=return_24
    253 
    254 doubleword_aligned_r0:
    255         /* Get here when r0 is double-word aligned.  */
    256         /* Is r1 doubleword_aligned?  */
    257         tst     r1, #4
    258         beq     doubleword_aligned
    259 
    260         /* Get here when the strings to compare are word-aligned,
    261         r0 is double-word aligned, but r1 is not double-word aligned.  */
    262 
    263         /* Initialize the queue.  */
    264         ldr     r5, [r1], #4
    265 
    266         /* Compare two words in every iteration.  */
    267         .p2align        2
    268 3:
    269         pld [r0, #16]
    270         pld [r1, #16]
    271 
    272         /* Load the next double-word from each string and compare.  */
    273         ldrd    r2, r3, [r0], #8
    274         magic_compare_and_branch w1=r2 w2=r5 label=return_25
    275         ldrd    r4, r5, [r1], #8
    276         magic_compare_and_branch w1=r3 w2=r4 label=return_34
    277         b       3b
    278 
    279         .macro miscmp_word offsetlo offsethi
    280         /* Macro to compare misaligned strings.  */
    281         /* r0, r1 are word-aligned, and at least one of the strings
    282         is not double-word aligned.  */
    283         /* Compare one word in every loop iteration.  */
    284         /* OFFSETLO is the original bit-offset of r1 from a word-boundary,
    285         OFFSETHI is 32 - OFFSETLO (i.e., offset from the next word).  */
    286 
    287         /* Initialize the shift queue.  */
    288         ldr     r5, [r1], #4
    289 
    290         /* Compare one word from each string in every loop iteration.  */
    291         .p2align        2
    292 7:
    293         ldr     r3, [r0], #4
    294         S2LOMEM r5, r5, #\offsetlo
    295         magic_find_zero_bytes w1=r3
    296         cmp     r7, ip, S2HIMEM #\offsetlo
    297         and     r2, r3, r6, S2LOMEM #\offsetlo
    298         it      eq
    299         cmpeq   r2, r5
    300         bne     return_25
    301         ldr     r5, [r1], #4
    302         cmp     ip, #0
    303         eor r3, r2, r3
    304         S2HIMEM r2, r5, #\offsethi
    305         it      eq
    306         cmpeq   r3, r2
    307         bne     return_32
    308         b       7b
    309         .endm /* miscmp_word */
    310 
    311 strcmp_unaligned:
    312         /* r0 is word-aligned, r1 is at offset ip from a word.  */
    313         /* Align r1 to the (previous) word-boundary.  */
    314         bic     r1, r1, #3
    315 
    316         /* Unaligned comparison word by word using LDRs. */
    317         cmp     ip, #2
    318         beq     miscmp_word_16                    /* If ip == 2.  */
    319         bge     miscmp_word_24                    /* If ip == 3.  */
    320         miscmp_word offsetlo=8 offsethi=24        /* If ip == 1.  */
    321 miscmp_word_24:  miscmp_word offsetlo=24 offsethi=8
    322 
    323 
    324 return_32:
    325         setup_return w1=r3, w2=r2
    326         b       do_return
    327 return_34:
    328         setup_return w1=r3, w2=r4
    329         b       do_return
    330 return_25:
    331         setup_return w1=r2, w2=r5
    332         b       do_return
    333 return_35:
    334         setup_return w1=r3, w2=r5
    335         b       do_return
    336 return_24:
    337         setup_return w1=r2, w2=r4
    338 
    339 do_return:
    340 
    341 #ifdef __ARMEB__
    342         mov     r0, ip
    343 #else /* not  __ARMEB__ */
    344         rev     r0, ip
    345 #endif /* not  __ARMEB__ */
    346 
    347         /* Restore temporaries early, before computing the return value.  */
    348         ldrd    r6, r7, [sp]
    349         ldrd    r4, r5, [sp, #8]
    350         adds    sp, sp, #16
    351 
    352         /* There is a zero or a different byte between r1 and r2.  */
    353         /* r0 contains a mask of all-zero bytes in r1.  */
    354         /* Using r0 and not ip here because cbz requires low register.  */
    355         m_cbz   reg=r0, label=compute_return_value
    356         clz     r0, r0
    357         /* r0 contains the number of bits on the left of the first all-zero byte in r1.  */
    358         rsb     r0, r0, #24
    359         /* Here, r0 contains the number of bits on the right of the first all-zero byte in r1.  */
    360         lsr     r1, r1, r0
    361         lsr     r2, r2, r0
    362 
    363 compute_return_value:
    364         movs    r0, #1
    365         cmp     r1, r2
    366         /* The return value is computed as follows.
    367         If r1>r2 then (C==1 and Z==0) and LS doesn't hold and r0 is #1 at return.
    368         If r1<r2 then (C==0 and Z==0) and we execute SBC with carry_in=0,
    369         which means r0:=r0-r0-1 and r0 is #-1 at return.
    370         If r1=r2 then (C==1 and Z==1) and we execute SBC with carry_in=1,
    371         which means r0:=r0-r0 and r0 is #0 at return.
    372         (C==0 and Z==1) cannot happen because the carry bit is "not borrow".  */
    373         it      ls
    374         sbcls   r0, r0, r0
    375         bx      lr
    376 
    377     /* The code from the previous version of strcmp.S handles this
    378      * particular case (the second string is 2 bytes off a word alignment)
    379      * faster than any current version. In this very specific case, use the
    380      * previous version. See bionic/libc/arch-arm/cortex-a15/bionic/strcmp.S
    381      * for the unedited version of this code.
    382      */
    383 miscmp_word_16:
    384 	wp1 .req r0
    385 	wp2 .req r1
    386 	b1  .req r2
    387 	w1  .req r4
    388 	w2  .req r5
    389 	t1  .req ip
    390 	@ r3 is scratch
    391 
    392     /* At this point, wp1 (r0) has already been word-aligned. */
    393 2:
    394 	mov	b1, #1
    395 	orr	b1, b1, b1, lsl #8
    396 	orr	b1, b1, b1, lsl #16
    397 
    398 	and	t1, wp2, #3
    399 	bic	wp2, wp2, #3
    400 	ldr	w1, [wp1], #4
    401 	ldr	w2, [wp2], #4
    402 
    403 	/* Critical inner Loop: Block with 2 bytes initial overlap */
    404 	.p2align	2
    405 2:
    406 	S2HIMEM	t1, w1, #16
    407 	sub	r3, w1, b1
    408 	S2LOMEM	t1, t1, #16
    409 	bic	r3, r3, w1
    410 	cmp	t1, w2, S2LOMEM #16
    411 	bne	4f
    412 	ands	r3, r3, b1, lsl #7
    413 	it	eq
    414 	ldreq	w2, [wp2], #4
    415 	bne	5f
    416 	eor	t1, t1, w1
    417 	cmp	t1, w2, S2HIMEM #16
    418 	bne	6f
    419 	ldr	w1, [wp1], #4
    420 	b	2b
    421 
    422 5:
    423 #ifdef __ARMEB__
    424 	/* The syndrome value may contain false ones if the string ends
    425 	 * with the bytes 0x01 0x00
    426 	 */
    427 	tst	w1, #0xff000000
    428 	it	ne
    429 	tstne	w1, #0x00ff0000
    430 	beq	7f
    431 #else
    432 	lsls	r3, r3, #16
    433 	bne	7f
    434 #endif
    435 	ldrh	w2, [wp2]
    436 	S2LOMEM	t1, w1, #16
    437 #ifdef __ARMEB__
    438 	lsl	w2, w2, #16
    439 #endif
    440 	b	8f
    441 
    442 6:
    443 	S2HIMEM	w2, w2, #16
    444 	S2LOMEM	t1, w1, #16
    445 4:
    446 	S2LOMEM	w2, w2, #16
    447 	b	8f
    448 
    449 7:
    450 	mov	r0, #0
    451 
    452     /* Restore registers and stack. */
    453     ldrd    r6, r7, [sp]
    454     ldrd    r4, r5, [sp, #8]
    455     adds    sp, sp, #16
    456 
    457 	bx	lr
    458 
    459 8:
    460 	and	r2, t1, #LSB
    461 	and	r0, w2, #LSB
    462 	cmp	r0, #1
    463 	it	cs
    464 	cmpcs	r0, r2
    465 	itt	eq
    466 	S2LOMEMEQ	t1, t1, #8
    467 	S2LOMEMEQ	w2, w2, #8
    468 	beq	8b
    469 	sub	r0, r2, r0
    470 
    471     /* Restore registers and stack. */
    472     ldrd    r6, r7, [sp]
    473     ldrd    r4, r5, [sp, #8]
    474     adds    sp, sp, #16
    475 
    476 	bx	lr
    477 END(strcmp)
    478