Home | History | Annotate | Download | only in bionic
      1 /*
      2  * Copyright (c) 2013 ARM Ltd
      3  * All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  * 3. The name of the company may not be used to endorse or promote
     14  *    products derived from this software without specific prior written
     15  *    permission.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
     18  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
     19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     20  * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
     22  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     23  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     24  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     25  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include <machine/cpu-features.h>
     30 #include <machine/asm.h>
     31 
     32 #ifdef __ARMEB__
     33 #define S2LOMEM lsl
     34 #define S2LOMEMEQ lsleq
     35 #define S2HIMEM lsr
     36 #define MSB 0x000000ff
     37 #define LSB 0xff000000
     38 #define BYTE0_OFFSET 24
     39 #define BYTE1_OFFSET 16
     40 #define BYTE2_OFFSET 8
     41 #define BYTE3_OFFSET 0
     42 #else /* not  __ARMEB__ */
     43 #define S2LOMEM lsr
     44 #define S2LOMEMEQ lsreq
     45 #define S2HIMEM lsl
     46 #define BYTE0_OFFSET 0
     47 #define BYTE1_OFFSET 8
     48 #define BYTE2_OFFSET 16
     49 #define BYTE3_OFFSET 24
     50 #define MSB 0xff000000
     51 #define LSB 0x000000ff
     52 #endif /* not  __ARMEB__ */
     53 
     54 .syntax         unified
     55 
     56 #if defined (__thumb__)
     57         .thumb
     58         .thumb_func
     59 #endif
     60 
     61 ENTRY(strcmp)
     62       /* Use LDRD whenever possible.  */
     63 
     64 /* The main thing to look out for when comparing large blocks is that
     65    the loads do not cross a page boundary when loading past the index
     66    of the byte with the first difference or the first string-terminator.
     67 
     68    For example, if the strings are identical and the string-terminator
     69    is at index k, byte by byte comparison will not load beyond address
     70    s1+k and s2+k; word by word comparison may load up to 3 bytes beyond
     71    k; double word - up to 7 bytes.  If the load of these bytes crosses
     72    a page boundary, it might cause a memory fault (if the page is not mapped)
     73    that would not have happened in byte by byte comparison.
     74 
     75    If an address is (double) word aligned, then a load of a (double) word
     76    from that address will not cross a page boundary.
     77    Therefore, the algorithm below considers word and double-word alignment
     78    of strings separately.  */
     79 
     80 /* High-level description of the algorithm.
     81 
     82    * The fast path: if both strings are double-word aligned,
     83      use LDRD to load two words from each string in every loop iteration.
     84    * If the strings have the same offset from a word boundary,
     85      use LDRB to load and compare byte by byte until
     86      the first string is aligned to a word boundary (at most 3 bytes).
     87      This is optimized for quick return on short unaligned strings.
     88    * If the strings have the same offset from a double-word boundary,
     89      use LDRD to load two words from each string in every loop iteration, as in the fast path.
     90    * If the strings do not have the same offset from a double-word boundary,
     91      load a word from the second string before the loop to initialize the queue.
     92      Use LDRD to load two words from every string in every loop iteration.
     93      Inside the loop, load the second word from the second string only after comparing
     94      the first word, using the queued value, to guarantee safety across page boundaries.
     95    * If the strings do not have the same offset from a word boundary,
     96      use LDR and a shift queue. Order of loads and comparisons matters,
     97      similarly to the previous case.
     98 
     99    * Use UADD8 and SEL to compare words, and use REV and CLZ to compute the return value.
    100    * The only difference between ARM and Thumb modes is the use of CBZ instruction.
    101    * The only difference between big and little endian is the use of REV in little endian
    102      to compute the return value, instead of MOV.
    103 */
    104 
    105         .macro m_cbz reg label
    106 #ifdef __thumb2__
    107         cbz     \reg, \label
    108 #else   /* not defined __thumb2__ */
    109         cmp     \reg, #0
    110         beq     \label
    111 #endif /* not defined __thumb2__ */
    112         .endm /* m_cbz */
    113 
    114         .macro m_cbnz reg label
    115 #ifdef __thumb2__
    116         cbnz    \reg, \label
    117 #else   /* not defined __thumb2__ */
    118         cmp     \reg, #0
    119         bne     \label
    120 #endif /* not defined __thumb2__ */
    121         .endm /* m_cbnz */
    122 
    123         .macro  init
    124         /* Macro to save temporary registers and prepare magic values.  */
    125         subs    sp, sp, #16
    126         strd    r4, r5, [sp, #8]
    127         strd    r6, r7, [sp]
    128         mvn     r6, #0  /* all F */
    129         mov     r7, #0  /* all 0 */
    130         .endm   /* init */
    131 
    132         .macro  magic_compare_and_branch w1 w2 label
    133         /* Macro to compare registers w1 and w2 and conditionally branch to label.  */
    134         cmp     \w1, \w2        /* Are w1 and w2 the same?  */
    135         magic_find_zero_bytes \w1
    136         it      eq
    137         cmpeq   ip, #0          /* Is there a zero byte in w1?  */
    138         bne     \label
    139         .endm /* magic_compare_and_branch */
    140 
    141         .macro  magic_find_zero_bytes w1
    142         /* Macro to find all-zero bytes in w1, result is in ip.  */
    143 #if (defined (__ARM_FEATURE_DSP))
    144         uadd8   ip, \w1, r6
    145         sel     ip, r7, r6
    146 #else /* not defined (__ARM_FEATURE_DSP) */
    147         /* __ARM_FEATURE_DSP is not defined for some Cortex-M processors.
    148         Coincidently, these processors only have Thumb-2 mode, where we can use the
    149         the (large) magic constant available directly as an immediate in instructions.
    150         Note that we cannot use the magic constant in ARM mode, where we need
    151         to create the constant in a register.  */
    152         sub     ip, \w1, #0x01010101
    153         bic     ip, ip, \w1
    154         and     ip, ip, #0x80808080
    155 #endif /* not defined (__ARM_FEATURE_DSP) */
    156         .endm /* magic_find_zero_bytes */
    157 
    158         .macro  setup_return w1 w2
    159 #ifdef __ARMEB__
    160         mov     r1, \w1
    161         mov     r2, \w2
    162 #else /* not  __ARMEB__ */
    163         rev     r1, \w1
    164         rev     r2, \w2
    165 #endif /* not  __ARMEB__ */
    166         .endm /* setup_return */
    167 
    168         pld [r0, #0]
    169         pld [r1, #0]
    170 
    171         /* Are both strings double-word aligned?  */
    172         orr     ip, r0, r1
    173         tst     ip, #7
    174         bne     do_align
    175 
    176         /* Fast path.  */
    177         init
    178 
    179 doubleword_aligned:
    180 
    181         /* Get here when the strings to compare are double-word aligned.  */
    182         /* Compare two words in every iteration.  */
    183         .p2align        2
    184 2:
    185         pld [r0, #16]
    186         pld [r1, #16]
    187 
    188         /* Load the next double-word from each string.  */
    189         ldrd    r2, r3, [r0], #8
    190         ldrd    r4, r5, [r1], #8
    191 
    192         magic_compare_and_branch w1=r2, w2=r4, label=return_24
    193         magic_compare_and_branch w1=r3, w2=r5, label=return_35
    194         b       2b
    195 
    196 do_align:
    197         /* Is the first string word-aligned?  */
    198         ands    ip, r0, #3
    199         beq     word_aligned_r0
    200 
    201         /* Fast compare byte by byte until the first string is word-aligned.  */
    202         /* The offset of r0 from a word boundary is in ip. Thus, the number of bytes
    203         to read until the next word boundary is 4-ip.  */
    204         bic     r0, r0, #3
    205         ldr     r2, [r0], #4
    206         lsls    ip, ip, #31
    207         beq     byte2
    208         bcs     byte3
    209 
    210 byte1:
    211         ldrb    ip, [r1], #1
    212         uxtb    r3, r2, ror #BYTE1_OFFSET
    213         subs    ip, r3, ip
    214         bne     fast_return
    215         m_cbz   reg=r3, label=fast_return
    216 
    217 byte2:
    218         ldrb    ip, [r1], #1
    219         uxtb    r3, r2, ror #BYTE2_OFFSET
    220         subs    ip, r3, ip
    221         bne     fast_return
    222         m_cbz   reg=r3, label=fast_return
    223 
    224 byte3:
    225         ldrb    ip, [r1], #1
    226         uxtb    r3, r2, ror #BYTE3_OFFSET
    227         subs    ip, r3, ip
    228         bne     fast_return
    229         m_cbnz  reg=r3, label=word_aligned_r0
    230 
    231 fast_return:
    232         mov     r0, ip
    233         bx      lr
    234 
    235 word_aligned_r0:
    236         init
    237         /* The first string is word-aligned.  */
    238         /* Is the second string word-aligned?  */
    239         ands    ip, r1, #3
    240         bne     strcmp_unaligned
    241 
    242 word_aligned:
    243         /* The strings are word-aligned. */
    244         /* Is the first string double-word aligned?  */
    245         tst     r0, #4
    246         beq     doubleword_aligned_r0
    247 
    248         /* If r0 is not double-word aligned yet, align it by loading
    249         and comparing the next word from each string.  */
    250         ldr     r2, [r0], #4
    251         ldr     r4, [r1], #4
    252         magic_compare_and_branch w1=r2 w2=r4 label=return_24
    253 
    254 doubleword_aligned_r0:
    255         /* Get here when r0 is double-word aligned.  */
    256         /* Is r1 doubleword_aligned?  */
    257         tst     r1, #4
    258         beq     doubleword_aligned
    259 
    260         /* Get here when the strings to compare are word-aligned,
    261         r0 is double-word aligned, but r1 is not double-word aligned.  */
    262 
    263         /* Initialize the queue.  */
    264         ldr     r5, [r1], #4
    265 
    266         /* Compare two words in every iteration.  */
    267         .p2align        2
    268 3:
    269         pld [r0, #16]
    270         pld [r1, #16]
    271 
    272         /* Load the next double-word from each string and compare.  */
    273         ldrd    r2, r3, [r0], #8
    274         magic_compare_and_branch w1=r2 w2=r5 label=return_25
    275         ldrd    r4, r5, [r1], #8
    276         magic_compare_and_branch w1=r3 w2=r4 label=return_34
    277         b       3b
    278 
    279         .macro miscmp_word offsetlo offsethi
    280         /* Macro to compare misaligned strings.  */
    281         /* r0, r1 are word-aligned, and at least one of the strings
    282         is not double-word aligned.  */
    283         /* Compare one word in every loop iteration.  */
    284         /* OFFSETLO is the original bit-offset of r1 from a word-boundary,
    285         OFFSETHI is 32 - OFFSETLO (i.e., offset from the next word).  */
    286 
    287         /* Initialize the shift queue.  */
    288         ldr     r5, [r1], #4
    289 
    290         /* Compare one word from each string in every loop iteration.  */
    291         .p2align        2
    292 7:
    293         ldr     r3, [r0], #4
    294         S2LOMEM r5, r5, #\offsetlo
    295         magic_find_zero_bytes w1=r3
    296         cmp     r7, ip, S2HIMEM #\offsetlo
    297         and     r2, r3, r6, S2LOMEM #\offsetlo
    298         it      eq
    299         cmpeq   r2, r5
    300         bne     return_25
    301         ldr     r5, [r1], #4
    302         cmp     ip, #0
    303         eor r3, r2, r3
    304         S2HIMEM r2, r5, #\offsethi
    305         it      eq
    306         cmpeq   r3, r2
    307         bne     return_32
    308         b       7b
    309         .endm /* miscmp_word */
    310 
    311 return_32:
    312         setup_return w1=r3, w2=r2
    313         b       do_return
    314 return_34:
    315         setup_return w1=r3, w2=r4
    316         b       do_return
    317 return_25:
    318         setup_return w1=r2, w2=r5
    319         b       do_return
    320 return_35:
    321         setup_return w1=r3, w2=r5
    322         b       do_return
    323 return_24:
    324         setup_return w1=r2, w2=r4
    325 
    326 do_return:
    327 
    328 #ifdef __ARMEB__
    329         mov     r0, ip
    330 #else /* not  __ARMEB__ */
    331         rev     r0, ip
    332 #endif /* not  __ARMEB__ */
    333 
    334         /* Restore temporaries early, before computing the return value.  */
    335         ldrd    r6, r7, [sp]
    336         ldrd    r4, r5, [sp, #8]
    337         adds    sp, sp, #16
    338 
    339         /* There is a zero or a different byte between r1 and r2.  */
    340         /* r0 contains a mask of all-zero bytes in r1.  */
    341         /* Using r0 and not ip here because cbz requires low register.  */
    342         m_cbz   reg=r0, label=compute_return_value
    343         clz     r0, r0
    344         /* r0 contains the number of bits on the left of the first all-zero byte in r1.  */
    345         rsb     r0, r0, #24
    346         /* Here, r0 contains the number of bits on the right of the first all-zero byte in r1.  */
    347         lsr     r1, r1, r0
    348         lsr     r2, r2, r0
    349 
    350 compute_return_value:
    351         movs    r0, #1
    352         cmp     r1, r2
    353         /* The return value is computed as follows.
    354         If r1>r2 then (C==1 and Z==0) and LS doesn't hold and r0 is #1 at return.
    355         If r1<r2 then (C==0 and Z==0) and we execute SBC with carry_in=0,
    356         which means r0:=r0-r0-1 and r0 is #-1 at return.
    357         If r1=r2 then (C==1 and Z==1) and we execute SBC with carry_in=1,
    358         which means r0:=r0-r0 and r0 is #0 at return.
    359         (C==0 and Z==1) cannot happen because the carry bit is "not borrow".  */
    360         it      ls
    361         sbcls   r0, r0, r0
    362         bx      lr
    363 
    364     /* The code from the previous version of strcmp.S handles all of the
    365      * cases where the first string and seconds string cannot both be
    366      * aligned to a word boundary faster than the new algorithm. See
    367      * bionic/libc/arch-arm/cortex-a15/bionic/strcmp.S for the unedited
    368      * version of the code.
    369      */
    370 strcmp_unaligned:
    371 	wp1 .req r0
    372 	wp2 .req r1
    373 	b1  .req r2
    374 	w1  .req r4
    375 	w2  .req r5
    376 	t1  .req ip
    377 	@ r3 is scratch
    378 
    379 2:
    380 	mov	b1, #1
    381 	orr	b1, b1, b1, lsl #8
    382 	orr	b1, b1, b1, lsl #16
    383 
    384 	and	t1, wp2, #3
    385 	bic	wp2, wp2, #3
    386 	ldr	w1, [wp1], #4
    387 	ldr	w2, [wp2], #4
    388 	cmp	t1, #2
    389 	beq	2f
    390 	bhi	3f
    391 
    392 	/* Critical inner Loop: Block with 3 bytes initial overlap */
    393 	.p2align	2
    394 1:
    395 	bic	t1, w1, #MSB
    396 	cmp	t1, w2, S2LOMEM #8
    397 	sub	r3, w1, b1
    398 	bic	r3, r3, w1
    399 	bne	4f
    400 	ands	r3, r3, b1, lsl #7
    401 	it	eq
    402 	ldreq	w2, [wp2], #4
    403 	bne	5f
    404 	eor	t1, t1, w1
    405 	cmp	t1, w2, S2HIMEM #24
    406 	bne	6f
    407 	ldr	w1, [wp1], #4
    408 	b	1b
    409 4:
    410 	S2LOMEM	w2, w2, #8
    411 	b	8f
    412 
    413 5:
    414 #ifdef __ARMEB__
    415 	/* The syndrome value may contain false ones if the string ends
    416 	 * with the bytes 0x01 0x00
    417 	 */
    418 	tst	w1, #0xff000000
    419 	itt	ne
    420 	tstne	w1, #0x00ff0000
    421 	tstne	w1, #0x0000ff00
    422 	beq	7f
    423 #else
    424 	bics	r3, r3, #0xff000000
    425 	bne	7f
    426 #endif
    427 	ldrb	w2, [wp2]
    428 	S2LOMEM	t1, w1, #24
    429 #ifdef __ARMEB__
    430 	lsl	w2, w2, #24
    431 #endif
    432 	b	8f
    433 
    434 6:
    435 	S2LOMEM	t1, w1, #24
    436 	and	w2, w2, #LSB
    437 	b	8f
    438 
    439 	/* Critical inner Loop: Block with 2 bytes initial overlap */
    440 	.p2align	2
    441 2:
    442 	S2HIMEM	t1, w1, #16
    443 	sub	r3, w1, b1
    444 	S2LOMEM	t1, t1, #16
    445 	bic	r3, r3, w1
    446 	cmp	t1, w2, S2LOMEM #16
    447 	bne	4f
    448 	ands	r3, r3, b1, lsl #7
    449 	it	eq
    450 	ldreq	w2, [wp2], #4
    451 	bne	5f
    452 	eor	t1, t1, w1
    453 	cmp	t1, w2, S2HIMEM #16
    454 	bne	6f
    455 	ldr	w1, [wp1], #4
    456 	b	2b
    457 
    458 5:
    459 #ifdef __ARMEB__
    460 	/* The syndrome value may contain false ones if the string ends
    461 	 * with the bytes 0x01 0x00
    462 	 */
    463 	tst	w1, #0xff000000
    464 	it	ne
    465 	tstne	w1, #0x00ff0000
    466 	beq	7f
    467 #else
    468 	lsls	r3, r3, #16
    469 	bne	7f
    470 #endif
    471 	ldrh	w2, [wp2]
    472 	S2LOMEM	t1, w1, #16
    473 #ifdef __ARMEB__
    474 	lsl	w2, w2, #16
    475 #endif
    476 	b	8f
    477 
    478 6:
    479 	S2HIMEM	w2, w2, #16
    480 	S2LOMEM	t1, w1, #16
    481 4:
    482 	S2LOMEM	w2, w2, #16
    483 	b	8f
    484 
    485 	/* Critical inner Loop: Block with 1 byte initial overlap */
    486 	.p2align	2
    487 3:
    488 	and	t1, w1, #LSB
    489 	cmp	t1, w2, S2LOMEM #24
    490 	sub	r3, w1, b1
    491 	bic	r3, r3, w1
    492 	bne	4f
    493 	ands	r3, r3, b1, lsl #7
    494 	it	eq
    495 	ldreq	w2, [wp2], #4
    496 	bne	5f
    497 	eor	t1, t1, w1
    498 	cmp	t1, w2, S2HIMEM #8
    499 	bne	6f
    500 	ldr	w1, [wp1], #4
    501 	b	3b
    502 4:
    503 	S2LOMEM	w2, w2, #24
    504 	b	8f
    505 5:
    506 	/* The syndrome value may contain false ones if the string ends
    507 	 * with the bytes 0x01 0x00
    508 	 */
    509 	tst	w1, #LSB
    510 	beq	7f
    511 	ldr	w2, [wp2], #4
    512 6:
    513 	S2LOMEM	t1, w1, #8
    514 	bic	w2, w2, #MSB
    515 	b	8f
    516 7:
    517 	mov	r0, #0
    518 
    519     /* Restore registers and stack. */
    520     ldrd    r6, r7, [sp]
    521     ldrd    r4, r5, [sp, #8]
    522     adds    sp, sp, #16
    523 
    524 	bx	lr
    525 
    526 8:
    527 	and	r2, t1, #LSB
    528 	and	r0, w2, #LSB
    529 	cmp	r0, #1
    530 	it	cs
    531 	cmpcs	r0, r2
    532 	itt	eq
    533 	S2LOMEMEQ	t1, t1, #8
    534 	S2LOMEMEQ	w2, w2, #8
    535 	beq	8b
    536 	sub	r0, r2, r0
    537 
    538     /* Restore registers and stack. */
    539     ldrd    r6, r7, [sp]
    540     ldrd    r4, r5, [sp, #8]
    541     adds    sp, sp, #16
    542 
    543 	bx	lr
    544 END(strcmp)
    545