Home | History | Annotate | Download | only in bionic
      1 /*
      2  * Copyright (c) 2013 ARM Ltd
      3  * All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  * 3. The name of the company may not be used to endorse or promote
     14  *    products derived from this software without specific prior written
     15  *    permission.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
     18  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
     19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     20  * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
     22  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     23  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     24  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     25  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include "arm_asm.h"
     30 
     31 #ifdef __ARMEB__
     32 #define S2LOMEM lsl
     33 #define S2LOMEMEQ lsleq
     34 #define S2HIMEM lsr
     35 #define MSB 0x000000ff
     36 #define LSB 0xff000000
     37 #define BYTE0_OFFSET 24
     38 #define BYTE1_OFFSET 16
     39 #define BYTE2_OFFSET 8
     40 #define BYTE3_OFFSET 0
     41 #else /* not  __ARMEB__ */
     42 #define S2LOMEM lsr
     43 #define S2LOMEMEQ lsreq
     44 #define S2HIMEM lsl
     45 #define BYTE0_OFFSET 0
     46 #define BYTE1_OFFSET 8
     47 #define BYTE2_OFFSET 16
     48 #define BYTE3_OFFSET 24
     49 #define MSB 0xff000000
     50 #define LSB 0x000000ff
     51 #endif /* not  __ARMEB__ */
     52 
     53 .syntax         unified
     54 
     55 #if defined (__thumb__)
     56         .thumb
     57         .thumb_func
     58 #endif
     59         .global strcmp
     60         .type   strcmp, %function
     61 strcmp:
     62 
     63 #if (defined (__thumb__) && !defined (__thumb2__))
     64 1:
     65         ldrb    r2, [r0]
     66         ldrb    r3, [r1]
     67         adds    r0, r0, #1
     68         adds    r1, r1, #1
     69         cmp     r2, #0
     70         beq     2f
     71         cmp     r2, r3
     72         beq     1b
     73 2:
     74         subs    r0, r2, r3
     75         bx      lr
     76 #elif (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
     77 1:
     78         ldrb    r2, [r0], #1
     79         ldrb    r3, [r1], #1
     80         cmp     r2, #1
     81         it      cs
     82         cmpcs   r2, r3
     83         beq     1b
     84         subs    r0, r2, r3
     85         RETURN
     86 
     87 
     88 #elif (defined (_ISA_THUMB_2) || defined (_ISA_ARM_6))
     89       /* Use LDRD whenever possible.  */
     90 
     91 /* The main thing to look out for when comparing large blocks is that
     92    the loads do not cross a page boundary when loading past the index
     93    of the byte with the first difference or the first string-terminator.
     94 
     95    For example, if the strings are identical and the string-terminator
     96    is at index k, byte by byte comparison will not load beyond address
     97    s1+k and s2+k; word by word comparison may load up to 3 bytes beyond
     98    k; double word - up to 7 bytes.  If the load of these bytes crosses
     99    a page boundary, it might cause a memory fault (if the page is not mapped)
    100    that would not have happened in byte by byte comparison.
    101 
    102    If an address is (double) word aligned, then a load of a (double) word
    103    from that address will not cross a page boundary.
    104    Therefore, the algorithm below considers word and double-word alignment
    105    of strings separately.  */
    106 
    107 /* High-level description of the algorithm.
    108 
    109    * The fast path: if both strings are double-word aligned,
    110      use LDRD to load two words from each string in every loop iteration.
    111    * If the strings have the same offset from a word boundary,
    112      use LDRB to load and compare byte by byte until
    113      the first string is aligned to a word boundary (at most 3 bytes).
    114      This is optimized for quick return on short unaligned strings.
    115    * If the strings have the same offset from a double-word boundary,
    116      use LDRD to load two words from each string in every loop iteration, as in the fast path.
    117    * If the strings do not have the same offset from a double-word boundary,
    118      load a word from the second string before the loop to initialize the queue.
    119      Use LDRD to load two words from every string in every loop iteration.
    120      Inside the loop, load the second word from the second string only after comparing
    121      the first word, using the queued value, to guarantee safety across page boundaries.
    122    * If the strings do not have the same offset from a word boundary,
    123      use LDR and a shift queue. Order of loads and comparisons matters,
    124      similarly to the previous case.
    125 
    126    * Use UADD8 and SEL to compare words, and use REV and CLZ to compute the return value.
    127    * The only difference between ARM and Thumb modes is the use of CBZ instruction.
    128    * The only difference between big and little endian is the use of REV in little endian
    129      to compute the return value, instead of MOV.
    130    * No preload. [TODO.]
    131 */
    132 
    133         .macro m_cbz reg label
    134 #ifdef __thumb2__
    135         cbz     \reg, \label
    136 #else   /* not defined __thumb2__ */
    137         cmp     \reg, #0
    138         beq     \label
    139 #endif /* not defined __thumb2__ */
    140         .endm /* m_cbz */
    141 
    142         .macro m_cbnz reg label
    143 #ifdef __thumb2__
    144         cbnz    \reg, \label
    145 #else   /* not defined __thumb2__ */
    146         cmp     \reg, #0
    147         bne     \label
    148 #endif /* not defined __thumb2__ */
    149         .endm /* m_cbnz */
    150 
    151         .macro  init
    152         /* Macro to save temporary registers and prepare magic values.  */
    153         subs    sp, sp, #16
    154         strd    r4, r5, [sp, #8]
    155         strd    r6, r7, [sp]
    156         mvn     r6, #0  /* all F */
    157         mov     r7, #0  /* all 0 */
    158         .endm   /* init */
    159 
    160         .macro  magic_compare_and_branch w1 w2 label
    161         /* Macro to compare registers w1 and w2 and conditionally branch to label.  */
    162         cmp     \w1, \w2        /* Are w1 and w2 the same?  */
    163         magic_find_zero_bytes \w1
    164         it      eq
    165         cmpeq   ip, #0          /* Is there a zero byte in w1?  */
    166         bne     \label
    167         .endm /* magic_compare_and_branch */
    168 
    169         .macro  magic_find_zero_bytes w1
    170         /* Macro to find all-zero bytes in w1, result is in ip.  */
    171 #if (defined (__ARM_FEATURE_DSP))
    172         uadd8   ip, \w1, r6
    173         sel     ip, r7, r6
    174 #else /* not defined (__ARM_FEATURE_DSP) */
    175         /* __ARM_FEATURE_DSP is not defined for some Cortex-M processors.
    176         Coincidently, these processors only have Thumb-2 mode, where we can use the
    177         the (large) magic constant available directly as an immediate in instructions.
    178         Note that we cannot use the magic constant in ARM mode, where we need
    179         to create the constant in a register.  */
    180         sub     ip, \w1, #0x01010101
    181         bic     ip, ip, \w1
    182         and     ip, ip, #0x80808080
    183 #endif /* not defined (__ARM_FEATURE_DSP) */
    184         .endm /* magic_find_zero_bytes */
    185 
    186         .macro  setup_return w1 w2
    187 #ifdef __ARMEB__
    188         mov     r1, \w1
    189         mov     r2, \w2
    190 #else /* not  __ARMEB__ */
    191         rev     r1, \w1
    192         rev     r2, \w2
    193 #endif /* not  __ARMEB__ */
    194         .endm /* setup_return */
    195 
    196         /*
    197         optpld r0, #0
    198         optpld r1, #0
    199         */
    200 
    201         /* Are both strings double-word aligned?  */
    202         orr     ip, r0, r1
    203         tst     ip, #7
    204         bne     do_align
    205 
    206         /* Fast path.  */
    207         init
    208 
    209 doubleword_aligned:
    210 
    211         /* Get here when the strings to compare are double-word aligned.  */
    212         /* Compare two words in every iteration.  */
    213         .p2align        2
    214 2:
    215         /*
    216         optpld r0, #16
    217         optpld r1, #16
    218         */
    219 
    220         /* Load the next double-word from each string.  */
    221         ldrd    r2, r3, [r0], #8
    222         ldrd    r4, r5, [r1], #8
    223 
    224         magic_compare_and_branch w1=r2, w2=r4, label=return_24
    225         magic_compare_and_branch w1=r3, w2=r5, label=return_35
    226         b       2b
    227 
    228 do_align:
    229         /* Is the first string word-aligned?  */
    230         ands    ip, r0, #3
    231         beq     word_aligned_r0
    232 
    233         /* Fast compare byte by byte until the first string is word-aligned.  */
    234         /* The offset of r0 from a word boundary is in ip. Thus, the number of bytes
    235         to read until the next word boudnary is 4-ip.  */
    236         bic     r0, r0, #3
    237         ldr     r2, [r0], #4
    238         lsls    ip, ip, #31
    239         beq     byte2
    240         bcs     byte3
    241 
    242 byte1:
    243         ldrb    ip, [r1], #1
    244         uxtb    r3, r2, ror #BYTE1_OFFSET
    245         subs    ip, r3, ip
    246         bne     fast_return
    247         m_cbz   reg=r3, label=fast_return
    248 
    249 byte2:
    250         ldrb    ip, [r1], #1
    251         uxtb    r3, r2, ror #BYTE2_OFFSET
    252         subs    ip, r3, ip
    253         bne     fast_return
    254         m_cbz   reg=r3, label=fast_return
    255 
    256 byte3:
    257         ldrb    ip, [r1], #1
    258         uxtb    r3, r2, ror #BYTE3_OFFSET
    259         subs    ip, r3, ip
    260         bne     fast_return
    261         m_cbnz  reg=r3, label=word_aligned_r0
    262 
    263 fast_return:
    264         mov     r0, ip
    265         bx      lr
    266 
    267 word_aligned_r0:
    268         init
    269         /* The first string is word-aligned.  */
    270         /* Is the second string word-aligned?  */
    271         ands    ip, r1, #3
    272         bne     strcmp_unaligned
    273 
    274 word_aligned:
    275         /* The strings are word-aligned. */
    276         /* Is the first string double-word aligned?  */
    277         tst     r0, #4
    278         beq     doubleword_aligned_r0
    279 
    280         /* If r0 is not double-word aligned yet, align it by loading
    281         and comparing the next word from each string.  */
    282         ldr     r2, [r0], #4
    283         ldr     r4, [r1], #4
    284         magic_compare_and_branch w1=r2 w2=r4 label=return_24
    285 
    286 doubleword_aligned_r0:
    287         /* Get here when r0 is double-word aligned.  */
    288         /* Is r1 doubleword_aligned?  */
    289         tst     r1, #4
    290         beq     doubleword_aligned
    291 
    292         /* Get here when the strings to compare are word-aligned,
    293         r0 is double-word aligned, but r1 is not double-word aligned.  */
    294 
    295         /* Initialize the queue.  */
    296         ldr     r5, [r1], #4
    297 
    298         /* Compare two words in every iteration.  */
    299         .p2align        2
    300 3:
    301         /*
    302         optpld r0, #16
    303         optpld r1, #16
    304         */
    305 
    306         /* Load the next double-word from each string and compare.  */
    307         ldrd    r2, r3, [r0], #8
    308         magic_compare_and_branch w1=r2 w2=r5 label=return_25
    309         ldrd    r4, r5, [r1], #8
    310         magic_compare_and_branch w1=r3 w2=r4 label=return_34
    311         b       3b
    312 
    313         .macro miscmp_word offsetlo offsethi
    314         /* Macro to compare misaligned strings.  */
    315         /* r0, r1 are word-aligned, and at least one of the strings
    316         is not double-word aligned.  */
    317         /* Compare one word in every loop iteration.  */
    318         /* OFFSETLO is the original bit-offset of r1 from a word-boundary,
    319         OFFSETHI is 32 - OFFSETLO (i.e., offset from the next word).  */
    320 
    321         /* Initialize the shift queue.  */
    322         ldr     r5, [r1], #4
    323 
    324         /* Compare one word from each string in every loop iteration.  */
    325         .p2align        2
    326 7:
    327         ldr     r3, [r0], #4
    328         S2LOMEM r5, r5, #\offsetlo
    329         magic_find_zero_bytes w1=r3
    330         cmp     r7, ip, S2HIMEM #\offsetlo
    331         and     r2, r3, r6, S2LOMEM #\offsetlo
    332         it      eq
    333         cmpeq   r2, r5
    334         bne     return_25
    335         ldr     r5, [r1], #4
    336         cmp     ip, #0
    337         eor r3, r2, r3
    338         S2HIMEM r2, r5, #\offsethi
    339         it      eq
    340         cmpeq   r3, r2
    341         bne     return_32
    342         b       7b
    343         .endm /* miscmp_word */
    344 
    345 strcmp_unaligned:
    346         /* r0 is word-aligned, r1 is at offset ip from a word.  */
    347         /* Align r1 to the (previous) word-boundary.  */
    348         bic     r1, r1, #3
    349 
    350         /* Unaligned comparison word by word using LDRs. */
    351         cmp     ip, #2
    352         beq     miscmp_word_16                    /* If ip == 2.  */
    353         bge     miscmp_word_24                    /* If ip == 3.  */
    354         miscmp_word offsetlo=8 offsethi=24        /* If ip == 1.  */
    355 miscmp_word_16:  miscmp_word offsetlo=16 offsethi=16
    356 miscmp_word_24:  miscmp_word offsetlo=24 offsethi=8
    357 
    358 
    359 return_32:
    360         setup_return w1=r3, w2=r2
    361         b       do_return
    362 return_34:
    363         setup_return w1=r3, w2=r4
    364         b       do_return
    365 return_25:
    366         setup_return w1=r2, w2=r5
    367         b       do_return
    368 return_35:
    369         setup_return w1=r3, w2=r5
    370         b       do_return
    371 return_24:
    372         setup_return w1=r2, w2=r4
    373 
    374 do_return:
    375 
    376 #ifdef __ARMEB__
    377         mov     r0, ip
    378 #else /* not  __ARMEB__ */
    379         rev     r0, ip
    380 #endif /* not  __ARMEB__ */
    381 
    382         /* Restore temporaries early, before computing the return value.  */
    383         ldrd    r6, r7, [sp]
    384         ldrd    r4, r5, [sp, #8]
    385         adds    sp, sp, #16
    386 
    387         /* There is a zero or a different byte between r1 and r2.  */
    388         /* r0 contains a mask of all-zero bytes in r1.  */
    389         /* Using r0 and not ip here because cbz requires low register.  */
    390         m_cbz   reg=r0, label=compute_return_value
    391         clz     r0, r0
    392         /* r0 contains the number of bits on the left of the first all-zero byte in r1.  */
    393         rsb     r0, r0, #24
    394         /* Here, r0 contains the number of bits on the right of the first all-zero byte in r1.  */
    395         lsr     r1, r1, r0
    396         lsr     r2, r2, r0
    397 
    398 compute_return_value:
    399         movs    r0, #1
    400         cmp     r1, r2
    401         /* The return value is computed as follows.
    402         If r1>r2 then (C==1 and Z==0) and LS doesn't hold and r0 is #1 at return.
    403         If r1<r2 then (C==0 and Z==0) and we execute SBC with carry_in=0,
    404         which means r0:=r0-r0-1 and r0 is #-1 at return.
    405         If r1=r2 then (C==1 and Z==1) and we execute SBC with carry_in=1,
    406         which means r0:=r0-r0 and r0 is #0 at return.
    407         (C==0 and Z==1) cannot happen because the carry bit is "not borrow".  */
    408         it      ls
    409         sbcls   r0, r0, r0
    410         bx      lr
    411 
    412 
    413 #else   /* !(defined (_ISA_THUMB_2) || defined (_ISA_ARM_6)
    414              defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) ||
    415              (defined (__thumb__) && !defined (__thumb2__))) */
    416 
    417         /* Use LDR whenever possible. */
    418 
    419 #ifdef __thumb2__
    420 #define magic1(REG) 0x01010101
    421 #define magic2(REG) 0x80808080
    422 #else
    423 #define magic1(REG) REG
    424 #define magic2(REG) REG, lsl #7
    425 #endif
    426 
    427         optpld  r0
    428         optpld  r1
    429         eor     r2, r0, r1
    430         tst     r2, #3
    431         /* Strings not at same byte offset from a word boundary.  */
    432         bne     strcmp_unaligned
    433         ands    r2, r0, #3
    434         bic     r0, r0, #3
    435         bic     r1, r1, #3
    436         ldr     ip, [r0], #4
    437         it      eq
    438         ldreq   r3, [r1], #4
    439         beq     1f
    440         /* Although s1 and s2 have identical initial alignment, they are
    441         not currently word aligned.  Rather than comparing bytes,
    442     make sure that any bytes fetched from before the addressed
    443     bytes are forced to 0xff.  Then they will always compare
    444     equal.  */
    445         eor     r2, r2, #3
    446         lsl     r2, r2, #3
    447         mvn     r3, MSB
    448         S2LOMEM        r2, r3, r2
    449         ldr     r3, [r1], #4
    450         orr     ip, ip, r2
    451         orr     r3, r3, r2
    452 1:
    453 #ifndef __thumb2__
    454               /* Load the 'magic' constant 0x01010101.  */
    455         str     r4, [sp, #-4]!
    456         mov     r4, #1
    457         orr     r4, r4, r4, lsl #8
    458         orr     r4, r4, r4, lsl #16
    459 #endif
    460         .p2align        2
    461 4:
    462         optpld  r0, #8
    463         optpld  r1, #8
    464         sub     r2, ip, magic1(r4)
    465         cmp     ip, r3
    466         itttt   eq
    467         /* check for any zero bytes in first word */
    468         biceq   r2, r2, ip
    469         tsteq   r2, magic2(r4)
    470         ldreq   ip, [r0], #4
    471         ldreq   r3, [r1], #4
    472         beq     4b
    473 2:
    474         /* There's a zero or a different byte in the word */
    475         S2HIMEM  r0, ip, #24
    476         S2LOMEM  ip, ip, #8
    477         cmp     r0, #1
    478         it      cs
    479         cmpcs   r0, r3, S2HIMEM #24
    480         it      eq
    481         S2LOMEMEQ r3, r3, #8
    482         beq     2b
    483         /* On a big-endian machine, r0 contains the desired byte in bits
    484         0-7; on a little-endian machine they are in bits 24-31.  In
    485         both cases the other bits in r0 are all zero.  For r3 the
    486         interesting byte is at the other end of the word, but the
    487         other bits are not necessarily zero.  We need a signed result
    488         representing the differnece in the unsigned bytes, so for the
    489         little-endian case we can't just shift the interesting bits
    490         up.  */
    491 #ifdef __ARMEB__
    492         sub     r0, r0, r3, lsr #24
    493 #else
    494         and     r3, r3, #255
    495 #ifdef __thumb2__
    496         /* No RSB instruction in Thumb2 */
    497         lsr     r0, r0, #24
    498         sub     r0, r0, r3
    499 #else
    500         rsb     r0, r3, r0, lsr #24
    501 #endif
    502 #endif
    503 #ifndef __thumb2__
    504         ldr     r4, [sp], #4
    505 #endif
    506         RETURN
    507 
    508 
    509 strcmp_unaligned:
    510 
    511 #if 0
    512         /* The assembly code below is based on the following alogrithm.  */
    513 #ifdef __ARMEB__
    514 #define RSHIFT <<
    515 #define LSHIFT >>
    516 #else
    517 #define RSHIFT >>
    518 #define LSHIFT <<
    519 #endif
    520 
    521 #define body(shift)                         \
    522   mask = 0xffffffffU RSHIFT shift;                  \
    523   w1 = *wp1++;                              \
    524   w2 = *wp2++;                              \
    525   do                                    \
    526     {                                   \
    527       t1 = w1 & mask;                           \
    528       if (__builtin_expect(t1 != w2 RSHIFT shift, 0))           \
    529     {                               \
    530       w2 RSHIFT= shift;                     \
    531       break;                            \
    532     }                               \
    533       if (__builtin_expect(((w1 - b1) & ~w1) & (b1 << 7), 0))       \
    534     {                               \
    535       /* See comment in assembler below re syndrome on big-endian */\
    536       if ((((w1 - b1) & ~w1) & (b1 << 7)) & mask)           \
    537         w2 RSHIFT= shift;                       \
    538       else                              \
    539         {                               \
    540           w2 = *wp2;                        \
    541           t1 = w1 RSHIFT (32 - shift);              \
    542           w2 = (w2 LSHIFT (32 - shift)) RSHIFT (32 - shift);    \
    543         }                               \
    544       break;                            \
    545     }                               \
    546       w2 = *wp2++;                          \
    547       t1 ^= w1;                             \
    548       if (__builtin_expect(t1 != w2 LSHIFT (32 - shift), 0))        \
    549     {                               \
    550       t1 = w1 >> (32 - shift);                  \
    551       w2 = (w2 << (32 - shift)) RSHIFT (32 - shift);        \
    552       break;                            \
    553     }                               \
    554       w1 = *wp1++;                          \
    555     } while (1)
    556 
    557   const unsigned* wp1;
    558   const unsigned* wp2;
    559   unsigned w1, w2;
    560   unsigned mask;
    561   unsigned shift;
    562   unsigned b1 = 0x01010101;
    563   char c1, c2;
    564   unsigned t1;
    565 
    566   while (((unsigned) s1) & 3)
    567     {
    568       c1 = *s1++;
    569       c2 = *s2++;
    570       if (c1 == 0 || c1 != c2)
    571     return c1 - (int)c2;
    572     }
    573   wp1 = (unsigned*) (((unsigned)s1) & ~3);
    574   wp2 = (unsigned*) (((unsigned)s2) & ~3);
    575   t1 = ((unsigned) s2) & 3;
    576   if (t1 == 1)
    577     {
    578       body(8);
    579     }
    580   else if (t1 == 2)
    581     {
    582       body(16);
    583     }
    584   else
    585     {
    586       body (24);
    587     }
    588 
    589   do
    590     {
    591 #ifdef __ARMEB__
    592       c1 = (char) t1 >> 24;
    593       c2 = (char) w2 >> 24;
    594 #else /* not  __ARMEB__ */
    595       c1 = (char) t1;
    596       c2 = (char) w2;
    597 #endif /* not  __ARMEB__ */
    598       t1 RSHIFT= 8;
    599       w2 RSHIFT= 8;
    600     } while (c1 != 0 && c1 == c2);
    601   return c1 - c2;
    602 #endif /* 0 */
    603 
    604 
    605         wp1 .req r0
    606         wp2 .req r1
    607         b1  .req r2
    608         w1  .req r4
    609         w2  .req r5
    610         t1  .req ip
    611         @ r3 is scratch
    612 
    613         /* First of all, compare bytes until wp1(sp1) is word-aligned. */
    614 1:
    615         tst     wp1, #3
    616         beq     2f
    617         ldrb    r2, [wp1], #1
    618         ldrb    r3, [wp2], #1
    619         cmp     r2, #1
    620         it      cs
    621         cmpcs   r2, r3
    622         beq     1b
    623         sub     r0, r2, r3
    624         RETURN
    625 
    626 2:
    627         str     r5, [sp, #-4]!
    628         str     r4, [sp, #-4]!
    629         //stmfd   sp!, {r4, r5}
    630         mov     b1, #1
    631         orr     b1, b1, b1, lsl #8
    632         orr     b1, b1, b1, lsl #16
    633 
    634         and     t1, wp2, #3
    635         bic     wp2, wp2, #3
    636         ldr     w1, [wp1], #4
    637         ldr     w2, [wp2], #4
    638         cmp     t1, #2
    639         beq     2f
    640         bhi     3f
    641 
    642         /* Critical inner Loop: Block with 3 bytes initial overlap */
    643         .p2align        2
    644 1:
    645         bic     t1, w1, MSB
    646         cmp     t1, w2, S2LOMEM #8
    647         sub     r3, w1, b1
    648         bic     r3, r3, w1
    649         bne     4f
    650         ands    r3, r3, b1, lsl #7
    651         it      eq
    652         ldreq   w2, [wp2], #4
    653         bne     5f
    654         eor     t1, t1, w1
    655         cmp     t1, w2, S2HIMEM #24
    656         bne     6f
    657         ldr     w1, [wp1], #4
    658         b       1b
    659 4:
    660         S2LOMEM        w2, w2, #8
    661         b       8f
    662 
    663 5:
    664 #ifdef __ARMEB__
    665         /* The syndrome value may contain false ones if the string ends
    666         with the bytes 0x01 0x00 */
    667         tst     w1, #0xff000000
    668         itt     ne
    669         tstne   w1, #0x00ff0000
    670         tstne   w1, #0x0000ff00
    671         beq     7f
    672 #else
    673         bics    r3, r3, #0xff000000
    674         bne     7f
    675 #endif
    676         ldrb    w2, [wp2]
    677         S2LOMEM  t1, w1, #24
    678 #ifdef __ARMEB__
    679         lsl     w2, w2, #24
    680 #endif
    681         b       8f
    682 
    683 6:
    684         S2LOMEM  t1, w1, #24
    685         and     w2, w2, LSB
    686         b       8f
    687 
    688         /* Critical inner Loop: Block with 2 bytes initial overlap */
    689         .p2align        2
    690 2:
    691         S2HIMEM  t1, w1, #16
    692         sub     r3, w1, b1
    693         S2LOMEM  t1, t1, #16
    694         bic     r3, r3, w1
    695         cmp     t1, w2, S2LOMEM #16
    696         bne     4f
    697         ands    r3, r3, b1, lsl #7
    698         it      eq
    699         ldreq   w2, [wp2], #4
    700         bne     5f
    701         eor     t1, t1, w1
    702         cmp     t1, w2, S2HIMEM #16
    703         bne     6f
    704         ldr     w1, [wp1], #4
    705         b       2b
    706 
    707 5:
    708 #ifdef __ARMEB__
    709         /* The syndrome value may contain false ones if the string ends
    710         with the bytes 0x01 0x00 */
    711         tst     w1, #0xff000000
    712         it      ne
    713         tstne   w1, #0x00ff0000
    714         beq     7f
    715 #else
    716         lsls    r3, r3, #16
    717         bne     7f
    718 #endif
    719         ldrh    w2, [wp2]
    720         S2LOMEM  t1, w1, #16
    721 #ifdef __ARMEB__
    722         lsl     w2, w2, #16
    723 #endif
    724         b       8f
    725 
    726 6:
    727         S2HIMEM  w2, w2, #16
    728         S2LOMEM  t1, w1, #16
    729 4:
    730         S2LOMEM  w2, w2, #16
    731         b       8f
    732 
    733         /* Critical inner Loop: Block with 1 byte initial overlap */
    734         .p2align        2
    735 3:
    736         and     t1, w1, LSB
    737         cmp     t1, w2, S2LOMEM #24
    738         sub     r3, w1, b1
    739         bic     r3, r3, w1
    740         bne     4f
    741         ands    r3, r3, b1, lsl #7
    742         it      eq
    743         ldreq   w2, [wp2], #4
    744         bne     5f
    745         eor     t1, t1, w1
    746         cmp     t1, w2, S2HIMEM #8
    747         bne     6f
    748         ldr     w1, [wp1], #4
    749         b       3b
    750 4:
    751         S2LOMEM  w2, w2, #24
    752         b       8f
    753 5:
    754         /* The syndrome value may contain false ones if the string ends
    755         with the bytes 0x01 0x00 */
    756         tst     w1, LSB
    757         beq     7f
    758         ldr     w2, [wp2], #4
    759 6:
    760         S2LOMEM  t1, w1, #8
    761         bic     w2, w2, MSB
    762         b       8f
    763 7:
    764         mov     r0, #0
    765         //ldmfd   sp!, {r4, r5}
    766         ldr     r4, [sp], #4
    767         ldr     r5, [sp], #4
    768         RETURN
    769 8:
    770         and     r2, t1, LSB
    771         and     r0, w2, LSB
    772         cmp     r0, #1
    773         it      cs
    774         cmpcs   r0, r2
    775         itt     eq
    776         S2LOMEMEQ        t1, t1, #8
    777         S2LOMEMEQ        w2, w2, #8
    778         beq     8b
    779         sub     r0, r2, r0
    780         //ldmfd   sp!, {r4, r5}
    781         ldr     r4, [sp], #4
    782         ldr     r5, [sp], #4
    783         RETURN
    784 
    785 #endif /* !(defined (_ISA_THUMB_2) || defined (_ISA_ARM_6)
    786             defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) ||
    787             (defined (__thumb__) && !defined (__thumb2__))) */
    788