1 /* 2 * Copyright (c) 2013 ARM Ltd 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. The name of the company may not be used to endorse or promote 14 * products derived from this software without specific prior written 15 * permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED 18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 #include <private/bionic_asm.h> 30 31 #ifdef __ARMEB__ 32 #define S2LOMEM lsl 33 #define S2LOMEMEQ lsleq 34 #define S2HIMEM lsr 35 #define MSB 0x000000ff 36 #define LSB 0xff000000 37 #define BYTE0_OFFSET 24 38 #define BYTE1_OFFSET 16 39 #define BYTE2_OFFSET 8 40 #define BYTE3_OFFSET 0 41 #else /* not __ARMEB__ */ 42 #define S2LOMEM lsr 43 #define S2LOMEMEQ lsreq 44 #define S2HIMEM lsl 45 #define BYTE0_OFFSET 0 46 #define BYTE1_OFFSET 8 47 #define BYTE2_OFFSET 16 48 #define BYTE3_OFFSET 24 49 #define MSB 0xff000000 50 #define LSB 0x000000ff 51 #endif /* not __ARMEB__ */ 52 53 .syntax unified 54 55 #if defined (__thumb__) 56 .thumb 57 .thumb_func 58 #endif 59 60 ENTRY(strcmp) 61 /* Use LDRD whenever possible. */ 62 63 /* The main thing to look out for when comparing large blocks is that 64 the loads do not cross a page boundary when loading past the index 65 of the byte with the first difference or the first string-terminator. 66 67 For example, if the strings are identical and the string-terminator 68 is at index k, byte by byte comparison will not load beyond address 69 s1+k and s2+k; word by word comparison may load up to 3 bytes beyond 70 k; double word - up to 7 bytes. If the load of these bytes crosses 71 a page boundary, it might cause a memory fault (if the page is not mapped) 72 that would not have happened in byte by byte comparison. 73 74 If an address is (double) word aligned, then a load of a (double) word 75 from that address will not cross a page boundary. 76 Therefore, the algorithm below considers word and double-word alignment 77 of strings separately. */ 78 79 /* High-level description of the algorithm. 80 81 * The fast path: if both strings are double-word aligned, 82 use LDRD to load two words from each string in every loop iteration. 83 * If the strings have the same offset from a word boundary, 84 use LDRB to load and compare byte by byte until 85 the first string is aligned to a word boundary (at most 3 bytes). 86 This is optimized for quick return on short unaligned strings. 87 * If the strings have the same offset from a double-word boundary, 88 use LDRD to load two words from each string in every loop iteration, as in the fast path. 89 * If the strings do not have the same offset from a double-word boundary, 90 load a word from the second string before the loop to initialize the queue. 91 Use LDRD to load two words from every string in every loop iteration. 92 Inside the loop, load the second word from the second string only after comparing 93 the first word, using the queued value, to guarantee safety across page boundaries. 94 * If the strings do not have the same offset from a word boundary, 95 use LDR and a shift queue. Order of loads and comparisons matters, 96 similarly to the previous case. 97 98 * Use UADD8 and SEL to compare words, and use REV and CLZ to compute the return value. 99 * The only difference between ARM and Thumb modes is the use of CBZ instruction. 100 * The only difference between big and little endian is the use of REV in little endian 101 to compute the return value, instead of MOV. 102 */ 103 104 .macro m_cbz reg label 105 #ifdef __thumb2__ 106 cbz \reg, \label 107 #else /* not defined __thumb2__ */ 108 cmp \reg, #0 109 beq \label 110 #endif /* not defined __thumb2__ */ 111 .endm /* m_cbz */ 112 113 .macro m_cbnz reg label 114 #ifdef __thumb2__ 115 cbnz \reg, \label 116 #else /* not defined __thumb2__ */ 117 cmp \reg, #0 118 bne \label 119 #endif /* not defined __thumb2__ */ 120 .endm /* m_cbnz */ 121 122 .macro init 123 /* Macro to save temporary registers and prepare magic values. */ 124 subs sp, sp, #16 125 .cfi_def_cfa_offset 16 126 strd r4, r5, [sp, #8] 127 .cfi_rel_offset r4, 0 128 .cfi_rel_offset r5, 4 129 strd r6, r7, [sp] 130 .cfi_rel_offset r6, 8 131 .cfi_rel_offset r7, 12 132 mvn r6, #0 /* all F */ 133 mov r7, #0 /* all 0 */ 134 .endm /* init */ 135 136 .macro magic_compare_and_branch w1 w2 label 137 /* Macro to compare registers w1 and w2 and conditionally branch to label. */ 138 cmp \w1, \w2 /* Are w1 and w2 the same? */ 139 magic_find_zero_bytes \w1 140 it eq 141 cmpeq ip, #0 /* Is there a zero byte in w1? */ 142 bne \label 143 .endm /* magic_compare_and_branch */ 144 145 .macro magic_find_zero_bytes w1 146 /* Macro to find all-zero bytes in w1, result is in ip. */ 147 uadd8 ip, \w1, r6 148 sel ip, r7, r6 149 .endm /* magic_find_zero_bytes */ 150 151 .macro setup_return w1 w2 152 #ifdef __ARMEB__ 153 mov r1, \w1 154 mov r2, \w2 155 #else /* not __ARMEB__ */ 156 rev r1, \w1 157 rev r2, \w2 158 #endif /* not __ARMEB__ */ 159 .endm /* setup_return */ 160 161 pld [r0, #0] 162 pld [r1, #0] 163 164 /* Are both strings double-word aligned? */ 165 orr ip, r0, r1 166 tst ip, #7 167 bne .L_do_align 168 169 /* Fast path. */ 170 init 171 172 .L_doubleword_aligned: 173 174 /* Get here when the strings to compare are double-word aligned. */ 175 /* Compare two words in every iteration. */ 176 .p2align 2 177 2: 178 pld [r0, #16] 179 pld [r1, #16] 180 181 /* Load the next double-word from each string. */ 182 ldrd r2, r3, [r0], #8 183 ldrd r4, r5, [r1], #8 184 185 magic_compare_and_branch w1=r2, w2=r4, label=.L_return_24 186 magic_compare_and_branch w1=r3, w2=r5, label=.L_return_35 187 b 2b 188 189 .L_do_align: 190 /* Is the first string word-aligned? */ 191 ands ip, r0, #3 192 beq .L_word_aligned_r0 193 194 /* Fast compare byte by byte until the first string is word-aligned. */ 195 /* The offset of r0 from a word boundary is in ip. Thus, the number of bytes 196 to read until the next word boundary is 4-ip. */ 197 bic r0, r0, #3 198 ldr r2, [r0], #4 199 lsls ip, ip, #31 200 beq .L_byte2 201 bcs .L_byte3 202 203 .L_byte1: 204 ldrb ip, [r1], #1 205 uxtb r3, r2, ror #BYTE1_OFFSET 206 subs ip, r3, ip 207 bne .L_fast_return 208 m_cbz reg=r3, label=.L_fast_return 209 210 .L_byte2: 211 ldrb ip, [r1], #1 212 uxtb r3, r2, ror #BYTE2_OFFSET 213 subs ip, r3, ip 214 bne .L_fast_return 215 m_cbz reg=r3, label=.L_fast_return 216 217 .L_byte3: 218 ldrb ip, [r1], #1 219 uxtb r3, r2, ror #BYTE3_OFFSET 220 subs ip, r3, ip 221 bne .L_fast_return 222 m_cbnz reg=r3, label=.L_word_aligned_r0 223 224 .L_fast_return: 225 mov r0, ip 226 bx lr 227 228 .L_word_aligned_r0: 229 init 230 /* The first string is word-aligned. */ 231 /* Is the second string word-aligned? */ 232 ands ip, r1, #3 233 bne .L_strcmp_unaligned 234 235 .L_word_aligned: 236 /* The strings are word-aligned. */ 237 /* Is the first string double-word aligned? */ 238 tst r0, #4 239 beq .L_doubleword_aligned_r0 240 241 /* If r0 is not double-word aligned yet, align it by loading 242 and comparing the next word from each string. */ 243 ldr r2, [r0], #4 244 ldr r4, [r1], #4 245 magic_compare_and_branch w1=r2 w2=r4 label=.L_return_24 246 247 .L_doubleword_aligned_r0: 248 /* Get here when r0 is double-word aligned. */ 249 /* Is r1 doubleword_aligned? */ 250 tst r1, #4 251 beq .L_doubleword_aligned 252 253 /* Get here when the strings to compare are word-aligned, 254 r0 is double-word aligned, but r1 is not double-word aligned. */ 255 256 /* Initialize the queue. */ 257 ldr r5, [r1], #4 258 259 /* Compare two words in every iteration. */ 260 .p2align 2 261 3: 262 pld [r0, #16] 263 pld [r1, #16] 264 265 /* Load the next double-word from each string and compare. */ 266 ldrd r2, r3, [r0], #8 267 magic_compare_and_branch w1=r2 w2=r5 label=.L_return_25 268 ldrd r4, r5, [r1], #8 269 magic_compare_and_branch w1=r3 w2=r4 label=.L_return_34 270 b 3b 271 272 .macro miscmp_word offsetlo offsethi 273 /* Macro to compare misaligned strings. */ 274 /* r0, r1 are word-aligned, and at least one of the strings 275 is not double-word aligned. */ 276 /* Compare one word in every loop iteration. */ 277 /* OFFSETLO is the original bit-offset of r1 from a word-boundary, 278 OFFSETHI is 32 - OFFSETLO (i.e., offset from the next word). */ 279 280 /* Initialize the shift queue. */ 281 ldr r5, [r1], #4 282 283 /* Compare one word from each string in every loop iteration. */ 284 .p2align 2 285 7: 286 ldr r3, [r0], #4 287 S2LOMEM r5, r5, #\offsetlo 288 magic_find_zero_bytes w1=r3 289 cmp r7, ip, S2HIMEM #\offsetlo 290 and r2, r3, r6, S2LOMEM #\offsetlo 291 it eq 292 cmpeq r2, r5 293 bne .L_return_25 294 ldr r5, [r1], #4 295 cmp ip, #0 296 eor r3, r2, r3 297 S2HIMEM r2, r5, #\offsethi 298 it eq 299 cmpeq r3, r2 300 bne .L_return_32 301 b 7b 302 .endm /* miscmp_word */ 303 304 .L_strcmp_unaligned: 305 /* r0 is word-aligned, r1 is at offset ip from a word. */ 306 /* Align r1 to the (previous) word-boundary. */ 307 bic r1, r1, #3 308 309 /* Unaligned comparison word by word using LDRs. */ 310 cmp ip, #2 311 beq .L_miscmp_word_16 /* If ip == 2. */ 312 bge .L_miscmp_word_24 /* If ip == 3. */ 313 miscmp_word offsetlo=8 offsethi=24 /* If ip == 1. */ 314 .L_miscmp_word_24: miscmp_word offsetlo=24 offsethi=8 315 316 317 .L_return_32: 318 setup_return w1=r3, w2=r2 319 b .L_do_return 320 .L_return_34: 321 setup_return w1=r3, w2=r4 322 b .L_do_return 323 .L_return_25: 324 setup_return w1=r2, w2=r5 325 b .L_do_return 326 .L_return_35: 327 setup_return w1=r3, w2=r5 328 b .L_do_return 329 .L_return_24: 330 setup_return w1=r2, w2=r4 331 332 .L_do_return: 333 334 #ifdef __ARMEB__ 335 mov r0, ip 336 #else /* not __ARMEB__ */ 337 rev r0, ip 338 #endif /* not __ARMEB__ */ 339 340 /* Restore temporaries early, before computing the return value. */ 341 ldrd r6, r7, [sp] 342 ldrd r4, r5, [sp, #8] 343 adds sp, sp, #16 344 .cfi_def_cfa_offset 0 345 .cfi_restore r4 346 .cfi_restore r5 347 .cfi_restore r6 348 .cfi_restore r7 349 350 /* There is a zero or a different byte between r1 and r2. */ 351 /* r0 contains a mask of all-zero bytes in r1. */ 352 /* Using r0 and not ip here because cbz requires low register. */ 353 m_cbz reg=r0, label=.L_compute_return_value 354 clz r0, r0 355 /* r0 contains the number of bits on the left of the first all-zero byte in r1. */ 356 rsb r0, r0, #24 357 /* Here, r0 contains the number of bits on the right of the first all-zero byte in r1. */ 358 lsr r1, r1, r0 359 lsr r2, r2, r0 360 361 .L_compute_return_value: 362 movs r0, #1 363 cmp r1, r2 364 /* The return value is computed as follows. 365 If r1>r2 then (C==1 and Z==0) and LS doesn't hold and r0 is #1 at return. 366 If r1<r2 then (C==0 and Z==0) and we execute SBC with carry_in=0, 367 which means r0:=r0-r0-1 and r0 is #-1 at return. 368 If r1=r2 then (C==1 and Z==1) and we execute SBC with carry_in=1, 369 which means r0:=r0-r0 and r0 is #0 at return. 370 (C==0 and Z==1) cannot happen because the carry bit is "not borrow". */ 371 it ls 372 sbcls r0, r0, r0 373 bx lr 374 375 /* The code from the previous version of strcmp.S handles this 376 * particular case (the second string is 2 bytes off a word alignment) 377 * faster than any current version. In this very specific case, use the 378 * previous version. See bionic/libc/arch-arm/cortex-a15/bionic/strcmp.S 379 * for the unedited version of this code. 380 */ 381 .L_miscmp_word_16: 382 wp1 .req r0 383 wp2 .req r1 384 b1 .req r2 385 w1 .req r4 386 w2 .req r5 387 t1 .req ip 388 @ r3 is scratch 389 390 /* At this point, wp1 (r0) has already been word-aligned. */ 391 2: 392 mov b1, #1 393 orr b1, b1, b1, lsl #8 394 orr b1, b1, b1, lsl #16 395 396 and t1, wp2, #3 397 bic wp2, wp2, #3 398 ldr w1, [wp1], #4 399 ldr w2, [wp2], #4 400 401 /* Critical inner Loop: Block with 2 bytes initial overlap */ 402 .p2align 2 403 2: 404 S2HIMEM t1, w1, #16 405 sub r3, w1, b1 406 S2LOMEM t1, t1, #16 407 bic r3, r3, w1 408 cmp t1, w2, S2LOMEM #16 409 bne 4f 410 ands r3, r3, b1, lsl #7 411 it eq 412 ldreq w2, [wp2], #4 413 bne 5f 414 eor t1, t1, w1 415 cmp t1, w2, S2HIMEM #16 416 bne 6f 417 ldr w1, [wp1], #4 418 b 2b 419 420 5: 421 #ifdef __ARMEB__ 422 /* The syndrome value may contain false ones if the string ends 423 * with the bytes 0x01 0x00 424 */ 425 tst w1, #0xff000000 426 it ne 427 tstne w1, #0x00ff0000 428 beq 7f 429 #else 430 lsls r3, r3, #16 431 bne 7f 432 #endif 433 ldrh w2, [wp2] 434 S2LOMEM t1, w1, #16 435 #ifdef __ARMEB__ 436 lsl w2, w2, #16 437 #endif 438 b 8f 439 440 6: 441 S2HIMEM w2, w2, #16 442 S2LOMEM t1, w1, #16 443 4: 444 S2LOMEM w2, w2, #16 445 b 8f 446 447 7: 448 mov r0, #0 449 450 /* Restore registers and stack. */ 451 ldrd r6, r7, [sp] 452 ldrd r4, r5, [sp, #8] 453 adds sp, sp, #16 454 .cfi_def_cfa_offset 0 455 .cfi_restore r4 456 .cfi_restore r5 457 .cfi_restore r6 458 .cfi_restore r7 459 460 bx lr 461 462 8: 463 and r2, t1, #LSB 464 and r0, w2, #LSB 465 cmp r0, #1 466 it cs 467 cmpcs r0, r2 468 itt eq 469 S2LOMEMEQ t1, t1, #8 470 S2LOMEMEQ w2, w2, #8 471 beq 8b 472 sub r0, r2, r0 473 474 /* Restore registers and stack. */ 475 ldrd r6, r7, [sp] 476 ldrd r4, r5, [sp, #8] 477 adds sp, sp, #16 478 .cfi_def_cfa_offset 0 479 .cfi_restore r4 480 .cfi_restore r5 481 .cfi_restore r6 482 .cfi_restore r7 483 484 bx lr 485 END(strcmp) 486