1 /* 2 Copyright (c) 2014, Intel Corporation 3 All rights reserved. 4 5 Redistribution and use in source and binary forms, with or without 6 modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #ifdef USE_AS_STRNCMP 32 /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz 33 if the new counter > the old one or is 0. */ 34 #define UPDATE_STRNCMP_COUNTER \ 35 /* calculate left number to compare */ \ 36 lea -16(%rcx, %r11), %r9; \ 37 cmp %r9, %r11; \ 38 jb L(strcmp_exitz); \ 39 test %r9, %r9; \ 40 je L(strcmp_exitz); \ 41 mov %r9, %r11 42 43 #else 44 #define UPDATE_STRNCMP_COUNTER 45 #ifndef STRCMP 46 #define STRCMP strcmp 47 #endif 48 #endif 49 50 #ifndef L 51 # define L(label) .L##label 52 #endif 53 54 #ifndef cfi_startproc 55 # define cfi_startproc .cfi_startproc 56 #endif 57 58 #ifndef cfi_endproc 59 # define cfi_endproc .cfi_endproc 60 #endif 61 62 #ifndef ENTRY 63 # define ENTRY(name) \ 64 .type name, @function; \ 65 .globl name; \ 66 .p2align 4; \ 67 name: \ 68 cfi_startproc 69 #endif 70 71 #ifndef END 72 # define END(name) \ 73 cfi_endproc; \ 74 .size name, .-name 75 #endif 76 #define RETURN ret 77 .section .text.ssse3,"ax",@progbits 78 ENTRY (STRCMP) 79 /* 80 * This implementation uses SSE to compare up to 16 bytes at a time. 81 */ 82 #ifdef USE_AS_STRNCMP 83 test %rdx, %rdx 84 je L(strcmp_exitz) 85 cmp $1, %rdx 86 je L(Byte0) 87 mov %rdx, %r11 88 #endif 89 mov %esi, %ecx 90 mov %edi, %eax 91 /* Use 64bit AND here to avoid long NOP padding. */ 92 and $0x3f, %rcx /* rsi alignment in cache line */ 93 and $0x3f, %rax /* rdi alignment in cache line */ 94 cmp $0x30, %ecx 95 ja L(crosscache) /* rsi: 16-byte load will cross cache line */ 96 cmp $0x30, %eax 97 ja L(crosscache) /* rdi: 16-byte load will cross cache line */ 98 movlpd (%rdi), %xmm1 99 movlpd (%rsi), %xmm2 100 movhpd 8(%rdi), %xmm1 101 movhpd 8(%rsi), %xmm2 102 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ 103 pcmpeqb %xmm1, %xmm0 /* Any null chars? */ 104 pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ 105 psubb %xmm0, %xmm1 /* packed sub of comparison results*/ 106 pmovmskb %xmm1, %edx 107 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ 108 jnz L(less16bytes) /* If not, find different value or null char */ 109 #ifdef USE_AS_STRNCMP 110 sub $16, %r11 111 jbe L(strcmp_exitz) /* finish comparision */ 112 #endif 113 add $16, %rsi /* prepare to search next 16 bytes */ 114 add $16, %rdi /* prepare to search next 16 bytes */ 115 116 /* 117 * Determine source and destination string offsets from 16-byte alignment. 118 * Use relative offset difference between the two to determine which case 119 * below to use. 120 */ 121 .p2align 4 122 L(crosscache): 123 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ 124 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ 125 mov $0xffff, %edx /* for equivalent offset */ 126 xor %r8d, %r8d 127 and $0xf, %ecx /* offset of rsi */ 128 and $0xf, %eax /* offset of rdi */ 129 cmp %eax, %ecx 130 je L(ashr_0) /* rsi and rdi relative offset same */ 131 ja L(bigger) 132 mov %edx, %r8d /* r8d is offset flag for exit tail */ 133 xchg %ecx, %eax 134 xchg %rsi, %rdi 135 L(bigger): 136 lea 15(%rax), %r9 137 sub %rcx, %r9 138 lea L(unaligned_table)(%rip), %r10 139 movslq (%r10, %r9,4), %r9 140 lea (%r10, %r9), %r10 141 jmp *%r10 /* jump to corresponding case */ 142 143 /* 144 * The following cases will be handled by ashr_0 145 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 146 * n(0~15) n(0~15) 15(15+ n-n) ashr_0 147 */ 148 .p2align 4 149 L(ashr_0): 150 151 movdqa (%rsi), %xmm1 152 pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ 153 pcmpeqb %xmm1, %xmm0 /* Any null chars? */ 154 pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ 155 psubb %xmm0, %xmm1 /* packed sub of comparison results*/ 156 pmovmskb %xmm1, %r9d 157 shr %cl, %edx /* adjust 0xffff for offset */ 158 shr %cl, %r9d /* adjust for 16-byte offset */ 159 sub %r9d, %edx 160 /* 161 * edx must be the same with r9d if in left byte (16-rcx) is equal to 162 * the start from (16-rax) and no null char was seen. 163 */ 164 jne L(less32bytes) /* mismatch or null char */ 165 UPDATE_STRNCMP_COUNTER 166 mov $16, %rcx 167 mov $16, %r9 168 pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */ 169 170 /* 171 * Now both strings are aligned at 16-byte boundary. Loop over strings 172 * checking 32-bytes per iteration. 173 */ 174 .p2align 4 175 L(loop_ashr_0): 176 movdqa (%rsi, %rcx), %xmm1 177 movdqa (%rdi, %rcx), %xmm2 178 179 pcmpeqb %xmm1, %xmm0 180 pcmpeqb %xmm2, %xmm1 181 psubb %xmm0, %xmm1 182 pmovmskb %xmm1, %edx 183 sub $0xffff, %edx 184 jnz L(exit) /* mismatch or null char seen */ 185 186 #ifdef USE_AS_STRNCMP 187 sub $16, %r11 188 jbe L(strcmp_exitz) 189 #endif 190 add $16, %rcx 191 movdqa (%rsi, %rcx), %xmm1 192 movdqa (%rdi, %rcx), %xmm2 193 194 pcmpeqb %xmm1, %xmm0 195 pcmpeqb %xmm2, %xmm1 196 psubb %xmm0, %xmm1 197 pmovmskb %xmm1, %edx 198 sub $0xffff, %edx 199 jnz L(exit) 200 #ifdef USE_AS_STRNCMP 201 sub $16, %r11 202 jbe L(strcmp_exitz) 203 #endif 204 add $16, %rcx 205 jmp L(loop_ashr_0) 206 207 /* 208 * The following cases will be handled by ashr_1 209 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 210 * n(15) n -15 0(15 +(n-15) - n) ashr_1 211 */ 212 .p2align 4 213 L(ashr_1): 214 pxor %xmm0, %xmm0 215 movdqa (%rdi), %xmm2 216 movdqa (%rsi), %xmm1 217 pcmpeqb %xmm1, %xmm0 /* Any null chars? */ 218 pslldq $15, %xmm2 /* shift first string to align with second */ 219 pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ 220 psubb %xmm0, %xmm2 /* packed sub of comparison results*/ 221 pmovmskb %xmm2, %r9d 222 shr %cl, %edx /* adjust 0xffff for offset */ 223 shr %cl, %r9d /* adjust for 16-byte offset */ 224 sub %r9d, %edx 225 jnz L(less32bytes) /* mismatch or null char seen */ 226 movdqa (%rdi), %xmm3 227 UPDATE_STRNCMP_COUNTER 228 229 pxor %xmm0, %xmm0 230 mov $16, %rcx /* index for loads*/ 231 mov $1, %r9d /* byte position left over from less32bytes case */ 232 /* 233 * Setup %r10 value allows us to detect crossing a page boundary. 234 * When %r10 goes positive we have crossed a page boundary and 235 * need to do a nibble. 236 */ 237 lea 1(%rdi), %r10 238 and $0xfff, %r10 /* offset into 4K page */ 239 sub $0x1000, %r10 /* subtract 4K pagesize */ 240 241 .p2align 4 242 L(loop_ashr_1): 243 add $16, %r10 244 jg L(nibble_ashr_1) /* cross page boundary */ 245 246 L(gobble_ashr_1): 247 movdqa (%rsi, %rcx), %xmm1 248 movdqa (%rdi, %rcx), %xmm2 249 movdqa %xmm2, %xmm4 /* store for next cycle */ 250 251 palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ 252 253 pcmpeqb %xmm1, %xmm0 254 pcmpeqb %xmm2, %xmm1 255 psubb %xmm0, %xmm1 256 pmovmskb %xmm1, %edx 257 sub $0xffff, %edx 258 jnz L(exit) 259 260 #ifdef USE_AS_STRNCMP 261 sub $16, %r11 262 jbe L(strcmp_exitz) 263 #endif 264 add $16, %rcx 265 movdqa %xmm4, %xmm3 266 267 add $16, %r10 268 jg L(nibble_ashr_1) /* cross page boundary */ 269 270 movdqa (%rsi, %rcx), %xmm1 271 movdqa (%rdi, %rcx), %xmm2 272 movdqa %xmm2, %xmm4 /* store for next cycle */ 273 274 palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ 275 276 pcmpeqb %xmm1, %xmm0 277 pcmpeqb %xmm2, %xmm1 278 psubb %xmm0, %xmm1 279 pmovmskb %xmm1, %edx 280 sub $0xffff, %edx 281 jnz L(exit) 282 283 #ifdef USE_AS_STRNCMP 284 sub $16, %r11 285 jbe L(strcmp_exitz) 286 #endif 287 add $16, %rcx 288 movdqa %xmm4, %xmm3 289 jmp L(loop_ashr_1) 290 291 /* 292 * Nibble avoids loads across page boundary. This is to avoid a potential 293 * access into unmapped memory. 294 */ 295 .p2align 4 296 L(nibble_ashr_1): 297 pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/ 298 pmovmskb %xmm0, %edx 299 test $0xfffe, %edx 300 jnz L(ashr_1_exittail) /* find null char*/ 301 302 #ifdef USE_AS_STRNCMP 303 cmp $14, %r11 304 jbe L(ashr_1_exittail) 305 #endif 306 307 pxor %xmm0, %xmm0 308 sub $0x1000, %r10 /* substract 4K from %r10 */ 309 jmp L(gobble_ashr_1) 310 311 /* 312 * Once find null char, determine if there is a string mismatch 313 * before the null char. 314 */ 315 .p2align 4 316 L(ashr_1_exittail): 317 movdqa (%rsi, %rcx), %xmm1 318 psrldq $1, %xmm0 319 psrldq $1, %xmm3 320 jmp L(aftertail) 321 322 /* 323 * The following cases will be handled by ashr_2 324 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 325 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 326 */ 327 .p2align 4 328 L(ashr_2): 329 pxor %xmm0, %xmm0 330 movdqa (%rdi), %xmm2 331 movdqa (%rsi), %xmm1 332 pcmpeqb %xmm1, %xmm0 333 pslldq $14, %xmm2 334 pcmpeqb %xmm1, %xmm2 335 psubb %xmm0, %xmm2 336 pmovmskb %xmm2, %r9d 337 shr %cl, %edx 338 shr %cl, %r9d 339 sub %r9d, %edx 340 jnz L(less32bytes) 341 movdqa (%rdi), %xmm3 342 UPDATE_STRNCMP_COUNTER 343 344 pxor %xmm0, %xmm0 345 mov $16, %rcx /* index for loads */ 346 mov $2, %r9d /* byte position left over from less32bytes case */ 347 /* 348 * Setup %r10 value allows us to detect crossing a page boundary. 349 * When %r10 goes positive we have crossed a page boundary and 350 * need to do a nibble. 351 */ 352 lea 2(%rdi), %r10 353 and $0xfff, %r10 /* offset into 4K page */ 354 sub $0x1000, %r10 /* subtract 4K pagesize */ 355 356 .p2align 4 357 L(loop_ashr_2): 358 add $16, %r10 359 jg L(nibble_ashr_2) 360 361 L(gobble_ashr_2): 362 movdqa (%rsi, %rcx), %xmm1 363 movdqa (%rdi, %rcx), %xmm2 364 movdqa %xmm2, %xmm4 365 366 palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ 367 368 pcmpeqb %xmm1, %xmm0 369 pcmpeqb %xmm2, %xmm1 370 psubb %xmm0, %xmm1 371 pmovmskb %xmm1, %edx 372 sub $0xffff, %edx 373 jnz L(exit) 374 375 #ifdef USE_AS_STRNCMP 376 sub $16, %r11 377 jbe L(strcmp_exitz) 378 #endif 379 380 add $16, %rcx 381 movdqa %xmm4, %xmm3 382 383 add $16, %r10 384 jg L(nibble_ashr_2) /* cross page boundary */ 385 386 movdqa (%rsi, %rcx), %xmm1 387 movdqa (%rdi, %rcx), %xmm2 388 movdqa %xmm2, %xmm4 389 390 palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ 391 392 pcmpeqb %xmm1, %xmm0 393 pcmpeqb %xmm2, %xmm1 394 psubb %xmm0, %xmm1 395 pmovmskb %xmm1, %edx 396 sub $0xffff, %edx 397 jnz L(exit) 398 399 #ifdef USE_AS_STRNCMP 400 sub $16, %r11 401 jbe L(strcmp_exitz) 402 #endif 403 404 add $16, %rcx 405 movdqa %xmm4, %xmm3 406 jmp L(loop_ashr_2) 407 408 .p2align 4 409 L(nibble_ashr_2): 410 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 411 pmovmskb %xmm0, %edx 412 test $0xfffc, %edx 413 jnz L(ashr_2_exittail) 414 415 #ifdef USE_AS_STRNCMP 416 cmp $13, %r11 417 jbe L(ashr_2_exittail) 418 #endif 419 420 pxor %xmm0, %xmm0 421 sub $0x1000, %r10 422 jmp L(gobble_ashr_2) 423 424 .p2align 4 425 L(ashr_2_exittail): 426 movdqa (%rsi, %rcx), %xmm1 427 psrldq $2, %xmm0 428 psrldq $2, %xmm3 429 jmp L(aftertail) 430 431 /* 432 * The following cases will be handled by ashr_3 433 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 434 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 435 */ 436 .p2align 4 437 L(ashr_3): 438 pxor %xmm0, %xmm0 439 movdqa (%rdi), %xmm2 440 movdqa (%rsi), %xmm1 441 pcmpeqb %xmm1, %xmm0 442 pslldq $13, %xmm2 443 pcmpeqb %xmm1, %xmm2 444 psubb %xmm0, %xmm2 445 pmovmskb %xmm2, %r9d 446 shr %cl, %edx 447 shr %cl, %r9d 448 sub %r9d, %edx 449 jnz L(less32bytes) 450 movdqa (%rdi), %xmm3 451 452 UPDATE_STRNCMP_COUNTER 453 454 pxor %xmm0, %xmm0 455 mov $16, %rcx /* index for loads */ 456 mov $3, %r9d /* byte position left over from less32bytes case */ 457 /* 458 * Setup %r10 value allows us to detect crossing a page boundary. 459 * When %r10 goes positive we have crossed a page boundary and 460 * need to do a nibble. 461 */ 462 lea 3(%rdi), %r10 463 and $0xfff, %r10 /* offset into 4K page */ 464 sub $0x1000, %r10 /* subtract 4K pagesize */ 465 466 .p2align 4 467 L(loop_ashr_3): 468 add $16, %r10 469 jg L(nibble_ashr_3) 470 471 L(gobble_ashr_3): 472 movdqa (%rsi, %rcx), %xmm1 473 movdqa (%rdi, %rcx), %xmm2 474 movdqa %xmm2, %xmm4 475 476 palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ 477 478 pcmpeqb %xmm1, %xmm0 479 pcmpeqb %xmm2, %xmm1 480 psubb %xmm0, %xmm1 481 pmovmskb %xmm1, %edx 482 sub $0xffff, %edx 483 jnz L(exit) 484 485 #ifdef USE_AS_STRNCMP 486 sub $16, %r11 487 jbe L(strcmp_exitz) 488 #endif 489 490 add $16, %rcx 491 movdqa %xmm4, %xmm3 492 493 add $16, %r10 494 jg L(nibble_ashr_3) /* cross page boundary */ 495 496 movdqa (%rsi, %rcx), %xmm1 497 movdqa (%rdi, %rcx), %xmm2 498 movdqa %xmm2, %xmm4 499 500 palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ 501 502 pcmpeqb %xmm1, %xmm0 503 pcmpeqb %xmm2, %xmm1 504 psubb %xmm0, %xmm1 505 pmovmskb %xmm1, %edx 506 sub $0xffff, %edx 507 jnz L(exit) 508 509 #ifdef USE_AS_STRNCMP 510 sub $16, %r11 511 jbe L(strcmp_exitz) 512 #endif 513 514 add $16, %rcx 515 movdqa %xmm4, %xmm3 516 jmp L(loop_ashr_3) 517 518 .p2align 4 519 L(nibble_ashr_3): 520 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 521 pmovmskb %xmm0, %edx 522 test $0xfff8, %edx 523 jnz L(ashr_3_exittail) 524 525 #ifdef USE_AS_STRNCMP 526 cmp $12, %r11 527 jbe L(ashr_3_exittail) 528 #endif 529 530 pxor %xmm0, %xmm0 531 sub $0x1000, %r10 532 jmp L(gobble_ashr_3) 533 534 .p2align 4 535 L(ashr_3_exittail): 536 movdqa (%rsi, %rcx), %xmm1 537 psrldq $3, %xmm0 538 psrldq $3, %xmm3 539 jmp L(aftertail) 540 541 /* 542 * The following cases will be handled by ashr_4 543 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 544 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 545 */ 546 .p2align 4 547 L(ashr_4): 548 pxor %xmm0, %xmm0 549 movdqa (%rdi), %xmm2 550 movdqa (%rsi), %xmm1 551 pcmpeqb %xmm1, %xmm0 552 pslldq $12, %xmm2 553 pcmpeqb %xmm1, %xmm2 554 psubb %xmm0, %xmm2 555 pmovmskb %xmm2, %r9d 556 shr %cl, %edx 557 shr %cl, %r9d 558 sub %r9d, %edx 559 jnz L(less32bytes) 560 movdqa (%rdi), %xmm3 561 562 UPDATE_STRNCMP_COUNTER 563 564 pxor %xmm0, %xmm0 565 mov $16, %rcx /* index for loads */ 566 mov $4, %r9d /* byte position left over from less32bytes case */ 567 /* 568 * Setup %r10 value allows us to detect crossing a page boundary. 569 * When %r10 goes positive we have crossed a page boundary and 570 * need to do a nibble. 571 */ 572 lea 4(%rdi), %r10 573 and $0xfff, %r10 /* offset into 4K page */ 574 sub $0x1000, %r10 /* subtract 4K pagesize */ 575 576 .p2align 4 577 L(loop_ashr_4): 578 add $16, %r10 579 jg L(nibble_ashr_4) 580 581 L(gobble_ashr_4): 582 movdqa (%rsi, %rcx), %xmm1 583 movdqa (%rdi, %rcx), %xmm2 584 movdqa %xmm2, %xmm4 585 586 palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ 587 588 pcmpeqb %xmm1, %xmm0 589 pcmpeqb %xmm2, %xmm1 590 psubb %xmm0, %xmm1 591 pmovmskb %xmm1, %edx 592 sub $0xffff, %edx 593 jnz L(exit) 594 595 #ifdef USE_AS_STRNCMP 596 sub $16, %r11 597 jbe L(strcmp_exitz) 598 #endif 599 600 add $16, %rcx 601 movdqa %xmm4, %xmm3 602 603 add $16, %r10 604 jg L(nibble_ashr_4) /* cross page boundary */ 605 606 movdqa (%rsi, %rcx), %xmm1 607 movdqa (%rdi, %rcx), %xmm2 608 movdqa %xmm2, %xmm4 609 610 palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ 611 612 pcmpeqb %xmm1, %xmm0 613 pcmpeqb %xmm2, %xmm1 614 psubb %xmm0, %xmm1 615 pmovmskb %xmm1, %edx 616 sub $0xffff, %edx 617 jnz L(exit) 618 619 #ifdef USE_AS_STRNCMP 620 sub $16, %r11 621 jbe L(strcmp_exitz) 622 #endif 623 624 add $16, %rcx 625 movdqa %xmm4, %xmm3 626 jmp L(loop_ashr_4) 627 628 .p2align 4 629 L(nibble_ashr_4): 630 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 631 pmovmskb %xmm0, %edx 632 test $0xfff0, %edx 633 jnz L(ashr_4_exittail) 634 635 #ifdef USE_AS_STRNCMP 636 cmp $11, %r11 637 jbe L(ashr_4_exittail) 638 #endif 639 640 pxor %xmm0, %xmm0 641 sub $0x1000, %r10 642 jmp L(gobble_ashr_4) 643 644 .p2align 4 645 L(ashr_4_exittail): 646 movdqa (%rsi, %rcx), %xmm1 647 psrldq $4, %xmm0 648 psrldq $4, %xmm3 649 jmp L(aftertail) 650 651 /* 652 * The following cases will be handled by ashr_5 653 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 654 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 655 */ 656 .p2align 4 657 L(ashr_5): 658 pxor %xmm0, %xmm0 659 movdqa (%rdi), %xmm2 660 movdqa (%rsi), %xmm1 661 pcmpeqb %xmm1, %xmm0 662 pslldq $11, %xmm2 663 pcmpeqb %xmm1, %xmm2 664 psubb %xmm0, %xmm2 665 pmovmskb %xmm2, %r9d 666 shr %cl, %edx 667 shr %cl, %r9d 668 sub %r9d, %edx 669 jnz L(less32bytes) 670 movdqa (%rdi), %xmm3 671 672 UPDATE_STRNCMP_COUNTER 673 674 pxor %xmm0, %xmm0 675 mov $16, %rcx /* index for loads */ 676 mov $5, %r9d /* byte position left over from less32bytes case */ 677 /* 678 * Setup %r10 value allows us to detect crossing a page boundary. 679 * When %r10 goes positive we have crossed a page boundary and 680 * need to do a nibble. 681 */ 682 lea 5(%rdi), %r10 683 and $0xfff, %r10 /* offset into 4K page */ 684 sub $0x1000, %r10 /* subtract 4K pagesize */ 685 686 .p2align 4 687 L(loop_ashr_5): 688 add $16, %r10 689 jg L(nibble_ashr_5) 690 691 L(gobble_ashr_5): 692 movdqa (%rsi, %rcx), %xmm1 693 movdqa (%rdi, %rcx), %xmm2 694 movdqa %xmm2, %xmm4 695 696 palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ 697 698 pcmpeqb %xmm1, %xmm0 699 pcmpeqb %xmm2, %xmm1 700 psubb %xmm0, %xmm1 701 pmovmskb %xmm1, %edx 702 sub $0xffff, %edx 703 jnz L(exit) 704 705 #ifdef USE_AS_STRNCMP 706 sub $16, %r11 707 jbe L(strcmp_exitz) 708 #endif 709 710 add $16, %rcx 711 movdqa %xmm4, %xmm3 712 713 add $16, %r10 714 jg L(nibble_ashr_5) /* cross page boundary */ 715 716 movdqa (%rsi, %rcx), %xmm1 717 movdqa (%rdi, %rcx), %xmm2 718 movdqa %xmm2, %xmm4 719 720 palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ 721 722 pcmpeqb %xmm1, %xmm0 723 pcmpeqb %xmm2, %xmm1 724 psubb %xmm0, %xmm1 725 pmovmskb %xmm1, %edx 726 sub $0xffff, %edx 727 jnz L(exit) 728 729 #ifdef USE_AS_STRNCMP 730 sub $16, %r11 731 jbe L(strcmp_exitz) 732 #endif 733 734 add $16, %rcx 735 movdqa %xmm4, %xmm3 736 jmp L(loop_ashr_5) 737 738 .p2align 4 739 L(nibble_ashr_5): 740 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 741 pmovmskb %xmm0, %edx 742 test $0xffe0, %edx 743 jnz L(ashr_5_exittail) 744 745 #ifdef USE_AS_STRNCMP 746 cmp $10, %r11 747 jbe L(ashr_5_exittail) 748 #endif 749 750 pxor %xmm0, %xmm0 751 sub $0x1000, %r10 752 jmp L(gobble_ashr_5) 753 754 .p2align 4 755 L(ashr_5_exittail): 756 movdqa (%rsi, %rcx), %xmm1 757 psrldq $5, %xmm0 758 psrldq $5, %xmm3 759 jmp L(aftertail) 760 761 /* 762 * The following cases will be handled by ashr_6 763 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 764 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 765 */ 766 .p2align 4 767 L(ashr_6): 768 pxor %xmm0, %xmm0 769 movdqa (%rdi), %xmm2 770 movdqa (%rsi), %xmm1 771 pcmpeqb %xmm1, %xmm0 772 pslldq $10, %xmm2 773 pcmpeqb %xmm1, %xmm2 774 psubb %xmm0, %xmm2 775 pmovmskb %xmm2, %r9d 776 shr %cl, %edx 777 shr %cl, %r9d 778 sub %r9d, %edx 779 jnz L(less32bytes) 780 movdqa (%rdi), %xmm3 781 782 UPDATE_STRNCMP_COUNTER 783 784 pxor %xmm0, %xmm0 785 mov $16, %rcx /* index for loads */ 786 mov $6, %r9d /* byte position left over from less32bytes case */ 787 /* 788 * Setup %r10 value allows us to detect crossing a page boundary. 789 * When %r10 goes positive we have crossed a page boundary and 790 * need to do a nibble. 791 */ 792 lea 6(%rdi), %r10 793 and $0xfff, %r10 /* offset into 4K page */ 794 sub $0x1000, %r10 /* subtract 4K pagesize */ 795 796 .p2align 4 797 L(loop_ashr_6): 798 add $16, %r10 799 jg L(nibble_ashr_6) 800 801 L(gobble_ashr_6): 802 movdqa (%rsi, %rcx), %xmm1 803 movdqa (%rdi, %rcx), %xmm2 804 movdqa %xmm2, %xmm4 805 806 palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ 807 808 pcmpeqb %xmm1, %xmm0 809 pcmpeqb %xmm2, %xmm1 810 psubb %xmm0, %xmm1 811 pmovmskb %xmm1, %edx 812 sub $0xffff, %edx 813 jnz L(exit) 814 815 #ifdef USE_AS_STRNCMP 816 sub $16, %r11 817 jbe L(strcmp_exitz) 818 #endif 819 820 add $16, %rcx 821 movdqa %xmm4, %xmm3 822 823 add $16, %r10 824 jg L(nibble_ashr_6) /* cross page boundary */ 825 826 movdqa (%rsi, %rcx), %xmm1 827 movdqa (%rdi, %rcx), %xmm2 828 movdqa %xmm2, %xmm4 829 830 palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ 831 832 pcmpeqb %xmm1, %xmm0 833 pcmpeqb %xmm2, %xmm1 834 psubb %xmm0, %xmm1 835 pmovmskb %xmm1, %edx 836 sub $0xffff, %edx 837 jnz L(exit) 838 839 #ifdef USE_AS_STRNCMP 840 sub $16, %r11 841 jbe L(strcmp_exitz) 842 #endif 843 844 add $16, %rcx 845 movdqa %xmm4, %xmm3 846 jmp L(loop_ashr_6) 847 848 .p2align 4 849 L(nibble_ashr_6): 850 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 851 pmovmskb %xmm0, %edx 852 test $0xffc0, %edx 853 jnz L(ashr_6_exittail) 854 855 #ifdef USE_AS_STRNCMP 856 cmp $9, %r11 857 jbe L(ashr_6_exittail) 858 #endif 859 860 pxor %xmm0, %xmm0 861 sub $0x1000, %r10 862 jmp L(gobble_ashr_6) 863 864 .p2align 4 865 L(ashr_6_exittail): 866 movdqa (%rsi, %rcx), %xmm1 867 psrldq $6, %xmm0 868 psrldq $6, %xmm3 869 jmp L(aftertail) 870 871 /* 872 * The following cases will be handled by ashr_7 873 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 874 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 875 */ 876 .p2align 4 877 L(ashr_7): 878 pxor %xmm0, %xmm0 879 movdqa (%rdi), %xmm2 880 movdqa (%rsi), %xmm1 881 pcmpeqb %xmm1, %xmm0 882 pslldq $9, %xmm2 883 pcmpeqb %xmm1, %xmm2 884 psubb %xmm0, %xmm2 885 pmovmskb %xmm2, %r9d 886 shr %cl, %edx 887 shr %cl, %r9d 888 sub %r9d, %edx 889 jnz L(less32bytes) 890 movdqa (%rdi), %xmm3 891 892 UPDATE_STRNCMP_COUNTER 893 894 pxor %xmm0, %xmm0 895 mov $16, %rcx /* index for loads */ 896 mov $7, %r9d /* byte position left over from less32bytes case */ 897 /* 898 * Setup %r10 value allows us to detect crossing a page boundary. 899 * When %r10 goes positive we have crossed a page boundary and 900 * need to do a nibble. 901 */ 902 lea 7(%rdi), %r10 903 and $0xfff, %r10 /* offset into 4K page */ 904 sub $0x1000, %r10 /* subtract 4K pagesize */ 905 906 .p2align 4 907 L(loop_ashr_7): 908 add $16, %r10 909 jg L(nibble_ashr_7) 910 911 L(gobble_ashr_7): 912 movdqa (%rsi, %rcx), %xmm1 913 movdqa (%rdi, %rcx), %xmm2 914 movdqa %xmm2, %xmm4 915 916 palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ 917 918 pcmpeqb %xmm1, %xmm0 919 pcmpeqb %xmm2, %xmm1 920 psubb %xmm0, %xmm1 921 pmovmskb %xmm1, %edx 922 sub $0xffff, %edx 923 jnz L(exit) 924 925 #ifdef USE_AS_STRNCMP 926 sub $16, %r11 927 jbe L(strcmp_exitz) 928 #endif 929 930 add $16, %rcx 931 movdqa %xmm4, %xmm3 932 933 add $16, %r10 934 jg L(nibble_ashr_7) /* cross page boundary */ 935 936 movdqa (%rsi, %rcx), %xmm1 937 movdqa (%rdi, %rcx), %xmm2 938 movdqa %xmm2, %xmm4 939 940 palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ 941 942 pcmpeqb %xmm1, %xmm0 943 pcmpeqb %xmm2, %xmm1 944 psubb %xmm0, %xmm1 945 pmovmskb %xmm1, %edx 946 sub $0xffff, %edx 947 jnz L(exit) 948 949 #ifdef USE_AS_STRNCMP 950 sub $16, %r11 951 jbe L(strcmp_exitz) 952 #endif 953 954 add $16, %rcx 955 movdqa %xmm4, %xmm3 956 jmp L(loop_ashr_7) 957 958 .p2align 4 959 L(nibble_ashr_7): 960 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 961 pmovmskb %xmm0, %edx 962 test $0xff80, %edx 963 jnz L(ashr_7_exittail) 964 965 #ifdef USE_AS_STRNCMP 966 cmp $8, %r11 967 jbe L(ashr_7_exittail) 968 #endif 969 970 pxor %xmm0, %xmm0 971 sub $0x1000, %r10 972 jmp L(gobble_ashr_7) 973 974 .p2align 4 975 L(ashr_7_exittail): 976 movdqa (%rsi, %rcx), %xmm1 977 psrldq $7, %xmm0 978 psrldq $7, %xmm3 979 jmp L(aftertail) 980 981 /* 982 * The following cases will be handled by ashr_8 983 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 984 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 985 */ 986 .p2align 4 987 L(ashr_8): 988 pxor %xmm0, %xmm0 989 movdqa (%rdi), %xmm2 990 movdqa (%rsi), %xmm1 991 pcmpeqb %xmm1, %xmm0 992 pslldq $8, %xmm2 993 pcmpeqb %xmm1, %xmm2 994 psubb %xmm0, %xmm2 995 pmovmskb %xmm2, %r9d 996 shr %cl, %edx 997 shr %cl, %r9d 998 sub %r9d, %edx 999 jnz L(less32bytes) 1000 movdqa (%rdi), %xmm3 1001 1002 UPDATE_STRNCMP_COUNTER 1003 1004 pxor %xmm0, %xmm0 1005 mov $16, %rcx /* index for loads */ 1006 mov $8, %r9d /* byte position left over from less32bytes case */ 1007 /* 1008 * Setup %r10 value allows us to detect crossing a page boundary. 1009 * When %r10 goes positive we have crossed a page boundary and 1010 * need to do a nibble. 1011 */ 1012 lea 8(%rdi), %r10 1013 and $0xfff, %r10 /* offset into 4K page */ 1014 sub $0x1000, %r10 /* subtract 4K pagesize */ 1015 1016 .p2align 4 1017 L(loop_ashr_8): 1018 add $16, %r10 1019 jg L(nibble_ashr_8) 1020 1021 L(gobble_ashr_8): 1022 movdqa (%rsi, %rcx), %xmm1 1023 movdqa (%rdi, %rcx), %xmm2 1024 movdqa %xmm2, %xmm4 1025 1026 palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ 1027 1028 pcmpeqb %xmm1, %xmm0 1029 pcmpeqb %xmm2, %xmm1 1030 psubb %xmm0, %xmm1 1031 pmovmskb %xmm1, %edx 1032 sub $0xffff, %edx 1033 jnz L(exit) 1034 1035 #ifdef USE_AS_STRNCMP 1036 sub $16, %r11 1037 jbe L(strcmp_exitz) 1038 #endif 1039 1040 add $16, %rcx 1041 movdqa %xmm4, %xmm3 1042 1043 add $16, %r10 1044 jg L(nibble_ashr_8) /* cross page boundary */ 1045 1046 movdqa (%rsi, %rcx), %xmm1 1047 movdqa (%rdi, %rcx), %xmm2 1048 movdqa %xmm2, %xmm4 1049 1050 palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ 1051 1052 pcmpeqb %xmm1, %xmm0 1053 pcmpeqb %xmm2, %xmm1 1054 psubb %xmm0, %xmm1 1055 pmovmskb %xmm1, %edx 1056 sub $0xffff, %edx 1057 jnz L(exit) 1058 1059 #ifdef USE_AS_STRNCMP 1060 sub $16, %r11 1061 jbe L(strcmp_exitz) 1062 #endif 1063 1064 add $16, %rcx 1065 movdqa %xmm4, %xmm3 1066 jmp L(loop_ashr_8) 1067 1068 .p2align 4 1069 L(nibble_ashr_8): 1070 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1071 pmovmskb %xmm0, %edx 1072 test $0xff00, %edx 1073 jnz L(ashr_8_exittail) 1074 1075 #ifdef USE_AS_STRNCMP 1076 cmp $7, %r11 1077 jbe L(ashr_8_exittail) 1078 #endif 1079 1080 pxor %xmm0, %xmm0 1081 sub $0x1000, %r10 1082 jmp L(gobble_ashr_8) 1083 1084 .p2align 4 1085 L(ashr_8_exittail): 1086 movdqa (%rsi, %rcx), %xmm1 1087 psrldq $8, %xmm0 1088 psrldq $8, %xmm3 1089 jmp L(aftertail) 1090 1091 /* 1092 * The following cases will be handled by ashr_9 1093 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1094 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 1095 */ 1096 .p2align 4 1097 L(ashr_9): 1098 pxor %xmm0, %xmm0 1099 movdqa (%rdi), %xmm2 1100 movdqa (%rsi), %xmm1 1101 pcmpeqb %xmm1, %xmm0 1102 pslldq $7, %xmm2 1103 pcmpeqb %xmm1, %xmm2 1104 psubb %xmm0, %xmm2 1105 pmovmskb %xmm2, %r9d 1106 shr %cl, %edx 1107 shr %cl, %r9d 1108 sub %r9d, %edx 1109 jnz L(less32bytes) 1110 movdqa (%rdi), %xmm3 1111 1112 UPDATE_STRNCMP_COUNTER 1113 1114 pxor %xmm0, %xmm0 1115 mov $16, %rcx /* index for loads */ 1116 mov $9, %r9d /* byte position left over from less32bytes case */ 1117 /* 1118 * Setup %r10 value allows us to detect crossing a page boundary. 1119 * When %r10 goes positive we have crossed a page boundary and 1120 * need to do a nibble. 1121 */ 1122 lea 9(%rdi), %r10 1123 and $0xfff, %r10 /* offset into 4K page */ 1124 sub $0x1000, %r10 /* subtract 4K pagesize */ 1125 1126 .p2align 4 1127 L(loop_ashr_9): 1128 add $16, %r10 1129 jg L(nibble_ashr_9) 1130 1131 L(gobble_ashr_9): 1132 movdqa (%rsi, %rcx), %xmm1 1133 movdqa (%rdi, %rcx), %xmm2 1134 movdqa %xmm2, %xmm4 1135 1136 palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ 1137 1138 pcmpeqb %xmm1, %xmm0 1139 pcmpeqb %xmm2, %xmm1 1140 psubb %xmm0, %xmm1 1141 pmovmskb %xmm1, %edx 1142 sub $0xffff, %edx 1143 jnz L(exit) 1144 1145 #ifdef USE_AS_STRNCMP 1146 sub $16, %r11 1147 jbe L(strcmp_exitz) 1148 #endif 1149 1150 add $16, %rcx 1151 movdqa %xmm4, %xmm3 1152 1153 add $16, %r10 1154 jg L(nibble_ashr_9) /* cross page boundary */ 1155 1156 movdqa (%rsi, %rcx), %xmm1 1157 movdqa (%rdi, %rcx), %xmm2 1158 movdqa %xmm2, %xmm4 1159 1160 palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ 1161 1162 pcmpeqb %xmm1, %xmm0 1163 pcmpeqb %xmm2, %xmm1 1164 psubb %xmm0, %xmm1 1165 pmovmskb %xmm1, %edx 1166 sub $0xffff, %edx 1167 jnz L(exit) 1168 1169 #ifdef USE_AS_STRNCMP 1170 sub $16, %r11 1171 jbe L(strcmp_exitz) 1172 #endif 1173 1174 add $16, %rcx 1175 movdqa %xmm4, %xmm3 /* store for next cycle */ 1176 jmp L(loop_ashr_9) 1177 1178 .p2align 4 1179 L(nibble_ashr_9): 1180 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1181 pmovmskb %xmm0, %edx 1182 test $0xfe00, %edx 1183 jnz L(ashr_9_exittail) 1184 1185 #ifdef USE_AS_STRNCMP 1186 cmp $6, %r11 1187 jbe L(ashr_9_exittail) 1188 #endif 1189 1190 pxor %xmm0, %xmm0 1191 sub $0x1000, %r10 1192 jmp L(gobble_ashr_9) 1193 1194 .p2align 4 1195 L(ashr_9_exittail): 1196 movdqa (%rsi, %rcx), %xmm1 1197 psrldq $9, %xmm0 1198 psrldq $9, %xmm3 1199 jmp L(aftertail) 1200 1201 /* 1202 * The following cases will be handled by ashr_10 1203 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1204 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 1205 */ 1206 .p2align 4 1207 L(ashr_10): 1208 pxor %xmm0, %xmm0 1209 movdqa (%rdi), %xmm2 1210 movdqa (%rsi), %xmm1 1211 pcmpeqb %xmm1, %xmm0 1212 pslldq $6, %xmm2 1213 pcmpeqb %xmm1, %xmm2 1214 psubb %xmm0, %xmm2 1215 pmovmskb %xmm2, %r9d 1216 shr %cl, %edx 1217 shr %cl, %r9d 1218 sub %r9d, %edx 1219 jnz L(less32bytes) 1220 movdqa (%rdi), %xmm3 1221 1222 UPDATE_STRNCMP_COUNTER 1223 1224 pxor %xmm0, %xmm0 1225 mov $16, %rcx /* index for loads */ 1226 mov $10, %r9d /* byte position left over from less32bytes case */ 1227 /* 1228 * Setup %r10 value allows us to detect crossing a page boundary. 1229 * When %r10 goes positive we have crossed a page boundary and 1230 * need to do a nibble. 1231 */ 1232 lea 10(%rdi), %r10 1233 and $0xfff, %r10 /* offset into 4K page */ 1234 sub $0x1000, %r10 /* subtract 4K pagesize */ 1235 1236 .p2align 4 1237 L(loop_ashr_10): 1238 add $16, %r10 1239 jg L(nibble_ashr_10) 1240 1241 L(gobble_ashr_10): 1242 movdqa (%rsi, %rcx), %xmm1 1243 movdqa (%rdi, %rcx), %xmm2 1244 movdqa %xmm2, %xmm4 1245 1246 palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ 1247 1248 pcmpeqb %xmm1, %xmm0 1249 pcmpeqb %xmm2, %xmm1 1250 psubb %xmm0, %xmm1 1251 pmovmskb %xmm1, %edx 1252 sub $0xffff, %edx 1253 jnz L(exit) 1254 1255 #ifdef USE_AS_STRNCMP 1256 sub $16, %r11 1257 jbe L(strcmp_exitz) 1258 #endif 1259 1260 add $16, %rcx 1261 movdqa %xmm4, %xmm3 1262 1263 add $16, %r10 1264 jg L(nibble_ashr_10) /* cross page boundary */ 1265 1266 movdqa (%rsi, %rcx), %xmm1 1267 movdqa (%rdi, %rcx), %xmm2 1268 movdqa %xmm2, %xmm4 1269 1270 palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ 1271 1272 pcmpeqb %xmm1, %xmm0 1273 pcmpeqb %xmm2, %xmm1 1274 psubb %xmm0, %xmm1 1275 pmovmskb %xmm1, %edx 1276 sub $0xffff, %edx 1277 jnz L(exit) 1278 1279 #ifdef USE_AS_STRNCMP 1280 sub $16, %r11 1281 jbe L(strcmp_exitz) 1282 #endif 1283 1284 add $16, %rcx 1285 movdqa %xmm4, %xmm3 1286 jmp L(loop_ashr_10) 1287 1288 .p2align 4 1289 L(nibble_ashr_10): 1290 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1291 pmovmskb %xmm0, %edx 1292 test $0xfc00, %edx 1293 jnz L(ashr_10_exittail) 1294 1295 #ifdef USE_AS_STRNCMP 1296 cmp $5, %r11 1297 jbe L(ashr_10_exittail) 1298 #endif 1299 1300 pxor %xmm0, %xmm0 1301 sub $0x1000, %r10 1302 jmp L(gobble_ashr_10) 1303 1304 .p2align 4 1305 L(ashr_10_exittail): 1306 movdqa (%rsi, %rcx), %xmm1 1307 psrldq $10, %xmm0 1308 psrldq $10, %xmm3 1309 jmp L(aftertail) 1310 1311 /* 1312 * The following cases will be handled by ashr_11 1313 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1314 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 1315 */ 1316 .p2align 4 1317 L(ashr_11): 1318 pxor %xmm0, %xmm0 1319 movdqa (%rdi), %xmm2 1320 movdqa (%rsi), %xmm1 1321 pcmpeqb %xmm1, %xmm0 1322 pslldq $5, %xmm2 1323 pcmpeqb %xmm1, %xmm2 1324 psubb %xmm0, %xmm2 1325 pmovmskb %xmm2, %r9d 1326 shr %cl, %edx 1327 shr %cl, %r9d 1328 sub %r9d, %edx 1329 jnz L(less32bytes) 1330 movdqa (%rdi), %xmm3 1331 1332 UPDATE_STRNCMP_COUNTER 1333 1334 pxor %xmm0, %xmm0 1335 mov $16, %rcx /* index for loads */ 1336 mov $11, %r9d /* byte position left over from less32bytes case */ 1337 /* 1338 * Setup %r10 value allows us to detect crossing a page boundary. 1339 * When %r10 goes positive we have crossed a page boundary and 1340 * need to do a nibble. 1341 */ 1342 lea 11(%rdi), %r10 1343 and $0xfff, %r10 /* offset into 4K page */ 1344 sub $0x1000, %r10 /* subtract 4K pagesize */ 1345 1346 .p2align 4 1347 L(loop_ashr_11): 1348 add $16, %r10 1349 jg L(nibble_ashr_11) 1350 1351 L(gobble_ashr_11): 1352 movdqa (%rsi, %rcx), %xmm1 1353 movdqa (%rdi, %rcx), %xmm2 1354 movdqa %xmm2, %xmm4 1355 1356 palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ 1357 1358 pcmpeqb %xmm1, %xmm0 1359 pcmpeqb %xmm2, %xmm1 1360 psubb %xmm0, %xmm1 1361 pmovmskb %xmm1, %edx 1362 sub $0xffff, %edx 1363 jnz L(exit) 1364 1365 #ifdef USE_AS_STRNCMP 1366 sub $16, %r11 1367 jbe L(strcmp_exitz) 1368 #endif 1369 1370 add $16, %rcx 1371 movdqa %xmm4, %xmm3 1372 1373 add $16, %r10 1374 jg L(nibble_ashr_11) /* cross page boundary */ 1375 1376 movdqa (%rsi, %rcx), %xmm1 1377 movdqa (%rdi, %rcx), %xmm2 1378 movdqa %xmm2, %xmm4 1379 1380 palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ 1381 1382 pcmpeqb %xmm1, %xmm0 1383 pcmpeqb %xmm2, %xmm1 1384 psubb %xmm0, %xmm1 1385 pmovmskb %xmm1, %edx 1386 sub $0xffff, %edx 1387 jnz L(exit) 1388 1389 #ifdef USE_AS_STRNCMP 1390 sub $16, %r11 1391 jbe L(strcmp_exitz) 1392 #endif 1393 1394 add $16, %rcx 1395 movdqa %xmm4, %xmm3 1396 jmp L(loop_ashr_11) 1397 1398 .p2align 4 1399 L(nibble_ashr_11): 1400 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1401 pmovmskb %xmm0, %edx 1402 test $0xf800, %edx 1403 jnz L(ashr_11_exittail) 1404 1405 #ifdef USE_AS_STRNCMP 1406 cmp $4, %r11 1407 jbe L(ashr_11_exittail) 1408 #endif 1409 1410 pxor %xmm0, %xmm0 1411 sub $0x1000, %r10 1412 jmp L(gobble_ashr_11) 1413 1414 .p2align 4 1415 L(ashr_11_exittail): 1416 movdqa (%rsi, %rcx), %xmm1 1417 psrldq $11, %xmm0 1418 psrldq $11, %xmm3 1419 jmp L(aftertail) 1420 1421 /* 1422 * The following cases will be handled by ashr_12 1423 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1424 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 1425 */ 1426 .p2align 4 1427 L(ashr_12): 1428 pxor %xmm0, %xmm0 1429 movdqa (%rdi), %xmm2 1430 movdqa (%rsi), %xmm1 1431 pcmpeqb %xmm1, %xmm0 1432 pslldq $4, %xmm2 1433 pcmpeqb %xmm1, %xmm2 1434 psubb %xmm0, %xmm2 1435 pmovmskb %xmm2, %r9d 1436 shr %cl, %edx 1437 shr %cl, %r9d 1438 sub %r9d, %edx 1439 jnz L(less32bytes) 1440 movdqa (%rdi), %xmm3 1441 1442 UPDATE_STRNCMP_COUNTER 1443 1444 pxor %xmm0, %xmm0 1445 mov $16, %rcx /* index for loads */ 1446 mov $12, %r9d /* byte position left over from less32bytes case */ 1447 /* 1448 * Setup %r10 value allows us to detect crossing a page boundary. 1449 * When %r10 goes positive we have crossed a page boundary and 1450 * need to do a nibble. 1451 */ 1452 lea 12(%rdi), %r10 1453 and $0xfff, %r10 /* offset into 4K page */ 1454 sub $0x1000, %r10 /* subtract 4K pagesize */ 1455 1456 .p2align 4 1457 L(loop_ashr_12): 1458 add $16, %r10 1459 jg L(nibble_ashr_12) 1460 1461 L(gobble_ashr_12): 1462 movdqa (%rsi, %rcx), %xmm1 1463 movdqa (%rdi, %rcx), %xmm2 1464 movdqa %xmm2, %xmm4 1465 1466 palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ 1467 1468 pcmpeqb %xmm1, %xmm0 1469 pcmpeqb %xmm2, %xmm1 1470 psubb %xmm0, %xmm1 1471 pmovmskb %xmm1, %edx 1472 sub $0xffff, %edx 1473 jnz L(exit) 1474 1475 #ifdef USE_AS_STRNCMP 1476 sub $16, %r11 1477 jbe L(strcmp_exitz) 1478 #endif 1479 1480 add $16, %rcx 1481 movdqa %xmm4, %xmm3 1482 1483 add $16, %r10 1484 jg L(nibble_ashr_12) /* cross page boundary */ 1485 1486 movdqa (%rsi, %rcx), %xmm1 1487 movdqa (%rdi, %rcx), %xmm2 1488 movdqa %xmm2, %xmm4 1489 1490 palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ 1491 1492 pcmpeqb %xmm1, %xmm0 1493 pcmpeqb %xmm2, %xmm1 1494 psubb %xmm0, %xmm1 1495 pmovmskb %xmm1, %edx 1496 sub $0xffff, %edx 1497 jnz L(exit) 1498 1499 #ifdef USE_AS_STRNCMP 1500 sub $16, %r11 1501 jbe L(strcmp_exitz) 1502 #endif 1503 1504 add $16, %rcx 1505 movdqa %xmm4, %xmm3 1506 jmp L(loop_ashr_12) 1507 1508 .p2align 4 1509 L(nibble_ashr_12): 1510 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1511 pmovmskb %xmm0, %edx 1512 test $0xf000, %edx 1513 jnz L(ashr_12_exittail) 1514 1515 #ifdef USE_AS_STRNCMP 1516 cmp $3, %r11 1517 jbe L(ashr_12_exittail) 1518 #endif 1519 1520 pxor %xmm0, %xmm0 1521 sub $0x1000, %r10 1522 jmp L(gobble_ashr_12) 1523 1524 .p2align 4 1525 L(ashr_12_exittail): 1526 movdqa (%rsi, %rcx), %xmm1 1527 psrldq $12, %xmm0 1528 psrldq $12, %xmm3 1529 jmp L(aftertail) 1530 1531 /* 1532 * The following cases will be handled by ashr_13 1533 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1534 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 1535 */ 1536 .p2align 4 1537 L(ashr_13): 1538 pxor %xmm0, %xmm0 1539 movdqa (%rdi), %xmm2 1540 movdqa (%rsi), %xmm1 1541 pcmpeqb %xmm1, %xmm0 1542 pslldq $3, %xmm2 1543 pcmpeqb %xmm1, %xmm2 1544 psubb %xmm0, %xmm2 1545 pmovmskb %xmm2, %r9d 1546 shr %cl, %edx 1547 shr %cl, %r9d 1548 sub %r9d, %edx 1549 jnz L(less32bytes) 1550 movdqa (%rdi), %xmm3 1551 1552 UPDATE_STRNCMP_COUNTER 1553 1554 pxor %xmm0, %xmm0 1555 mov $16, %rcx /* index for loads */ 1556 mov $13, %r9d /* byte position left over from less32bytes case */ 1557 /* 1558 * Setup %r10 value allows us to detect crossing a page boundary. 1559 * When %r10 goes positive we have crossed a page boundary and 1560 * need to do a nibble. 1561 */ 1562 lea 13(%rdi), %r10 1563 and $0xfff, %r10 /* offset into 4K page */ 1564 sub $0x1000, %r10 /* subtract 4K pagesize */ 1565 1566 .p2align 4 1567 L(loop_ashr_13): 1568 add $16, %r10 1569 jg L(nibble_ashr_13) 1570 1571 L(gobble_ashr_13): 1572 movdqa (%rsi, %rcx), %xmm1 1573 movdqa (%rdi, %rcx), %xmm2 1574 movdqa %xmm2, %xmm4 1575 1576 palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ 1577 1578 pcmpeqb %xmm1, %xmm0 1579 pcmpeqb %xmm2, %xmm1 1580 psubb %xmm0, %xmm1 1581 pmovmskb %xmm1, %edx 1582 sub $0xffff, %edx 1583 jnz L(exit) 1584 1585 #ifdef USE_AS_STRNCMP 1586 sub $16, %r11 1587 jbe L(strcmp_exitz) 1588 #endif 1589 1590 add $16, %rcx 1591 movdqa %xmm4, %xmm3 1592 1593 add $16, %r10 1594 jg L(nibble_ashr_13) /* cross page boundary */ 1595 1596 movdqa (%rsi, %rcx), %xmm1 1597 movdqa (%rdi, %rcx), %xmm2 1598 movdqa %xmm2, %xmm4 1599 1600 palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ 1601 1602 pcmpeqb %xmm1, %xmm0 1603 pcmpeqb %xmm2, %xmm1 1604 psubb %xmm0, %xmm1 1605 pmovmskb %xmm1, %edx 1606 sub $0xffff, %edx 1607 jnz L(exit) 1608 1609 #ifdef USE_AS_STRNCMP 1610 sub $16, %r11 1611 jbe L(strcmp_exitz) 1612 #endif 1613 1614 add $16, %rcx 1615 movdqa %xmm4, %xmm3 1616 jmp L(loop_ashr_13) 1617 1618 .p2align 4 1619 L(nibble_ashr_13): 1620 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1621 pmovmskb %xmm0, %edx 1622 test $0xe000, %edx 1623 jnz L(ashr_13_exittail) 1624 1625 #ifdef USE_AS_STRNCMP 1626 cmp $2, %r11 1627 jbe L(ashr_13_exittail) 1628 #endif 1629 1630 pxor %xmm0, %xmm0 1631 sub $0x1000, %r10 1632 jmp L(gobble_ashr_13) 1633 1634 .p2align 4 1635 L(ashr_13_exittail): 1636 movdqa (%rsi, %rcx), %xmm1 1637 psrldq $13, %xmm0 1638 psrldq $13, %xmm3 1639 jmp L(aftertail) 1640 1641 /* 1642 * The following cases will be handled by ashr_14 1643 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1644 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 1645 */ 1646 .p2align 4 1647 L(ashr_14): 1648 pxor %xmm0, %xmm0 1649 movdqa (%rdi), %xmm2 1650 movdqa (%rsi), %xmm1 1651 pcmpeqb %xmm1, %xmm0 1652 pslldq $2, %xmm2 1653 pcmpeqb %xmm1, %xmm2 1654 psubb %xmm0, %xmm2 1655 pmovmskb %xmm2, %r9d 1656 shr %cl, %edx 1657 shr %cl, %r9d 1658 sub %r9d, %edx 1659 jnz L(less32bytes) 1660 movdqa (%rdi), %xmm3 1661 1662 UPDATE_STRNCMP_COUNTER 1663 1664 pxor %xmm0, %xmm0 1665 mov $16, %rcx /* index for loads */ 1666 mov $14, %r9d /* byte position left over from less32bytes case */ 1667 /* 1668 * Setup %r10 value allows us to detect crossing a page boundary. 1669 * When %r10 goes positive we have crossed a page boundary and 1670 * need to do a nibble. 1671 */ 1672 lea 14(%rdi), %r10 1673 and $0xfff, %r10 /* offset into 4K page */ 1674 sub $0x1000, %r10 /* subtract 4K pagesize */ 1675 1676 .p2align 4 1677 L(loop_ashr_14): 1678 add $16, %r10 1679 jg L(nibble_ashr_14) 1680 1681 L(gobble_ashr_14): 1682 movdqa (%rsi, %rcx), %xmm1 1683 movdqa (%rdi, %rcx), %xmm2 1684 movdqa %xmm2, %xmm4 1685 1686 palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ 1687 1688 pcmpeqb %xmm1, %xmm0 1689 pcmpeqb %xmm2, %xmm1 1690 psubb %xmm0, %xmm1 1691 pmovmskb %xmm1, %edx 1692 sub $0xffff, %edx 1693 jnz L(exit) 1694 1695 #ifdef USE_AS_STRNCMP 1696 sub $16, %r11 1697 jbe L(strcmp_exitz) 1698 #endif 1699 1700 add $16, %rcx 1701 movdqa %xmm4, %xmm3 1702 1703 add $16, %r10 1704 jg L(nibble_ashr_14) /* cross page boundary */ 1705 1706 movdqa (%rsi, %rcx), %xmm1 1707 movdqa (%rdi, %rcx), %xmm2 1708 movdqa %xmm2, %xmm4 1709 1710 palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ 1711 1712 pcmpeqb %xmm1, %xmm0 1713 pcmpeqb %xmm2, %xmm1 1714 psubb %xmm0, %xmm1 1715 pmovmskb %xmm1, %edx 1716 sub $0xffff, %edx 1717 jnz L(exit) 1718 1719 #ifdef USE_AS_STRNCMP 1720 sub $16, %r11 1721 jbe L(strcmp_exitz) 1722 #endif 1723 1724 add $16, %rcx 1725 movdqa %xmm4, %xmm3 1726 jmp L(loop_ashr_14) 1727 1728 .p2align 4 1729 L(nibble_ashr_14): 1730 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1731 pmovmskb %xmm0, %edx 1732 test $0xc000, %edx 1733 jnz L(ashr_14_exittail) 1734 1735 #ifdef USE_AS_STRNCMP 1736 cmp $1, %r11 1737 jbe L(ashr_14_exittail) 1738 #endif 1739 1740 pxor %xmm0, %xmm0 1741 sub $0x1000, %r10 1742 jmp L(gobble_ashr_14) 1743 1744 .p2align 4 1745 L(ashr_14_exittail): 1746 movdqa (%rsi, %rcx), %xmm1 1747 psrldq $14, %xmm0 1748 psrldq $14, %xmm3 1749 jmp L(aftertail) 1750 1751 /* 1752 * The following cases will be handled by ashr_15 1753 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1754 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 1755 */ 1756 .p2align 4 1757 L(ashr_15): 1758 pxor %xmm0, %xmm0 1759 movdqa (%rdi), %xmm2 1760 movdqa (%rsi), %xmm1 1761 pcmpeqb %xmm1, %xmm0 1762 pslldq $1, %xmm2 1763 pcmpeqb %xmm1, %xmm2 1764 psubb %xmm0, %xmm2 1765 pmovmskb %xmm2, %r9d 1766 shr %cl, %edx 1767 shr %cl, %r9d 1768 sub %r9d, %edx 1769 jnz L(less32bytes) 1770 1771 movdqa (%rdi), %xmm3 1772 1773 UPDATE_STRNCMP_COUNTER 1774 1775 pxor %xmm0, %xmm0 1776 mov $16, %rcx /* index for loads */ 1777 mov $15, %r9d /* byte position left over from less32bytes case */ 1778 /* 1779 * Setup %r10 value allows us to detect crossing a page boundary. 1780 * When %r10 goes positive we have crossed a page boundary and 1781 * need to do a nibble. 1782 */ 1783 lea 15(%rdi), %r10 1784 and $0xfff, %r10 /* offset into 4K page */ 1785 1786 sub $0x1000, %r10 /* subtract 4K pagesize */ 1787 1788 .p2align 4 1789 L(loop_ashr_15): 1790 add $16, %r10 1791 jg L(nibble_ashr_15) 1792 1793 L(gobble_ashr_15): 1794 movdqa (%rsi, %rcx), %xmm1 1795 movdqa (%rdi, %rcx), %xmm2 1796 movdqa %xmm2, %xmm4 1797 1798 palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ 1799 1800 pcmpeqb %xmm1, %xmm0 1801 pcmpeqb %xmm2, %xmm1 1802 psubb %xmm0, %xmm1 1803 pmovmskb %xmm1, %edx 1804 sub $0xffff, %edx 1805 jnz L(exit) 1806 1807 #ifdef USE_AS_STRNCMP 1808 sub $16, %r11 1809 jbe L(strcmp_exitz) 1810 #endif 1811 1812 add $16, %rcx 1813 movdqa %xmm4, %xmm3 1814 1815 add $16, %r10 1816 jg L(nibble_ashr_15) /* cross page boundary */ 1817 1818 movdqa (%rsi, %rcx), %xmm1 1819 movdqa (%rdi, %rcx), %xmm2 1820 movdqa %xmm2, %xmm4 1821 1822 palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ 1823 1824 pcmpeqb %xmm1, %xmm0 1825 pcmpeqb %xmm2, %xmm1 1826 psubb %xmm0, %xmm1 1827 pmovmskb %xmm1, %edx 1828 sub $0xffff, %edx 1829 jnz L(exit) 1830 1831 #ifdef USE_AS_STRNCMP 1832 sub $16, %r11 1833 jbe L(strcmp_exitz) 1834 #endif 1835 1836 add $16, %rcx 1837 movdqa %xmm4, %xmm3 1838 jmp L(loop_ashr_15) 1839 1840 .p2align 4 1841 L(nibble_ashr_15): 1842 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1843 pmovmskb %xmm0, %edx 1844 test $0x8000, %edx 1845 jnz L(ashr_15_exittail) 1846 1847 #ifdef USE_AS_STRNCMP 1848 test %r11, %r11 1849 je L(ashr_15_exittail) 1850 #endif 1851 1852 pxor %xmm0, %xmm0 1853 sub $0x1000, %r10 1854 jmp L(gobble_ashr_15) 1855 1856 .p2align 4 1857 L(ashr_15_exittail): 1858 movdqa (%rsi, %rcx), %xmm1 1859 psrldq $15, %xmm3 1860 psrldq $15, %xmm0 1861 1862 .p2align 4 1863 L(aftertail): 1864 pcmpeqb %xmm3, %xmm1 1865 psubb %xmm0, %xmm1 1866 pmovmskb %xmm1, %edx 1867 not %edx 1868 1869 .p2align 4 1870 L(exit): 1871 lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */ 1872 L(less32bytes): 1873 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ 1874 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ 1875 test %r8d, %r8d 1876 jz L(ret) 1877 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ 1878 1879 .p2align 4 1880 L(ret): 1881 L(less16bytes): 1882 bsf %rdx, %rdx /* find and store bit index in %rdx */ 1883 1884 #ifdef USE_AS_STRNCMP 1885 sub %rdx, %r11 1886 jbe L(strcmp_exitz) 1887 #endif 1888 movzbl (%rsi, %rdx), %ecx 1889 movzbl (%rdi, %rdx), %eax 1890 1891 sub %ecx, %eax 1892 ret 1893 1894 L(strcmp_exitz): 1895 xor %eax, %eax 1896 ret 1897 1898 .p2align 4 1899 L(Byte0): 1900 movzbl (%rsi), %ecx 1901 movzbl (%rdi), %eax 1902 1903 sub %ecx, %eax 1904 ret 1905 END (STRCMP) 1906 1907 .section .rodata,"a",@progbits 1908 .p2align 3 1909 L(unaligned_table): 1910 .int L(ashr_1) - L(unaligned_table) 1911 .int L(ashr_2) - L(unaligned_table) 1912 .int L(ashr_3) - L(unaligned_table) 1913 .int L(ashr_4) - L(unaligned_table) 1914 .int L(ashr_5) - L(unaligned_table) 1915 .int L(ashr_6) - L(unaligned_table) 1916 .int L(ashr_7) - L(unaligned_table) 1917 .int L(ashr_8) - L(unaligned_table) 1918 .int L(ashr_9) - L(unaligned_table) 1919 .int L(ashr_10) - L(unaligned_table) 1920 .int L(ashr_11) - L(unaligned_table) 1921 .int L(ashr_12) - L(unaligned_table) 1922 .int L(ashr_13) - L(unaligned_table) 1923 .int L(ashr_14) - L(unaligned_table) 1924 .int L(ashr_15) - L(unaligned_table) 1925 .int L(ashr_0) - L(unaligned_table) 1926