1 /* 2 Copyright (c) 2010, Intel Corporation 3 All rights reserved. 4 5 Redistribution and use in source and binary forms, with or without 6 modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #ifndef MEMCMP 32 # define MEMCMP ssse3_memcmp3_new 33 #endif 34 35 #ifndef L 36 # define L(label) .L##label 37 #endif 38 39 #ifndef ALIGN 40 # define ALIGN(n) .p2align n 41 #endif 42 43 #ifndef cfi_startproc 44 # define cfi_startproc .cfi_startproc 45 #endif 46 47 #ifndef cfi_endproc 48 # define cfi_endproc .cfi_endproc 49 #endif 50 51 #ifndef cfi_rel_offset 52 # define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 53 #endif 54 55 #ifndef cfi_restore 56 # define cfi_restore(reg) .cfi_restore (reg) 57 #endif 58 59 #ifndef cfi_adjust_cfa_offset 60 # define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 61 #endif 62 63 #ifndef ENTRY 64 # define ENTRY(name) \ 65 .type name, @function; \ 66 .globl name; \ 67 .p2align 4; \ 68 name: \ 69 cfi_startproc 70 #endif 71 72 #ifndef END 73 # define END(name) \ 74 cfi_endproc; \ 75 .size name, .-name 76 #endif 77 78 #define CFI_PUSH(REG) \ 79 cfi_adjust_cfa_offset (4); \ 80 cfi_rel_offset (REG, 0) 81 82 #define CFI_POP(REG) \ 83 cfi_adjust_cfa_offset (-4); \ 84 cfi_restore (REG) 85 86 #define PUSH(REG) pushl REG; CFI_PUSH (REG) 87 #define POP(REG) popl REG; CFI_POP (REG) 88 89 #define PARMS 4 90 #define BLK1 PARMS 91 #define BLK2 BLK1+4 92 #define LEN BLK2+4 93 #define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret 94 #define RETURN RETURN_END; CFI_PUSH (%ebx); CFI_PUSH (%edi); \ 95 CFI_PUSH (%esi) 96 97 .section .text.ssse3,"ax",@progbits 98 ENTRY (MEMCMP) 99 movl LEN(%esp), %ecx 100 movl BLK1(%esp), %eax 101 cmp $48, %ecx 102 movl BLK2(%esp), %edx 103 jae L(48bytesormore) 104 cmp $1, %ecx 105 jbe L(less1bytes) 106 PUSH (%ebx) 107 add %ecx, %edx 108 add %ecx, %eax 109 jmp L(less48bytes) 110 111 CFI_POP (%ebx) 112 ALIGN (4) 113 L(less1bytes): 114 jb L(zero) 115 movb (%eax), %cl 116 cmp (%edx), %cl 117 je L(zero) 118 mov $1, %eax 119 ja L(1bytesend) 120 neg %eax 121 L(1bytesend): 122 ret 123 124 ALIGN (4) 125 L(zero): 126 mov $0, %eax 127 ret 128 129 ALIGN (4) 130 L(48bytesormore): 131 PUSH (%ebx) 132 PUSH (%esi) 133 PUSH (%edi) 134 movdqu (%eax), %xmm3 135 movdqu (%edx), %xmm0 136 movl %eax, %edi 137 movl %edx, %esi 138 pcmpeqb %xmm0, %xmm3 139 pmovmskb %xmm3, %edx 140 lea 16(%edi), %edi 141 142 sub $0xffff, %edx 143 lea 16(%esi), %esi 144 jnz L(less16bytes) 145 mov %edi, %edx 146 and $0xf, %edx 147 xor %edx, %edi 148 sub %edx, %esi 149 add %edx, %ecx 150 mov %esi, %edx 151 and $0xf, %edx 152 jz L(shr_0) 153 xor %edx, %esi 154 155 cmp $8, %edx 156 jae L(next_unaligned_table) 157 cmp $0, %edx 158 je L(shr_0) 159 cmp $1, %edx 160 je L(shr_1) 161 cmp $2, %edx 162 je L(shr_2) 163 cmp $3, %edx 164 je L(shr_3) 165 cmp $4, %edx 166 je L(shr_4) 167 cmp $5, %edx 168 je L(shr_5) 169 cmp $6, %edx 170 je L(shr_6) 171 jmp L(shr_7) 172 173 ALIGN (4) 174 L(next_unaligned_table): 175 cmp $8, %edx 176 je L(shr_8) 177 cmp $9, %edx 178 je L(shr_9) 179 cmp $10, %edx 180 je L(shr_10) 181 cmp $11, %edx 182 je L(shr_11) 183 cmp $12, %edx 184 je L(shr_12) 185 cmp $13, %edx 186 je L(shr_13) 187 cmp $14, %edx 188 je L(shr_14) 189 jmp L(shr_15) 190 191 ALIGN (4) 192 L(shr_0): 193 cmp $80, %ecx 194 jae L(shr_0_gobble) 195 lea -48(%ecx), %ecx 196 xor %eax, %eax 197 movaps (%esi), %xmm1 198 pcmpeqb (%edi), %xmm1 199 movaps 16(%esi), %xmm2 200 pcmpeqb 16(%edi), %xmm2 201 pand %xmm1, %xmm2 202 pmovmskb %xmm2, %edx 203 add $32, %edi 204 add $32, %esi 205 sub $0xffff, %edx 206 jnz L(exit) 207 208 lea (%ecx, %edi,1), %eax 209 lea (%ecx, %esi,1), %edx 210 POP (%edi) 211 POP (%esi) 212 jmp L(less48bytes) 213 214 CFI_PUSH (%esi) 215 CFI_PUSH (%edi) 216 ALIGN (4) 217 L(shr_0_gobble): 218 lea -48(%ecx), %ecx 219 movdqa (%esi), %xmm0 220 xor %eax, %eax 221 pcmpeqb (%edi), %xmm0 222 sub $32, %ecx 223 movdqa 16(%esi), %xmm2 224 pcmpeqb 16(%edi), %xmm2 225 L(shr_0_gobble_loop): 226 pand %xmm0, %xmm2 227 sub $32, %ecx 228 pmovmskb %xmm2, %edx 229 movdqa %xmm0, %xmm1 230 movdqa 32(%esi), %xmm0 231 movdqa 48(%esi), %xmm2 232 sbb $0xffff, %edx 233 pcmpeqb 32(%edi), %xmm0 234 pcmpeqb 48(%edi), %xmm2 235 lea 32(%edi), %edi 236 lea 32(%esi), %esi 237 jz L(shr_0_gobble_loop) 238 239 pand %xmm0, %xmm2 240 cmp $0, %ecx 241 jge L(shr_0_gobble_loop_next) 242 inc %edx 243 add $32, %ecx 244 L(shr_0_gobble_loop_next): 245 test %edx, %edx 246 jnz L(exit) 247 248 pmovmskb %xmm2, %edx 249 movdqa %xmm0, %xmm1 250 lea 32(%edi), %edi 251 lea 32(%esi), %esi 252 sub $0xffff, %edx 253 jnz L(exit) 254 lea (%ecx, %edi,1), %eax 255 lea (%ecx, %esi,1), %edx 256 POP (%edi) 257 POP (%esi) 258 jmp L(less48bytes) 259 260 CFI_PUSH (%esi) 261 CFI_PUSH (%edi) 262 ALIGN (4) 263 L(shr_1): 264 cmp $80, %ecx 265 lea -48(%ecx), %ecx 266 mov %edx, %eax 267 jae L(shr_1_gobble) 268 269 movdqa 16(%esi), %xmm1 270 movdqa %xmm1, %xmm2 271 palignr $1,(%esi), %xmm1 272 pcmpeqb (%edi), %xmm1 273 274 movdqa 32(%esi), %xmm3 275 palignr $1,%xmm2, %xmm3 276 pcmpeqb 16(%edi), %xmm3 277 278 pand %xmm1, %xmm3 279 pmovmskb %xmm3, %edx 280 lea 32(%edi), %edi 281 lea 32(%esi), %esi 282 sub $0xffff, %edx 283 jnz L(exit) 284 lea (%ecx, %edi,1), %eax 285 lea 1(%ecx, %esi,1), %edx 286 POP (%edi) 287 POP (%esi) 288 jmp L(less48bytes) 289 290 CFI_PUSH (%esi) 291 CFI_PUSH (%edi) 292 ALIGN (4) 293 L(shr_1_gobble): 294 sub $32, %ecx 295 movdqa 16(%esi), %xmm0 296 palignr $1,(%esi), %xmm0 297 pcmpeqb (%edi), %xmm0 298 299 movdqa 32(%esi), %xmm3 300 palignr $1,16(%esi), %xmm3 301 pcmpeqb 16(%edi), %xmm3 302 303 L(shr_1_gobble_loop): 304 pand %xmm0, %xmm3 305 sub $32, %ecx 306 pmovmskb %xmm3, %edx 307 movdqa %xmm0, %xmm1 308 309 movdqa 64(%esi), %xmm3 310 palignr $1,48(%esi), %xmm3 311 sbb $0xffff, %edx 312 movdqa 48(%esi), %xmm0 313 palignr $1,32(%esi), %xmm0 314 pcmpeqb 32(%edi), %xmm0 315 lea 32(%esi), %esi 316 pcmpeqb 48(%edi), %xmm3 317 318 lea 32(%edi), %edi 319 jz L(shr_1_gobble_loop) 320 pand %xmm0, %xmm3 321 322 cmp $0, %ecx 323 jge L(shr_1_gobble_next) 324 inc %edx 325 add $32, %ecx 326 L(shr_1_gobble_next): 327 test %edx, %edx 328 jnz L(exit) 329 330 pmovmskb %xmm3, %edx 331 movdqa %xmm0, %xmm1 332 lea 32(%edi), %edi 333 lea 32(%esi), %esi 334 sub $0xffff, %edx 335 jnz L(exit) 336 337 lea (%ecx, %edi,1), %eax 338 lea 1(%ecx, %esi,1), %edx 339 POP (%edi) 340 POP (%esi) 341 jmp L(less48bytes) 342 343 CFI_PUSH (%esi) 344 CFI_PUSH (%edi) 345 ALIGN (4) 346 L(shr_2): 347 cmp $80, %ecx 348 lea -48(%ecx), %ecx 349 mov %edx, %eax 350 jae L(shr_2_gobble) 351 352 movdqa 16(%esi), %xmm1 353 movdqa %xmm1, %xmm2 354 palignr $2,(%esi), %xmm1 355 pcmpeqb (%edi), %xmm1 356 357 movdqa 32(%esi), %xmm3 358 palignr $2,%xmm2, %xmm3 359 pcmpeqb 16(%edi), %xmm3 360 361 pand %xmm1, %xmm3 362 pmovmskb %xmm3, %edx 363 lea 32(%edi), %edi 364 lea 32(%esi), %esi 365 sub $0xffff, %edx 366 jnz L(exit) 367 lea (%ecx, %edi,1), %eax 368 lea 2(%ecx, %esi,1), %edx 369 POP (%edi) 370 POP (%esi) 371 jmp L(less48bytes) 372 373 CFI_PUSH (%esi) 374 CFI_PUSH (%edi) 375 ALIGN (4) 376 L(shr_2_gobble): 377 sub $32, %ecx 378 movdqa 16(%esi), %xmm0 379 palignr $2,(%esi), %xmm0 380 pcmpeqb (%edi), %xmm0 381 382 movdqa 32(%esi), %xmm3 383 palignr $2,16(%esi), %xmm3 384 pcmpeqb 16(%edi), %xmm3 385 386 L(shr_2_gobble_loop): 387 pand %xmm0, %xmm3 388 sub $32, %ecx 389 pmovmskb %xmm3, %edx 390 movdqa %xmm0, %xmm1 391 392 movdqa 64(%esi), %xmm3 393 palignr $2,48(%esi), %xmm3 394 sbb $0xffff, %edx 395 movdqa 48(%esi), %xmm0 396 palignr $2,32(%esi), %xmm0 397 pcmpeqb 32(%edi), %xmm0 398 lea 32(%esi), %esi 399 pcmpeqb 48(%edi), %xmm3 400 401 lea 32(%edi), %edi 402 jz L(shr_2_gobble_loop) 403 pand %xmm0, %xmm3 404 405 cmp $0, %ecx 406 jge L(shr_2_gobble_next) 407 inc %edx 408 add $32, %ecx 409 L(shr_2_gobble_next): 410 test %edx, %edx 411 jnz L(exit) 412 413 pmovmskb %xmm3, %edx 414 movdqa %xmm0, %xmm1 415 lea 32(%edi), %edi 416 lea 32(%esi), %esi 417 sub $0xffff, %edx 418 jnz L(exit) 419 420 lea (%ecx, %edi,1), %eax 421 lea 2(%ecx, %esi,1), %edx 422 POP (%edi) 423 POP (%esi) 424 jmp L(less48bytes) 425 426 CFI_PUSH (%esi) 427 CFI_PUSH (%edi) 428 ALIGN (4) 429 L(shr_3): 430 cmp $80, %ecx 431 lea -48(%ecx), %ecx 432 mov %edx, %eax 433 jae L(shr_3_gobble) 434 435 movdqa 16(%esi), %xmm1 436 movdqa %xmm1, %xmm2 437 palignr $3,(%esi), %xmm1 438 pcmpeqb (%edi), %xmm1 439 440 movdqa 32(%esi), %xmm3 441 palignr $3,%xmm2, %xmm3 442 pcmpeqb 16(%edi), %xmm3 443 444 pand %xmm1, %xmm3 445 pmovmskb %xmm3, %edx 446 lea 32(%edi), %edi 447 lea 32(%esi), %esi 448 sub $0xffff, %edx 449 jnz L(exit) 450 lea (%ecx, %edi,1), %eax 451 lea 3(%ecx, %esi,1), %edx 452 POP (%edi) 453 POP (%esi) 454 jmp L(less48bytes) 455 456 CFI_PUSH (%esi) 457 CFI_PUSH (%edi) 458 ALIGN (4) 459 L(shr_3_gobble): 460 sub $32, %ecx 461 movdqa 16(%esi), %xmm0 462 palignr $3,(%esi), %xmm0 463 pcmpeqb (%edi), %xmm0 464 465 movdqa 32(%esi), %xmm3 466 palignr $3,16(%esi), %xmm3 467 pcmpeqb 16(%edi), %xmm3 468 469 L(shr_3_gobble_loop): 470 pand %xmm0, %xmm3 471 sub $32, %ecx 472 pmovmskb %xmm3, %edx 473 movdqa %xmm0, %xmm1 474 475 movdqa 64(%esi), %xmm3 476 palignr $3,48(%esi), %xmm3 477 sbb $0xffff, %edx 478 movdqa 48(%esi), %xmm0 479 palignr $3,32(%esi), %xmm0 480 pcmpeqb 32(%edi), %xmm0 481 lea 32(%esi), %esi 482 pcmpeqb 48(%edi), %xmm3 483 484 lea 32(%edi), %edi 485 jz L(shr_3_gobble_loop) 486 pand %xmm0, %xmm3 487 488 cmp $0, %ecx 489 jge L(shr_3_gobble_next) 490 inc %edx 491 add $32, %ecx 492 L(shr_3_gobble_next): 493 test %edx, %edx 494 jnz L(exit) 495 496 pmovmskb %xmm3, %edx 497 movdqa %xmm0, %xmm1 498 lea 32(%edi), %edi 499 lea 32(%esi), %esi 500 sub $0xffff, %edx 501 jnz L(exit) 502 503 lea (%ecx, %edi,1), %eax 504 lea 3(%ecx, %esi,1), %edx 505 POP (%edi) 506 POP (%esi) 507 jmp L(less48bytes) 508 509 CFI_PUSH (%esi) 510 CFI_PUSH (%edi) 511 ALIGN (4) 512 L(shr_4): 513 cmp $80, %ecx 514 lea -48(%ecx), %ecx 515 mov %edx, %eax 516 jae L(shr_4_gobble) 517 518 movdqa 16(%esi), %xmm1 519 movdqa %xmm1, %xmm2 520 palignr $4,(%esi), %xmm1 521 pcmpeqb (%edi), %xmm1 522 523 movdqa 32(%esi), %xmm3 524 palignr $4,%xmm2, %xmm3 525 pcmpeqb 16(%edi), %xmm3 526 527 pand %xmm1, %xmm3 528 pmovmskb %xmm3, %edx 529 lea 32(%edi), %edi 530 lea 32(%esi), %esi 531 sub $0xffff, %edx 532 jnz L(exit) 533 lea (%ecx, %edi,1), %eax 534 lea 4(%ecx, %esi,1), %edx 535 POP (%edi) 536 POP (%esi) 537 jmp L(less48bytes) 538 539 CFI_PUSH (%esi) 540 CFI_PUSH (%edi) 541 ALIGN (4) 542 L(shr_4_gobble): 543 sub $32, %ecx 544 movdqa 16(%esi), %xmm0 545 palignr $4,(%esi), %xmm0 546 pcmpeqb (%edi), %xmm0 547 548 movdqa 32(%esi), %xmm3 549 palignr $4,16(%esi), %xmm3 550 pcmpeqb 16(%edi), %xmm3 551 552 L(shr_4_gobble_loop): 553 pand %xmm0, %xmm3 554 sub $32, %ecx 555 pmovmskb %xmm3, %edx 556 movdqa %xmm0, %xmm1 557 558 movdqa 64(%esi), %xmm3 559 palignr $4,48(%esi), %xmm3 560 sbb $0xffff, %edx 561 movdqa 48(%esi), %xmm0 562 palignr $4,32(%esi), %xmm0 563 pcmpeqb 32(%edi), %xmm0 564 lea 32(%esi), %esi 565 pcmpeqb 48(%edi), %xmm3 566 567 lea 32(%edi), %edi 568 jz L(shr_4_gobble_loop) 569 pand %xmm0, %xmm3 570 571 cmp $0, %ecx 572 jge L(shr_4_gobble_next) 573 inc %edx 574 add $32, %ecx 575 L(shr_4_gobble_next): 576 test %edx, %edx 577 jnz L(exit) 578 579 pmovmskb %xmm3, %edx 580 movdqa %xmm0, %xmm1 581 lea 32(%edi), %edi 582 lea 32(%esi), %esi 583 sub $0xffff, %edx 584 jnz L(exit) 585 586 lea (%ecx, %edi,1), %eax 587 lea 4(%ecx, %esi,1), %edx 588 POP (%edi) 589 POP (%esi) 590 jmp L(less48bytes) 591 592 CFI_PUSH (%esi) 593 CFI_PUSH (%edi) 594 ALIGN (4) 595 L(shr_5): 596 cmp $80, %ecx 597 lea -48(%ecx), %ecx 598 mov %edx, %eax 599 jae L(shr_5_gobble) 600 601 movdqa 16(%esi), %xmm1 602 movdqa %xmm1, %xmm2 603 palignr $5,(%esi), %xmm1 604 pcmpeqb (%edi), %xmm1 605 606 movdqa 32(%esi), %xmm3 607 palignr $5,%xmm2, %xmm3 608 pcmpeqb 16(%edi), %xmm3 609 610 pand %xmm1, %xmm3 611 pmovmskb %xmm3, %edx 612 lea 32(%edi), %edi 613 lea 32(%esi), %esi 614 sub $0xffff, %edx 615 jnz L(exit) 616 lea (%ecx, %edi,1), %eax 617 lea 5(%ecx, %esi,1), %edx 618 POP (%edi) 619 POP (%esi) 620 jmp L(less48bytes) 621 622 CFI_PUSH (%esi) 623 CFI_PUSH (%edi) 624 ALIGN (4) 625 L(shr_5_gobble): 626 sub $32, %ecx 627 movdqa 16(%esi), %xmm0 628 palignr $5,(%esi), %xmm0 629 pcmpeqb (%edi), %xmm0 630 631 movdqa 32(%esi), %xmm3 632 palignr $5,16(%esi), %xmm3 633 pcmpeqb 16(%edi), %xmm3 634 635 L(shr_5_gobble_loop): 636 pand %xmm0, %xmm3 637 sub $32, %ecx 638 pmovmskb %xmm3, %edx 639 movdqa %xmm0, %xmm1 640 641 movdqa 64(%esi), %xmm3 642 palignr $5,48(%esi), %xmm3 643 sbb $0xffff, %edx 644 movdqa 48(%esi), %xmm0 645 palignr $5,32(%esi), %xmm0 646 pcmpeqb 32(%edi), %xmm0 647 lea 32(%esi), %esi 648 pcmpeqb 48(%edi), %xmm3 649 650 lea 32(%edi), %edi 651 jz L(shr_5_gobble_loop) 652 pand %xmm0, %xmm3 653 654 cmp $0, %ecx 655 jge L(shr_5_gobble_next) 656 inc %edx 657 add $32, %ecx 658 L(shr_5_gobble_next): 659 test %edx, %edx 660 jnz L(exit) 661 662 pmovmskb %xmm3, %edx 663 movdqa %xmm0, %xmm1 664 lea 32(%edi), %edi 665 lea 32(%esi), %esi 666 sub $0xffff, %edx 667 jnz L(exit) 668 669 lea (%ecx, %edi,1), %eax 670 lea 5(%ecx, %esi,1), %edx 671 POP (%edi) 672 POP (%esi) 673 jmp L(less48bytes) 674 675 CFI_PUSH (%esi) 676 CFI_PUSH (%edi) 677 ALIGN (4) 678 L(shr_6): 679 cmp $80, %ecx 680 lea -48(%ecx), %ecx 681 mov %edx, %eax 682 jae L(shr_6_gobble) 683 684 movdqa 16(%esi), %xmm1 685 movdqa %xmm1, %xmm2 686 palignr $6,(%esi), %xmm1 687 pcmpeqb (%edi), %xmm1 688 689 movdqa 32(%esi), %xmm3 690 palignr $6,%xmm2, %xmm3 691 pcmpeqb 16(%edi), %xmm3 692 693 pand %xmm1, %xmm3 694 pmovmskb %xmm3, %edx 695 lea 32(%edi), %edi 696 lea 32(%esi), %esi 697 sub $0xffff, %edx 698 jnz L(exit) 699 lea (%ecx, %edi,1), %eax 700 lea 6(%ecx, %esi,1), %edx 701 POP (%edi) 702 POP (%esi) 703 jmp L(less48bytes) 704 705 CFI_PUSH (%esi) 706 CFI_PUSH (%edi) 707 ALIGN (4) 708 L(shr_6_gobble): 709 sub $32, %ecx 710 movdqa 16(%esi), %xmm0 711 palignr $6,(%esi), %xmm0 712 pcmpeqb (%edi), %xmm0 713 714 movdqa 32(%esi), %xmm3 715 palignr $6,16(%esi), %xmm3 716 pcmpeqb 16(%edi), %xmm3 717 718 L(shr_6_gobble_loop): 719 pand %xmm0, %xmm3 720 sub $32, %ecx 721 pmovmskb %xmm3, %edx 722 movdqa %xmm0, %xmm1 723 724 movdqa 64(%esi), %xmm3 725 palignr $6,48(%esi), %xmm3 726 sbb $0xffff, %edx 727 movdqa 48(%esi), %xmm0 728 palignr $6,32(%esi), %xmm0 729 pcmpeqb 32(%edi), %xmm0 730 lea 32(%esi), %esi 731 pcmpeqb 48(%edi), %xmm3 732 733 lea 32(%edi), %edi 734 jz L(shr_6_gobble_loop) 735 pand %xmm0, %xmm3 736 737 cmp $0, %ecx 738 jge L(shr_6_gobble_next) 739 inc %edx 740 add $32, %ecx 741 L(shr_6_gobble_next): 742 test %edx, %edx 743 jnz L(exit) 744 745 pmovmskb %xmm3, %edx 746 movdqa %xmm0, %xmm1 747 lea 32(%edi), %edi 748 lea 32(%esi), %esi 749 sub $0xffff, %edx 750 jnz L(exit) 751 752 lea (%ecx, %edi,1), %eax 753 lea 6(%ecx, %esi,1), %edx 754 POP (%edi) 755 POP (%esi) 756 jmp L(less48bytes) 757 758 CFI_PUSH (%esi) 759 CFI_PUSH (%edi) 760 ALIGN (4) 761 L(shr_7): 762 cmp $80, %ecx 763 lea -48(%ecx), %ecx 764 mov %edx, %eax 765 jae L(shr_7_gobble) 766 767 movdqa 16(%esi), %xmm1 768 movdqa %xmm1, %xmm2 769 palignr $7,(%esi), %xmm1 770 pcmpeqb (%edi), %xmm1 771 772 movdqa 32(%esi), %xmm3 773 palignr $7,%xmm2, %xmm3 774 pcmpeqb 16(%edi), %xmm3 775 776 pand %xmm1, %xmm3 777 pmovmskb %xmm3, %edx 778 lea 32(%edi), %edi 779 lea 32(%esi), %esi 780 sub $0xffff, %edx 781 jnz L(exit) 782 lea (%ecx, %edi,1), %eax 783 lea 7(%ecx, %esi,1), %edx 784 POP (%edi) 785 POP (%esi) 786 jmp L(less48bytes) 787 788 CFI_PUSH (%esi) 789 CFI_PUSH (%edi) 790 ALIGN (4) 791 L(shr_7_gobble): 792 sub $32, %ecx 793 movdqa 16(%esi), %xmm0 794 palignr $7,(%esi), %xmm0 795 pcmpeqb (%edi), %xmm0 796 797 movdqa 32(%esi), %xmm3 798 palignr $7,16(%esi), %xmm3 799 pcmpeqb 16(%edi), %xmm3 800 801 L(shr_7_gobble_loop): 802 pand %xmm0, %xmm3 803 sub $32, %ecx 804 pmovmskb %xmm3, %edx 805 movdqa %xmm0, %xmm1 806 807 movdqa 64(%esi), %xmm3 808 palignr $7,48(%esi), %xmm3 809 sbb $0xffff, %edx 810 movdqa 48(%esi), %xmm0 811 palignr $7,32(%esi), %xmm0 812 pcmpeqb 32(%edi), %xmm0 813 lea 32(%esi), %esi 814 pcmpeqb 48(%edi), %xmm3 815 816 lea 32(%edi), %edi 817 jz L(shr_7_gobble_loop) 818 pand %xmm0, %xmm3 819 820 cmp $0, %ecx 821 jge L(shr_7_gobble_next) 822 inc %edx 823 add $32, %ecx 824 L(shr_7_gobble_next): 825 test %edx, %edx 826 jnz L(exit) 827 828 pmovmskb %xmm3, %edx 829 movdqa %xmm0, %xmm1 830 lea 32(%edi), %edi 831 lea 32(%esi), %esi 832 sub $0xffff, %edx 833 jnz L(exit) 834 835 lea (%ecx, %edi,1), %eax 836 lea 7(%ecx, %esi,1), %edx 837 POP (%edi) 838 POP (%esi) 839 jmp L(less48bytes) 840 841 CFI_PUSH (%esi) 842 CFI_PUSH (%edi) 843 ALIGN (4) 844 L(shr_8): 845 cmp $80, %ecx 846 lea -48(%ecx), %ecx 847 mov %edx, %eax 848 jae L(shr_8_gobble) 849 850 movdqa 16(%esi), %xmm1 851 movdqa %xmm1, %xmm2 852 palignr $8,(%esi), %xmm1 853 pcmpeqb (%edi), %xmm1 854 855 movdqa 32(%esi), %xmm3 856 palignr $8,%xmm2, %xmm3 857 pcmpeqb 16(%edi), %xmm3 858 859 pand %xmm1, %xmm3 860 pmovmskb %xmm3, %edx 861 lea 32(%edi), %edi 862 lea 32(%esi), %esi 863 sub $0xffff, %edx 864 jnz L(exit) 865 lea (%ecx, %edi,1), %eax 866 lea 8(%ecx, %esi,1), %edx 867 POP (%edi) 868 POP (%esi) 869 jmp L(less48bytes) 870 871 CFI_PUSH (%esi) 872 CFI_PUSH (%edi) 873 ALIGN (4) 874 L(shr_8_gobble): 875 sub $32, %ecx 876 movdqa 16(%esi), %xmm0 877 palignr $8,(%esi), %xmm0 878 pcmpeqb (%edi), %xmm0 879 880 movdqa 32(%esi), %xmm3 881 palignr $8,16(%esi), %xmm3 882 pcmpeqb 16(%edi), %xmm3 883 884 L(shr_8_gobble_loop): 885 pand %xmm0, %xmm3 886 sub $32, %ecx 887 pmovmskb %xmm3, %edx 888 movdqa %xmm0, %xmm1 889 890 movdqa 64(%esi), %xmm3 891 palignr $8,48(%esi), %xmm3 892 sbb $0xffff, %edx 893 movdqa 48(%esi), %xmm0 894 palignr $8,32(%esi), %xmm0 895 pcmpeqb 32(%edi), %xmm0 896 lea 32(%esi), %esi 897 pcmpeqb 48(%edi), %xmm3 898 899 lea 32(%edi), %edi 900 jz L(shr_8_gobble_loop) 901 pand %xmm0, %xmm3 902 903 cmp $0, %ecx 904 jge L(shr_8_gobble_next) 905 inc %edx 906 add $32, %ecx 907 L(shr_8_gobble_next): 908 test %edx, %edx 909 jnz L(exit) 910 911 pmovmskb %xmm3, %edx 912 movdqa %xmm0, %xmm1 913 lea 32(%edi), %edi 914 lea 32(%esi), %esi 915 sub $0xffff, %edx 916 jnz L(exit) 917 918 lea (%ecx, %edi,1), %eax 919 lea 8(%ecx, %esi,1), %edx 920 POP (%edi) 921 POP (%esi) 922 jmp L(less48bytes) 923 924 CFI_PUSH (%esi) 925 CFI_PUSH (%edi) 926 ALIGN (4) 927 L(shr_9): 928 cmp $80, %ecx 929 lea -48(%ecx), %ecx 930 mov %edx, %eax 931 jae L(shr_9_gobble) 932 933 movdqa 16(%esi), %xmm1 934 movdqa %xmm1, %xmm2 935 palignr $9,(%esi), %xmm1 936 pcmpeqb (%edi), %xmm1 937 938 movdqa 32(%esi), %xmm3 939 palignr $9,%xmm2, %xmm3 940 pcmpeqb 16(%edi), %xmm3 941 942 pand %xmm1, %xmm3 943 pmovmskb %xmm3, %edx 944 lea 32(%edi), %edi 945 lea 32(%esi), %esi 946 sub $0xffff, %edx 947 jnz L(exit) 948 lea (%ecx, %edi,1), %eax 949 lea 9(%ecx, %esi,1), %edx 950 POP (%edi) 951 POP (%esi) 952 jmp L(less48bytes) 953 954 CFI_PUSH (%esi) 955 CFI_PUSH (%edi) 956 ALIGN (4) 957 L(shr_9_gobble): 958 sub $32, %ecx 959 movdqa 16(%esi), %xmm0 960 palignr $9,(%esi), %xmm0 961 pcmpeqb (%edi), %xmm0 962 963 movdqa 32(%esi), %xmm3 964 palignr $9,16(%esi), %xmm3 965 pcmpeqb 16(%edi), %xmm3 966 967 L(shr_9_gobble_loop): 968 pand %xmm0, %xmm3 969 sub $32, %ecx 970 pmovmskb %xmm3, %edx 971 movdqa %xmm0, %xmm1 972 973 movdqa 64(%esi), %xmm3 974 palignr $9,48(%esi), %xmm3 975 sbb $0xffff, %edx 976 movdqa 48(%esi), %xmm0 977 palignr $9,32(%esi), %xmm0 978 pcmpeqb 32(%edi), %xmm0 979 lea 32(%esi), %esi 980 pcmpeqb 48(%edi), %xmm3 981 982 lea 32(%edi), %edi 983 jz L(shr_9_gobble_loop) 984 pand %xmm0, %xmm3 985 986 cmp $0, %ecx 987 jge L(shr_9_gobble_next) 988 inc %edx 989 add $32, %ecx 990 L(shr_9_gobble_next): 991 test %edx, %edx 992 jnz L(exit) 993 994 pmovmskb %xmm3, %edx 995 movdqa %xmm0, %xmm1 996 lea 32(%edi), %edi 997 lea 32(%esi), %esi 998 sub $0xffff, %edx 999 jnz L(exit) 1000 1001 lea (%ecx, %edi,1), %eax 1002 lea 9(%ecx, %esi,1), %edx 1003 POP (%edi) 1004 POP (%esi) 1005 jmp L(less48bytes) 1006 1007 CFI_PUSH (%esi) 1008 CFI_PUSH (%edi) 1009 ALIGN (4) 1010 L(shr_10): 1011 cmp $80, %ecx 1012 lea -48(%ecx), %ecx 1013 mov %edx, %eax 1014 jae L(shr_10_gobble) 1015 1016 movdqa 16(%esi), %xmm1 1017 movdqa %xmm1, %xmm2 1018 palignr $10, (%esi), %xmm1 1019 pcmpeqb (%edi), %xmm1 1020 1021 movdqa 32(%esi), %xmm3 1022 palignr $10,%xmm2, %xmm3 1023 pcmpeqb 16(%edi), %xmm3 1024 1025 pand %xmm1, %xmm3 1026 pmovmskb %xmm3, %edx 1027 lea 32(%edi), %edi 1028 lea 32(%esi), %esi 1029 sub $0xffff, %edx 1030 jnz L(exit) 1031 lea (%ecx, %edi,1), %eax 1032 lea 10(%ecx, %esi,1), %edx 1033 POP (%edi) 1034 POP (%esi) 1035 jmp L(less48bytes) 1036 1037 CFI_PUSH (%esi) 1038 CFI_PUSH (%edi) 1039 ALIGN (4) 1040 L(shr_10_gobble): 1041 sub $32, %ecx 1042 movdqa 16(%esi), %xmm0 1043 palignr $10, (%esi), %xmm0 1044 pcmpeqb (%edi), %xmm0 1045 1046 movdqa 32(%esi), %xmm3 1047 palignr $10, 16(%esi), %xmm3 1048 pcmpeqb 16(%edi), %xmm3 1049 1050 L(shr_10_gobble_loop): 1051 pand %xmm0, %xmm3 1052 sub $32, %ecx 1053 pmovmskb %xmm3, %edx 1054 movdqa %xmm0, %xmm1 1055 1056 movdqa 64(%esi), %xmm3 1057 palignr $10,48(%esi), %xmm3 1058 sbb $0xffff, %edx 1059 movdqa 48(%esi), %xmm0 1060 palignr $10,32(%esi), %xmm0 1061 pcmpeqb 32(%edi), %xmm0 1062 lea 32(%esi), %esi 1063 pcmpeqb 48(%edi), %xmm3 1064 1065 lea 32(%edi), %edi 1066 jz L(shr_10_gobble_loop) 1067 pand %xmm0, %xmm3 1068 1069 cmp $0, %ecx 1070 jge L(shr_10_gobble_next) 1071 inc %edx 1072 add $32, %ecx 1073 L(shr_10_gobble_next): 1074 test %edx, %edx 1075 jnz L(exit) 1076 1077 pmovmskb %xmm3, %edx 1078 movdqa %xmm0, %xmm1 1079 lea 32(%edi), %edi 1080 lea 32(%esi), %esi 1081 sub $0xffff, %edx 1082 jnz L(exit) 1083 1084 lea (%ecx, %edi,1), %eax 1085 lea 10(%ecx, %esi,1), %edx 1086 POP (%edi) 1087 POP (%esi) 1088 jmp L(less48bytes) 1089 1090 CFI_PUSH (%esi) 1091 CFI_PUSH (%edi) 1092 ALIGN (4) 1093 L(shr_11): 1094 cmp $80, %ecx 1095 lea -48(%ecx), %ecx 1096 mov %edx, %eax 1097 jae L(shr_11_gobble) 1098 1099 movdqa 16(%esi), %xmm1 1100 movdqa %xmm1, %xmm2 1101 palignr $11, (%esi), %xmm1 1102 pcmpeqb (%edi), %xmm1 1103 1104 movdqa 32(%esi), %xmm3 1105 palignr $11, %xmm2, %xmm3 1106 pcmpeqb 16(%edi), %xmm3 1107 1108 pand %xmm1, %xmm3 1109 pmovmskb %xmm3, %edx 1110 lea 32(%edi), %edi 1111 lea 32(%esi), %esi 1112 sub $0xffff, %edx 1113 jnz L(exit) 1114 lea (%ecx, %edi,1), %eax 1115 lea 11(%ecx, %esi,1), %edx 1116 POP (%edi) 1117 POP (%esi) 1118 jmp L(less48bytes) 1119 1120 CFI_PUSH (%esi) 1121 CFI_PUSH (%edi) 1122 ALIGN (4) 1123 L(shr_11_gobble): 1124 sub $32, %ecx 1125 movdqa 16(%esi), %xmm0 1126 palignr $11, (%esi), %xmm0 1127 pcmpeqb (%edi), %xmm0 1128 1129 movdqa 32(%esi), %xmm3 1130 palignr $11, 16(%esi), %xmm3 1131 pcmpeqb 16(%edi), %xmm3 1132 1133 L(shr_11_gobble_loop): 1134 pand %xmm0, %xmm3 1135 sub $32, %ecx 1136 pmovmskb %xmm3, %edx 1137 movdqa %xmm0, %xmm1 1138 1139 movdqa 64(%esi), %xmm3 1140 palignr $11,48(%esi), %xmm3 1141 sbb $0xffff, %edx 1142 movdqa 48(%esi), %xmm0 1143 palignr $11,32(%esi), %xmm0 1144 pcmpeqb 32(%edi), %xmm0 1145 lea 32(%esi), %esi 1146 pcmpeqb 48(%edi), %xmm3 1147 1148 lea 32(%edi), %edi 1149 jz L(shr_11_gobble_loop) 1150 pand %xmm0, %xmm3 1151 1152 cmp $0, %ecx 1153 jge L(shr_11_gobble_next) 1154 inc %edx 1155 add $32, %ecx 1156 L(shr_11_gobble_next): 1157 test %edx, %edx 1158 jnz L(exit) 1159 1160 pmovmskb %xmm3, %edx 1161 movdqa %xmm0, %xmm1 1162 lea 32(%edi), %edi 1163 lea 32(%esi), %esi 1164 sub $0xffff, %edx 1165 jnz L(exit) 1166 1167 lea (%ecx, %edi,1), %eax 1168 lea 11(%ecx, %esi,1), %edx 1169 POP (%edi) 1170 POP (%esi) 1171 jmp L(less48bytes) 1172 1173 CFI_PUSH (%esi) 1174 CFI_PUSH (%edi) 1175 ALIGN (4) 1176 L(shr_12): 1177 cmp $80, %ecx 1178 lea -48(%ecx), %ecx 1179 mov %edx, %eax 1180 jae L(shr_12_gobble) 1181 1182 movdqa 16(%esi), %xmm1 1183 movdqa %xmm1, %xmm2 1184 palignr $12, (%esi), %xmm1 1185 pcmpeqb (%edi), %xmm1 1186 1187 movdqa 32(%esi), %xmm3 1188 palignr $12, %xmm2, %xmm3 1189 pcmpeqb 16(%edi), %xmm3 1190 1191 pand %xmm1, %xmm3 1192 pmovmskb %xmm3, %edx 1193 lea 32(%edi), %edi 1194 lea 32(%esi), %esi 1195 sub $0xffff, %edx 1196 jnz L(exit) 1197 lea (%ecx, %edi,1), %eax 1198 lea 12(%ecx, %esi,1), %edx 1199 POP (%edi) 1200 POP (%esi) 1201 jmp L(less48bytes) 1202 1203 CFI_PUSH (%esi) 1204 CFI_PUSH (%edi) 1205 ALIGN (4) 1206 L(shr_12_gobble): 1207 sub $32, %ecx 1208 movdqa 16(%esi), %xmm0 1209 palignr $12, (%esi), %xmm0 1210 pcmpeqb (%edi), %xmm0 1211 1212 movdqa 32(%esi), %xmm3 1213 palignr $12, 16(%esi), %xmm3 1214 pcmpeqb 16(%edi), %xmm3 1215 1216 L(shr_12_gobble_loop): 1217 pand %xmm0, %xmm3 1218 sub $32, %ecx 1219 pmovmskb %xmm3, %edx 1220 movdqa %xmm0, %xmm1 1221 1222 movdqa 64(%esi), %xmm3 1223 palignr $12,48(%esi), %xmm3 1224 sbb $0xffff, %edx 1225 movdqa 48(%esi), %xmm0 1226 palignr $12,32(%esi), %xmm0 1227 pcmpeqb 32(%edi), %xmm0 1228 lea 32(%esi), %esi 1229 pcmpeqb 48(%edi), %xmm3 1230 1231 lea 32(%edi), %edi 1232 jz L(shr_12_gobble_loop) 1233 pand %xmm0, %xmm3 1234 1235 cmp $0, %ecx 1236 jge L(shr_12_gobble_next) 1237 inc %edx 1238 add $32, %ecx 1239 L(shr_12_gobble_next): 1240 test %edx, %edx 1241 jnz L(exit) 1242 1243 pmovmskb %xmm3, %edx 1244 movdqa %xmm0, %xmm1 1245 lea 32(%edi), %edi 1246 lea 32(%esi), %esi 1247 sub $0xffff, %edx 1248 jnz L(exit) 1249 1250 lea (%ecx, %edi,1), %eax 1251 lea 12(%ecx, %esi,1), %edx 1252 POP (%edi) 1253 POP (%esi) 1254 jmp L(less48bytes) 1255 1256 CFI_PUSH (%esi) 1257 CFI_PUSH (%edi) 1258 ALIGN (4) 1259 L(shr_13): 1260 cmp $80, %ecx 1261 lea -48(%ecx), %ecx 1262 mov %edx, %eax 1263 jae L(shr_13_gobble) 1264 1265 movdqa 16(%esi), %xmm1 1266 movdqa %xmm1, %xmm2 1267 palignr $13, (%esi), %xmm1 1268 pcmpeqb (%edi), %xmm1 1269 1270 movdqa 32(%esi), %xmm3 1271 palignr $13, %xmm2, %xmm3 1272 pcmpeqb 16(%edi), %xmm3 1273 1274 pand %xmm1, %xmm3 1275 pmovmskb %xmm3, %edx 1276 lea 32(%edi), %edi 1277 lea 32(%esi), %esi 1278 sub $0xffff, %edx 1279 jnz L(exit) 1280 lea (%ecx, %edi,1), %eax 1281 lea 13(%ecx, %esi,1), %edx 1282 POP (%edi) 1283 POP (%esi) 1284 jmp L(less48bytes) 1285 1286 CFI_PUSH (%esi) 1287 CFI_PUSH (%edi) 1288 ALIGN (4) 1289 L(shr_13_gobble): 1290 sub $32, %ecx 1291 movdqa 16(%esi), %xmm0 1292 palignr $13, (%esi), %xmm0 1293 pcmpeqb (%edi), %xmm0 1294 1295 movdqa 32(%esi), %xmm3 1296 palignr $13, 16(%esi), %xmm3 1297 pcmpeqb 16(%edi), %xmm3 1298 1299 L(shr_13_gobble_loop): 1300 pand %xmm0, %xmm3 1301 sub $32, %ecx 1302 pmovmskb %xmm3, %edx 1303 movdqa %xmm0, %xmm1 1304 1305 movdqa 64(%esi), %xmm3 1306 palignr $13,48(%esi), %xmm3 1307 sbb $0xffff, %edx 1308 movdqa 48(%esi), %xmm0 1309 palignr $13,32(%esi), %xmm0 1310 pcmpeqb 32(%edi), %xmm0 1311 lea 32(%esi), %esi 1312 pcmpeqb 48(%edi), %xmm3 1313 1314 lea 32(%edi), %edi 1315 jz L(shr_13_gobble_loop) 1316 pand %xmm0, %xmm3 1317 1318 cmp $0, %ecx 1319 jge L(shr_13_gobble_next) 1320 inc %edx 1321 add $32, %ecx 1322 L(shr_13_gobble_next): 1323 test %edx, %edx 1324 jnz L(exit) 1325 1326 pmovmskb %xmm3, %edx 1327 movdqa %xmm0, %xmm1 1328 lea 32(%edi), %edi 1329 lea 32(%esi), %esi 1330 sub $0xffff, %edx 1331 jnz L(exit) 1332 1333 lea (%ecx, %edi,1), %eax 1334 lea 13(%ecx, %esi,1), %edx 1335 POP (%edi) 1336 POP (%esi) 1337 jmp L(less48bytes) 1338 1339 CFI_PUSH (%esi) 1340 CFI_PUSH (%edi) 1341 ALIGN (4) 1342 L(shr_14): 1343 cmp $80, %ecx 1344 lea -48(%ecx), %ecx 1345 mov %edx, %eax 1346 jae L(shr_14_gobble) 1347 1348 movdqa 16(%esi), %xmm1 1349 movdqa %xmm1, %xmm2 1350 palignr $14, (%esi), %xmm1 1351 pcmpeqb (%edi), %xmm1 1352 1353 movdqa 32(%esi), %xmm3 1354 palignr $14, %xmm2, %xmm3 1355 pcmpeqb 16(%edi), %xmm3 1356 1357 pand %xmm1, %xmm3 1358 pmovmskb %xmm3, %edx 1359 lea 32(%edi), %edi 1360 lea 32(%esi), %esi 1361 sub $0xffff, %edx 1362 jnz L(exit) 1363 lea (%ecx, %edi,1), %eax 1364 lea 14(%ecx, %esi,1), %edx 1365 POP (%edi) 1366 POP (%esi) 1367 jmp L(less48bytes) 1368 1369 CFI_PUSH (%esi) 1370 CFI_PUSH (%edi) 1371 ALIGN (4) 1372 L(shr_14_gobble): 1373 sub $32, %ecx 1374 movdqa 16(%esi), %xmm0 1375 palignr $14, (%esi), %xmm0 1376 pcmpeqb (%edi), %xmm0 1377 1378 movdqa 32(%esi), %xmm3 1379 palignr $14, 16(%esi), %xmm3 1380 pcmpeqb 16(%edi), %xmm3 1381 1382 L(shr_14_gobble_loop): 1383 pand %xmm0, %xmm3 1384 sub $32, %ecx 1385 pmovmskb %xmm3, %edx 1386 movdqa %xmm0, %xmm1 1387 1388 movdqa 64(%esi), %xmm3 1389 palignr $14,48(%esi), %xmm3 1390 sbb $0xffff, %edx 1391 movdqa 48(%esi), %xmm0 1392 palignr $14,32(%esi), %xmm0 1393 pcmpeqb 32(%edi), %xmm0 1394 lea 32(%esi), %esi 1395 pcmpeqb 48(%edi), %xmm3 1396 1397 lea 32(%edi), %edi 1398 jz L(shr_14_gobble_loop) 1399 pand %xmm0, %xmm3 1400 1401 cmp $0, %ecx 1402 jge L(shr_14_gobble_next) 1403 inc %edx 1404 add $32, %ecx 1405 L(shr_14_gobble_next): 1406 test %edx, %edx 1407 jnz L(exit) 1408 1409 pmovmskb %xmm3, %edx 1410 movdqa %xmm0, %xmm1 1411 lea 32(%edi), %edi 1412 lea 32(%esi), %esi 1413 sub $0xffff, %edx 1414 jnz L(exit) 1415 1416 lea (%ecx, %edi,1), %eax 1417 lea 14(%ecx, %esi,1), %edx 1418 POP (%edi) 1419 POP (%esi) 1420 jmp L(less48bytes) 1421 1422 CFI_PUSH (%esi) 1423 CFI_PUSH (%edi) 1424 ALIGN (4) 1425 L(shr_15): 1426 cmp $80, %ecx 1427 lea -48(%ecx), %ecx 1428 mov %edx, %eax 1429 jae L(shr_15_gobble) 1430 1431 movdqa 16(%esi), %xmm1 1432 movdqa %xmm1, %xmm2 1433 palignr $15, (%esi), %xmm1 1434 pcmpeqb (%edi), %xmm1 1435 1436 movdqa 32(%esi), %xmm3 1437 palignr $15, %xmm2, %xmm3 1438 pcmpeqb 16(%edi), %xmm3 1439 1440 pand %xmm1, %xmm3 1441 pmovmskb %xmm3, %edx 1442 lea 32(%edi), %edi 1443 lea 32(%esi), %esi 1444 sub $0xffff, %edx 1445 jnz L(exit) 1446 lea (%ecx, %edi,1), %eax 1447 lea 15(%ecx, %esi,1), %edx 1448 POP (%edi) 1449 POP (%esi) 1450 jmp L(less48bytes) 1451 1452 CFI_PUSH (%esi) 1453 CFI_PUSH (%edi) 1454 ALIGN (4) 1455 L(shr_15_gobble): 1456 sub $32, %ecx 1457 movdqa 16(%esi), %xmm0 1458 palignr $15, (%esi), %xmm0 1459 pcmpeqb (%edi), %xmm0 1460 1461 movdqa 32(%esi), %xmm3 1462 palignr $15, 16(%esi), %xmm3 1463 pcmpeqb 16(%edi), %xmm3 1464 1465 L(shr_15_gobble_loop): 1466 pand %xmm0, %xmm3 1467 sub $32, %ecx 1468 pmovmskb %xmm3, %edx 1469 movdqa %xmm0, %xmm1 1470 1471 movdqa 64(%esi), %xmm3 1472 palignr $15,48(%esi), %xmm3 1473 sbb $0xffff, %edx 1474 movdqa 48(%esi), %xmm0 1475 palignr $15,32(%esi), %xmm0 1476 pcmpeqb 32(%edi), %xmm0 1477 lea 32(%esi), %esi 1478 pcmpeqb 48(%edi), %xmm3 1479 1480 lea 32(%edi), %edi 1481 jz L(shr_15_gobble_loop) 1482 pand %xmm0, %xmm3 1483 1484 cmp $0, %ecx 1485 jge L(shr_15_gobble_next) 1486 inc %edx 1487 add $32, %ecx 1488 L(shr_15_gobble_next): 1489 test %edx, %edx 1490 jnz L(exit) 1491 1492 pmovmskb %xmm3, %edx 1493 movdqa %xmm0, %xmm1 1494 lea 32(%edi), %edi 1495 lea 32(%esi), %esi 1496 sub $0xffff, %edx 1497 jnz L(exit) 1498 1499 lea (%ecx, %edi,1), %eax 1500 lea 15(%ecx, %esi,1), %edx 1501 POP (%edi) 1502 POP (%esi) 1503 jmp L(less48bytes) 1504 1505 CFI_PUSH (%esi) 1506 CFI_PUSH (%edi) 1507 ALIGN (4) 1508 L(exit): 1509 pmovmskb %xmm1, %ebx 1510 sub $0xffff, %ebx 1511 jz L(first16bytes) 1512 lea -16(%esi), %esi 1513 lea -16(%edi), %edi 1514 mov %ebx, %edx 1515 L(first16bytes): 1516 add %eax, %esi 1517 L(less16bytes): 1518 test %dl, %dl 1519 jz L(next_24_bytes) 1520 1521 test $0x01, %dl 1522 jnz L(Byte16) 1523 1524 test $0x02, %dl 1525 jnz L(Byte17) 1526 1527 test $0x04, %dl 1528 jnz L(Byte18) 1529 1530 test $0x08, %dl 1531 jnz L(Byte19) 1532 1533 test $0x10, %dl 1534 jnz L(Byte20) 1535 1536 test $0x20, %dl 1537 jnz L(Byte21) 1538 1539 test $0x40, %dl 1540 jnz L(Byte22) 1541 L(Byte23): 1542 movzbl -9(%edi), %eax 1543 movzbl -9(%esi), %edx 1544 sub %edx, %eax 1545 RETURN 1546 1547 ALIGN (4) 1548 L(Byte16): 1549 movzbl -16(%edi), %eax 1550 movzbl -16(%esi), %edx 1551 sub %edx, %eax 1552 RETURN 1553 1554 ALIGN (4) 1555 L(Byte17): 1556 movzbl -15(%edi), %eax 1557 movzbl -15(%esi), %edx 1558 sub %edx, %eax 1559 RETURN 1560 1561 ALIGN (4) 1562 L(Byte18): 1563 movzbl -14(%edi), %eax 1564 movzbl -14(%esi), %edx 1565 sub %edx, %eax 1566 RETURN 1567 1568 ALIGN (4) 1569 L(Byte19): 1570 movzbl -13(%edi), %eax 1571 movzbl -13(%esi), %edx 1572 sub %edx, %eax 1573 RETURN 1574 1575 ALIGN (4) 1576 L(Byte20): 1577 movzbl -12(%edi), %eax 1578 movzbl -12(%esi), %edx 1579 sub %edx, %eax 1580 RETURN 1581 1582 ALIGN (4) 1583 L(Byte21): 1584 movzbl -11(%edi), %eax 1585 movzbl -11(%esi), %edx 1586 sub %edx, %eax 1587 RETURN 1588 1589 ALIGN (4) 1590 L(Byte22): 1591 movzbl -10(%edi), %eax 1592 movzbl -10(%esi), %edx 1593 sub %edx, %eax 1594 RETURN 1595 1596 ALIGN (4) 1597 L(next_24_bytes): 1598 lea 8(%edi), %edi 1599 lea 8(%esi), %esi 1600 test $0x01, %dh 1601 jnz L(Byte16) 1602 1603 test $0x02, %dh 1604 jnz L(Byte17) 1605 1606 test $0x04, %dh 1607 jnz L(Byte18) 1608 1609 test $0x08, %dh 1610 jnz L(Byte19) 1611 1612 test $0x10, %dh 1613 jnz L(Byte20) 1614 1615 test $0x20, %dh 1616 jnz L(Byte21) 1617 1618 test $0x40, %dh 1619 jnz L(Byte22) 1620 1621 ALIGN (4) 1622 L(Byte31): 1623 movzbl -9(%edi), %eax 1624 movzbl -9(%esi), %edx 1625 sub %edx, %eax 1626 RETURN_END 1627 CFI_PUSH (%ebx) 1628 1629 ALIGN (4) 1630 L(more8bytes): 1631 cmp $16, %ecx 1632 jae L(more16bytes) 1633 cmp $8, %ecx 1634 je L(8bytes) 1635 cmp $9, %ecx 1636 je L(9bytes) 1637 cmp $10, %ecx 1638 je L(10bytes) 1639 cmp $11, %ecx 1640 je L(11bytes) 1641 cmp $12, %ecx 1642 je L(12bytes) 1643 cmp $13, %ecx 1644 je L(13bytes) 1645 cmp $14, %ecx 1646 je L(14bytes) 1647 jmp L(15bytes) 1648 1649 ALIGN (4) 1650 L(more16bytes): 1651 cmp $24, %ecx 1652 jae L(more24bytes) 1653 cmp $16, %ecx 1654 je L(16bytes) 1655 cmp $17, %ecx 1656 je L(17bytes) 1657 cmp $18, %ecx 1658 je L(18bytes) 1659 cmp $19, %ecx 1660 je L(19bytes) 1661 cmp $20, %ecx 1662 je L(20bytes) 1663 cmp $21, %ecx 1664 je L(21bytes) 1665 cmp $22, %ecx 1666 je L(22bytes) 1667 jmp L(23bytes) 1668 1669 ALIGN (4) 1670 L(more24bytes): 1671 cmp $32, %ecx 1672 jae L(more32bytes) 1673 cmp $24, %ecx 1674 je L(24bytes) 1675 cmp $25, %ecx 1676 je L(25bytes) 1677 cmp $26, %ecx 1678 je L(26bytes) 1679 cmp $27, %ecx 1680 je L(27bytes) 1681 cmp $28, %ecx 1682 je L(28bytes) 1683 cmp $29, %ecx 1684 je L(29bytes) 1685 cmp $30, %ecx 1686 je L(30bytes) 1687 jmp L(31bytes) 1688 1689 ALIGN (4) 1690 L(more32bytes): 1691 cmp $40, %ecx 1692 jae L(more40bytes) 1693 cmp $32, %ecx 1694 je L(32bytes) 1695 cmp $33, %ecx 1696 je L(33bytes) 1697 cmp $34, %ecx 1698 je L(34bytes) 1699 cmp $35, %ecx 1700 je L(35bytes) 1701 cmp $36, %ecx 1702 je L(36bytes) 1703 cmp $37, %ecx 1704 je L(37bytes) 1705 cmp $38, %ecx 1706 je L(38bytes) 1707 jmp L(39bytes) 1708 1709 ALIGN (4) 1710 L(more40bytes): 1711 cmp $40, %ecx 1712 je L(40bytes) 1713 cmp $41, %ecx 1714 je L(41bytes) 1715 cmp $42, %ecx 1716 je L(42bytes) 1717 cmp $43, %ecx 1718 je L(43bytes) 1719 cmp $44, %ecx 1720 je L(44bytes) 1721 cmp $45, %ecx 1722 je L(45bytes) 1723 cmp $46, %ecx 1724 je L(46bytes) 1725 jmp L(47bytes) 1726 1727 ALIGN (4) 1728 L(less48bytes): 1729 cmp $8, %ecx 1730 jae L(more8bytes) 1731 cmp $2, %ecx 1732 je L(2bytes) 1733 cmp $3, %ecx 1734 je L(3bytes) 1735 cmp $4, %ecx 1736 je L(4bytes) 1737 cmp $5, %ecx 1738 je L(5bytes) 1739 cmp $6, %ecx 1740 je L(6bytes) 1741 jmp L(7bytes) 1742 1743 1744 ALIGN (4) 1745 L(44bytes): 1746 mov -44(%eax), %ecx 1747 mov -44(%edx), %ebx 1748 cmp %ebx, %ecx 1749 jne L(find_diff) 1750 L(40bytes): 1751 mov -40(%eax), %ecx 1752 mov -40(%edx), %ebx 1753 cmp %ebx, %ecx 1754 jne L(find_diff) 1755 L(36bytes): 1756 mov -36(%eax), %ecx 1757 mov -36(%edx), %ebx 1758 cmp %ebx, %ecx 1759 jne L(find_diff) 1760 L(32bytes): 1761 mov -32(%eax), %ecx 1762 mov -32(%edx), %ebx 1763 cmp %ebx, %ecx 1764 jne L(find_diff) 1765 L(28bytes): 1766 mov -28(%eax), %ecx 1767 mov -28(%edx), %ebx 1768 cmp %ebx, %ecx 1769 jne L(find_diff) 1770 L(24bytes): 1771 mov -24(%eax), %ecx 1772 mov -24(%edx), %ebx 1773 cmp %ebx, %ecx 1774 jne L(find_diff) 1775 L(20bytes): 1776 mov -20(%eax), %ecx 1777 mov -20(%edx), %ebx 1778 cmp %ebx, %ecx 1779 jne L(find_diff) 1780 L(16bytes): 1781 mov -16(%eax), %ecx 1782 mov -16(%edx), %ebx 1783 cmp %ebx, %ecx 1784 jne L(find_diff) 1785 L(12bytes): 1786 mov -12(%eax), %ecx 1787 mov -12(%edx), %ebx 1788 cmp %ebx, %ecx 1789 jne L(find_diff) 1790 L(8bytes): 1791 mov -8(%eax), %ecx 1792 mov -8(%edx), %ebx 1793 cmp %ebx, %ecx 1794 jne L(find_diff) 1795 L(4bytes): 1796 mov -4(%eax), %ecx 1797 mov -4(%edx), %ebx 1798 cmp %ebx, %ecx 1799 mov $0, %eax 1800 jne L(find_diff) 1801 POP (%ebx) 1802 ret 1803 CFI_PUSH (%ebx) 1804 1805 ALIGN (4) 1806 L(45bytes): 1807 mov -45(%eax), %ecx 1808 mov -45(%edx), %ebx 1809 cmp %ebx, %ecx 1810 jne L(find_diff) 1811 L(41bytes): 1812 mov -41(%eax), %ecx 1813 mov -41(%edx), %ebx 1814 cmp %ebx, %ecx 1815 jne L(find_diff) 1816 L(37bytes): 1817 mov -37(%eax), %ecx 1818 mov -37(%edx), %ebx 1819 cmp %ebx, %ecx 1820 jne L(find_diff) 1821 L(33bytes): 1822 mov -33(%eax), %ecx 1823 mov -33(%edx), %ebx 1824 cmp %ebx, %ecx 1825 jne L(find_diff) 1826 L(29bytes): 1827 mov -29(%eax), %ecx 1828 mov -29(%edx), %ebx 1829 cmp %ebx, %ecx 1830 jne L(find_diff) 1831 L(25bytes): 1832 mov -25(%eax), %ecx 1833 mov -25(%edx), %ebx 1834 cmp %ebx, %ecx 1835 jne L(find_diff) 1836 L(21bytes): 1837 mov -21(%eax), %ecx 1838 mov -21(%edx), %ebx 1839 cmp %ebx, %ecx 1840 jne L(find_diff) 1841 L(17bytes): 1842 mov -17(%eax), %ecx 1843 mov -17(%edx), %ebx 1844 cmp %ebx, %ecx 1845 jne L(find_diff) 1846 L(13bytes): 1847 mov -13(%eax), %ecx 1848 mov -13(%edx), %ebx 1849 cmp %ebx, %ecx 1850 jne L(find_diff) 1851 L(9bytes): 1852 mov -9(%eax), %ecx 1853 mov -9(%edx), %ebx 1854 cmp %ebx, %ecx 1855 jne L(find_diff) 1856 L(5bytes): 1857 mov -5(%eax), %ecx 1858 mov -5(%edx), %ebx 1859 cmp %ebx, %ecx 1860 jne L(find_diff) 1861 movzbl -1(%eax), %ecx 1862 cmp -1(%edx), %cl 1863 mov $0, %eax 1864 jne L(end) 1865 POP (%ebx) 1866 ret 1867 CFI_PUSH (%ebx) 1868 1869 ALIGN (4) 1870 L(46bytes): 1871 mov -46(%eax), %ecx 1872 mov -46(%edx), %ebx 1873 cmp %ebx, %ecx 1874 jne L(find_diff) 1875 L(42bytes): 1876 mov -42(%eax), %ecx 1877 mov -42(%edx), %ebx 1878 cmp %ebx, %ecx 1879 jne L(find_diff) 1880 L(38bytes): 1881 mov -38(%eax), %ecx 1882 mov -38(%edx), %ebx 1883 cmp %ebx, %ecx 1884 jne L(find_diff) 1885 L(34bytes): 1886 mov -34(%eax), %ecx 1887 mov -34(%edx), %ebx 1888 cmp %ebx, %ecx 1889 jne L(find_diff) 1890 L(30bytes): 1891 mov -30(%eax), %ecx 1892 mov -30(%edx), %ebx 1893 cmp %ebx, %ecx 1894 jne L(find_diff) 1895 L(26bytes): 1896 mov -26(%eax), %ecx 1897 mov -26(%edx), %ebx 1898 cmp %ebx, %ecx 1899 jne L(find_diff) 1900 L(22bytes): 1901 mov -22(%eax), %ecx 1902 mov -22(%edx), %ebx 1903 cmp %ebx, %ecx 1904 jne L(find_diff) 1905 L(18bytes): 1906 mov -18(%eax), %ecx 1907 mov -18(%edx), %ebx 1908 cmp %ebx, %ecx 1909 jne L(find_diff) 1910 L(14bytes): 1911 mov -14(%eax), %ecx 1912 mov -14(%edx), %ebx 1913 cmp %ebx, %ecx 1914 jne L(find_diff) 1915 L(10bytes): 1916 mov -10(%eax), %ecx 1917 mov -10(%edx), %ebx 1918 cmp %ebx, %ecx 1919 jne L(find_diff) 1920 L(6bytes): 1921 mov -6(%eax), %ecx 1922 mov -6(%edx), %ebx 1923 cmp %ebx, %ecx 1924 jne L(find_diff) 1925 L(2bytes): 1926 movzwl -2(%eax), %ecx 1927 movzwl -2(%edx), %ebx 1928 cmp %bl, %cl 1929 jne L(end) 1930 cmp %bh, %ch 1931 mov $0, %eax 1932 jne L(end) 1933 POP (%ebx) 1934 ret 1935 CFI_PUSH (%ebx) 1936 1937 ALIGN (4) 1938 L(47bytes): 1939 movl -47(%eax), %ecx 1940 movl -47(%edx), %ebx 1941 cmp %ebx, %ecx 1942 jne L(find_diff) 1943 L(43bytes): 1944 movl -43(%eax), %ecx 1945 movl -43(%edx), %ebx 1946 cmp %ebx, %ecx 1947 jne L(find_diff) 1948 L(39bytes): 1949 movl -39(%eax), %ecx 1950 movl -39(%edx), %ebx 1951 cmp %ebx, %ecx 1952 jne L(find_diff) 1953 L(35bytes): 1954 movl -35(%eax), %ecx 1955 movl -35(%edx), %ebx 1956 cmp %ebx, %ecx 1957 jne L(find_diff) 1958 L(31bytes): 1959 movl -31(%eax), %ecx 1960 movl -31(%edx), %ebx 1961 cmp %ebx, %ecx 1962 jne L(find_diff) 1963 L(27bytes): 1964 movl -27(%eax), %ecx 1965 movl -27(%edx), %ebx 1966 cmp %ebx, %ecx 1967 jne L(find_diff) 1968 L(23bytes): 1969 movl -23(%eax), %ecx 1970 movl -23(%edx), %ebx 1971 cmp %ebx, %ecx 1972 jne L(find_diff) 1973 L(19bytes): 1974 movl -19(%eax), %ecx 1975 movl -19(%edx), %ebx 1976 cmp %ebx, %ecx 1977 jne L(find_diff) 1978 L(15bytes): 1979 movl -15(%eax), %ecx 1980 movl -15(%edx), %ebx 1981 cmp %ebx, %ecx 1982 jne L(find_diff) 1983 L(11bytes): 1984 movl -11(%eax), %ecx 1985 movl -11(%edx), %ebx 1986 cmp %ebx, %ecx 1987 jne L(find_diff) 1988 L(7bytes): 1989 movl -7(%eax), %ecx 1990 movl -7(%edx), %ebx 1991 cmp %ebx, %ecx 1992 jne L(find_diff) 1993 L(3bytes): 1994 movzwl -3(%eax), %ecx 1995 movzwl -3(%edx), %ebx 1996 cmpb %bl, %cl 1997 jne L(end) 1998 cmp %bx, %cx 1999 jne L(end) 2000 movzbl -1(%eax), %eax 2001 cmpb -1(%edx), %al 2002 mov $0, %eax 2003 jne L(end) 2004 POP (%ebx) 2005 ret 2006 CFI_PUSH (%ebx) 2007 2008 ALIGN (4) 2009 L(find_diff): 2010 cmpb %bl, %cl 2011 jne L(end) 2012 cmp %bx, %cx 2013 jne L(end) 2014 shr $16,%ecx 2015 shr $16,%ebx 2016 cmp %bl, %cl 2017 jne L(end) 2018 cmp %bx, %cx 2019 L(end): 2020 POP (%ebx) 2021 mov $1, %eax 2022 ja L(bigger) 2023 neg %eax 2024 L(bigger): 2025 ret 2026 2027 END (MEMCMP) 2028