Home | History | Annotate | Download | only in string
      1 /*
      2 Copyright (c) 2010, Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 #ifndef L
     32 # define L(label)	.L##label
     33 #endif
     34 
     35 #ifndef cfi_startproc
     36 # define cfi_startproc			.cfi_startproc
     37 #endif
     38 
     39 #ifndef cfi_endproc
     40 # define cfi_endproc			.cfi_endproc
     41 #endif
     42 
     43 #ifndef cfi_rel_offset
     44 # define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
     45 #endif
     46 
     47 #ifndef cfi_restore
     48 # define cfi_restore(reg)		.cfi_restore reg
     49 #endif
     50 
     51 #ifndef cfi_adjust_cfa_offset
     52 # define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
     53 #endif
     54 
     55 #ifndef cfi_remember_state
     56 # define cfi_remember_state		.cfi_remember_state
     57 #endif
     58 
     59 #ifndef cfi_restore_state
     60 # define cfi_restore_state		.cfi_restore_state
     61 #endif
     62 
     63 #ifndef ENTRY
     64 # define ENTRY(name)			\
     65 	.type name,  @function; 	\
     66 	.globl name;			\
     67 	.p2align 4;			\
     68 name:					\
     69 	cfi_startproc
     70 #endif
     71 
     72 #ifndef END
     73 # define END(name)			\
     74 	cfi_endproc;			\
     75 	.size name, .-name
     76 #endif
     77 
     78 #define CFI_PUSH(REG)						\
     79   cfi_adjust_cfa_offset (4);					\
     80   cfi_rel_offset (REG, 0)
     81 
     82 #define CFI_POP(REG)						\
     83   cfi_adjust_cfa_offset (-4);					\
     84   cfi_restore (REG)
     85 
     86 #define PUSH(REG)	pushl REG; CFI_PUSH (REG)
     87 #define POP(REG)	popl REG; CFI_POP (REG)
     88 
     89 #ifndef USE_AS_STRNCMP
     90 # define STR1		4
     91 # define STR2		STR1+4
     92 # define RETURN		ret
     93 
     94 # define UPDATE_STRNCMP_COUNTER
     95 #else
     96 # define STR1		8
     97 # define STR2		STR1+4
     98 # define CNT		STR2+4
     99 # define RETURN		POP (%ebp); ret; CFI_PUSH (%ebp)
    100 
    101 # define UPDATE_STRNCMP_COUNTER				\
    102 	/* calculate left number to compare */		\
    103 	mov	$16, %esi;				\
    104 	sub	%ecx, %esi;				\
    105 	cmp	%esi, %ebp;				\
    106 	jbe	L(more8byteseq);			\
    107 	sub	%esi, %ebp
    108 #endif
    109 
    110 	.section .text.ssse3,"ax",@progbits
    111 ENTRY (ssse3_strcmp_latest)
    112 #ifdef USE_AS_STRNCMP
    113 	PUSH	(%ebp)
    114 #endif
    115 	movl	STR1(%esp), %edx
    116 	movl	STR2(%esp), %eax
    117 #ifdef USE_AS_STRNCMP
    118 	movl	CNT(%esp), %ebp
    119 	cmp	$16, %ebp
    120 	jb	L(less16bytes_sncmp)
    121 	jmp	L(more16bytes)
    122 #endif
    123 
    124 	movzbl	(%eax), %ecx
    125 	cmpb	%cl, (%edx)
    126 	jne	L(neq)
    127 	cmpl	$0, %ecx
    128 	je	L(eq)
    129 
    130 	movzbl	1(%eax), %ecx
    131 	cmpb	%cl, 1(%edx)
    132 	jne	L(neq)
    133 	cmpl	$0, %ecx
    134 	je	L(eq)
    135 
    136 	movzbl	2(%eax), %ecx
    137 	cmpb	%cl, 2(%edx)
    138 	jne	L(neq)
    139 	cmpl	$0, %ecx
    140 	je	L(eq)
    141 
    142 	movzbl	3(%eax), %ecx
    143 	cmpb	%cl, 3(%edx)
    144 	jne	L(neq)
    145 	cmpl	$0, %ecx
    146 	je	L(eq)
    147 
    148 	movzbl	4(%eax), %ecx
    149 	cmpb	%cl, 4(%edx)
    150 	jne	L(neq)
    151 	cmpl	$0, %ecx
    152 	je	L(eq)
    153 
    154 	movzbl	5(%eax), %ecx
    155 	cmpb	%cl, 5(%edx)
    156 	jne	L(neq)
    157 	cmpl	$0, %ecx
    158 	je	L(eq)
    159 
    160 	movzbl	6(%eax), %ecx
    161 	cmpb	%cl, 6(%edx)
    162 	jne	L(neq)
    163 	cmpl	$0, %ecx
    164 	je	L(eq)
    165 
    166 	movzbl	7(%eax), %ecx
    167 	cmpb	%cl, 7(%edx)
    168 	jne	L(neq)
    169 	cmpl	$0, %ecx
    170 	je	L(eq)
    171 
    172 	add	$8, %edx
    173 	add	$8, %eax
    174 #ifdef USE_AS_STRNCMP
    175 	cmp	$8, %ebp
    176 	lea	-8(%ebp), %ebp
    177 	je	L(eq)
    178 L(more16bytes):
    179 #endif
    180 	movl	%edx, %ecx
    181 	and	$0xfff, %ecx
    182 	cmp	$0xff0, %ecx
    183 	ja	L(crosspage)
    184 	mov	%eax, %ecx
    185 	and	$0xfff, %ecx
    186 	cmp	$0xff0, %ecx
    187 	ja	L(crosspage)
    188 	pxor	%xmm0, %xmm0
    189 	movlpd	(%eax), %xmm1
    190 	movlpd	(%edx), %xmm2
    191 	movhpd	8(%eax), %xmm1
    192 	movhpd	8(%edx), %xmm2
    193 	pcmpeqb	%xmm1, %xmm0
    194 	pcmpeqb	%xmm2, %xmm1
    195 	psubb	%xmm0, %xmm1
    196 	pmovmskb %xmm1, %ecx
    197 	sub	$0xffff, %ecx
    198 	jnz	L(less16bytes)
    199 #ifdef USE_AS_STRNCMP
    200 	cmp	$16, %ebp
    201 	lea	-16(%ebp), %ebp
    202 	jbe	L(eq)
    203 #endif
    204 	add	$16, %eax
    205 	add	$16, %edx
    206 
    207 L(crosspage):
    208 
    209 	PUSH	(%ebx)
    210 	PUSH	(%edi)
    211 	PUSH	(%esi)
    212 #ifdef USE_AS_STRNCMP
    213 	cfi_remember_state
    214 #endif
    215 
    216 	movl	%edx, %edi
    217 	movl	%eax, %ecx
    218 	and	$0xf, %ecx
    219 	and	$0xf, %edi
    220 	xor	%ecx, %eax
    221 	xor	%edi, %edx
    222 	xor	%ebx, %ebx
    223 	cmp	%edi, %ecx
    224 	je	L(ashr_0)
    225 	ja	L(bigger)
    226 	or	$0x20, %ebx
    227 	xchg	%edx, %eax
    228 	xchg	%ecx, %edi
    229 L(bigger):
    230 	lea	15(%edi), %edi
    231 	sub	%ecx, %edi
    232 	cmp	$8, %edi
    233 	jle	L(ashr_less_8)
    234 	cmp	$14, %edi
    235 	je	L(ashr_15)
    236 	cmp	$13, %edi
    237 	je	L(ashr_14)
    238 	cmp	$12, %edi
    239 	je	L(ashr_13)
    240 	cmp	$11, %edi
    241 	je	L(ashr_12)
    242 	cmp	$10, %edi
    243 	je	L(ashr_11)
    244 	cmp	$9, %edi
    245 	je	L(ashr_10)
    246 L(ashr_less_8):
    247 	je	L(ashr_9)
    248 	cmp	$7, %edi
    249 	je	L(ashr_8)
    250 	cmp	$6, %edi
    251 	je	L(ashr_7)
    252 	cmp	$5, %edi
    253 	je	L(ashr_6)
    254 	cmp	$4, %edi
    255 	je	L(ashr_5)
    256 	cmp	$3, %edi
    257 	je	L(ashr_4)
    258 	cmp	$2, %edi
    259 	je	L(ashr_3)
    260 	cmp	$1, %edi
    261 	je	L(ashr_2)
    262 	cmp	$0, %edi
    263 	je	L(ashr_1)
    264 
    265 /*
    266  * The following cases will be handled by ashr_0
    267  *  ecx(offset of esi)  eax(offset of edi)  relative offset  corresponding case
    268  *        n(0~15)            n(0~15)           15(15+ n-n)         ashr_0
    269  */
    270 	.p2align 4
    271 L(ashr_0):
    272 	mov	$0xffff, %esi
    273 	movdqa	(%eax), %xmm1
    274 	pxor	%xmm0, %xmm0
    275 	pcmpeqb	%xmm1, %xmm0
    276 	pcmpeqb	(%edx), %xmm1
    277 	psubb	%xmm0, %xmm1
    278 	pmovmskb %xmm1, %edi
    279 	shr	%cl, %esi
    280 	shr	%cl, %edi
    281 	sub	%edi, %esi
    282 	mov	%ecx, %edi
    283 	jne	L(less32bytes)
    284 	UPDATE_STRNCMP_COUNTER
    285 	mov	$0x10, %ebx
    286 	mov	$0x10, %ecx
    287 	pxor	%xmm0, %xmm0
    288 	.p2align 4
    289 L(loop_ashr_0):
    290 	movdqa	(%eax, %ecx), %xmm1
    291 	movdqa	(%edx, %ecx), %xmm2
    292 
    293 	pcmpeqb	%xmm1, %xmm0
    294 	pcmpeqb	%xmm2, %xmm1
    295 	psubb	%xmm0, %xmm1
    296 	pmovmskb %xmm1, %esi
    297 	sub	$0xffff, %esi
    298 	jnz	L(exit)
    299 #ifdef USE_AS_STRNCMP
    300 	cmp	$16, %ebp
    301 	lea	-16(%ebp), %ebp
    302 	jbe	L(more8byteseq)
    303 #endif
    304 	add	$16, %ecx
    305 	jmp	L(loop_ashr_0)
    306 
    307 /*
    308  * The following cases will be handled by ashr_1
    309  * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
    310  *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
    311  */
    312 	.p2align 4
    313 L(ashr_1):
    314 	mov	$0xffff, %esi
    315 	pxor	%xmm0, %xmm0
    316 	movdqa	(%edx), %xmm2
    317 	movdqa	(%eax), %xmm1
    318 	pcmpeqb	%xmm1, %xmm0
    319 	pslldq	$15, %xmm2
    320 	pcmpeqb	%xmm1, %xmm2
    321 	psubb	%xmm0, %xmm2
    322 	pmovmskb %xmm2, %edi
    323 	shr	%cl, %esi
    324 	shr	%cl, %edi
    325 	sub	%edi, %esi
    326 	lea	-15(%ecx), %edi
    327 	jnz	L(less32bytes)
    328 
    329 	UPDATE_STRNCMP_COUNTER
    330 
    331 	movdqa	(%edx), %xmm3
    332 	pxor	%xmm0, %xmm0
    333 	mov	$16, %ecx
    334 	or	$1, %ebx
    335 	lea	1(%edx), %edi
    336 	and	$0xfff, %edi
    337 	sub	$0x1000, %edi
    338 
    339 	.p2align 4
    340 L(loop_ashr_1):
    341 	add	$16, %edi
    342 	jg	L(nibble_ashr_1)
    343 
    344 L(gobble_ashr_1):
    345 	movdqa	(%eax, %ecx), %xmm1
    346 	movdqa	(%edx, %ecx), %xmm2
    347 	movdqa	%xmm2, %xmm4
    348 
    349 	palignr	$1, %xmm3, %xmm2
    350 
    351 	pcmpeqb	%xmm1, %xmm0
    352 	pcmpeqb	%xmm2, %xmm1
    353 	psubb	%xmm0, %xmm1
    354 	pmovmskb %xmm1, %esi
    355 	sub	$0xffff, %esi
    356 	jnz	L(exit)
    357 #ifdef USE_AS_STRNCMP
    358 	cmp	$16, %ebp
    359 	lea	-16(%ebp), %ebp
    360 	jbe	L(more8byteseq)
    361 #endif
    362 
    363 	add	$16, %ecx
    364 	movdqa	%xmm4, %xmm3
    365 
    366 	add	$16, %edi
    367 	jg	L(nibble_ashr_1)
    368 
    369 	movdqa	(%eax, %ecx), %xmm1
    370 	movdqa	(%edx, %ecx), %xmm2
    371 	movdqa	%xmm2, %xmm4
    372 
    373 	palignr	$1, %xmm3, %xmm2
    374 
    375 	pcmpeqb	%xmm1, %xmm0
    376 	pcmpeqb	%xmm2, %xmm1
    377 	psubb	%xmm0, %xmm1
    378 	pmovmskb %xmm1, %esi
    379 	sub	$0xffff, %esi
    380 	jnz	L(exit)
    381 
    382 #ifdef USE_AS_STRNCMP
    383 	cmp	$16, %ebp
    384 	lea	-16(%ebp), %ebp
    385 	jbe	L(more8byteseq)
    386 #endif
    387 	add	$16, %ecx
    388 	movdqa	%xmm4, %xmm3
    389 	jmp	L(loop_ashr_1)
    390 
    391 	.p2align 4
    392 L(nibble_ashr_1):
    393 	pcmpeqb	%xmm3, %xmm0
    394 	pmovmskb %xmm0, %esi
    395 	test	$0xfffe, %esi
    396 	jnz	L(ashr_1_exittail)
    397 
    398 #ifdef USE_AS_STRNCMP
    399 	cmp	$15, %ebp
    400 	jbe	L(ashr_1_exittail)
    401 #endif
    402 	pxor	%xmm0, %xmm0
    403 	sub	$0x1000, %edi
    404 	jmp	L(gobble_ashr_1)
    405 
    406 	.p2align 4
    407 L(ashr_1_exittail):
    408 	movdqa	(%eax, %ecx), %xmm1
    409 	psrldq	$1, %xmm0
    410 	psrldq	$1, %xmm3
    411 	jmp	L(aftertail)
    412 
    413 /*
    414  * The following cases will be handled by ashr_2
    415  * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
    416  *        n(14~15)            n -14            1(15 +(n-14) - n)         ashr_2
    417  */
    418 	.p2align 4
    419 L(ashr_2):
    420 	mov	$0xffff, %esi
    421 	pxor	%xmm0, %xmm0
    422 	movdqa	(%edx), %xmm2
    423 	movdqa	(%eax), %xmm1
    424 	pcmpeqb	%xmm1, %xmm0
    425 	pslldq	$14, %xmm2
    426 	pcmpeqb	%xmm1, %xmm2
    427 	psubb	%xmm0, %xmm2
    428 	pmovmskb %xmm2, %edi
    429 	shr	%cl, %esi
    430 	shr	%cl, %edi
    431 	sub	%edi, %esi
    432 	lea	-14(%ecx), %edi
    433 	jnz	L(less32bytes)
    434 
    435 	UPDATE_STRNCMP_COUNTER
    436 
    437 	movdqa	(%edx), %xmm3
    438 	pxor	%xmm0, %xmm0
    439 	mov	$16, %ecx
    440 	or	$2, %ebx
    441 	lea	2(%edx), %edi
    442 	and	$0xfff, %edi
    443 	sub	$0x1000, %edi
    444 
    445 	.p2align 4
    446 L(loop_ashr_2):
    447 	add	$16, %edi
    448 	jg	L(nibble_ashr_2)
    449 
    450 L(gobble_ashr_2):
    451 	movdqa	(%eax, %ecx), %xmm1
    452 	movdqa	(%edx, %ecx), %xmm2
    453 	movdqa	%xmm2, %xmm4
    454 
    455 	palignr	$2, %xmm3, %xmm2
    456 
    457 	pcmpeqb	%xmm1, %xmm0
    458 	pcmpeqb	%xmm2, %xmm1
    459 	psubb	%xmm0, %xmm1
    460 	pmovmskb %xmm1, %esi
    461 	sub	$0xffff, %esi
    462 	jnz	L(exit)
    463 
    464 #ifdef USE_AS_STRNCMP
    465 	cmp	$16, %ebp
    466 	lea	-16(%ebp), %ebp
    467 	jbe	L(more8byteseq)
    468 #endif
    469 	add	$16, %ecx
    470 	movdqa	%xmm4, %xmm3
    471 
    472 	add	$16, %edi
    473 	jg	L(nibble_ashr_2)
    474 
    475 	movdqa	(%eax, %ecx), %xmm1
    476 	movdqa	(%edx, %ecx), %xmm2
    477 	movdqa	%xmm2, %xmm4
    478 
    479 	palignr	$2, %xmm3, %xmm2
    480 
    481 	pcmpeqb	%xmm1, %xmm0
    482 	pcmpeqb	%xmm2, %xmm1
    483 	psubb	%xmm0, %xmm1
    484 	pmovmskb %xmm1, %esi
    485 	sub	$0xffff, %esi
    486 	jnz	L(exit)
    487 
    488 #ifdef USE_AS_STRNCMP
    489 	cmp	$16, %ebp
    490 	lea	-16(%ebp), %ebp
    491 	jbe	L(more8byteseq)
    492 #endif
    493 	add	$16, %ecx
    494 	movdqa	%xmm4, %xmm3
    495 	jmp	L(loop_ashr_2)
    496 
    497 	.p2align 4
    498 L(nibble_ashr_2):
    499 	pcmpeqb	%xmm3, %xmm0
    500 	pmovmskb %xmm0, %esi
    501 	test	$0xfffc, %esi
    502 	jnz	L(ashr_2_exittail)
    503 
    504 #ifdef USE_AS_STRNCMP
    505 	cmp	$14, %ebp
    506 	jbe	L(ashr_2_exittail)
    507 #endif
    508 
    509 	pxor	%xmm0, %xmm0
    510 	sub	$0x1000, %edi
    511 	jmp	L(gobble_ashr_2)
    512 
    513 	.p2align 4
    514 L(ashr_2_exittail):
    515 	movdqa	(%eax, %ecx), %xmm1
    516 	psrldq	$2, %xmm0
    517 	psrldq	$2, %xmm3
    518 	jmp	L(aftertail)
    519 
    520 /*
    521  * The following cases will be handled by ashr_3
    522  * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
    523  *        n(13~15)            n -13            2(15 +(n-13) - n)         ashr_3
    524  */
    525 	.p2align 4
    526 L(ashr_3):
    527 	mov	$0xffff, %esi
    528 	pxor	%xmm0, %xmm0
    529 	movdqa	(%edx), %xmm2
    530 	movdqa	(%eax), %xmm1
    531 	pcmpeqb	%xmm1, %xmm0
    532 	pslldq	$13, %xmm2
    533 	pcmpeqb	%xmm1, %xmm2
    534 	psubb	%xmm0, %xmm2
    535 	pmovmskb %xmm2, %edi
    536 	shr	%cl, %esi
    537 	shr	%cl, %edi
    538 	sub	%edi, %esi
    539 	lea	-13(%ecx), %edi
    540 	jnz	L(less32bytes)
    541 
    542 	UPDATE_STRNCMP_COUNTER
    543 
    544 	movdqa	(%edx), %xmm3
    545 	pxor	%xmm0, %xmm0
    546 	mov	$16, %ecx
    547 	or	$3, %ebx
    548 	lea	3(%edx), %edi
    549 	and	$0xfff, %edi
    550 	sub	$0x1000, %edi
    551 
    552 	.p2align 4
    553 L(loop_ashr_3):
    554 	add	$16, %edi
    555 	jg	L(nibble_ashr_3)
    556 
    557 L(gobble_ashr_3):
    558 	movdqa	(%eax, %ecx), %xmm1
    559 	movdqa	(%edx, %ecx), %xmm2
    560 	movdqa	%xmm2, %xmm4
    561 
    562 	palignr	$3, %xmm3, %xmm2
    563 
    564 	pcmpeqb	%xmm1, %xmm0
    565 	pcmpeqb	%xmm2, %xmm1
    566 	psubb	%xmm0, %xmm1
    567 	pmovmskb %xmm1, %esi
    568 	sub	$0xffff, %esi
    569 	jnz	L(exit)
    570 
    571 #ifdef USE_AS_STRNCMP
    572 	cmp	$16, %ebp
    573 	lea	-16(%ebp), %ebp
    574 	jbe	L(more8byteseq)
    575 #endif
    576 	add	$16, %ecx
    577 	movdqa	%xmm4, %xmm3
    578 
    579 	add	$16, %edi
    580 	jg	L(nibble_ashr_3)
    581 
    582 	movdqa	(%eax, %ecx), %xmm1
    583 	movdqa	(%edx, %ecx), %xmm2
    584 	movdqa	%xmm2, %xmm4
    585 
    586 	palignr	$3, %xmm3, %xmm2
    587 
    588 	pcmpeqb	%xmm1, %xmm0
    589 	pcmpeqb	%xmm2, %xmm1
    590 	psubb	%xmm0, %xmm1
    591 	pmovmskb %xmm1, %esi
    592 	sub	$0xffff, %esi
    593 	jnz	L(exit)
    594 
    595 #ifdef USE_AS_STRNCMP
    596 	cmp	$16, %ebp
    597 	lea	-16(%ebp), %ebp
    598 	jbe	L(more8byteseq)
    599 #endif
    600 	add	$16, %ecx
    601 	movdqa	%xmm4, %xmm3
    602 	jmp	L(loop_ashr_3)
    603 
    604 	.p2align 4
    605 L(nibble_ashr_3):
    606 	pcmpeqb	%xmm3, %xmm0
    607 	pmovmskb %xmm0, %esi
    608 	test	$0xfff8, %esi
    609 	jnz	L(ashr_3_exittail)
    610 
    611 #ifdef USE_AS_STRNCMP
    612 	cmp	$13, %ebp
    613 	jbe	L(ashr_3_exittail)
    614 #endif
    615 	pxor	%xmm0, %xmm0
    616 	sub	$0x1000, %edi
    617 	jmp	L(gobble_ashr_3)
    618 
    619 	.p2align 4
    620 L(ashr_3_exittail):
    621 	movdqa	(%eax, %ecx), %xmm1
    622 	psrldq	$3, %xmm0
    623 	psrldq	$3, %xmm3
    624 	jmp	L(aftertail)
    625 
    626 /*
    627  * The following cases will be handled by ashr_4
    628  * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
    629  *        n(12~15)            n -12            3(15 +(n-12) - n)         ashr_4
    630  */
    631 	.p2align 4
    632 L(ashr_4):
    633 	mov	$0xffff, %esi
    634 	pxor	%xmm0, %xmm0
    635 	movdqa	(%edx), %xmm2
    636 	movdqa	(%eax), %xmm1
    637 	pcmpeqb	%xmm1, %xmm0
    638 	pslldq	$12, %xmm2
    639 	pcmpeqb	%xmm1, %xmm2
    640 	psubb	%xmm0, %xmm2
    641 	pmovmskb %xmm2, %edi
    642 	shr	%cl, %esi
    643 	shr	%cl, %edi
    644 	sub	%edi, %esi
    645 	lea	-12(%ecx), %edi
    646 	jnz	L(less32bytes)
    647 
    648 	UPDATE_STRNCMP_COUNTER
    649 
    650 	movdqa	(%edx), %xmm3
    651 	pxor	%xmm0, %xmm0
    652 	mov	$16, %ecx
    653 	or	$4, %ebx
    654 	lea	4(%edx), %edi
    655 	and	$0xfff, %edi
    656 	sub	$0x1000, %edi
    657 
    658 	.p2align 4
    659 L(loop_ashr_4):
    660 	add	$16, %edi
    661 	jg	L(nibble_ashr_4)
    662 
    663 L(gobble_ashr_4):
    664 	movdqa	(%eax, %ecx), %xmm1
    665 	movdqa	(%edx, %ecx), %xmm2
    666 	movdqa	%xmm2, %xmm4
    667 
    668 	palignr	$4, %xmm3, %xmm2
    669 
    670 	pcmpeqb	%xmm1, %xmm0
    671 	pcmpeqb	%xmm2, %xmm1
    672 	psubb	%xmm0, %xmm1
    673 	pmovmskb %xmm1, %esi
    674 	sub	$0xffff, %esi
    675 	jnz	L(exit)
    676 
    677 #ifdef USE_AS_STRNCMP
    678 	cmp	$16, %ebp
    679 	lea	-16(%ebp), %ebp
    680 	jbe	L(more8byteseq)
    681 #endif
    682 
    683 	add	$16, %ecx
    684 	movdqa	%xmm4, %xmm3
    685 
    686 	add	$16, %edi
    687 	jg	L(nibble_ashr_4)
    688 
    689 	movdqa	(%eax, %ecx), %xmm1
    690 	movdqa	(%edx, %ecx), %xmm2
    691 	movdqa	%xmm2, %xmm4
    692 
    693 	palignr	$4, %xmm3, %xmm2
    694 
    695 	pcmpeqb	%xmm1, %xmm0
    696 	pcmpeqb	%xmm2, %xmm1
    697 	psubb	%xmm0, %xmm1
    698 	pmovmskb %xmm1, %esi
    699 	sub	$0xffff, %esi
    700 	jnz	L(exit)
    701 
    702 #ifdef USE_AS_STRNCMP
    703 	cmp	$16, %ebp
    704 	lea	-16(%ebp), %ebp
    705 	jbe	L(more8byteseq)
    706 #endif
    707 
    708 	add	$16, %ecx
    709 	movdqa	%xmm4, %xmm3
    710 	jmp	L(loop_ashr_4)
    711 
    712 	.p2align 4
    713 L(nibble_ashr_4):
    714 	pcmpeqb	%xmm3, %xmm0
    715 	pmovmskb %xmm0, %esi
    716 	test	$0xfff0, %esi
    717 	jnz	L(ashr_4_exittail)
    718 
    719 #ifdef USE_AS_STRNCMP
    720 	cmp	$12, %ebp
    721 	jbe	L(ashr_4_exittail)
    722 #endif
    723 
    724 	pxor	%xmm0, %xmm0
    725 	sub	$0x1000, %edi
    726 	jmp	L(gobble_ashr_4)
    727 
    728 	.p2align 4
    729 L(ashr_4_exittail):
    730 	movdqa	(%eax, %ecx), %xmm1
    731 	psrldq	$4, %xmm0
    732 	psrldq	$4, %xmm3
    733 	jmp	L(aftertail)
    734 
    735 /*
    736  * The following cases will be handled by ashr_5
    737  * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
    738  *        n(11~15)            n -11            4(15 +(n-11) - n)         ashr_5
    739  */
    740 	.p2align 4
    741 L(ashr_5):
    742 	mov	$0xffff, %esi
    743 	pxor	%xmm0, %xmm0
    744 	movdqa	(%edx), %xmm2
    745 	movdqa	(%eax), %xmm1
    746 	pcmpeqb	%xmm1, %xmm0
    747 	pslldq	$11, %xmm2
    748 	pcmpeqb	%xmm1, %xmm2
    749 	psubb	%xmm0, %xmm2
    750 	pmovmskb %xmm2, %edi
    751 	shr	%cl, %esi
    752 	shr	%cl, %edi
    753 	sub	%edi, %esi
    754 	lea	-11(%ecx), %edi
    755 	jnz	L(less32bytes)
    756 
    757 	UPDATE_STRNCMP_COUNTER
    758 
    759 	movdqa	(%edx), %xmm3
    760 	pxor	%xmm0, %xmm0
    761 	mov	$16, %ecx
    762 	or	$5, %ebx
    763 	lea	5(%edx), %edi
    764 	and	$0xfff, %edi
    765 	sub	$0x1000, %edi
    766 
    767 	.p2align 4
    768 L(loop_ashr_5):
    769 	add	$16, %edi
    770 	jg	L(nibble_ashr_5)
    771 
    772 L(gobble_ashr_5):
    773 	movdqa	(%eax, %ecx), %xmm1
    774 	movdqa	(%edx, %ecx), %xmm2
    775 	movdqa	%xmm2, %xmm4
    776 
    777 	palignr	$5, %xmm3, %xmm2
    778 
    779 	pcmpeqb	%xmm1, %xmm0
    780 	pcmpeqb	%xmm2, %xmm1
    781 	psubb	%xmm0, %xmm1
    782 	pmovmskb %xmm1, %esi
    783 	sub	$0xffff, %esi
    784 	jnz	L(exit)
    785 
    786 #ifdef USE_AS_STRNCMP
    787 	cmp	$16, %ebp
    788 	lea	-16(%ebp), %ebp
    789 	jbe	L(more8byteseq)
    790 #endif
    791 	add	$16, %ecx
    792 	movdqa	%xmm4, %xmm3
    793 
    794 	add	$16, %edi
    795 	jg	L(nibble_ashr_5)
    796 
    797 	movdqa	(%eax, %ecx), %xmm1
    798 	movdqa	(%edx, %ecx), %xmm2
    799 	movdqa	%xmm2, %xmm4
    800 
    801 	palignr	$5, %xmm3, %xmm2
    802 
    803 	pcmpeqb	%xmm1, %xmm0
    804 	pcmpeqb	%xmm2, %xmm1
    805 	psubb	%xmm0, %xmm1
    806 	pmovmskb %xmm1, %esi
    807 	sub	$0xffff, %esi
    808 	jnz	L(exit)
    809 
    810 #ifdef USE_AS_STRNCMP
    811 	cmp	$16, %ebp
    812 	lea	-16(%ebp), %ebp
    813 	jbe	L(more8byteseq)
    814 #endif
    815 	add	$16, %ecx
    816 	movdqa	%xmm4, %xmm3
    817 	jmp	L(loop_ashr_5)
    818 
    819 	.p2align 4
    820 L(nibble_ashr_5):
    821 	pcmpeqb	%xmm3, %xmm0
    822 	pmovmskb %xmm0, %esi
    823 	test	$0xffe0, %esi
    824 	jnz	L(ashr_5_exittail)
    825 
    826 #ifdef USE_AS_STRNCMP
    827 	cmp	$11, %ebp
    828 	jbe	L(ashr_5_exittail)
    829 #endif
    830 	pxor	%xmm0, %xmm0
    831 	sub	$0x1000, %edi
    832 	jmp	L(gobble_ashr_5)
    833 
    834 	.p2align 4
    835 L(ashr_5_exittail):
    836 	movdqa	(%eax, %ecx), %xmm1
    837 	psrldq	$5, %xmm0
    838 	psrldq	$5, %xmm3
    839 	jmp	L(aftertail)
    840 
    841 /*
    842  * The following cases will be handled by ashr_6
    843  * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
    844  *        n(10~15)            n -10            5(15 +(n-10) - n)         ashr_6
    845  */
    846 
    847 	.p2align 4
    848 L(ashr_6):
    849 	mov	$0xffff, %esi
    850 	pxor	%xmm0, %xmm0
    851 	movdqa	(%edx), %xmm2
    852 	movdqa	(%eax), %xmm1
    853 	pcmpeqb	%xmm1, %xmm0
    854 	pslldq	$10, %xmm2
    855 	pcmpeqb	%xmm1, %xmm2
    856 	psubb	%xmm0, %xmm2
    857 	pmovmskb %xmm2, %edi
    858 	shr	%cl, %esi
    859 	shr	%cl, %edi
    860 	sub	%edi, %esi
    861 	lea	-10(%ecx), %edi
    862 	jnz	L(less32bytes)
    863 
    864 	UPDATE_STRNCMP_COUNTER
    865 
    866 	movdqa	(%edx), %xmm3
    867 	pxor	%xmm0, %xmm0
    868 	mov	$16, %ecx
    869 	or	$6, %ebx
    870 	lea	6(%edx), %edi
    871 	and	$0xfff, %edi
    872 	sub	$0x1000, %edi
    873 
    874 	.p2align 4
    875 L(loop_ashr_6):
    876 	add	$16, %edi
    877 	jg	L(nibble_ashr_6)
    878 
    879 L(gobble_ashr_6):
    880 	movdqa	(%eax, %ecx), %xmm1
    881 	movdqa	(%edx, %ecx), %xmm2
    882 	movdqa	%xmm2, %xmm4
    883 
    884 	palignr	$6, %xmm3, %xmm2
    885 
    886 	pcmpeqb	%xmm1, %xmm0
    887 	pcmpeqb	%xmm2, %xmm1
    888 	psubb	%xmm0, %xmm1
    889 	pmovmskb %xmm1, %esi
    890 	sub	$0xffff, %esi
    891 	jnz	L(exit)
    892 
    893 #ifdef USE_AS_STRNCMP
    894 	cmp	$16, %ebp
    895 	lea	-16(%ebp), %ebp
    896 	jbe	L(more8byteseq)
    897 #endif
    898 
    899 	add	$16, %ecx
    900 	movdqa	%xmm4, %xmm3
    901 
    902 	add	$16, %edi
    903 	jg	L(nibble_ashr_6)
    904 
    905 	movdqa	(%eax, %ecx), %xmm1
    906 	movdqa	(%edx, %ecx), %xmm2
    907 	movdqa	%xmm2, %xmm4
    908 
    909 	palignr	$6, %xmm3, %xmm2
    910 
    911 	pcmpeqb	%xmm1, %xmm0
    912 	pcmpeqb	%xmm2, %xmm1
    913 	psubb	%xmm0, %xmm1
    914 	pmovmskb %xmm1, %esi
    915 	sub	$0xffff, %esi
    916 	jnz	L(exit)
    917 #ifdef USE_AS_STRNCMP
    918 	cmp	$16, %ebp
    919 	lea	-16(%ebp), %ebp
    920 	jbe	L(more8byteseq)
    921 #endif
    922 
    923 	add	$16, %ecx
    924 	movdqa	%xmm4, %xmm3
    925 	jmp	L(loop_ashr_6)
    926 
    927 	.p2align 4
    928 L(nibble_ashr_6):
    929 	pcmpeqb	%xmm3, %xmm0
    930 	pmovmskb %xmm0, %esi
    931 	test	$0xffc0, %esi
    932 	jnz	L(ashr_6_exittail)
    933 
    934 #ifdef USE_AS_STRNCMP
    935 	cmp	$10, %ebp
    936 	jbe	L(ashr_6_exittail)
    937 #endif
    938 	pxor	%xmm0, %xmm0
    939 	sub	$0x1000, %edi
    940 	jmp	L(gobble_ashr_6)
    941 
    942 	.p2align 4
    943 L(ashr_6_exittail):
    944 	movdqa	(%eax, %ecx), %xmm1
    945 	psrldq	$6, %xmm0
    946 	psrldq	$6, %xmm3
    947 	jmp	L(aftertail)
    948 
    949 /*
    950  * The following cases will be handled by ashr_7
    951  * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
    952  *        n(9~15)            n - 9            6(15 +(n-9) - n)         ashr_7
    953  */
    954 
    955 	.p2align 4
    956 L(ashr_7):
    957 	mov	$0xffff, %esi
    958 	pxor	%xmm0, %xmm0
    959 	movdqa	(%edx), %xmm2
    960 	movdqa	(%eax), %xmm1
    961 	pcmpeqb	%xmm1, %xmm0
    962 	pslldq	$9, %xmm2
    963 	pcmpeqb	%xmm1, %xmm2
    964 	psubb	%xmm0, %xmm2
    965 	pmovmskb %xmm2, %edi
    966 	shr	%cl, %esi
    967 	shr	%cl, %edi
    968 	sub	%edi, %esi
    969 	lea	-9(%ecx), %edi
    970 	jnz	L(less32bytes)
    971 
    972 	UPDATE_STRNCMP_COUNTER
    973 
    974 	movdqa	(%edx), %xmm3
    975 	pxor	%xmm0, %xmm0
    976 	mov	$16, %ecx
    977 	or	$7, %ebx
    978 	lea	8(%edx), %edi
    979 	and	$0xfff, %edi
    980 	sub	$0x1000, %edi
    981 
    982 	.p2align 4
    983 L(loop_ashr_7):
    984 	add	$16, %edi
    985 	jg	L(nibble_ashr_7)
    986 
    987 L(gobble_ashr_7):
    988 	movdqa	(%eax, %ecx), %xmm1
    989 	movdqa	(%edx, %ecx), %xmm2
    990 	movdqa	%xmm2, %xmm4
    991 
    992 	palignr	$7, %xmm3, %xmm2
    993 
    994 	pcmpeqb	%xmm1, %xmm0
    995 	pcmpeqb	%xmm2, %xmm1
    996 	psubb	%xmm0, %xmm1
    997 	pmovmskb %xmm1, %esi
    998 	sub	$0xffff, %esi
    999 	jnz	L(exit)
   1000 
   1001 #ifdef USE_AS_STRNCMP
   1002 	cmp	$16, %ebp
   1003 	lea	-16(%ebp), %ebp
   1004 	jbe	L(more8byteseq)
   1005 #endif
   1006 
   1007 	add	$16, %ecx
   1008 	movdqa	%xmm4, %xmm3
   1009 
   1010 	add	$16, %edi
   1011 	jg	L(nibble_ashr_7)
   1012 
   1013 	movdqa	(%eax, %ecx), %xmm1
   1014 	movdqa	(%edx, %ecx), %xmm2
   1015 	movdqa	%xmm2, %xmm4
   1016 
   1017 	palignr	$7, %xmm3, %xmm2
   1018 
   1019 	pcmpeqb	%xmm1, %xmm0
   1020 	pcmpeqb	%xmm2, %xmm1
   1021 	psubb	%xmm0, %xmm1
   1022 	pmovmskb %xmm1, %esi
   1023 	sub	$0xffff, %esi
   1024 	jnz	L(exit)
   1025 
   1026 #ifdef USE_AS_STRNCMP
   1027 	cmp	$16, %ebp
   1028 	lea	-16(%ebp), %ebp
   1029 	jbe	L(more8byteseq)
   1030 #endif
   1031 
   1032 	add	$16, %ecx
   1033 	movdqa	%xmm4, %xmm3
   1034 	jmp	L(loop_ashr_7)
   1035 
   1036 	.p2align 4
   1037 L(nibble_ashr_7):
   1038 	pcmpeqb	%xmm3, %xmm0
   1039 	pmovmskb %xmm0, %esi
   1040 	test	$0xff80, %esi
   1041 	jnz	L(ashr_7_exittail)
   1042 
   1043 #ifdef USE_AS_STRNCMP
   1044 	cmp	$9, %ebp
   1045 	jbe	L(ashr_7_exittail)
   1046 #endif
   1047 	pxor	%xmm0, %xmm0
   1048 	pxor	%xmm0, %xmm0
   1049 	sub	$0x1000, %edi
   1050 	jmp	L(gobble_ashr_7)
   1051 
   1052 	.p2align 4
   1053 L(ashr_7_exittail):
   1054 	movdqa	(%eax, %ecx), %xmm1
   1055 	psrldq	$7, %xmm0
   1056 	psrldq	$7, %xmm3
   1057 	jmp	L(aftertail)
   1058 
   1059 /*
   1060  * The following cases will be handled by ashr_8
   1061  * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
   1062  *        n(8~15)            n - 8            7(15 +(n-8) - n)         ashr_8
   1063  */
   1064 	.p2align 4
   1065 L(ashr_8):
   1066 	mov	$0xffff, %esi
   1067 	pxor	%xmm0, %xmm0
   1068 	movdqa	(%edx), %xmm2
   1069 	movdqa	(%eax), %xmm1
   1070 	pcmpeqb	%xmm1, %xmm0
   1071 	pslldq	$8, %xmm2
   1072 	pcmpeqb	%xmm1, %xmm2
   1073 	psubb	%xmm0, %xmm2
   1074 	pmovmskb %xmm2, %edi
   1075 	shr	%cl, %esi
   1076 	shr	%cl, %edi
   1077 	sub	%edi, %esi
   1078 	lea	-8(%ecx), %edi
   1079 	jnz	L(less32bytes)
   1080 
   1081 	UPDATE_STRNCMP_COUNTER
   1082 
   1083 	movdqa	(%edx), %xmm3
   1084 	pxor	%xmm0, %xmm0
   1085 	mov	$16, %ecx
   1086 	or	$8, %ebx
   1087 	lea	8(%edx), %edi
   1088 	and	$0xfff, %edi
   1089 	sub	$0x1000, %edi
   1090 
   1091 	.p2align 4
   1092 L(loop_ashr_8):
   1093 	add	$16, %edi
   1094 	jg	L(nibble_ashr_8)
   1095 
   1096 L(gobble_ashr_8):
   1097 	movdqa	(%eax, %ecx), %xmm1
   1098 	movdqa	(%edx, %ecx), %xmm2
   1099 	movdqa	%xmm2, %xmm4
   1100 
   1101 	palignr	$8, %xmm3, %xmm2
   1102 
   1103 	pcmpeqb	%xmm1, %xmm0
   1104 	pcmpeqb	%xmm2, %xmm1
   1105 	psubb	%xmm0, %xmm1
   1106 	pmovmskb %xmm1, %esi
   1107 	sub	$0xffff, %esi
   1108 	jnz	L(exit)
   1109 
   1110 #ifdef USE_AS_STRNCMP
   1111 	cmp	$16, %ebp
   1112 	lea	-16(%ebp), %ebp
   1113 	jbe	L(more8byteseq)
   1114 #endif
   1115 	add	$16, %ecx
   1116 	movdqa	%xmm4, %xmm3
   1117 
   1118 	add	$16, %edi
   1119 	jg	L(nibble_ashr_8)
   1120 
   1121 	movdqa	(%eax, %ecx), %xmm1
   1122 	movdqa	(%edx, %ecx), %xmm2
   1123 	movdqa	%xmm2, %xmm4
   1124 
   1125 	palignr	$8, %xmm3, %xmm2
   1126 
   1127 	pcmpeqb	%xmm1, %xmm0
   1128 	pcmpeqb	%xmm2, %xmm1
   1129 	psubb	%xmm0, %xmm1
   1130 	pmovmskb %xmm1, %esi
   1131 	sub	$0xffff, %esi
   1132 	jnz	L(exit)
   1133 
   1134 #ifdef USE_AS_STRNCMP
   1135 	cmp	$16, %ebp
   1136 	lea	-16(%ebp), %ebp
   1137 	jbe	L(more8byteseq)
   1138 #endif
   1139 	add	$16, %ecx
   1140 	movdqa	%xmm4, %xmm3
   1141 	jmp	L(loop_ashr_8)
   1142 
   1143 	.p2align 4
   1144 L(nibble_ashr_8):
   1145 	pcmpeqb	%xmm3, %xmm0
   1146 	pmovmskb %xmm0, %esi
   1147 	test	$0xff00, %esi
   1148 	jnz	L(ashr_8_exittail)
   1149 
   1150 #ifdef USE_AS_STRNCMP
   1151 	cmp	$8, %ebp
   1152 	jbe	L(ashr_8_exittail)
   1153 #endif
   1154 	pxor	%xmm0, %xmm0
   1155 	pxor	%xmm0, %xmm0
   1156 	sub	$0x1000, %edi
   1157 	jmp	L(gobble_ashr_8)
   1158 
   1159 	.p2align 4
   1160 L(ashr_8_exittail):
   1161 	movdqa	(%eax, %ecx), %xmm1
   1162 	psrldq	$8, %xmm0
   1163 	psrldq	$8, %xmm3
   1164 	jmp	L(aftertail)
   1165 
   1166 /*
   1167  * The following cases will be handled by ashr_9
   1168  * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
   1169  *        n(7~15)            n - 7            8(15 +(n-7) - n)         ashr_9
   1170  */
   1171 	.p2align 4
   1172 L(ashr_9):
   1173 	mov	$0xffff, %esi
   1174 	pxor	%xmm0, %xmm0
   1175 	movdqa	(%edx), %xmm2
   1176 	movdqa	(%eax), %xmm1
   1177 	pcmpeqb	%xmm1, %xmm0
   1178 	pslldq	$7, %xmm2
   1179 	pcmpeqb	%xmm1, %xmm2
   1180 	psubb	%xmm0, %xmm2
   1181 	pmovmskb %xmm2, %edi
   1182 	shr	%cl, %esi
   1183 	shr	%cl, %edi
   1184 	sub	%edi, %esi
   1185 	lea	-7(%ecx), %edi
   1186 	jnz	L(less32bytes)
   1187 
   1188 	UPDATE_STRNCMP_COUNTER
   1189 
   1190 	movdqa	(%edx), %xmm3
   1191 	pxor	%xmm0, %xmm0
   1192 	mov	$16, %ecx
   1193 	or	$9, %ebx
   1194 	lea	9(%edx), %edi
   1195 	and	$0xfff, %edi
   1196 	sub	$0x1000, %edi
   1197 
   1198 	.p2align 4
   1199 L(loop_ashr_9):
   1200 	add	$16, %edi
   1201 	jg	L(nibble_ashr_9)
   1202 
   1203 L(gobble_ashr_9):
   1204 	movdqa	(%eax, %ecx), %xmm1
   1205 	movdqa	(%edx, %ecx), %xmm2
   1206 	movdqa	%xmm2, %xmm4
   1207 
   1208 	palignr	$9, %xmm3, %xmm2
   1209 
   1210 	pcmpeqb	%xmm1, %xmm0
   1211 	pcmpeqb	%xmm2, %xmm1
   1212 	psubb	%xmm0, %xmm1
   1213 	pmovmskb %xmm1, %esi
   1214 	sub	$0xffff, %esi
   1215 	jnz	L(exit)
   1216 
   1217 #ifdef USE_AS_STRNCMP
   1218 	cmp	$16, %ebp
   1219 	lea	-16(%ebp), %ebp
   1220 	jbe	L(more8byteseq)
   1221 #endif
   1222 	add	$16, %ecx
   1223 	movdqa	%xmm4, %xmm3
   1224 
   1225 	add	$16, %edi
   1226 	jg	L(nibble_ashr_9)
   1227 
   1228 	movdqa	(%eax, %ecx), %xmm1
   1229 	movdqa	(%edx, %ecx), %xmm2
   1230 	movdqa	%xmm2, %xmm4
   1231 
   1232 	palignr	$9, %xmm3, %xmm2
   1233 
   1234 	pcmpeqb	%xmm1, %xmm0
   1235 	pcmpeqb	%xmm2, %xmm1
   1236 	psubb	%xmm0, %xmm1
   1237 	pmovmskb %xmm1, %esi
   1238 	sub	$0xffff, %esi
   1239 	jnz	L(exit)
   1240 
   1241 #ifdef USE_AS_STRNCMP
   1242 	cmp	$16, %ebp
   1243 	lea	-16(%ebp), %ebp
   1244 	jbe	L(more8byteseq)
   1245 #endif
   1246 	add	$16, %ecx
   1247 	movdqa	%xmm4, %xmm3
   1248 	jmp	L(loop_ashr_9)
   1249 
   1250 	.p2align 4
   1251 L(nibble_ashr_9):
   1252 	pcmpeqb	%xmm3, %xmm0
   1253 	pmovmskb %xmm0, %esi
   1254 	test	$0xfe00, %esi
   1255 	jnz	L(ashr_9_exittail)
   1256 
   1257 #ifdef USE_AS_STRNCMP
   1258 	cmp	$7, %ebp
   1259 	jbe	L(ashr_9_exittail)
   1260 #endif
   1261 	pxor	%xmm0, %xmm0
   1262 	sub	$0x1000, %edi
   1263 	jmp	L(gobble_ashr_9)
   1264 
   1265 	.p2align 4
   1266 L(ashr_9_exittail):
   1267 	movdqa	(%eax, %ecx), %xmm1
   1268 	psrldq	$9, %xmm0
   1269 	psrldq	$9, %xmm3
   1270 	jmp	L(aftertail)
   1271 
   1272 /*
   1273  * The following cases will be handled by ashr_10
   1274  * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
   1275  *        n(6~15)            n - 6            9(15 +(n-6) - n)         ashr_10
   1276  */
   1277 	.p2align 4
   1278 L(ashr_10):
   1279 	mov	$0xffff, %esi
   1280 	pxor	%xmm0, %xmm0
   1281 	movdqa	(%edx), %xmm2
   1282 	movdqa	(%eax), %xmm1
   1283 	pcmpeqb	%xmm1, %xmm0
   1284 	pslldq	$6, %xmm2
   1285 	pcmpeqb	%xmm1, %xmm2
   1286 	psubb	%xmm0, %xmm2
   1287 	pmovmskb %xmm2, %edi
   1288 	shr	%cl, %esi
   1289 	shr	%cl, %edi
   1290 	sub	%edi, %esi
   1291 	lea	-6(%ecx), %edi
   1292 	jnz	L(less32bytes)
   1293 
   1294 	UPDATE_STRNCMP_COUNTER
   1295 
   1296 	movdqa	(%edx), %xmm3
   1297 	pxor	%xmm0, %xmm0
   1298 	mov	$16, %ecx
   1299 	or	$10, %ebx
   1300 	lea	10(%edx), %edi
   1301 	and	$0xfff, %edi
   1302 	sub	$0x1000, %edi
   1303 
   1304 	.p2align 4
   1305 L(loop_ashr_10):
   1306 	add	$16, %edi
   1307 	jg	L(nibble_ashr_10)
   1308 
   1309 L(gobble_ashr_10):
   1310 	movdqa	(%eax, %ecx), %xmm1
   1311 	movdqa	(%edx, %ecx), %xmm2
   1312 	movdqa	%xmm2, %xmm4
   1313 
   1314 	palignr	$10, %xmm3, %xmm2
   1315 
   1316 	pcmpeqb	%xmm1, %xmm0
   1317 	pcmpeqb	%xmm2, %xmm1
   1318 	psubb	%xmm0, %xmm1
   1319 	pmovmskb %xmm1, %esi
   1320 	sub	$0xffff, %esi
   1321 	jnz	L(exit)
   1322 
   1323 #ifdef USE_AS_STRNCMP
   1324 	cmp	$16, %ebp
   1325 	lea	-16(%ebp), %ebp
   1326 	jbe	L(more8byteseq)
   1327 #endif
   1328 	add	$16, %ecx
   1329 	movdqa	%xmm4, %xmm3
   1330 
   1331 	add	$16, %edi
   1332 	jg	L(nibble_ashr_10)
   1333 
   1334 	movdqa	(%eax, %ecx), %xmm1
   1335 	movdqa	(%edx, %ecx), %xmm2
   1336 	movdqa	%xmm2, %xmm4
   1337 
   1338 	palignr	$10, %xmm3, %xmm2
   1339 
   1340 	pcmpeqb	%xmm1, %xmm0
   1341 	pcmpeqb	%xmm2, %xmm1
   1342 	psubb	%xmm0, %xmm1
   1343 	pmovmskb %xmm1, %esi
   1344 	sub	$0xffff, %esi
   1345 	jnz	L(exit)
   1346 
   1347 #ifdef USE_AS_STRNCMP
   1348 	cmp	$16, %ebp
   1349 	lea	-16(%ebp), %ebp
   1350 	jbe	L(more8byteseq)
   1351 #endif
   1352 	add	$16, %ecx
   1353 	movdqa	%xmm4, %xmm3
   1354 	jmp	L(loop_ashr_10)
   1355 
   1356 	.p2align 4
   1357 L(nibble_ashr_10):
   1358 	pcmpeqb	%xmm3, %xmm0
   1359 	pmovmskb %xmm0, %esi
   1360 	test	$0xfc00, %esi
   1361 	jnz	L(ashr_10_exittail)
   1362 
   1363 #ifdef USE_AS_STRNCMP
   1364 	cmp	$6, %ebp
   1365 	jbe	L(ashr_10_exittail)
   1366 #endif
   1367 	pxor	%xmm0, %xmm0
   1368 	sub	$0x1000, %edi
   1369 	jmp	L(gobble_ashr_10)
   1370 
   1371 	.p2align 4
   1372 L(ashr_10_exittail):
   1373 	movdqa	(%eax, %ecx), %xmm1
   1374 	psrldq	$10, %xmm0
   1375 	psrldq	$10, %xmm3
   1376 	jmp	L(aftertail)
   1377 
   1378 /*
   1379  * The following cases will be handled by ashr_11
   1380  * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
   1381  *        n(5~15)            n - 5            10(15 +(n-5) - n)         ashr_11
   1382  */
   1383 	.p2align 4
   1384 L(ashr_11):
   1385 	mov	$0xffff, %esi
   1386 	pxor	%xmm0, %xmm0
   1387 	movdqa	(%edx), %xmm2
   1388 	movdqa	(%eax), %xmm1
   1389 	pcmpeqb	%xmm1, %xmm0
   1390 	pslldq	$5, %xmm2
   1391 	pcmpeqb	%xmm1, %xmm2
   1392 	psubb	%xmm0, %xmm2
   1393 	pmovmskb %xmm2, %edi
   1394 	shr	%cl, %esi
   1395 	shr	%cl, %edi
   1396 	sub	%edi, %esi
   1397 	lea	-5(%ecx), %edi
   1398 	jnz	L(less32bytes)
   1399 
   1400 	UPDATE_STRNCMP_COUNTER
   1401 
   1402 	movdqa	(%edx), %xmm3
   1403 	pxor	%xmm0, %xmm0
   1404 	mov	$16, %ecx
   1405 	or	$11, %ebx
   1406 	lea	11(%edx), %edi
   1407 	and	$0xfff, %edi
   1408 	sub	$0x1000, %edi
   1409 
   1410 	.p2align 4
   1411 L(loop_ashr_11):
   1412 	add	$16, %edi
   1413 	jg	L(nibble_ashr_11)
   1414 
   1415 L(gobble_ashr_11):
   1416 	movdqa	(%eax, %ecx), %xmm1
   1417 	movdqa	(%edx, %ecx), %xmm2
   1418 	movdqa	%xmm2, %xmm4
   1419 
   1420 	palignr	$11, %xmm3, %xmm2
   1421 
   1422 	pcmpeqb	%xmm1, %xmm0
   1423 	pcmpeqb	%xmm2, %xmm1
   1424 	psubb	%xmm0, %xmm1
   1425 	pmovmskb %xmm1, %esi
   1426 	sub	$0xffff, %esi
   1427 	jnz	L(exit)
   1428 
   1429 #ifdef USE_AS_STRNCMP
   1430 	cmp	$16, %ebp
   1431 	lea	-16(%ebp), %ebp
   1432 	jbe	L(more8byteseq)
   1433 #endif
   1434 	add	$16, %ecx
   1435 	movdqa	%xmm4, %xmm3
   1436 
   1437 	add	$16, %edi
   1438 	jg	L(nibble_ashr_11)
   1439 
   1440 	movdqa	(%eax, %ecx), %xmm1
   1441 	movdqa	(%edx, %ecx), %xmm2
   1442 	movdqa	%xmm2, %xmm4
   1443 
   1444 	palignr	$11, %xmm3, %xmm2
   1445 
   1446 	pcmpeqb	%xmm1, %xmm0
   1447 	pcmpeqb	%xmm2, %xmm1
   1448 	psubb	%xmm0, %xmm1
   1449 	pmovmskb %xmm1, %esi
   1450 	sub	$0xffff, %esi
   1451 	jnz	L(exit)
   1452 
   1453 #ifdef USE_AS_STRNCMP
   1454 	cmp	$16, %ebp
   1455 	lea	-16(%ebp), %ebp
   1456 	jbe	L(more8byteseq)
   1457 #endif
   1458 	add	$16, %ecx
   1459 	movdqa	%xmm4, %xmm3
   1460 	jmp	L(loop_ashr_11)
   1461 
   1462 	.p2align 4
   1463 L(nibble_ashr_11):
   1464 	pcmpeqb	%xmm3, %xmm0
   1465 	pmovmskb %xmm0, %esi
   1466 	test	$0xf800, %esi
   1467 	jnz	L(ashr_11_exittail)
   1468 
   1469 #ifdef USE_AS_STRNCMP
   1470 	cmp	$5, %ebp
   1471 	jbe	L(ashr_11_exittail)
   1472 #endif
   1473 	pxor	%xmm0, %xmm0
   1474 	sub	$0x1000, %edi
   1475 	jmp	L(gobble_ashr_11)
   1476 
   1477 	.p2align 4
   1478 L(ashr_11_exittail):
   1479 	movdqa	(%eax, %ecx), %xmm1
   1480 	psrldq	$11, %xmm0
   1481 	psrldq	$11, %xmm3
   1482 	jmp	L(aftertail)
   1483 
   1484 /*
   1485  * The following cases will be handled by ashr_12
   1486  * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
   1487  *        n(4~15)            n - 4            11(15 +(n-4) - n)         ashr_12
   1488  */
   1489 	.p2align 4
   1490 L(ashr_12):
   1491 	mov	$0xffff, %esi
   1492 	pxor	%xmm0, %xmm0
   1493 	movdqa	(%edx), %xmm2
   1494 	movdqa	(%eax), %xmm1
   1495 	pcmpeqb	%xmm1, %xmm0
   1496 	pslldq	$4, %xmm2
   1497 	pcmpeqb	%xmm1, %xmm2
   1498 	psubb	%xmm0, %xmm2
   1499 	pmovmskb %xmm2, %edi
   1500 	shr	%cl, %esi
   1501 	shr	%cl, %edi
   1502 	sub	%edi, %esi
   1503 	lea	-4(%ecx), %edi
   1504 	jnz	L(less32bytes)
   1505 
   1506 	UPDATE_STRNCMP_COUNTER
   1507 
   1508 	movdqa	(%edx), %xmm3
   1509 	pxor	%xmm0, %xmm0
   1510 	mov	$16, %ecx
   1511 	or	$12, %ebx
   1512 	lea	12(%edx), %edi
   1513 	and	$0xfff, %edi
   1514 	sub	$0x1000, %edi
   1515 
   1516 	.p2align 4
   1517 L(loop_ashr_12):
   1518 	add	$16, %edi
   1519 	jg	L(nibble_ashr_12)
   1520 
   1521 L(gobble_ashr_12):
   1522 	movdqa	(%eax, %ecx), %xmm1
   1523 	movdqa	(%edx, %ecx), %xmm2
   1524 	movdqa	%xmm2, %xmm4
   1525 
   1526 	palignr	$12, %xmm3, %xmm2
   1527 
   1528 	pcmpeqb	%xmm1, %xmm0
   1529 	pcmpeqb	%xmm2, %xmm1
   1530 	psubb	%xmm0, %xmm1
   1531 	pmovmskb %xmm1, %esi
   1532 	sub	$0xffff, %esi
   1533 	jnz	L(exit)
   1534 
   1535 #ifdef USE_AS_STRNCMP
   1536 	cmp	$16, %ebp
   1537 	lea	-16(%ebp), %ebp
   1538 	jbe	L(more8byteseq)
   1539 #endif
   1540 
   1541 	add	$16, %ecx
   1542 	movdqa	%xmm4, %xmm3
   1543 
   1544 	add	$16, %edi
   1545 	jg	L(nibble_ashr_12)
   1546 
   1547 	movdqa	(%eax, %ecx), %xmm1
   1548 	movdqa	(%edx, %ecx), %xmm2
   1549 	movdqa	%xmm2, %xmm4
   1550 
   1551 	palignr	$12, %xmm3, %xmm2
   1552 
   1553 	pcmpeqb	%xmm1, %xmm0
   1554 	pcmpeqb	%xmm2, %xmm1
   1555 	psubb	%xmm0, %xmm1
   1556 	pmovmskb %xmm1, %esi
   1557 	sub	$0xffff, %esi
   1558 	jnz	L(exit)
   1559 
   1560 #ifdef USE_AS_STRNCMP
   1561 	cmp	$16, %ebp
   1562 	lea	-16(%ebp), %ebp
   1563 	jbe	L(more8byteseq)
   1564 #endif
   1565 	add	$16, %ecx
   1566 	movdqa	%xmm4, %xmm3
   1567 	jmp	L(loop_ashr_12)
   1568 
   1569 	.p2align 4
   1570 L(nibble_ashr_12):
   1571 	pcmpeqb	%xmm3, %xmm0
   1572 	pmovmskb %xmm0, %esi
   1573 	test	$0xf000, %esi
   1574 	jnz	L(ashr_12_exittail)
   1575 
   1576 #ifdef USE_AS_STRNCMP
   1577 	cmp	$4, %ebp
   1578 	jbe	L(ashr_12_exittail)
   1579 #endif
   1580 	pxor	%xmm0, %xmm0
   1581 	sub	$0x1000, %edi
   1582 	jmp	L(gobble_ashr_12)
   1583 
   1584 	.p2align 4
   1585 L(ashr_12_exittail):
   1586 	movdqa	(%eax, %ecx), %xmm1
   1587 	psrldq	$12, %xmm0
   1588 	psrldq	$12, %xmm3
   1589 	jmp	L(aftertail)
   1590 
   1591 /*
   1592  * The following cases will be handled by ashr_13
   1593  * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
   1594  *        n(3~15)            n - 3            12(15 +(n-3) - n)         ashr_13
   1595  */
   1596 	.p2align 4
   1597 L(ashr_13):
   1598 	mov	$0xffff, %esi
   1599 	pxor	%xmm0, %xmm0
   1600 	movdqa	(%edx), %xmm2
   1601 	movdqa	(%eax), %xmm1
   1602 	pcmpeqb	%xmm1, %xmm0
   1603 	pslldq	$3, %xmm2
   1604 	pcmpeqb	%xmm1, %xmm2
   1605 	psubb	%xmm0, %xmm2
   1606 	pmovmskb %xmm2, %edi
   1607 	shr	%cl, %esi
   1608 	shr	%cl, %edi
   1609 	sub	%edi, %esi
   1610 	lea	-3(%ecx), %edi
   1611 	jnz	L(less32bytes)
   1612 
   1613 	UPDATE_STRNCMP_COUNTER
   1614 
   1615 	movdqa	(%edx), %xmm3
   1616 	pxor	%xmm0, %xmm0
   1617 	mov	$16, %ecx
   1618 	or	$13, %ebx
   1619 	lea	13(%edx), %edi
   1620 	and	$0xfff, %edi
   1621 	sub	$0x1000, %edi
   1622 
   1623 	.p2align 4
   1624 L(loop_ashr_13):
   1625 	add	$16, %edi
   1626 	jg	L(nibble_ashr_13)
   1627 
   1628 L(gobble_ashr_13):
   1629 	movdqa	(%eax, %ecx), %xmm1
   1630 	movdqa	(%edx, %ecx), %xmm2
   1631 	movdqa	%xmm2, %xmm4
   1632 
   1633 	palignr	$13, %xmm3, %xmm2
   1634 
   1635 	pcmpeqb	%xmm1, %xmm0
   1636 	pcmpeqb	%xmm2, %xmm1
   1637 	psubb	%xmm0, %xmm1
   1638 	pmovmskb %xmm1, %esi
   1639 	sub	$0xffff, %esi
   1640 	jnz	L(exit)
   1641 
   1642 #ifdef USE_AS_STRNCMP
   1643 	cmp	$16, %ebp
   1644 	lea	-16(%ebp), %ebp
   1645 	jbe	L(more8byteseq)
   1646 #endif
   1647 	add	$16, %ecx
   1648 	movdqa	%xmm4, %xmm3
   1649 
   1650 	add	$16, %edi
   1651 	jg	L(nibble_ashr_13)
   1652 
   1653 	movdqa	(%eax, %ecx), %xmm1
   1654 	movdqa	(%edx, %ecx), %xmm2
   1655 	movdqa	%xmm2, %xmm4
   1656 
   1657 	palignr	$13, %xmm3, %xmm2
   1658 
   1659 	pcmpeqb	%xmm1, %xmm0
   1660 	pcmpeqb	%xmm2, %xmm1
   1661 	psubb	%xmm0, %xmm1
   1662 	pmovmskb %xmm1, %esi
   1663 	sub	$0xffff, %esi
   1664 	jnz	L(exit)
   1665 
   1666 #ifdef USE_AS_STRNCMP
   1667 	cmp	$16, %ebp
   1668 	lea	-16(%ebp), %ebp
   1669 	jbe	L(more8byteseq)
   1670 #endif
   1671 	add	$16, %ecx
   1672 	movdqa	%xmm4, %xmm3
   1673 	jmp	L(loop_ashr_13)
   1674 
   1675 	.p2align 4
   1676 L(nibble_ashr_13):
   1677 	pcmpeqb	%xmm3, %xmm0
   1678 	pmovmskb %xmm0, %esi
   1679 	test	$0xe000, %esi
   1680 	jnz	L(ashr_13_exittail)
   1681 
   1682 #ifdef USE_AS_STRNCMP
   1683 	cmp	$3, %ebp
   1684 	jbe	L(ashr_13_exittail)
   1685 #endif
   1686 	pxor	%xmm0, %xmm0
   1687 	sub	$0x1000, %edi
   1688 	jmp	L(gobble_ashr_13)
   1689 
   1690 	.p2align 4
   1691 L(ashr_13_exittail):
   1692 	movdqa	(%eax, %ecx), %xmm1
   1693 	psrldq	$13, %xmm0
   1694 	psrldq	$13, %xmm3
   1695 	jmp	L(aftertail)
   1696 
   1697 /*
   1698  * The following cases will be handled by ashr_14
   1699  * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
   1700  *        n(2~15)            n - 2            13(15 +(n-2) - n)         ashr_14
   1701  */
   1702 	.p2align 4
   1703 L(ashr_14):
   1704 	mov	$0xffff, %esi
   1705 	pxor	%xmm0, %xmm0
   1706 	movdqa	(%edx), %xmm2
   1707 	movdqa	(%eax), %xmm1
   1708 	pcmpeqb	%xmm1, %xmm0
   1709 	pslldq	$2, %xmm2
   1710 	pcmpeqb	%xmm1, %xmm2
   1711 	psubb	%xmm0, %xmm2
   1712 	pmovmskb %xmm2, %edi
   1713 	shr	%cl, %esi
   1714 	shr	%cl, %edi
   1715 	sub	%edi, %esi
   1716 	lea	-2(%ecx), %edi
   1717 	jnz	L(less32bytes)
   1718 
   1719 	UPDATE_STRNCMP_COUNTER
   1720 
   1721 	movdqa	(%edx), %xmm3
   1722 	pxor	%xmm0, %xmm0
   1723 	mov	$16, %ecx
   1724 	or	$14, %ebx
   1725 	lea	14(%edx), %edi
   1726 	and	$0xfff, %edi
   1727 	sub	$0x1000, %edi
   1728 
   1729 	.p2align 4
   1730 L(loop_ashr_14):
   1731 	add	$16, %edi
   1732 	jg	L(nibble_ashr_14)
   1733 
   1734 L(gobble_ashr_14):
   1735 	movdqa	(%eax, %ecx), %xmm1
   1736 	movdqa	(%edx, %ecx), %xmm2
   1737 	movdqa	%xmm2, %xmm4
   1738 
   1739 	palignr	$14, %xmm3, %xmm2
   1740 
   1741 	pcmpeqb	%xmm1, %xmm0
   1742 	pcmpeqb	%xmm2, %xmm1
   1743 	psubb	%xmm0, %xmm1
   1744 	pmovmskb %xmm1, %esi
   1745 	sub	$0xffff, %esi
   1746 	jnz	L(exit)
   1747 
   1748 #ifdef USE_AS_STRNCMP
   1749 	cmp	$16, %ebp
   1750 	lea	-16(%ebp), %ebp
   1751 	jbe	L(more8byteseq)
   1752 #endif
   1753 	add	$16, %ecx
   1754 	movdqa	%xmm4, %xmm3
   1755 
   1756 	add	$16, %edi
   1757 	jg	L(nibble_ashr_14)
   1758 
   1759 	movdqa	(%eax, %ecx), %xmm1
   1760 	movdqa	(%edx, %ecx), %xmm2
   1761 	movdqa	%xmm2, %xmm4
   1762 
   1763 	palignr	$14, %xmm3, %xmm2
   1764 
   1765 	pcmpeqb	%xmm1, %xmm0
   1766 	pcmpeqb	%xmm2, %xmm1
   1767 	psubb	%xmm0, %xmm1
   1768 	pmovmskb %xmm1, %esi
   1769 	sub	$0xffff, %esi
   1770 	jnz	L(exit)
   1771 
   1772 #ifdef USE_AS_STRNCMP
   1773 	cmp	$16, %ebp
   1774 	lea	-16(%ebp), %ebp
   1775 	jbe	L(more8byteseq)
   1776 #endif
   1777 	add	$16, %ecx
   1778 	movdqa	%xmm4, %xmm3
   1779 	jmp	L(loop_ashr_14)
   1780 
   1781 	.p2align 4
   1782 L(nibble_ashr_14):
   1783 	pcmpeqb	%xmm3, %xmm0
   1784 	pmovmskb %xmm0, %esi
   1785 	test	$0xc000, %esi
   1786 	jnz	L(ashr_14_exittail)
   1787 
   1788 #ifdef USE_AS_STRNCMP
   1789 	cmp	$2, %ebp
   1790 	jbe	L(ashr_14_exittail)
   1791 #endif
   1792 	pxor	%xmm0, %xmm0
   1793 	sub	$0x1000, %edi
   1794 	jmp	L(gobble_ashr_14)
   1795 
   1796 	.p2align 4
   1797 L(ashr_14_exittail):
   1798 	movdqa	(%eax, %ecx), %xmm1
   1799 	psrldq	$14, %xmm0
   1800 	psrldq	$14, %xmm3
   1801 	jmp	L(aftertail)
   1802 
   1803 /*
   1804  * The following cases will be handled by ashr_14
   1805  * ecx(offset of esi)  eax(offset of edi)   relative offset   	corresponding case
   1806  *        n(1~15)            n - 1            14(15 +(n-1) - n)         ashr_15
   1807  */
   1808 
   1809 	.p2align 4
   1810 L(ashr_15):
   1811 	mov	$0xffff, %esi
   1812 	pxor	%xmm0, %xmm0
   1813 	movdqa	(%edx), %xmm2
   1814 	movdqa	(%eax), %xmm1
   1815 	pcmpeqb	%xmm1, %xmm0
   1816 	pslldq	$1, %xmm2
   1817 	pcmpeqb	%xmm1, %xmm2
   1818 	psubb	%xmm0, %xmm2
   1819 	pmovmskb %xmm2, %edi
   1820 	shr	%cl, %esi
   1821 	shr	%cl, %edi
   1822 	sub	%edi, %esi
   1823 	lea	-1(%ecx), %edi
   1824 	jnz	L(less32bytes)
   1825 
   1826 	UPDATE_STRNCMP_COUNTER
   1827 
   1828 	movdqa	(%edx), %xmm3
   1829 	pxor	%xmm0, %xmm0
   1830 	mov	$16, %ecx
   1831 	or	$15, %ebx
   1832 	lea	15(%edx), %edi
   1833 	and	$0xfff, %edi
   1834 	sub	$0x1000, %edi
   1835 
   1836 	.p2align 4
   1837 L(loop_ashr_15):
   1838 	add	$16, %edi
   1839 	jg	L(nibble_ashr_15)
   1840 
   1841 L(gobble_ashr_15):
   1842 	movdqa	(%eax, %ecx), %xmm1
   1843 	movdqa	(%edx, %ecx), %xmm2
   1844 	movdqa	%xmm2, %xmm4
   1845 
   1846 	palignr	$15, %xmm3, %xmm2
   1847 
   1848 	pcmpeqb	%xmm1, %xmm0
   1849 	pcmpeqb	%xmm2, %xmm1
   1850 	psubb	%xmm0, %xmm1
   1851 	pmovmskb %xmm1, %esi
   1852 	sub	$0xffff, %esi
   1853 	jnz	L(exit)
   1854 
   1855 #ifdef USE_AS_STRNCMP
   1856 	cmp	$16, %ebp
   1857 	lea	-16(%ebp), %ebp
   1858 	jbe	L(more8byteseq)
   1859 #endif
   1860 	add	$16, %ecx
   1861 	movdqa	%xmm4, %xmm3
   1862 
   1863 	add	$16, %edi
   1864 	jg	L(nibble_ashr_15)
   1865 
   1866 	movdqa	(%eax, %ecx), %xmm1
   1867 	movdqa	(%edx, %ecx), %xmm2
   1868 	movdqa	%xmm2, %xmm4
   1869 
   1870 	palignr	$15, %xmm3, %xmm2
   1871 
   1872 	pcmpeqb	%xmm1, %xmm0
   1873 	pcmpeqb	%xmm2, %xmm1
   1874 	psubb	%xmm0, %xmm1
   1875 	pmovmskb %xmm1, %esi
   1876 	sub	$0xffff, %esi
   1877 	jnz	L(exit)
   1878 
   1879 #ifdef USE_AS_STRNCMP
   1880 	cmp	$16, %ebp
   1881 	lea	-16(%ebp), %ebp
   1882 	jbe	L(more8byteseq)
   1883 #endif
   1884 	add	$16, %ecx
   1885 	movdqa	%xmm4, %xmm3
   1886 	jmp	L(loop_ashr_15)
   1887 
   1888 	.p2align 4
   1889 L(nibble_ashr_15):
   1890 	pcmpeqb	%xmm3, %xmm0
   1891 	pmovmskb %xmm0, %esi
   1892 	test	$0x8000, %esi
   1893 	jnz	L(ashr_15_exittail)
   1894 
   1895 #ifdef USE_AS_STRNCMP
   1896 	cmp	$1, %ebp
   1897 	jbe	L(ashr_15_exittail)
   1898 #endif
   1899 	pxor	%xmm0, %xmm0
   1900 	sub	$0x1000, %edi
   1901 	jmp	L(gobble_ashr_15)
   1902 
   1903 	.p2align 4
   1904 L(ashr_15_exittail):
   1905 	movdqa	(%eax, %ecx), %xmm1
   1906 	psrldq	$15, %xmm0
   1907 	psrldq	$15, %xmm3
   1908 	jmp	L(aftertail)
   1909 
   1910 	.p2align 4
   1911 L(aftertail):
   1912 	pcmpeqb	%xmm3, %xmm1
   1913 	psubb	%xmm0, %xmm1
   1914 	pmovmskb %xmm1, %esi
   1915 	not	%esi
   1916 L(exit):
   1917 	mov	%ebx, %edi
   1918 	and	$0x1f, %edi
   1919 	lea	-16(%edi, %ecx), %edi
   1920 L(less32bytes):
   1921 	add	%edi, %edx
   1922 	add	%ecx, %eax
   1923 	test	$0x20, %ebx
   1924 	jz	L(ret2)
   1925 	xchg	%eax, %edx
   1926 
   1927 	.p2align 4
   1928 L(ret2):
   1929 	mov	%esi, %ecx
   1930 	POP	(%esi)
   1931 	POP	(%edi)
   1932 	POP	(%ebx)
   1933 L(less16bytes):
   1934 	test	%cl, %cl
   1935 	jz	L(2next_8_bytes)
   1936 
   1937 	test	$0x01, %cl
   1938 	jnz	L(Byte0)
   1939 
   1940 	test	$0x02, %cl
   1941 	jnz	L(Byte1)
   1942 
   1943 	test	$0x04, %cl
   1944 	jnz	L(Byte2)
   1945 
   1946 	test	$0x08, %cl
   1947 	jnz	L(Byte3)
   1948 
   1949 	test	$0x10, %cl
   1950 	jnz	L(Byte4)
   1951 
   1952 	test	$0x20, %cl
   1953 	jnz	L(Byte5)
   1954 
   1955 	test	$0x40, %cl
   1956 	jnz	L(Byte6)
   1957 #ifdef USE_AS_STRNCMP
   1958 	cmp	$7, %ebp
   1959 	jbe	L(eq)
   1960 #endif
   1961 
   1962 	movzx	7(%eax), %ecx
   1963 	movzx	7(%edx), %eax
   1964 
   1965 	sub	%ecx, %eax
   1966 	RETURN
   1967 
   1968 	.p2align 4
   1969 L(Byte0):
   1970 #ifdef USE_AS_STRNCMP
   1971 	cmp	$0, %ebp
   1972 	jbe	L(eq)
   1973 #endif
   1974 	movzx	(%eax), %ecx
   1975 	movzx	(%edx), %eax
   1976 
   1977 	sub	%ecx, %eax
   1978 	RETURN
   1979 
   1980 	.p2align 4
   1981 L(Byte1):
   1982 #ifdef USE_AS_STRNCMP
   1983 	cmp	$1, %ebp
   1984 	jbe	L(eq)
   1985 #endif
   1986 	movzx	1(%eax), %ecx
   1987 	movzx	1(%edx), %eax
   1988 
   1989 	sub	%ecx, %eax
   1990 	RETURN
   1991 
   1992 	.p2align 4
   1993 L(Byte2):
   1994 #ifdef USE_AS_STRNCMP
   1995 	cmp	$2, %ebp
   1996 	jbe	L(eq)
   1997 #endif
   1998 	movzx	2(%eax), %ecx
   1999 	movzx	2(%edx), %eax
   2000 
   2001 	sub	%ecx, %eax
   2002 	RETURN
   2003 
   2004 	.p2align 4
   2005 L(Byte3):
   2006 #ifdef USE_AS_STRNCMP
   2007 	cmp	$3, %ebp
   2008 	jbe	L(eq)
   2009 #endif
   2010 	movzx	3(%eax), %ecx
   2011 	movzx	3(%edx), %eax
   2012 
   2013 	sub	%ecx, %eax
   2014 	RETURN
   2015 
   2016 	.p2align 4
   2017 L(Byte4):
   2018 #ifdef USE_AS_STRNCMP
   2019 	cmp	$4, %ebp
   2020 	jbe	L(eq)
   2021 #endif
   2022 	movzx	4(%eax), %ecx
   2023 	movzx	4(%edx), %eax
   2024 
   2025 	sub	%ecx, %eax
   2026 	RETURN
   2027 
   2028 	.p2align 4
   2029 L(Byte5):
   2030 #ifdef USE_AS_STRNCMP
   2031 	cmp	$5, %ebp
   2032 	jbe	L(eq)
   2033 #endif
   2034 	movzx	5(%eax), %ecx
   2035 	movzx	5(%edx), %eax
   2036 
   2037 	sub	%ecx, %eax
   2038 	RETURN
   2039 
   2040 	.p2align 4
   2041 L(Byte6):
   2042 #ifdef USE_AS_STRNCMP
   2043 	cmp	$6, %ebp
   2044 	jbe	L(eq)
   2045 #endif
   2046 	movzx	6(%eax), %ecx
   2047 	movzx	6(%edx), %eax
   2048 
   2049 	sub	%ecx, %eax
   2050 	RETURN
   2051 
   2052 	.p2align 4
   2053 L(2next_8_bytes):
   2054 	add	$8, %eax
   2055 	add	$8, %edx
   2056 #ifdef USE_AS_STRNCMP
   2057 	cmp	$8, %ebp
   2058 	lea	-8(%ebp), %ebp
   2059 	jbe	L(eq)
   2060 #endif
   2061 
   2062 	test	$0x01, %ch
   2063 	jnz	L(Byte0)
   2064 
   2065 	test	$0x02, %ch
   2066 	jnz	L(Byte1)
   2067 
   2068 	test	$0x04, %ch
   2069 	jnz	L(Byte2)
   2070 
   2071 	test	$0x08, %ch
   2072 	jnz	L(Byte3)
   2073 
   2074 	test	$0x10, %ch
   2075 	jnz	L(Byte4)
   2076 
   2077 	test	$0x20, %ch
   2078 	jnz	L(Byte5)
   2079 
   2080 	test	$0x40, %ch
   2081 	jnz	L(Byte6)
   2082 
   2083 #ifdef USE_AS_STRNCMP
   2084 	cmp	$7, %ebp
   2085 	jbe	L(eq)
   2086 #endif
   2087 	movzx	7(%eax), %ecx
   2088 	movzx	7(%edx), %eax
   2089 
   2090 	sub	%ecx, %eax
   2091 	RETURN
   2092 
   2093 	.p2align 4
   2094 L(neq):
   2095 	mov	$1, %eax
   2096 	ja	L(neq_bigger)
   2097 	neg	%eax
   2098 L(neq_bigger):
   2099 	RETURN
   2100 
   2101 #ifdef USE_AS_STRNCMP
   2102 	cfi_restore_state
   2103 	.p2align 4
   2104 L(more8byteseq):
   2105 	POP	(%esi)
   2106 	POP	(%edi)
   2107 	POP	(%ebx)
   2108 #endif
   2109 
   2110 L(eq):
   2111 
   2112 #ifdef USE_AS_STRNCMP
   2113 	POP	(%ebp)
   2114 #endif
   2115 	xorl	%eax, %eax
   2116 	ret
   2117 
   2118 #ifdef USE_AS_STRNCMP
   2119 	CFI_PUSH (%ebp)
   2120 
   2121 	.p2align 4
   2122 L(less16bytes_sncmp):
   2123 	test	%ebp, %ebp
   2124 	jz	L(eq)
   2125 
   2126 	movzbl	(%eax), %ecx
   2127 	cmpb	%cl, (%edx)
   2128 	jne	L(neq)
   2129 	test	%cl, %cl
   2130 	je	L(eq)
   2131 
   2132 	cmp	$1, %ebp
   2133 	je	L(eq)
   2134 
   2135 	movzbl	1(%eax), %ecx
   2136 	cmpb	%cl, 1(%edx)
   2137 	jne	L(neq)
   2138 	test	%cl, %cl
   2139 	je	L(eq)
   2140 
   2141 	cmp	$2, %ebp
   2142 	je	L(eq)
   2143 
   2144 	movzbl	2(%eax), %ecx
   2145 	cmpb	%cl, 2(%edx)
   2146 	jne	L(neq)
   2147 	test	%cl, %cl
   2148 	je	L(eq)
   2149 
   2150 	cmp	$3, %ebp
   2151 	je	L(eq)
   2152 
   2153 	movzbl	3(%eax), %ecx
   2154 	cmpb	%cl, 3(%edx)
   2155 	jne	L(neq)
   2156 	test	%cl, %cl
   2157 	je	L(eq)
   2158 
   2159 	cmp	$4, %ebp
   2160 	je	L(eq)
   2161 
   2162 	movzbl	4(%eax), %ecx
   2163 	cmpb	%cl, 4(%edx)
   2164 	jne	L(neq)
   2165 	test	%cl, %cl
   2166 	je	L(eq)
   2167 
   2168 	cmp	$5, %ebp
   2169 	je	L(eq)
   2170 
   2171 	movzbl	5(%eax), %ecx
   2172 	cmpb	%cl, 5(%edx)
   2173 	jne	L(neq)
   2174 	test	%cl, %cl
   2175 	je	L(eq)
   2176 
   2177 	cmp	$6, %ebp
   2178 	je	L(eq)
   2179 
   2180 	movzbl	6(%eax), %ecx
   2181 	cmpb	%cl, 6(%edx)
   2182 	jne	L(neq)
   2183 	test	%cl, %cl
   2184 	je	L(eq)
   2185 
   2186 	cmp	$7, %ebp
   2187 	je	L(eq)
   2188 
   2189 	movzbl	7(%eax), %ecx
   2190 	cmpb	%cl, 7(%edx)
   2191 	jne	L(neq)
   2192 	test	%cl, %cl
   2193 	je	L(eq)
   2194 
   2195 
   2196 	cmp	$8, %ebp
   2197 	je	L(eq)
   2198 
   2199 	movzbl	8(%eax), %ecx
   2200 	cmpb	%cl, 8(%edx)
   2201 	jne	L(neq)
   2202 	test	%cl, %cl
   2203 	je	L(eq)
   2204 
   2205 	cmp	$9, %ebp
   2206 	je	L(eq)
   2207 
   2208 	movzbl	9(%eax), %ecx
   2209 	cmpb	%cl, 9(%edx)
   2210 	jne	L(neq)
   2211 	test	%cl, %cl
   2212 	je	L(eq)
   2213 
   2214 	cmp	$10, %ebp
   2215 	je	L(eq)
   2216 
   2217 	movzbl	10(%eax), %ecx
   2218 	cmpb	%cl, 10(%edx)
   2219 	jne	L(neq)
   2220 	test	%cl, %cl
   2221 	je	L(eq)
   2222 
   2223 	cmp	$11, %ebp
   2224 	je	L(eq)
   2225 
   2226 	movzbl	11(%eax), %ecx
   2227 	cmpb	%cl, 11(%edx)
   2228 	jne	L(neq)
   2229 	test	%cl, %cl
   2230 	je	L(eq)
   2231 
   2232 
   2233 	cmp	$12, %ebp
   2234 	je	L(eq)
   2235 
   2236 	movzbl	12(%eax), %ecx
   2237 	cmpb	%cl, 12(%edx)
   2238 	jne	L(neq)
   2239 	test	%cl, %cl
   2240 	je	L(eq)
   2241 
   2242 	cmp	$13, %ebp
   2243 	je	L(eq)
   2244 
   2245 	movzbl	13(%eax), %ecx
   2246 	cmpb	%cl, 13(%edx)
   2247 	jne	L(neq)
   2248 	test	%cl, %cl
   2249 	je	L(eq)
   2250 
   2251 	cmp	$14, %ebp
   2252 	je	L(eq)
   2253 
   2254 	movzbl	14(%eax), %ecx
   2255 	cmpb	%cl, 14(%edx)
   2256 	jne	L(neq)
   2257 	test	%cl, %cl
   2258 	je	L(eq)
   2259 
   2260 	cmp	$15, %ebp
   2261 	je	L(eq)
   2262 
   2263 	movzbl	15(%eax), %ecx
   2264 	cmpb	%cl, 15(%edx)
   2265 	jne	L(neq)
   2266 	test	%cl, %cl
   2267 	je	L(eq)
   2268 
   2269 	POP	(%ebp)
   2270 	xor	%eax, %eax
   2271 	ret
   2272 #endif
   2273 
   2274 END (ssse3_strcmp_latest)
   2275