Home | History | Annotate | Download | only in string
      1 /*
      2 Copyright (c) 2010, 2011, 2012, 2013 Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 #ifndef L
     32 # define L(label)	.L##label
     33 #endif
     34 
     35 #ifndef cfi_startproc
     36 # define cfi_startproc	.cfi_startproc
     37 #endif
     38 
     39 #ifndef cfi_endproc
     40 # define cfi_endproc	.cfi_endproc
     41 #endif
     42 
     43 #ifndef cfi_rel_offset
     44 # define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
     45 #endif
     46 
     47 #ifndef cfi_restore
     48 # define cfi_restore(reg)	.cfi_restore reg
     49 #endif
     50 
     51 #ifndef cfi_adjust_cfa_offset
     52 # define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
     53 #endif
     54 
     55 #ifndef cfi_remember_state
     56 # define cfi_remember_state	.cfi_remember_state
     57 #endif
     58 
     59 #ifndef cfi_restore_state
     60 # define cfi_restore_state	.cfi_restore_state
     61 #endif
     62 
     63 #ifndef ENTRY
     64 # define ENTRY(name)             \
     65 	.type name, @function;   \
     66 	.globl name;             \
     67 	.p2align 4;              \
     68 name:                            \
     69 	cfi_startproc
     70 #endif
     71 
     72 #ifndef END
     73 # define END(name)               \
     74 	cfi_endproc;             \
     75 	.size name, .-name
     76 #endif
     77 
     78 #ifndef MEMCMP
     79 # define MEMCMP	memcmp
     80 #endif
     81 
     82 #define CFI_PUSH(REG)	\
     83 	cfi_adjust_cfa_offset (4);	\
     84 	cfi_rel_offset (REG, 0)
     85 
     86 #define CFI_POP(REG)	\
     87 	cfi_adjust_cfa_offset (-4);	\
     88 	cfi_restore (REG)
     89 
     90 #define PUSH(REG)	pushl REG; CFI_PUSH (REG)
     91 #define POP(REG)	popl REG; CFI_POP (REG)
     92 
     93 #define PARMS		4
     94 #define BLK1		PARMS
     95 #define BLK2		BLK1+4
     96 #define LEN		BLK2+4
     97 #define RETURN_END	POP (%edi); POP (%esi); POP (%ebx); ret
     98 #define RETURN		RETURN_END; cfi_restore_state; cfi_remember_state
     99 
    100 /* Warning!
    101            wmemcmp has to use SIGNED comparison for elements.
    102            memcmp has to use UNSIGNED comparison for elemnts.
    103 */
    104 
    105 	.text
    106 ENTRY (MEMCMP)
    107 	movl	LEN(%esp), %ecx
    108 
    109 #ifdef USE_WCHAR
    110 	shl	$2, %ecx
    111 	jz	L(zero)
    112 #elif defined USE_UTF16
    113 	shl	$1, %ecx
    114 	jz	L(zero)
    115 #endif
    116 
    117 	movl	BLK1(%esp), %eax
    118 	cmp	$48, %ecx
    119 	movl	BLK2(%esp), %edx
    120 	jae	L(48bytesormore)
    121 
    122 #if !defined(USE_WCHAR) && !defined(USE_UTF16)
    123 	cmp	$1, %ecx
    124 	jbe	L(less1bytes)
    125 #endif
    126 
    127 	PUSH	(%ebx)
    128 	add	%ecx, %edx
    129 	add	%ecx, %eax
    130 	jmp	L(less48bytes)
    131 
    132 	CFI_POP	(%ebx)
    133 
    134 #if !defined(USE_WCHAR) && !defined(USE_UTF16)
    135 	.p2align 4
    136 L(less1bytes):
    137 	jb	L(zero)
    138 	movb	(%eax), %cl
    139 	cmp	(%edx), %cl
    140 	je	L(zero)
    141 	mov	$1, %eax
    142 	ja	L(1bytesend)
    143 	neg	%eax
    144 L(1bytesend):
    145 	ret
    146 #endif
    147 
    148 	.p2align 4
    149 L(zero):
    150 	xor	%eax, %eax
    151 	ret
    152 
    153 	.p2align 4
    154 L(48bytesormore):
    155 	PUSH	(%ebx)
    156 	PUSH	(%esi)
    157 	PUSH	(%edi)
    158 	cfi_remember_state
    159 	movdqu	(%eax), %xmm3
    160 	movdqu	(%edx), %xmm0
    161 	movl	%eax, %edi
    162 	movl	%edx, %esi
    163 	pcmpeqb	%xmm0, %xmm3
    164 	pmovmskb %xmm3, %edx
    165 	lea	16(%edi), %edi
    166 
    167 	sub	$0xffff, %edx
    168 	lea	16(%esi), %esi
    169 	jnz	L(less16bytes)
    170 	mov	%edi, %edx
    171 	and	$0xf, %edx
    172 	xor	%edx, %edi
    173 	sub	%edx, %esi
    174 	add	%edx, %ecx
    175 	mov	%esi, %edx
    176 	and	$0xf, %edx
    177 	jz	L(shr_0)
    178 	xor	%edx, %esi
    179 
    180 #if !defined(USE_WCHAR) && !defined(USE_UTF16)
    181 	cmp	$8, %edx
    182 	jae	L(next_unaligned_table)
    183 	cmp	$0, %edx
    184 	je	L(shr_0)
    185 	cmp	$1, %edx
    186 	je	L(shr_1)
    187 	cmp	$2, %edx
    188 	je	L(shr_2)
    189 	cmp	$3, %edx
    190 	je	L(shr_3)
    191 	cmp	$4, %edx
    192 	je	L(shr_4)
    193 	cmp	$5, %edx
    194 	je	L(shr_5)
    195 	cmp	$6, %edx
    196 	je	L(shr_6)
    197 	jmp	L(shr_7)
    198 
    199 	.p2align 2
    200 L(next_unaligned_table):
    201 	cmp	$8, %edx
    202 	je	L(shr_8)
    203 	cmp	$9, %edx
    204 	je	L(shr_9)
    205 	cmp	$10, %edx
    206 	je	L(shr_10)
    207 	cmp	$11, %edx
    208 	je	L(shr_11)
    209 	cmp	$12, %edx
    210 	je	L(shr_12)
    211 	cmp	$13, %edx
    212 	je	L(shr_13)
    213 	cmp	$14, %edx
    214 	je	L(shr_14)
    215 	jmp	L(shr_15)
    216 #elif defined(USE_WCHAR)
    217 	cmp	$0, %edx
    218 	je	L(shr_0)
    219 	cmp	$4, %edx
    220 	je	L(shr_4)
    221 	cmp	$8, %edx
    222 	je	L(shr_8)
    223 	jmp	L(shr_12)
    224 #elif defined(USE_UTF16)
    225 	cmp	$0, %edx
    226 	je	L(shr_0)
    227 	cmp	$2, %edx
    228 	je	L(shr_2)
    229 	cmp	$4, %edx
    230 	je	L(shr_4)
    231 	cmp	$6, %edx
    232 	je	L(shr_6)
    233 	cmp	$8, %edx
    234 	je	L(shr_8)
    235 	cmp	$10, %edx
    236 	je	L(shr_10)
    237 	cmp	$12, %edx
    238 	je	L(shr_12)
    239 	jmp	L(shr_14)
    240 #endif
    241 
    242 	.p2align 4
    243 L(shr_0):
    244 	cmp	$80, %ecx
    245 	jae	L(shr_0_gobble)
    246 	lea	-48(%ecx), %ecx
    247 	xor	%eax, %eax
    248 	movaps	(%esi), %xmm1
    249 	pcmpeqb	(%edi), %xmm1
    250 	movaps	16(%esi), %xmm2
    251 	pcmpeqb	16(%edi), %xmm2
    252 	pand	%xmm1, %xmm2
    253 	pmovmskb %xmm2, %edx
    254 	add	$32, %edi
    255 	add	$32, %esi
    256 	sub	$0xffff, %edx
    257 	jnz	L(exit)
    258 
    259 	lea	(%ecx, %edi,1), %eax
    260 	lea	(%ecx, %esi,1), %edx
    261 	POP	(%edi)
    262 	POP	(%esi)
    263 	jmp	L(less48bytes)
    264 
    265 	cfi_restore_state
    266 	cfi_remember_state
    267 	.p2align 4
    268 L(shr_0_gobble):
    269 	lea	-48(%ecx), %ecx
    270 	movdqa	(%esi), %xmm0
    271 	xor	%eax, %eax
    272 	pcmpeqb	(%edi), %xmm0
    273 	sub	$32, %ecx
    274 	movdqa	16(%esi), %xmm2
    275 	pcmpeqb	16(%edi), %xmm2
    276 L(shr_0_gobble_loop):
    277 	pand	%xmm0, %xmm2
    278 	sub	$32, %ecx
    279 	pmovmskb %xmm2, %edx
    280 	movdqa	%xmm0, %xmm1
    281 	movdqa	32(%esi), %xmm0
    282 	movdqa	48(%esi), %xmm2
    283 	sbb	$0xffff, %edx
    284 	pcmpeqb	32(%edi), %xmm0
    285 	pcmpeqb	48(%edi), %xmm2
    286 	lea	32(%edi), %edi
    287 	lea	32(%esi), %esi
    288 	jz	L(shr_0_gobble_loop)
    289 
    290 	pand	%xmm0, %xmm2
    291 	cmp	$0, %ecx
    292 	jge	L(shr_0_gobble_loop_next)
    293 	inc	%edx
    294 	add	$32, %ecx
    295 L(shr_0_gobble_loop_next):
    296 	test	%edx, %edx
    297 	jnz	L(exit)
    298 
    299 	pmovmskb %xmm2, %edx
    300 	movdqa	%xmm0, %xmm1
    301 	lea	32(%edi), %edi
    302 	lea	32(%esi), %esi
    303 	sub	$0xffff, %edx
    304 	jnz	L(exit)
    305 	lea	(%ecx, %edi,1), %eax
    306 	lea	(%ecx, %esi,1), %edx
    307 	POP	(%edi)
    308 	POP	(%esi)
    309 	jmp	L(less48bytes)
    310 
    311 #if !defined(USE_WCHAR) && !defined(USE_UTF16)
    312 	cfi_restore_state
    313 	cfi_remember_state
    314 	.p2align 4
    315 L(shr_1):
    316 	cmp	$80, %ecx
    317 	lea	-48(%ecx), %ecx
    318 	mov	%edx, %eax
    319 	jae	L(shr_1_gobble)
    320 
    321 	movdqa	16(%esi), %xmm1
    322 	movdqa	%xmm1, %xmm2
    323 	palignr	$1,(%esi), %xmm1
    324 	pcmpeqb	(%edi), %xmm1
    325 
    326 	movdqa	32(%esi), %xmm3
    327 	palignr	$1,%xmm2, %xmm3
    328 	pcmpeqb	16(%edi), %xmm3
    329 
    330 	pand	%xmm1, %xmm3
    331 	pmovmskb %xmm3, %edx
    332 	lea	32(%edi), %edi
    333 	lea	32(%esi), %esi
    334 	sub	$0xffff, %edx
    335 	jnz	L(exit)
    336 	lea	(%ecx, %edi,1), %eax
    337 	lea	1(%ecx, %esi,1), %edx
    338 	POP	(%edi)
    339 	POP	(%esi)
    340 	jmp	L(less48bytes)
    341 
    342 	cfi_restore_state
    343 	cfi_remember_state
    344 	.p2align 4
    345 L(shr_1_gobble):
    346 	sub	$32, %ecx
    347 	movdqa	16(%esi), %xmm0
    348 	palignr	$1,(%esi), %xmm0
    349 	pcmpeqb	(%edi), %xmm0
    350 
    351 	movdqa	32(%esi), %xmm3
    352 	palignr	$1,16(%esi), %xmm3
    353 	pcmpeqb	16(%edi), %xmm3
    354 
    355 L(shr_1_gobble_loop):
    356 	pand	%xmm0, %xmm3
    357 	sub	$32, %ecx
    358 	pmovmskb %xmm3, %edx
    359 	movdqa	%xmm0, %xmm1
    360 
    361 	movdqa	64(%esi), %xmm3
    362 	palignr	$1,48(%esi), %xmm3
    363 	sbb	$0xffff, %edx
    364 	movdqa	48(%esi), %xmm0
    365 	palignr	$1,32(%esi), %xmm0
    366 	pcmpeqb	32(%edi), %xmm0
    367 	lea	32(%esi), %esi
    368 	pcmpeqb	48(%edi), %xmm3
    369 
    370 	lea	32(%edi), %edi
    371 	jz	L(shr_1_gobble_loop)
    372 	pand	%xmm0, %xmm3
    373 
    374 	cmp	$0, %ecx
    375 	jge	L(shr_1_gobble_next)
    376 	inc	%edx
    377 	add	$32, %ecx
    378 L(shr_1_gobble_next):
    379 	test	%edx, %edx
    380 	jnz	L(exit)
    381 
    382 	pmovmskb %xmm3, %edx
    383 	movdqa	%xmm0, %xmm1
    384 	lea	32(%edi), %edi
    385 	lea	32(%esi), %esi
    386 	sub	$0xffff, %edx
    387 	jnz	L(exit)
    388 
    389 	lea	(%ecx, %edi,1), %eax
    390 	lea	1(%ecx, %esi,1), %edx
    391 	POP	(%edi)
    392 	POP	(%esi)
    393 	jmp	L(less48bytes)
    394 #endif
    395 
    396 
    397 #if !defined(USE_WCHAR)
    398 	cfi_restore_state
    399 	cfi_remember_state
    400 	.p2align 4
    401 L(shr_2):
    402 	cmp	$80, %ecx
    403 	lea	-48(%ecx), %ecx
    404 	mov	%edx, %eax
    405 	jae	L(shr_2_gobble)
    406 
    407 	movdqa	16(%esi), %xmm1
    408 	movdqa	%xmm1, %xmm2
    409 	palignr	$2,(%esi), %xmm1
    410 	pcmpeqb	(%edi), %xmm1
    411 
    412 	movdqa	32(%esi), %xmm3
    413 	palignr	$2,%xmm2, %xmm3
    414 	pcmpeqb	16(%edi), %xmm3
    415 
    416 	pand	%xmm1, %xmm3
    417 	pmovmskb %xmm3, %edx
    418 	lea	32(%edi), %edi
    419 	lea	32(%esi), %esi
    420 	sub	$0xffff, %edx
    421 	jnz	L(exit)
    422 	lea	(%ecx, %edi,1), %eax
    423 	lea	2(%ecx, %esi,1), %edx
    424 	POP	(%edi)
    425 	POP	(%esi)
    426 	jmp	L(less48bytes)
    427 
    428 	cfi_restore_state
    429 	cfi_remember_state
    430 	.p2align 4
    431 L(shr_2_gobble):
    432 	sub	$32, %ecx
    433 	movdqa	16(%esi), %xmm0
    434 	palignr	$2,(%esi), %xmm0
    435 	pcmpeqb	(%edi), %xmm0
    436 
    437 	movdqa	32(%esi), %xmm3
    438 	palignr	$2,16(%esi), %xmm3
    439 	pcmpeqb	16(%edi), %xmm3
    440 
    441 L(shr_2_gobble_loop):
    442 	pand	%xmm0, %xmm3
    443 	sub	$32, %ecx
    444 	pmovmskb %xmm3, %edx
    445 	movdqa	%xmm0, %xmm1
    446 
    447 	movdqa	64(%esi), %xmm3
    448 	palignr	$2,48(%esi), %xmm3
    449 	sbb	$0xffff, %edx
    450 	movdqa	48(%esi), %xmm0
    451 	palignr	$2,32(%esi), %xmm0
    452 	pcmpeqb	32(%edi), %xmm0
    453 	lea	32(%esi), %esi
    454 	pcmpeqb	48(%edi), %xmm3
    455 
    456 	lea	32(%edi), %edi
    457 	jz	L(shr_2_gobble_loop)
    458 	pand	%xmm0, %xmm3
    459 
    460 	cmp	$0, %ecx
    461 	jge	L(shr_2_gobble_next)
    462 	inc	%edx
    463 	add	$32, %ecx
    464 L(shr_2_gobble_next):
    465 	test	%edx, %edx
    466 	jnz	L(exit)
    467 
    468 	pmovmskb %xmm3, %edx
    469 	movdqa	%xmm0, %xmm1
    470 	lea	32(%edi), %edi
    471 	lea	32(%esi), %esi
    472 	sub	$0xffff, %edx
    473 	jnz	L(exit)
    474 
    475 	lea	(%ecx, %edi,1), %eax
    476 	lea	2(%ecx, %esi,1), %edx
    477 	POP	(%edi)
    478 	POP	(%esi)
    479 	jmp	L(less48bytes)
    480 #endif
    481 
    482 #if !defined(USE_WCHAR) && !defined(USE_UTF16)
    483 	cfi_restore_state
    484 	cfi_remember_state
    485 	.p2align 4
    486 L(shr_3):
    487 	cmp	$80, %ecx
    488 	lea	-48(%ecx), %ecx
    489 	mov	%edx, %eax
    490 	jae	L(shr_3_gobble)
    491 
    492 	movdqa	16(%esi), %xmm1
    493 	movdqa	%xmm1, %xmm2
    494 	palignr	$3,(%esi), %xmm1
    495 	pcmpeqb	(%edi), %xmm1
    496 
    497 	movdqa	32(%esi), %xmm3
    498 	palignr	$3,%xmm2, %xmm3
    499 	pcmpeqb	16(%edi), %xmm3
    500 
    501 	pand	%xmm1, %xmm3
    502 	pmovmskb %xmm3, %edx
    503 	lea	32(%edi), %edi
    504 	lea	32(%esi), %esi
    505 	sub	$0xffff, %edx
    506 	jnz	L(exit)
    507 	lea	(%ecx, %edi,1), %eax
    508 	lea	3(%ecx, %esi,1), %edx
    509 	POP	(%edi)
    510 	POP	(%esi)
    511 	jmp	L(less48bytes)
    512 
    513 	cfi_restore_state
    514 	cfi_remember_state
    515 	.p2align 4
    516 L(shr_3_gobble):
    517 	sub	$32, %ecx
    518 	movdqa	16(%esi), %xmm0
    519 	palignr	$3,(%esi), %xmm0
    520 	pcmpeqb	(%edi), %xmm0
    521 
    522 	movdqa	32(%esi), %xmm3
    523 	palignr	$3,16(%esi), %xmm3
    524 	pcmpeqb	16(%edi), %xmm3
    525 
    526 L(shr_3_gobble_loop):
    527 	pand	%xmm0, %xmm3
    528 	sub	$32, %ecx
    529 	pmovmskb %xmm3, %edx
    530 	movdqa	%xmm0, %xmm1
    531 
    532 	movdqa	64(%esi), %xmm3
    533 	palignr	$3,48(%esi), %xmm3
    534 	sbb	$0xffff, %edx
    535 	movdqa	48(%esi), %xmm0
    536 	palignr	$3,32(%esi), %xmm0
    537 	pcmpeqb	32(%edi), %xmm0
    538 	lea	32(%esi), %esi
    539 	pcmpeqb	48(%edi), %xmm3
    540 
    541 	lea	32(%edi), %edi
    542 	jz	L(shr_3_gobble_loop)
    543 	pand	%xmm0, %xmm3
    544 
    545 	cmp	$0, %ecx
    546 	jge	L(shr_3_gobble_next)
    547 	inc	%edx
    548 	add	$32, %ecx
    549 L(shr_3_gobble_next):
    550 	test	%edx, %edx
    551 	jnz	L(exit)
    552 
    553 	pmovmskb %xmm3, %edx
    554 	movdqa	%xmm0, %xmm1
    555 	lea	32(%edi), %edi
    556 	lea	32(%esi), %esi
    557 	sub	$0xffff, %edx
    558 	jnz	L(exit)
    559 
    560 	lea	(%ecx, %edi,1), %eax
    561 	lea	3(%ecx, %esi,1), %edx
    562 	POP	(%edi)
    563 	POP	(%esi)
    564 	jmp	L(less48bytes)
    565 #endif
    566 
    567 	cfi_restore_state
    568 	cfi_remember_state
    569 	.p2align 4
    570 L(shr_4):
    571 	cmp	$80, %ecx
    572 	lea	-48(%ecx), %ecx
    573 	mov	%edx, %eax
    574 	jae	L(shr_4_gobble)
    575 
    576 	movdqa	16(%esi), %xmm1
    577 	movdqa	%xmm1, %xmm2
    578 	palignr	$4,(%esi), %xmm1
    579 	pcmpeqb	(%edi), %xmm1
    580 
    581 	movdqa	32(%esi), %xmm3
    582 	palignr	$4,%xmm2, %xmm3
    583 	pcmpeqb	16(%edi), %xmm3
    584 
    585 	pand	%xmm1, %xmm3
    586 	pmovmskb %xmm3, %edx
    587 	lea	32(%edi), %edi
    588 	lea	32(%esi), %esi
    589 	sub	$0xffff, %edx
    590 	jnz	L(exit)
    591 	lea	(%ecx, %edi,1), %eax
    592 	lea	4(%ecx, %esi,1), %edx
    593 	POP	(%edi)
    594 	POP	(%esi)
    595 	jmp	L(less48bytes)
    596 
    597 	cfi_restore_state
    598 	cfi_remember_state
    599 	.p2align 4
    600 L(shr_4_gobble):
    601 	sub	$32, %ecx
    602 	movdqa	16(%esi), %xmm0
    603 	palignr	$4,(%esi), %xmm0
    604 	pcmpeqb	(%edi), %xmm0
    605 
    606 	movdqa	32(%esi), %xmm3
    607 	palignr	$4,16(%esi), %xmm3
    608 	pcmpeqb	16(%edi), %xmm3
    609 
    610 L(shr_4_gobble_loop):
    611 	pand	%xmm0, %xmm3
    612 	sub	$32, %ecx
    613 	pmovmskb %xmm3, %edx
    614 	movdqa	%xmm0, %xmm1
    615 
    616 	movdqa	64(%esi), %xmm3
    617 	palignr	$4,48(%esi), %xmm3
    618 	sbb	$0xffff, %edx
    619 	movdqa	48(%esi), %xmm0
    620 	palignr	$4,32(%esi), %xmm0
    621 	pcmpeqb	32(%edi), %xmm0
    622 	lea	32(%esi), %esi
    623 	pcmpeqb	48(%edi), %xmm3
    624 
    625 	lea	32(%edi), %edi
    626 	jz	L(shr_4_gobble_loop)
    627 	pand	%xmm0, %xmm3
    628 
    629 	cmp	$0, %ecx
    630 	jge	L(shr_4_gobble_next)
    631 	inc	%edx
    632 	add	$32, %ecx
    633 L(shr_4_gobble_next):
    634 	test	%edx, %edx
    635 	jnz	L(exit)
    636 
    637 	pmovmskb %xmm3, %edx
    638 	movdqa	%xmm0, %xmm1
    639 	lea	32(%edi), %edi
    640 	lea	32(%esi), %esi
    641 	sub	$0xffff, %edx
    642 	jnz	L(exit)
    643 
    644 	lea	(%ecx, %edi,1), %eax
    645 	lea	4(%ecx, %esi,1), %edx
    646 	POP	(%edi)
    647 	POP	(%esi)
    648 	jmp	L(less48bytes)
    649 
    650 #if !defined(USE_WCHAR) && !defined(USE_UTF16)
    651 	cfi_restore_state
    652 	cfi_remember_state
    653 	.p2align 4
    654 L(shr_5):
    655 	cmp	$80, %ecx
    656 	lea	-48(%ecx), %ecx
    657 	mov	%edx, %eax
    658 	jae	L(shr_5_gobble)
    659 
    660 	movdqa	16(%esi), %xmm1
    661 	movdqa	%xmm1, %xmm2
    662 	palignr	$5,(%esi), %xmm1
    663 	pcmpeqb	(%edi), %xmm1
    664 
    665 	movdqa	32(%esi), %xmm3
    666 	palignr	$5,%xmm2, %xmm3
    667 	pcmpeqb	16(%edi), %xmm3
    668 
    669 	pand	%xmm1, %xmm3
    670 	pmovmskb %xmm3, %edx
    671 	lea	32(%edi), %edi
    672 	lea	32(%esi), %esi
    673 	sub	$0xffff, %edx
    674 	jnz	L(exit)
    675 	lea	(%ecx, %edi,1), %eax
    676 	lea	5(%ecx, %esi,1), %edx
    677 	POP	(%edi)
    678 	POP	(%esi)
    679 	jmp	L(less48bytes)
    680 
    681 	cfi_restore_state
    682 	cfi_remember_state
    683 	.p2align 4
    684 L(shr_5_gobble):
    685 	sub	$32, %ecx
    686 	movdqa	16(%esi), %xmm0
    687 	palignr	$5,(%esi), %xmm0
    688 	pcmpeqb	(%edi), %xmm0
    689 
    690 	movdqa	32(%esi), %xmm3
    691 	palignr	$5,16(%esi), %xmm3
    692 	pcmpeqb	16(%edi), %xmm3
    693 
    694 L(shr_5_gobble_loop):
    695 	pand	%xmm0, %xmm3
    696 	sub	$32, %ecx
    697 	pmovmskb %xmm3, %edx
    698 	movdqa	%xmm0, %xmm1
    699 
    700 	movdqa	64(%esi), %xmm3
    701 	palignr	$5,48(%esi), %xmm3
    702 	sbb	$0xffff, %edx
    703 	movdqa	48(%esi), %xmm0
    704 	palignr	$5,32(%esi), %xmm0
    705 	pcmpeqb	32(%edi), %xmm0
    706 	lea	32(%esi), %esi
    707 	pcmpeqb	48(%edi), %xmm3
    708 
    709 	lea	32(%edi), %edi
    710 	jz	L(shr_5_gobble_loop)
    711 	pand	%xmm0, %xmm3
    712 
    713 	cmp	$0, %ecx
    714 	jge	L(shr_5_gobble_next)
    715 	inc	%edx
    716 	add	$32, %ecx
    717 L(shr_5_gobble_next):
    718 	test	%edx, %edx
    719 	jnz	L(exit)
    720 
    721 	pmovmskb %xmm3, %edx
    722 	movdqa	%xmm0, %xmm1
    723 	lea	32(%edi), %edi
    724 	lea	32(%esi), %esi
    725 	sub	$0xffff, %edx
    726 	jnz	L(exit)
    727 
    728 	lea	(%ecx, %edi,1), %eax
    729 	lea	5(%ecx, %esi,1), %edx
    730 	POP	(%edi)
    731 	POP	(%esi)
    732 	jmp	L(less48bytes)
    733 #endif
    734 
    735 #if !defined(USE_WCHAR)
    736 	cfi_restore_state
    737 	cfi_remember_state
    738 	.p2align 4
    739 L(shr_6):
    740 	cmp	$80, %ecx
    741 	lea	-48(%ecx), %ecx
    742 	mov	%edx, %eax
    743 	jae	L(shr_6_gobble)
    744 
    745 	movdqa	16(%esi), %xmm1
    746 	movdqa	%xmm1, %xmm2
    747 	palignr	$6,(%esi), %xmm1
    748 	pcmpeqb	(%edi), %xmm1
    749 
    750 	movdqa	32(%esi), %xmm3
    751 	palignr	$6,%xmm2, %xmm3
    752 	pcmpeqb	16(%edi), %xmm3
    753 
    754 	pand	%xmm1, %xmm3
    755 	pmovmskb %xmm3, %edx
    756 	lea	32(%edi), %edi
    757 	lea	32(%esi), %esi
    758 	sub	$0xffff, %edx
    759 	jnz	L(exit)
    760 	lea	(%ecx, %edi,1), %eax
    761 	lea	6(%ecx, %esi,1), %edx
    762 	POP	(%edi)
    763 	POP	(%esi)
    764 	jmp	L(less48bytes)
    765 
    766 	cfi_restore_state
    767 	cfi_remember_state
    768 	.p2align 4
    769 L(shr_6_gobble):
    770 	sub	$32, %ecx
    771 	movdqa	16(%esi), %xmm0
    772 	palignr	$6,(%esi), %xmm0
    773 	pcmpeqb	(%edi), %xmm0
    774 
    775 	movdqa	32(%esi), %xmm3
    776 	palignr	$6,16(%esi), %xmm3
    777 	pcmpeqb	16(%edi), %xmm3
    778 
    779 L(shr_6_gobble_loop):
    780 	pand	%xmm0, %xmm3
    781 	sub	$32, %ecx
    782 	pmovmskb %xmm3, %edx
    783 	movdqa	%xmm0, %xmm1
    784 
    785 	movdqa	64(%esi), %xmm3
    786 	palignr	$6,48(%esi), %xmm3
    787 	sbb	$0xffff, %edx
    788 	movdqa	48(%esi), %xmm0
    789 	palignr	$6,32(%esi), %xmm0
    790 	pcmpeqb	32(%edi), %xmm0
    791 	lea	32(%esi), %esi
    792 	pcmpeqb	48(%edi), %xmm3
    793 
    794 	lea	32(%edi), %edi
    795 	jz	L(shr_6_gobble_loop)
    796 	pand	%xmm0, %xmm3
    797 
    798 	cmp	$0, %ecx
    799 	jge	L(shr_6_gobble_next)
    800 	inc	%edx
    801 	add	$32, %ecx
    802 L(shr_6_gobble_next):
    803 	test	%edx, %edx
    804 	jnz	L(exit)
    805 
    806 	pmovmskb %xmm3, %edx
    807 	movdqa	%xmm0, %xmm1
    808 	lea	32(%edi), %edi
    809 	lea	32(%esi), %esi
    810 	sub	$0xffff, %edx
    811 	jnz	L(exit)
    812 
    813 	lea	(%ecx, %edi,1), %eax
    814 	lea	6(%ecx, %esi,1), %edx
    815 	POP	(%edi)
    816 	POP	(%esi)
    817 	jmp	L(less48bytes)
    818 #endif
    819 
    820 #if !defined(USE_WCHAR) && !defined(USE_UTF16)
    821 	cfi_restore_state
    822 	cfi_remember_state
    823 	.p2align 4
    824 L(shr_7):
    825 	cmp	$80, %ecx
    826 	lea	-48(%ecx), %ecx
    827 	mov	%edx, %eax
    828 	jae	L(shr_7_gobble)
    829 
    830 	movdqa	16(%esi), %xmm1
    831 	movdqa	%xmm1, %xmm2
    832 	palignr	$7,(%esi), %xmm1
    833 	pcmpeqb	(%edi), %xmm1
    834 
    835 	movdqa	32(%esi), %xmm3
    836 	palignr	$7,%xmm2, %xmm3
    837 	pcmpeqb	16(%edi), %xmm3
    838 
    839 	pand	%xmm1, %xmm3
    840 	pmovmskb %xmm3, %edx
    841 	lea	32(%edi), %edi
    842 	lea	32(%esi), %esi
    843 	sub	$0xffff, %edx
    844 	jnz	L(exit)
    845 	lea	(%ecx, %edi,1), %eax
    846 	lea	7(%ecx, %esi,1), %edx
    847 	POP	(%edi)
    848 	POP	(%esi)
    849 	jmp	L(less48bytes)
    850 
    851 	cfi_restore_state
    852 	cfi_remember_state
    853 	.p2align 4
    854 L(shr_7_gobble):
    855 	sub	$32, %ecx
    856 	movdqa	16(%esi), %xmm0
    857 	palignr	$7,(%esi), %xmm0
    858 	pcmpeqb	(%edi), %xmm0
    859 
    860 	movdqa	32(%esi), %xmm3
    861 	palignr	$7,16(%esi), %xmm3
    862 	pcmpeqb	16(%edi), %xmm3
    863 
    864 L(shr_7_gobble_loop):
    865 	pand	%xmm0, %xmm3
    866 	sub	$32, %ecx
    867 	pmovmskb %xmm3, %edx
    868 	movdqa	%xmm0, %xmm1
    869 
    870 	movdqa	64(%esi), %xmm3
    871 	palignr	$7,48(%esi), %xmm3
    872 	sbb	$0xffff, %edx
    873 	movdqa	48(%esi), %xmm0
    874 	palignr	$7,32(%esi), %xmm0
    875 	pcmpeqb	32(%edi), %xmm0
    876 	lea	32(%esi), %esi
    877 	pcmpeqb	48(%edi), %xmm3
    878 
    879 	lea	32(%edi), %edi
    880 	jz	L(shr_7_gobble_loop)
    881 	pand	%xmm0, %xmm3
    882 
    883 	cmp	$0, %ecx
    884 	jge	L(shr_7_gobble_next)
    885 	inc	%edx
    886 	add	$32, %ecx
    887 L(shr_7_gobble_next):
    888 	test	%edx, %edx
    889 	jnz	L(exit)
    890 
    891 	pmovmskb %xmm3, %edx
    892 	movdqa	%xmm0, %xmm1
    893 	lea	32(%edi), %edi
    894 	lea	32(%esi), %esi
    895 	sub	$0xffff, %edx
    896 	jnz	L(exit)
    897 
    898 	lea	(%ecx, %edi,1), %eax
    899 	lea	7(%ecx, %esi,1), %edx
    900 	POP	(%edi)
    901 	POP	(%esi)
    902 	jmp	L(less48bytes)
    903 #endif
    904 
    905 	cfi_restore_state
    906 	cfi_remember_state
    907 	.p2align 4
    908 L(shr_8):
    909 	cmp	$80, %ecx
    910 	lea	-48(%ecx), %ecx
    911 	mov	%edx, %eax
    912 	jae	L(shr_8_gobble)
    913 
    914 	movdqa	16(%esi), %xmm1
    915 	movdqa	%xmm1, %xmm2
    916 	palignr	$8,(%esi), %xmm1
    917 	pcmpeqb	(%edi), %xmm1
    918 
    919 	movdqa	32(%esi), %xmm3
    920 	palignr	$8,%xmm2, %xmm3
    921 	pcmpeqb	16(%edi), %xmm3
    922 
    923 	pand	%xmm1, %xmm3
    924 	pmovmskb %xmm3, %edx
    925 	lea	32(%edi), %edi
    926 	lea	32(%esi), %esi
    927 	sub	$0xffff, %edx
    928 	jnz	L(exit)
    929 	lea	(%ecx, %edi,1), %eax
    930 	lea	8(%ecx, %esi,1), %edx
    931 	POP	(%edi)
    932 	POP	(%esi)
    933 	jmp	L(less48bytes)
    934 
    935 	cfi_restore_state
    936 	cfi_remember_state
    937 	.p2align 4
    938 L(shr_8_gobble):
    939 	sub	$32, %ecx
    940 	movdqa	16(%esi), %xmm0
    941 	palignr	$8,(%esi), %xmm0
    942 	pcmpeqb	(%edi), %xmm0
    943 
    944 	movdqa	32(%esi), %xmm3
    945 	palignr	$8,16(%esi), %xmm3
    946 	pcmpeqb	16(%edi), %xmm3
    947 
    948 L(shr_8_gobble_loop):
    949 	pand	%xmm0, %xmm3
    950 	sub	$32, %ecx
    951 	pmovmskb %xmm3, %edx
    952 	movdqa	%xmm0, %xmm1
    953 
    954 	movdqa	64(%esi), %xmm3
    955 	palignr	$8,48(%esi), %xmm3
    956 	sbb	$0xffff, %edx
    957 	movdqa	48(%esi), %xmm0
    958 	palignr	$8,32(%esi), %xmm0
    959 	pcmpeqb	32(%edi), %xmm0
    960 	lea	32(%esi), %esi
    961 	pcmpeqb	48(%edi), %xmm3
    962 
    963 	lea	32(%edi), %edi
    964 	jz	L(shr_8_gobble_loop)
    965 	pand	%xmm0, %xmm3
    966 
    967 	cmp	$0, %ecx
    968 	jge	L(shr_8_gobble_next)
    969 	inc	%edx
    970 	add	$32, %ecx
    971 L(shr_8_gobble_next):
    972 	test	%edx, %edx
    973 	jnz	L(exit)
    974 
    975 	pmovmskb %xmm3, %edx
    976 	movdqa	%xmm0, %xmm1
    977 	lea	32(%edi), %edi
    978 	lea	32(%esi), %esi
    979 	sub	$0xffff, %edx
    980 	jnz	L(exit)
    981 
    982 	lea	(%ecx, %edi,1), %eax
    983 	lea	8(%ecx, %esi,1), %edx
    984 	POP	(%edi)
    985 	POP	(%esi)
    986 	jmp	L(less48bytes)
    987 
    988 #if !defined(USE_WCHAR) && !defined(USE_UTF16)
    989 	cfi_restore_state
    990 	cfi_remember_state
    991 	.p2align 4
    992 L(shr_9):
    993 	cmp	$80, %ecx
    994 	lea	-48(%ecx), %ecx
    995 	mov	%edx, %eax
    996 	jae	L(shr_9_gobble)
    997 
    998 	movdqa	16(%esi), %xmm1
    999 	movdqa	%xmm1, %xmm2
   1000 	palignr	$9,(%esi), %xmm1
   1001 	pcmpeqb	(%edi), %xmm1
   1002 
   1003 	movdqa	32(%esi), %xmm3
   1004 	palignr	$9,%xmm2, %xmm3
   1005 	pcmpeqb	16(%edi), %xmm3
   1006 
   1007 	pand	%xmm1, %xmm3
   1008 	pmovmskb %xmm3, %edx
   1009 	lea	32(%edi), %edi
   1010 	lea	32(%esi), %esi
   1011 	sub	$0xffff, %edx
   1012 	jnz	L(exit)
   1013 	lea	(%ecx, %edi,1), %eax
   1014 	lea	9(%ecx, %esi,1), %edx
   1015 	POP	(%edi)
   1016 	POP	(%esi)
   1017 	jmp	L(less48bytes)
   1018 
   1019 	cfi_restore_state
   1020 	cfi_remember_state
   1021 	.p2align 4
   1022 L(shr_9_gobble):
   1023 	sub	$32, %ecx
   1024 	movdqa	16(%esi), %xmm0
   1025 	palignr	$9,(%esi), %xmm0
   1026 	pcmpeqb	(%edi), %xmm0
   1027 
   1028 	movdqa	32(%esi), %xmm3
   1029 	palignr	$9,16(%esi), %xmm3
   1030 	pcmpeqb	16(%edi), %xmm3
   1031 
   1032 L(shr_9_gobble_loop):
   1033 	pand	%xmm0, %xmm3
   1034 	sub	$32, %ecx
   1035 	pmovmskb %xmm3, %edx
   1036 	movdqa	%xmm0, %xmm1
   1037 
   1038 	movdqa	64(%esi), %xmm3
   1039 	palignr	$9,48(%esi), %xmm3
   1040 	sbb	$0xffff, %edx
   1041 	movdqa	48(%esi), %xmm0
   1042 	palignr	$9,32(%esi), %xmm0
   1043 	pcmpeqb	32(%edi), %xmm0
   1044 	lea	32(%esi), %esi
   1045 	pcmpeqb	48(%edi), %xmm3
   1046 
   1047 	lea	32(%edi), %edi
   1048 	jz	L(shr_9_gobble_loop)
   1049 	pand	%xmm0, %xmm3
   1050 
   1051 	cmp	$0, %ecx
   1052 	jge	L(shr_9_gobble_next)
   1053 	inc	%edx
   1054 	add	$32, %ecx
   1055 L(shr_9_gobble_next):
   1056 	test	%edx, %edx
   1057 	jnz	L(exit)
   1058 
   1059 	pmovmskb %xmm3, %edx
   1060 	movdqa	%xmm0, %xmm1
   1061 	lea	32(%edi), %edi
   1062 	lea	32(%esi), %esi
   1063 	sub	$0xffff, %edx
   1064 	jnz	L(exit)
   1065 
   1066 	lea	(%ecx, %edi,1), %eax
   1067 	lea	9(%ecx, %esi,1), %edx
   1068 	POP	(%edi)
   1069 	POP	(%esi)
   1070 	jmp	L(less48bytes)
   1071 #endif
   1072 
   1073 #if !defined(USE_WCHAR)
   1074 	cfi_restore_state
   1075 	cfi_remember_state
   1076 	.p2align 4
   1077 L(shr_10):
   1078 	cmp	$80, %ecx
   1079 	lea	-48(%ecx), %ecx
   1080 	mov	%edx, %eax
   1081 	jae	L(shr_10_gobble)
   1082 
   1083 	movdqa	16(%esi), %xmm1
   1084 	movdqa	%xmm1, %xmm2
   1085 	palignr	$10, (%esi), %xmm1
   1086 	pcmpeqb	(%edi), %xmm1
   1087 
   1088 	movdqa	32(%esi), %xmm3
   1089 	palignr	$10,%xmm2, %xmm3
   1090 	pcmpeqb	16(%edi), %xmm3
   1091 
   1092 	pand	%xmm1, %xmm3
   1093 	pmovmskb %xmm3, %edx
   1094 	lea	32(%edi), %edi
   1095 	lea	32(%esi), %esi
   1096 	sub	$0xffff, %edx
   1097 	jnz	L(exit)
   1098 	lea	(%ecx, %edi,1), %eax
   1099 	lea	10(%ecx, %esi,1), %edx
   1100 	POP	(%edi)
   1101 	POP	(%esi)
   1102 	jmp	L(less48bytes)
   1103 
   1104 	cfi_restore_state
   1105 	cfi_remember_state
   1106 	.p2align 4
   1107 L(shr_10_gobble):
   1108 	sub	$32, %ecx
   1109 	movdqa	16(%esi), %xmm0
   1110 	palignr	$10, (%esi), %xmm0
   1111 	pcmpeqb	(%edi), %xmm0
   1112 
   1113 	movdqa	32(%esi), %xmm3
   1114 	palignr	$10, 16(%esi), %xmm3
   1115 	pcmpeqb	16(%edi), %xmm3
   1116 
   1117 L(shr_10_gobble_loop):
   1118 	pand	%xmm0, %xmm3
   1119 	sub	$32, %ecx
   1120 	pmovmskb %xmm3, %edx
   1121 	movdqa	%xmm0, %xmm1
   1122 
   1123 	movdqa	64(%esi), %xmm3
   1124 	palignr	$10,48(%esi), %xmm3
   1125 	sbb	$0xffff, %edx
   1126 	movdqa	48(%esi), %xmm0
   1127 	palignr	$10,32(%esi), %xmm0
   1128 	pcmpeqb	32(%edi), %xmm0
   1129 	lea	32(%esi), %esi
   1130 	pcmpeqb	48(%edi), %xmm3
   1131 
   1132 	lea	32(%edi), %edi
   1133 	jz	L(shr_10_gobble_loop)
   1134 	pand	%xmm0, %xmm3
   1135 
   1136 	cmp	$0, %ecx
   1137 	jge	L(shr_10_gobble_next)
   1138 	inc	%edx
   1139 	add	$32, %ecx
   1140 L(shr_10_gobble_next):
   1141 	test	%edx, %edx
   1142 	jnz	L(exit)
   1143 
   1144 	pmovmskb %xmm3, %edx
   1145 	movdqa	%xmm0, %xmm1
   1146 	lea	32(%edi), %edi
   1147 	lea	32(%esi), %esi
   1148 	sub	$0xffff, %edx
   1149 	jnz	L(exit)
   1150 
   1151 	lea	(%ecx, %edi,1), %eax
   1152 	lea	10(%ecx, %esi,1), %edx
   1153 	POP	(%edi)
   1154 	POP	(%esi)
   1155 	jmp	L(less48bytes)
   1156 #endif
   1157 
   1158 #if !defined(USE_WCHAR) && !defined(USE_UTF16)
   1159 	cfi_restore_state
   1160 	cfi_remember_state
   1161 	.p2align 4
   1162 L(shr_11):
   1163 	cmp	$80, %ecx
   1164 	lea	-48(%ecx), %ecx
   1165 	mov	%edx, %eax
   1166 	jae	L(shr_11_gobble)
   1167 
   1168 	movdqa	16(%esi), %xmm1
   1169 	movdqa	%xmm1, %xmm2
   1170 	palignr	$11, (%esi), %xmm1
   1171 	pcmpeqb	(%edi), %xmm1
   1172 
   1173 	movdqa	32(%esi), %xmm3
   1174 	palignr	$11, %xmm2, %xmm3
   1175 	pcmpeqb	16(%edi), %xmm3
   1176 
   1177 	pand	%xmm1, %xmm3
   1178 	pmovmskb %xmm3, %edx
   1179 	lea	32(%edi), %edi
   1180 	lea	32(%esi), %esi
   1181 	sub	$0xffff, %edx
   1182 	jnz	L(exit)
   1183 	lea	(%ecx, %edi,1), %eax
   1184 	lea	11(%ecx, %esi,1), %edx
   1185 	POP	(%edi)
   1186 	POP	(%esi)
   1187 	jmp	L(less48bytes)
   1188 
   1189 	cfi_restore_state
   1190 	cfi_remember_state
   1191 	.p2align 4
   1192 L(shr_11_gobble):
   1193 	sub	$32, %ecx
   1194 	movdqa	16(%esi), %xmm0
   1195 	palignr	$11, (%esi), %xmm0
   1196 	pcmpeqb	(%edi), %xmm0
   1197 
   1198 	movdqa	32(%esi), %xmm3
   1199 	palignr	$11, 16(%esi), %xmm3
   1200 	pcmpeqb	16(%edi), %xmm3
   1201 
   1202 L(shr_11_gobble_loop):
   1203 	pand	%xmm0, %xmm3
   1204 	sub	$32, %ecx
   1205 	pmovmskb %xmm3, %edx
   1206 	movdqa	%xmm0, %xmm1
   1207 
   1208 	movdqa	64(%esi), %xmm3
   1209 	palignr	$11,48(%esi), %xmm3
   1210 	sbb	$0xffff, %edx
   1211 	movdqa	48(%esi), %xmm0
   1212 	palignr	$11,32(%esi), %xmm0
   1213 	pcmpeqb	32(%edi), %xmm0
   1214 	lea	32(%esi), %esi
   1215 	pcmpeqb	48(%edi), %xmm3
   1216 
   1217 	lea	32(%edi), %edi
   1218 	jz	L(shr_11_gobble_loop)
   1219 	pand	%xmm0, %xmm3
   1220 
   1221 	cmp	$0, %ecx
   1222 	jge	L(shr_11_gobble_next)
   1223 	inc	%edx
   1224 	add	$32, %ecx
   1225 L(shr_11_gobble_next):
   1226 	test	%edx, %edx
   1227 	jnz	L(exit)
   1228 
   1229 	pmovmskb %xmm3, %edx
   1230 	movdqa	%xmm0, %xmm1
   1231 	lea	32(%edi), %edi
   1232 	lea	32(%esi), %esi
   1233 	sub	$0xffff, %edx
   1234 	jnz	L(exit)
   1235 
   1236 	lea	(%ecx, %edi,1), %eax
   1237 	lea	11(%ecx, %esi,1), %edx
   1238 	POP	(%edi)
   1239 	POP	(%esi)
   1240 	jmp	L(less48bytes)
   1241 #endif
   1242 
   1243 	cfi_restore_state
   1244 	cfi_remember_state
   1245 	.p2align 4
   1246 L(shr_12):
   1247 	cmp	$80, %ecx
   1248 	lea	-48(%ecx), %ecx
   1249 	mov	%edx, %eax
   1250 	jae	L(shr_12_gobble)
   1251 
   1252 	movdqa	16(%esi), %xmm1
   1253 	movdqa	%xmm1, %xmm2
   1254 	palignr	$12, (%esi), %xmm1
   1255 	pcmpeqb	(%edi), %xmm1
   1256 
   1257 	movdqa	32(%esi), %xmm3
   1258 	palignr	$12, %xmm2, %xmm3
   1259 	pcmpeqb	16(%edi), %xmm3
   1260 
   1261 	pand	%xmm1, %xmm3
   1262 	pmovmskb %xmm3, %edx
   1263 	lea	32(%edi), %edi
   1264 	lea	32(%esi), %esi
   1265 	sub	$0xffff, %edx
   1266 	jnz	L(exit)
   1267 	lea	(%ecx, %edi,1), %eax
   1268 	lea	12(%ecx, %esi,1), %edx
   1269 	POP	(%edi)
   1270 	POP	(%esi)
   1271 	jmp	L(less48bytes)
   1272 
   1273 	cfi_restore_state
   1274 	cfi_remember_state
   1275 	.p2align 4
   1276 L(shr_12_gobble):
   1277 	sub	$32, %ecx
   1278 	movdqa	16(%esi), %xmm0
   1279 	palignr	$12, (%esi), %xmm0
   1280 	pcmpeqb	(%edi), %xmm0
   1281 
   1282 	movdqa	32(%esi), %xmm3
   1283 	palignr	$12, 16(%esi), %xmm3
   1284 	pcmpeqb	16(%edi), %xmm3
   1285 
   1286 L(shr_12_gobble_loop):
   1287 	pand	%xmm0, %xmm3
   1288 	sub	$32, %ecx
   1289 	pmovmskb %xmm3, %edx
   1290 	movdqa	%xmm0, %xmm1
   1291 
   1292 	movdqa	64(%esi), %xmm3
   1293 	palignr	$12,48(%esi), %xmm3
   1294 	sbb	$0xffff, %edx
   1295 	movdqa	48(%esi), %xmm0
   1296 	palignr	$12,32(%esi), %xmm0
   1297 	pcmpeqb	32(%edi), %xmm0
   1298 	lea	32(%esi), %esi
   1299 	pcmpeqb	48(%edi), %xmm3
   1300 
   1301 	lea	32(%edi), %edi
   1302 	jz	L(shr_12_gobble_loop)
   1303 	pand	%xmm0, %xmm3
   1304 
   1305 	cmp	$0, %ecx
   1306 	jge	L(shr_12_gobble_next)
   1307 	inc	%edx
   1308 	add	$32, %ecx
   1309 L(shr_12_gobble_next):
   1310 	test	%edx, %edx
   1311 	jnz	L(exit)
   1312 
   1313 	pmovmskb %xmm3, %edx
   1314 	movdqa	%xmm0, %xmm1
   1315 	lea	32(%edi), %edi
   1316 	lea	32(%esi), %esi
   1317 	sub	$0xffff, %edx
   1318 	jnz	L(exit)
   1319 
   1320 	lea	(%ecx, %edi,1), %eax
   1321 	lea	12(%ecx, %esi,1), %edx
   1322 	POP	(%edi)
   1323 	POP	(%esi)
   1324 	jmp	L(less48bytes)
   1325 
   1326 #if !defined(USE_WCHAR) && !defined(USE_UTF16)
   1327 	cfi_restore_state
   1328 	cfi_remember_state
   1329 	.p2align 4
   1330 L(shr_13):
   1331 	cmp	$80, %ecx
   1332 	lea	-48(%ecx), %ecx
   1333 	mov	%edx, %eax
   1334 	jae	L(shr_13_gobble)
   1335 
   1336 	movdqa	16(%esi), %xmm1
   1337 	movdqa	%xmm1, %xmm2
   1338 	palignr	$13, (%esi), %xmm1
   1339 	pcmpeqb	(%edi), %xmm1
   1340 
   1341 	movdqa	32(%esi), %xmm3
   1342 	palignr	$13, %xmm2, %xmm3
   1343 	pcmpeqb	16(%edi), %xmm3
   1344 
   1345 	pand	%xmm1, %xmm3
   1346 	pmovmskb %xmm3, %edx
   1347 	lea	32(%edi), %edi
   1348 	lea	32(%esi), %esi
   1349 	sub	$0xffff, %edx
   1350 	jnz	L(exit)
   1351 	lea	(%ecx, %edi,1), %eax
   1352 	lea	13(%ecx, %esi,1), %edx
   1353 	POP	(%edi)
   1354 	POP	(%esi)
   1355 	jmp	L(less48bytes)
   1356 
   1357 	cfi_restore_state
   1358 	cfi_remember_state
   1359 	.p2align 4
   1360 L(shr_13_gobble):
   1361 	sub	$32, %ecx
   1362 	movdqa	16(%esi), %xmm0
   1363 	palignr	$13, (%esi), %xmm0
   1364 	pcmpeqb	(%edi), %xmm0
   1365 
   1366 	movdqa	32(%esi), %xmm3
   1367 	palignr	$13, 16(%esi), %xmm3
   1368 	pcmpeqb	16(%edi), %xmm3
   1369 
   1370 L(shr_13_gobble_loop):
   1371 	pand	%xmm0, %xmm3
   1372 	sub	$32, %ecx
   1373 	pmovmskb %xmm3, %edx
   1374 	movdqa	%xmm0, %xmm1
   1375 
   1376 	movdqa	64(%esi), %xmm3
   1377 	palignr	$13,48(%esi), %xmm3
   1378 	sbb	$0xffff, %edx
   1379 	movdqa	48(%esi), %xmm0
   1380 	palignr	$13,32(%esi), %xmm0
   1381 	pcmpeqb	32(%edi), %xmm0
   1382 	lea	32(%esi), %esi
   1383 	pcmpeqb	48(%edi), %xmm3
   1384 
   1385 	lea	32(%edi), %edi
   1386 	jz	L(shr_13_gobble_loop)
   1387 	pand	%xmm0, %xmm3
   1388 
   1389 	cmp	$0, %ecx
   1390 	jge	L(shr_13_gobble_next)
   1391 	inc	%edx
   1392 	add	$32, %ecx
   1393 L(shr_13_gobble_next):
   1394 	test	%edx, %edx
   1395 	jnz	L(exit)
   1396 
   1397 	pmovmskb %xmm3, %edx
   1398 	movdqa	%xmm0, %xmm1
   1399 	lea	32(%edi), %edi
   1400 	lea	32(%esi), %esi
   1401 	sub	$0xffff, %edx
   1402 	jnz	L(exit)
   1403 
   1404 	lea	(%ecx, %edi,1), %eax
   1405 	lea	13(%ecx, %esi,1), %edx
   1406 	POP	(%edi)
   1407 	POP	(%esi)
   1408 	jmp	L(less48bytes)
   1409 #endif
   1410 
   1411 #if !defined(USE_WCHAR)
   1412 	cfi_restore_state
   1413 	cfi_remember_state
   1414 	.p2align 4
   1415 L(shr_14):
   1416 	cmp	$80, %ecx
   1417 	lea	-48(%ecx), %ecx
   1418 	mov	%edx, %eax
   1419 	jae	L(shr_14_gobble)
   1420 
   1421 	movdqa	16(%esi), %xmm1
   1422 	movdqa	%xmm1, %xmm2
   1423 	palignr	$14, (%esi), %xmm1
   1424 	pcmpeqb	(%edi), %xmm1
   1425 
   1426 	movdqa	32(%esi), %xmm3
   1427 	palignr	$14, %xmm2, %xmm3
   1428 	pcmpeqb	16(%edi), %xmm3
   1429 
   1430 	pand	%xmm1, %xmm3
   1431 	pmovmskb %xmm3, %edx
   1432 	lea	32(%edi), %edi
   1433 	lea	32(%esi), %esi
   1434 	sub	$0xffff, %edx
   1435 	jnz	L(exit)
   1436 	lea	(%ecx, %edi,1), %eax
   1437 	lea	14(%ecx, %esi,1), %edx
   1438 	POP	(%edi)
   1439 	POP	(%esi)
   1440 	jmp	L(less48bytes)
   1441 
   1442 	cfi_restore_state
   1443 	cfi_remember_state
   1444 	.p2align 4
   1445 L(shr_14_gobble):
   1446 	sub	$32, %ecx
   1447 	movdqa	16(%esi), %xmm0
   1448 	palignr	$14, (%esi), %xmm0
   1449 	pcmpeqb	(%edi), %xmm0
   1450 
   1451 	movdqa	32(%esi), %xmm3
   1452 	palignr	$14, 16(%esi), %xmm3
   1453 	pcmpeqb	16(%edi), %xmm3
   1454 
   1455 L(shr_14_gobble_loop):
   1456 	pand	%xmm0, %xmm3
   1457 	sub	$32, %ecx
   1458 	pmovmskb %xmm3, %edx
   1459 	movdqa	%xmm0, %xmm1
   1460 
   1461 	movdqa	64(%esi), %xmm3
   1462 	palignr	$14,48(%esi), %xmm3
   1463 	sbb	$0xffff, %edx
   1464 	movdqa	48(%esi), %xmm0
   1465 	palignr	$14,32(%esi), %xmm0
   1466 	pcmpeqb	32(%edi), %xmm0
   1467 	lea	32(%esi), %esi
   1468 	pcmpeqb	48(%edi), %xmm3
   1469 
   1470 	lea	32(%edi), %edi
   1471 	jz	L(shr_14_gobble_loop)
   1472 	pand	%xmm0, %xmm3
   1473 
   1474 	cmp	$0, %ecx
   1475 	jge	L(shr_14_gobble_next)
   1476 	inc	%edx
   1477 	add	$32, %ecx
   1478 L(shr_14_gobble_next):
   1479 	test	%edx, %edx
   1480 	jnz	L(exit)
   1481 
   1482 	pmovmskb %xmm3, %edx
   1483 	movdqa	%xmm0, %xmm1
   1484 	lea	32(%edi), %edi
   1485 	lea	32(%esi), %esi
   1486 	sub	$0xffff, %edx
   1487 	jnz	L(exit)
   1488 
   1489 	lea	(%ecx, %edi,1), %eax
   1490 	lea	14(%ecx, %esi,1), %edx
   1491 	POP	(%edi)
   1492 	POP	(%esi)
   1493 	jmp	L(less48bytes)
   1494 #endif
   1495 
   1496 #if !defined(USE_WCHAR) && !defined(USE_UTF16)
   1497 	cfi_restore_state
   1498 	cfi_remember_state
   1499 	.p2align 4
   1500 L(shr_15):
   1501 	cmp	$80, %ecx
   1502 	lea	-48(%ecx), %ecx
   1503 	mov	%edx, %eax
   1504 	jae	L(shr_15_gobble)
   1505 
   1506 	movdqa	16(%esi), %xmm1
   1507 	movdqa	%xmm1, %xmm2
   1508 	palignr	$15, (%esi), %xmm1
   1509 	pcmpeqb	(%edi), %xmm1
   1510 
   1511 	movdqa	32(%esi), %xmm3
   1512 	palignr	$15, %xmm2, %xmm3
   1513 	pcmpeqb	16(%edi), %xmm3
   1514 
   1515 	pand	%xmm1, %xmm3
   1516 	pmovmskb %xmm3, %edx
   1517 	lea	32(%edi), %edi
   1518 	lea	32(%esi), %esi
   1519 	sub	$0xffff, %edx
   1520 	jnz	L(exit)
   1521 	lea	(%ecx, %edi,1), %eax
   1522 	lea	15(%ecx, %esi,1), %edx
   1523 	POP	(%edi)
   1524 	POP	(%esi)
   1525 	jmp	L(less48bytes)
   1526 
   1527 	cfi_restore_state
   1528 	cfi_remember_state
   1529 	.p2align 4
   1530 L(shr_15_gobble):
   1531 	sub	$32, %ecx
   1532 	movdqa	16(%esi), %xmm0
   1533 	palignr	$15, (%esi), %xmm0
   1534 	pcmpeqb	(%edi), %xmm0
   1535 
   1536 	movdqa	32(%esi), %xmm3
   1537 	palignr	$15, 16(%esi), %xmm3
   1538 	pcmpeqb	16(%edi), %xmm3
   1539 
   1540 L(shr_15_gobble_loop):
   1541 	pand	%xmm0, %xmm3
   1542 	sub	$32, %ecx
   1543 	pmovmskb %xmm3, %edx
   1544 	movdqa	%xmm0, %xmm1
   1545 
   1546 	movdqa	64(%esi), %xmm3
   1547 	palignr	$15,48(%esi), %xmm3
   1548 	sbb	$0xffff, %edx
   1549 	movdqa	48(%esi), %xmm0
   1550 	palignr	$15,32(%esi), %xmm0
   1551 	pcmpeqb	32(%edi), %xmm0
   1552 	lea	32(%esi), %esi
   1553 	pcmpeqb	48(%edi), %xmm3
   1554 
   1555 	lea	32(%edi), %edi
   1556 	jz	L(shr_15_gobble_loop)
   1557 	pand	%xmm0, %xmm3
   1558 
   1559 	cmp	$0, %ecx
   1560 	jge	L(shr_15_gobble_next)
   1561 	inc	%edx
   1562 	add	$32, %ecx
   1563 L(shr_15_gobble_next):
   1564 	test	%edx, %edx
   1565 	jnz	L(exit)
   1566 
   1567 	pmovmskb %xmm3, %edx
   1568 	movdqa	%xmm0, %xmm1
   1569 	lea	32(%edi), %edi
   1570 	lea	32(%esi), %esi
   1571 	sub	$0xffff, %edx
   1572 	jnz	L(exit)
   1573 
   1574 	lea	(%ecx, %edi,1), %eax
   1575 	lea	15(%ecx, %esi,1), %edx
   1576 	POP	(%edi)
   1577 	POP	(%esi)
   1578 	jmp	L(less48bytes)
   1579 #endif
   1580 
   1581 	cfi_restore_state
   1582 	cfi_remember_state
   1583 	.p2align 4
   1584 L(exit):
   1585 	pmovmskb %xmm1, %ebx
   1586 	sub	$0xffff, %ebx
   1587 	jz	L(first16bytes)
   1588 	lea	-16(%esi), %esi
   1589 	lea	-16(%edi), %edi
   1590 	mov	%ebx, %edx
   1591 
   1592 L(first16bytes):
   1593 	add	%eax, %esi
   1594 L(less16bytes):
   1595 
   1596 #if !defined(USE_WCHAR) && !defined(USE_UTF16)
   1597 	test	%dl, %dl
   1598 	jz	L(next_24_bytes)
   1599 
   1600 	test	$0x01, %dl
   1601 	jnz	L(Byte16)
   1602 
   1603 	test	$0x02, %dl
   1604 	jnz	L(Byte17)
   1605 
   1606 	test	$0x04, %dl
   1607 	jnz	L(Byte18)
   1608 
   1609 	test	$0x08, %dl
   1610 	jnz	L(Byte19)
   1611 
   1612 	test	$0x10, %dl
   1613 	jnz	L(Byte20)
   1614 
   1615 	test	$0x20, %dl
   1616 	jnz	L(Byte21)
   1617 
   1618 	test	$0x40, %dl
   1619 	jnz	L(Byte22)
   1620 L(Byte23):
   1621 	movzbl	-9(%edi), %eax
   1622 	movzbl	-9(%esi), %edx
   1623 	sub	%edx, %eax
   1624 	RETURN
   1625 
   1626 	.p2align 4
   1627 L(Byte16):
   1628 	movzbl	-16(%edi), %eax
   1629 	movzbl	-16(%esi), %edx
   1630 	sub	%edx, %eax
   1631 	RETURN
   1632 
   1633 	.p2align 4
   1634 L(Byte17):
   1635 	movzbl	-15(%edi), %eax
   1636 	movzbl	-15(%esi), %edx
   1637 	sub	%edx, %eax
   1638 	RETURN
   1639 
   1640 	.p2align 4
   1641 L(Byte18):
   1642 	movzbl	-14(%edi), %eax
   1643 	movzbl	-14(%esi), %edx
   1644 	sub	%edx, %eax
   1645 	RETURN
   1646 
   1647 	.p2align 4
   1648 L(Byte19):
   1649 	movzbl	-13(%edi), %eax
   1650 	movzbl	-13(%esi), %edx
   1651 	sub	%edx, %eax
   1652 	RETURN
   1653 
   1654 	.p2align 4
   1655 L(Byte20):
   1656 	movzbl	-12(%edi), %eax
   1657 	movzbl	-12(%esi), %edx
   1658 	sub	%edx, %eax
   1659 	RETURN
   1660 
   1661 	.p2align 4
   1662 L(Byte21):
   1663 	movzbl	-11(%edi), %eax
   1664 	movzbl	-11(%esi), %edx
   1665 	sub	%edx, %eax
   1666 	RETURN
   1667 
   1668 	.p2align 4
   1669 L(Byte22):
   1670 	movzbl	-10(%edi), %eax
   1671 	movzbl	-10(%esi), %edx
   1672 	sub	%edx, %eax
   1673 	RETURN
   1674 
   1675 	.p2align 4
   1676 L(next_24_bytes):
   1677 	lea	8(%edi), %edi
   1678 	lea	8(%esi), %esi
   1679 	test	$0x01, %dh
   1680 	jnz	L(Byte16)
   1681 
   1682 	test	$0x02, %dh
   1683 	jnz	L(Byte17)
   1684 
   1685 	test	$0x04, %dh
   1686 	jnz	L(Byte18)
   1687 
   1688 	test	$0x08, %dh
   1689 	jnz	L(Byte19)
   1690 
   1691 	test	$0x10, %dh
   1692 	jnz	L(Byte20)
   1693 
   1694 	test	$0x20, %dh
   1695 	jnz	L(Byte21)
   1696 
   1697 	test	$0x40, %dh
   1698 	jnz	L(Byte22)
   1699 
   1700 	.p2align 4
   1701 L(Byte31):
   1702 	movzbl	-9(%edi), %eax
   1703 	movzbl	-9(%esi), %edx
   1704 	sub	%edx, %eax
   1705 	RETURN_END
   1706 #elif defined(USE_AS_WMEMCMP)
   1707 
   1708 /* special for wmemcmp */
   1709 	test	%dl, %dl
   1710 	jz	L(next_two_double_words)
   1711 	and	$15, %dl
   1712 	jz	L(second_double_word)
   1713 	mov	-16(%edi), %ecx
   1714 	cmp	-16(%esi), %ecx
   1715 	mov	$1, %eax
   1716 	jg	L(nequal_bigger)
   1717 	neg	%eax
   1718 	RETURN
   1719 
   1720 	.p2align 4
   1721 L(second_double_word):
   1722 	mov	-12(%edi), %ecx
   1723 	cmp	-12(%esi), %ecx
   1724 	mov	$1, %eax
   1725 	jg	L(nequal_bigger)
   1726 	neg	%eax
   1727 	RETURN
   1728 
   1729 	.p2align 4
   1730 L(next_two_double_words):
   1731 	and	$15, %dh
   1732 	jz	L(fourth_double_word)
   1733 	mov	-8(%edi), %ecx
   1734 	cmp	-8(%esi), %ecx
   1735 	mov	$1, %eax
   1736 	jg	L(nequal_bigger)
   1737 	neg	%eax
   1738 	RETURN
   1739 
   1740 	.p2align 4
   1741 L(fourth_double_word):
   1742 	mov	-4(%edi), %ecx
   1743 	cmp	-4(%esi), %ecx
   1744 	mov	$1, %eax
   1745 	jg	L(nequal_bigger)
   1746 	neg	%eax
   1747 	RETURN
   1748 
   1749 	.p2align 4
   1750 L(nequal_bigger):
   1751 	RETURN_END
   1752 
   1753 #elif defined(USE_AS_MEMCMP16)
   1754 
   1755 /* special for __memcmp16 */
   1756 	test	%dl, %dl
   1757 	jz	L(next_four_words)
   1758 	test	$15, %dl
   1759 	jz	L(second_two_words)
   1760 	test	$3, %dl
   1761 	jz	L(second_word)
   1762 	movzwl	-16(%edi), %eax
   1763 	movzwl	-16(%esi), %ebx
   1764 	subl	%ebx, %eax
   1765 	RETURN
   1766 
   1767 	.p2align 4
   1768 L(second_word):
   1769 	movzwl	-14(%edi), %eax
   1770 	movzwl	-14(%esi), %ebx
   1771 	subl	%ebx, %eax
   1772 	RETURN
   1773 
   1774 	.p2align 4
   1775 L(second_two_words):
   1776 	test	$63, %dl
   1777 	jz	L(fourth_word)
   1778 	movzwl	-12(%edi), %eax
   1779 	movzwl	-12(%esi), %ebx
   1780 	subl	%ebx, %eax
   1781 	RETURN
   1782 
   1783 	.p2align 4
   1784 L(fourth_word):
   1785 	movzwl	-10(%edi), %eax
   1786 	movzwl	-10(%esi), %ebx
   1787 	subl	%ebx, %eax
   1788 	RETURN
   1789 
   1790 	.p2align 4
   1791 L(next_four_words):
   1792 	test	$15, %dh
   1793 	jz	L(fourth_two_words)
   1794 	test	$3, %dh
   1795 	jz	L(sixth_word)
   1796 	movzwl	-8(%edi), %eax
   1797 	movzwl	-8(%esi), %ebx
   1798 	subl	%ebx, %eax
   1799 	RETURN
   1800 
   1801 	.p2align 4
   1802 L(sixth_word):
   1803 	movzwl	-6(%edi), %eax
   1804 	movzwl	-6(%esi), %ebx
   1805 	subl	%ebx, %eax
   1806 	RETURN
   1807 
   1808 	.p2align 4
   1809 L(fourth_two_words):
   1810 	test	$63, %dh
   1811 	jz	L(eighth_word)
   1812 	movzwl	-4(%edi), %eax
   1813 	movzwl	-4(%esi), %ebx
   1814 	subl	%ebx, %eax
   1815 	RETURN
   1816 
   1817 	.p2align 4
   1818 L(eighth_word):
   1819 	movzwl	-2(%edi), %eax
   1820 	movzwl	-2(%esi), %ebx
   1821 	subl	%ebx, %eax
   1822 	RETURN
   1823 #else
   1824 # error Unreachable preprocessor case
   1825 #endif
   1826 
   1827 	CFI_PUSH (%ebx)
   1828 
   1829 	.p2align 4
   1830 L(more8bytes):
   1831 	cmp	$16, %ecx
   1832 	jae	L(more16bytes)
   1833 	cmp	$8, %ecx
   1834 	je	L(8bytes)
   1835 #if !defined(USE_WCHAR) && !defined(USE_UTF16)
   1836 	cmp	$9, %ecx
   1837 	je	L(9bytes)
   1838 	cmp	$10, %ecx
   1839 	je	L(10bytes)
   1840 	cmp	$11, %ecx
   1841 	je	L(11bytes)
   1842 	cmp	$12, %ecx
   1843 	je	L(12bytes)
   1844 	cmp	$13, %ecx
   1845 	je	L(13bytes)
   1846 	cmp	$14, %ecx
   1847 	je	L(14bytes)
   1848 	jmp	L(15bytes)
   1849 #elif defined(USE_WCHAR) && !defined(USE_UTF16)
   1850 	jmp	L(12bytes)
   1851 #elif defined(USE_UTF16) && !defined(USE_WCHAR)
   1852 	cmp	$10, %ecx
   1853 	je	L(10bytes)
   1854 	cmp	$12, %ecx
   1855 	je	L(12bytes)
   1856 	jmp	L(14bytes)
   1857 #else
   1858 # error Unreachable preprocessor case
   1859 #endif
   1860 
   1861 	.p2align 4
   1862 L(more16bytes):
   1863 	cmp	$24, %ecx
   1864 	jae	L(more24bytes)
   1865 	cmp	$16, %ecx
   1866 	je	L(16bytes)
   1867 #if !defined(USE_WCHAR) && !defined(USE_UTF16)
   1868 	cmp	$17, %ecx
   1869 	je	L(17bytes)
   1870 	cmp	$18, %ecx
   1871 	je	L(18bytes)
   1872 	cmp	$19, %ecx
   1873 	je	L(19bytes)
   1874 	cmp	$20, %ecx
   1875 	je	L(20bytes)
   1876 	cmp	$21, %ecx
   1877 	je	L(21bytes)
   1878 	cmp	$22, %ecx
   1879 	je	L(22bytes)
   1880 	jmp	L(23bytes)
   1881 #elif defined(USE_WCHAR) && !defined(USE_UTF16)
   1882 	jmp	L(20bytes)
   1883 #elif defined(USE_UTF16) && !defined(USE_WCHAR)
   1884 	cmp	$18, %ecx
   1885 	je	L(18bytes)
   1886 	cmp	$20, %ecx
   1887 	je	L(20bytes)
   1888 	jmp	L(22bytes)
   1889 #else
   1890 # error Unreachable preprocessor case
   1891 #endif
   1892 
   1893 	.p2align 4
   1894 L(more24bytes):
   1895 	cmp	$32, %ecx
   1896 	jae	L(more32bytes)
   1897 	cmp	$24, %ecx
   1898 	je	L(24bytes)
   1899 #if !defined(USE_WCHAR) && !defined(USE_UTF16)
   1900 	cmp	$25, %ecx
   1901 	je	L(25bytes)
   1902 	cmp	$26, %ecx
   1903 	je	L(26bytes)
   1904 	cmp	$27, %ecx
   1905 	je	L(27bytes)
   1906 	cmp	$28, %ecx
   1907 	je	L(28bytes)
   1908 	cmp	$29, %ecx
   1909 	je	L(29bytes)
   1910 	cmp	$30, %ecx
   1911 	je	L(30bytes)
   1912 	jmp	L(31bytes)
   1913 #elif defined(USE_WCHAR) && !defined(USE_UTF16)
   1914 	jmp	L(28bytes)
   1915 #elif defined(USE_UTF16) && !defined(USE_WCHAR)
   1916 	cmp	$26, %ecx
   1917 	je	L(26bytes)
   1918 	cmp	$28, %ecx
   1919 	je	L(28bytes)
   1920 	jmp	L(30bytes)
   1921 #else
   1922 # error Unreachable preprocessor case
   1923 #endif
   1924 
   1925 	.p2align 4
   1926 L(more32bytes):
   1927 	cmp	$40, %ecx
   1928 	jae	L(more40bytes)
   1929 	cmp	$32, %ecx
   1930 	je	L(32bytes)
   1931 #if !defined(USE_WCHAR) && !defined(USE_UTF16)
   1932 	cmp	$33, %ecx
   1933 	je	L(33bytes)
   1934 	cmp	$34, %ecx
   1935 	je	L(34bytes)
   1936 	cmp	$35, %ecx
   1937 	je	L(35bytes)
   1938 	cmp	$36, %ecx
   1939 	je	L(36bytes)
   1940 	cmp	$37, %ecx
   1941 	je	L(37bytes)
   1942 	cmp	$38, %ecx
   1943 	je	L(38bytes)
   1944 	jmp	L(39bytes)
   1945 #elif defined(USE_WCHAR) && !defined(USE_UTF16)
   1946 	jmp	L(36bytes)
   1947 #elif defined(USE_UTF16) && !defined(USE_WCHAR)
   1948 	cmp	$34, %ecx
   1949 	je	L(34bytes)
   1950 	cmp	$36, %ecx
   1951 	je	L(36bytes)
   1952 	jmp	L(38bytes)
   1953 #else
   1954 # error Unreachable preprocessor case
   1955 #endif
   1956 
   1957 	.p2align 4
   1958 L(less48bytes):
   1959 	cmp	$8, %ecx
   1960 	jae	L(more8bytes)
   1961 #if !defined(USE_WCHAR) && !defined(USE_UTF16)
   1962 	cmp	$2, %ecx
   1963 	je	L(2bytes)
   1964 	cmp	$3, %ecx
   1965 	je	L(3bytes)
   1966 	cmp	$4, %ecx
   1967 	je	L(4bytes)
   1968 	cmp	$5, %ecx
   1969 	je	L(5bytes)
   1970 	cmp	$6, %ecx
   1971 	je	L(6bytes)
   1972 	jmp	L(7bytes)
   1973 #elif defined(USE_WCHAR) && !defined(USE_UTF16)
   1974 	jmp	L(4bytes)
   1975 #elif defined(USE_UTF16) && !defined(USE_WCHAR)
   1976 	cmp	$2, %ecx
   1977 	je	L(2bytes)
   1978 	cmp	$4, %ecx
   1979 	je	L(4bytes)
   1980 	jmp	L(6bytes)
   1981 #else
   1982 # error Unreachable preprocessor case
   1983 #endif
   1984 
   1985 	.p2align 4
   1986 L(more40bytes):
   1987 	cmp	$40, %ecx
   1988 	je	L(40bytes)
   1989 #if !defined(USE_WCHAR) && !defined(USE_UTF16)
   1990 	cmp	$41, %ecx
   1991 	je	L(41bytes)
   1992 	cmp	$42, %ecx
   1993 	je	L(42bytes)
   1994 	cmp	$43, %ecx
   1995 	je	L(43bytes)
   1996 	cmp	$44, %ecx
   1997 	je	L(44bytes)
   1998 	cmp	$45, %ecx
   1999 	je	L(45bytes)
   2000 	cmp	$46, %ecx
   2001 	je	L(46bytes)
   2002 	jmp	L(47bytes)
   2003 #elif defined(USE_UTF16) && !defined(USE_WCHAR)
   2004 	cmp	$42, %ecx
   2005 	je	L(42bytes)
   2006 	cmp	$44, %ecx
   2007 	je	L(44bytes)
   2008 	jmp	L(46bytes)
   2009 #endif
   2010 
   2011 #if !defined(USE_AS_WMEMCMP) && !defined(USE_AS_MEMCMP16)
   2012 	.p2align 4
   2013 L(44bytes):
   2014 	mov	-44(%eax), %ecx
   2015 	mov	-44(%edx), %ebx
   2016 	cmp	%ebx, %ecx
   2017 	jne	L(find_diff)
   2018 L(40bytes):
   2019 	mov	-40(%eax), %ecx
   2020 	mov	-40(%edx), %ebx
   2021 	cmp	%ebx, %ecx
   2022 	jne	L(find_diff)
   2023 L(36bytes):
   2024 	mov	-36(%eax), %ecx
   2025 	mov	-36(%edx), %ebx
   2026 	cmp	%ebx, %ecx
   2027 	jne	L(find_diff)
   2028 L(32bytes):
   2029 	mov	-32(%eax), %ecx
   2030 	mov	-32(%edx), %ebx
   2031 	cmp	%ebx, %ecx
   2032 	jne	L(find_diff)
   2033 L(28bytes):
   2034 	mov	-28(%eax), %ecx
   2035 	mov	-28(%edx), %ebx
   2036 	cmp	%ebx, %ecx
   2037 	jne	L(find_diff)
   2038 L(24bytes):
   2039 	mov	-24(%eax), %ecx
   2040 	mov	-24(%edx), %ebx
   2041 	cmp	%ebx, %ecx
   2042 	jne	L(find_diff)
   2043 L(20bytes):
   2044 	mov	-20(%eax), %ecx
   2045 	mov	-20(%edx), %ebx
   2046 	cmp	%ebx, %ecx
   2047 	jne	L(find_diff)
   2048 L(16bytes):
   2049 	mov	-16(%eax), %ecx
   2050 	mov	-16(%edx), %ebx
   2051 	cmp	%ebx, %ecx
   2052 	jne	L(find_diff)
   2053 L(12bytes):
   2054 	mov	-12(%eax), %ecx
   2055 	mov	-12(%edx), %ebx
   2056 	cmp	%ebx, %ecx
   2057 	jne	L(find_diff)
   2058 L(8bytes):
   2059 	mov	-8(%eax), %ecx
   2060 	mov	-8(%edx), %ebx
   2061 	cmp	%ebx, %ecx
   2062 	jne	L(find_diff)
   2063 L(4bytes):
   2064 	mov	-4(%eax), %ecx
   2065 	mov	-4(%edx), %ebx
   2066 	cmp	%ebx, %ecx
   2067 	mov	$0, %eax
   2068 	jne	L(find_diff)
   2069 	POP	(%ebx)
   2070 	ret
   2071 	CFI_PUSH (%ebx)
   2072 #elif defined(USE_AS_WMEMCMP)
   2073 
   2074 	.p2align 4
   2075 L(44bytes):
   2076 	mov	-44(%eax), %ecx
   2077 	cmp	-44(%edx), %ecx
   2078 	jne	L(find_diff)
   2079 L(40bytes):
   2080 	mov	-40(%eax), %ecx
   2081 	cmp	-40(%edx), %ecx
   2082 	jne	L(find_diff)
   2083 L(36bytes):
   2084 	mov	-36(%eax), %ecx
   2085 	cmp	-36(%edx), %ecx
   2086 	jne	L(find_diff)
   2087 L(32bytes):
   2088 	mov	-32(%eax), %ecx
   2089 	cmp	-32(%edx), %ecx
   2090 	jne	L(find_diff)
   2091 L(28bytes):
   2092 	mov	-28(%eax), %ecx
   2093 	cmp	-28(%edx), %ecx
   2094 	jne	L(find_diff)
   2095 L(24bytes):
   2096 	mov	-24(%eax), %ecx
   2097 	cmp	-24(%edx), %ecx
   2098 	jne	L(find_diff)
   2099 L(20bytes):
   2100 	mov	-20(%eax), %ecx
   2101 	cmp	-20(%edx), %ecx
   2102 	jne	L(find_diff)
   2103 L(16bytes):
   2104 	mov	-16(%eax), %ecx
   2105 	cmp	-16(%edx), %ecx
   2106 	jne	L(find_diff)
   2107 L(12bytes):
   2108 	mov	-12(%eax), %ecx
   2109 	cmp	-12(%edx), %ecx
   2110 	jne	L(find_diff)
   2111 L(8bytes):
   2112 	mov	-8(%eax), %ecx
   2113 	cmp	-8(%edx), %ecx
   2114 	jne	L(find_diff)
   2115 L(4bytes):
   2116 	mov	-4(%eax), %ecx
   2117 	xor	%eax, %eax
   2118 	cmp	-4(%edx), %ecx
   2119 	jne	L(find_diff)
   2120 	POP	(%ebx)
   2121 	ret
   2122 	CFI_PUSH (%ebx)
   2123 #elif defined USE_AS_MEMCMP16
   2124 
   2125 	.p2align 4
   2126 L(46bytes):
   2127 	movzwl	-46(%eax), %ecx
   2128 	movzwl	-46(%edx), %ebx
   2129 	subl	%ebx, %ecx
   2130 	jne	L(memcmp16_exit)
   2131 L(44bytes):
   2132 	movzwl	-44(%eax), %ecx
   2133 	movzwl	-44(%edx), %ebx
   2134 	subl	%ebx, %ecx
   2135 	jne	L(memcmp16_exit)
   2136 L(42bytes):
   2137 	movzwl	-42(%eax), %ecx
   2138 	movzwl	-42(%edx), %ebx
   2139 	subl	%ebx, %ecx
   2140 	jne	L(memcmp16_exit)
   2141 L(40bytes):
   2142 	movzwl	-40(%eax), %ecx
   2143 	movzwl	-40(%edx), %ebx
   2144 	subl	%ebx, %ecx
   2145 	jne	L(memcmp16_exit)
   2146 L(38bytes):
   2147 	movzwl	-38(%eax), %ecx
   2148 	movzwl	-38(%edx), %ebx
   2149 	subl	%ebx, %ecx
   2150 	jne	L(memcmp16_exit)
   2151 L(36bytes):
   2152 	movzwl	-36(%eax), %ecx
   2153 	movzwl	-36(%edx), %ebx
   2154 	subl	%ebx, %ecx
   2155 	jne	L(memcmp16_exit)
   2156 L(34bytes):
   2157 	movzwl	-34(%eax), %ecx
   2158 	movzwl	-34(%edx), %ebx
   2159 	subl	%ebx, %ecx
   2160 	jne	L(memcmp16_exit)
   2161 L(32bytes):
   2162 	movzwl	-32(%eax), %ecx
   2163 	movzwl	-32(%edx), %ebx
   2164 	subl	%ebx, %ecx
   2165 	jne	L(memcmp16_exit)
   2166 L(30bytes):
   2167 	movzwl	-30(%eax), %ecx
   2168 	movzwl	-30(%edx), %ebx
   2169 	subl	%ebx, %ecx
   2170 	jne	L(memcmp16_exit)
   2171 L(28bytes):
   2172 	movzwl	-28(%eax), %ecx
   2173 	movzwl	-28(%edx), %ebx
   2174 	subl	%ebx, %ecx
   2175 	jne	L(memcmp16_exit)
   2176 L(26bytes):
   2177 	movzwl	-26(%eax), %ecx
   2178 	movzwl	-26(%edx), %ebx
   2179 	subl	%ebx, %ecx
   2180 	jne	L(memcmp16_exit)
   2181 L(24bytes):
   2182 	movzwl	-24(%eax), %ecx
   2183 	movzwl	-24(%edx), %ebx
   2184 	subl	%ebx, %ecx
   2185 	jne	L(memcmp16_exit)
   2186 L(22bytes):
   2187 	movzwl	-22(%eax), %ecx
   2188 	movzwl	-22(%edx), %ebx
   2189 	subl	%ebx, %ecx
   2190 	jne	L(memcmp16_exit)
   2191 L(20bytes):
   2192 	movzwl	-20(%eax), %ecx
   2193 	movzwl	-20(%edx), %ebx
   2194 	subl	%ebx, %ecx
   2195 	jne	L(memcmp16_exit)
   2196 L(18bytes):
   2197 	movzwl	-18(%eax), %ecx
   2198 	movzwl	-18(%edx), %ebx
   2199 	subl	%ebx, %ecx
   2200 	jne	L(memcmp16_exit)
   2201 L(16bytes):
   2202 	movzwl	-16(%eax), %ecx
   2203 	movzwl	-16(%edx), %ebx
   2204 	subl	%ebx, %ecx
   2205 	jne	L(memcmp16_exit)
   2206 L(14bytes):
   2207 	movzwl	-14(%eax), %ecx
   2208 	movzwl	-14(%edx), %ebx
   2209 	subl	%ebx, %ecx
   2210 	jne	L(memcmp16_exit)
   2211 L(12bytes):
   2212 	movzwl	-12(%eax), %ecx
   2213 	movzwl	-12(%edx), %ebx
   2214 	subl	%ebx, %ecx
   2215 	jne	L(memcmp16_exit)
   2216 L(10bytes):
   2217 	movzwl	-10(%eax), %ecx
   2218 	movzwl	-10(%edx), %ebx
   2219 	subl	%ebx, %ecx
   2220 	jne	L(memcmp16_exit)
   2221 L(8bytes):
   2222 	movzwl	-8(%eax), %ecx
   2223 	movzwl	-8(%edx), %ebx
   2224 	subl	%ebx, %ecx
   2225 	jne	L(memcmp16_exit)
   2226 L(6bytes):
   2227 	movzwl	-6(%eax), %ecx
   2228 	movzwl	-6(%edx), %ebx
   2229 	subl	%ebx, %ecx
   2230 	jne	L(memcmp16_exit)
   2231 L(4bytes):
   2232 	movzwl	-4(%eax), %ecx
   2233 	movzwl	-4(%edx), %ebx
   2234 	subl	%ebx, %ecx
   2235 	jne	L(memcmp16_exit)
   2236 L(2bytes):
   2237 	movzwl	-2(%eax), %eax
   2238 	movzwl	-2(%edx), %ebx
   2239 	subl	%ebx, %eax
   2240 	POP	(%ebx)
   2241 	ret
   2242 	CFI_PUSH (%ebx)
   2243 #else
   2244 # error Unreachable preprocessor case
   2245 #endif
   2246 
   2247 #if !defined(USE_AS_WMEMCMP) && !defined(USE_AS_MEMCMP16)
   2248 
   2249 	.p2align 4
   2250 L(45bytes):
   2251 	mov	-45(%eax), %ecx
   2252 	mov	-45(%edx), %ebx
   2253 	cmp	%ebx, %ecx
   2254 	jne	L(find_diff)
   2255 L(41bytes):
   2256 	mov	-41(%eax), %ecx
   2257 	mov	-41(%edx), %ebx
   2258 	cmp	%ebx, %ecx
   2259 	jne	L(find_diff)
   2260 L(37bytes):
   2261 	mov	-37(%eax), %ecx
   2262 	mov	-37(%edx), %ebx
   2263 	cmp	%ebx, %ecx
   2264 	jne	L(find_diff)
   2265 L(33bytes):
   2266 	mov	-33(%eax), %ecx
   2267 	mov	-33(%edx), %ebx
   2268 	cmp	%ebx, %ecx
   2269 	jne	L(find_diff)
   2270 L(29bytes):
   2271 	mov	-29(%eax), %ecx
   2272 	mov	-29(%edx), %ebx
   2273 	cmp	%ebx, %ecx
   2274 	jne	L(find_diff)
   2275 L(25bytes):
   2276 	mov	-25(%eax), %ecx
   2277 	mov	-25(%edx), %ebx
   2278 	cmp	%ebx, %ecx
   2279 	jne	L(find_diff)
   2280 L(21bytes):
   2281 	mov	-21(%eax), %ecx
   2282 	mov	-21(%edx), %ebx
   2283 	cmp	%ebx, %ecx
   2284 	jne	L(find_diff)
   2285 L(17bytes):
   2286 	mov	-17(%eax), %ecx
   2287 	mov	-17(%edx), %ebx
   2288 	cmp	%ebx, %ecx
   2289 	jne	L(find_diff)
   2290 L(13bytes):
   2291 	mov	-13(%eax), %ecx
   2292 	mov	-13(%edx), %ebx
   2293 	cmp	%ebx, %ecx
   2294 	jne	L(find_diff)
   2295 L(9bytes):
   2296 	mov	-9(%eax), %ecx
   2297 	mov	-9(%edx), %ebx
   2298 	cmp	%ebx, %ecx
   2299 	jne	L(find_diff)
   2300 L(5bytes):
   2301 	mov	-5(%eax), %ecx
   2302 	mov	-5(%edx), %ebx
   2303 	cmp	%ebx, %ecx
   2304 	jne	L(find_diff)
   2305 	movzbl	-1(%eax), %ecx
   2306 	cmp	-1(%edx), %cl
   2307 	mov	$0, %eax
   2308 	jne	L(end)
   2309 	POP	(%ebx)
   2310 	ret
   2311 	CFI_PUSH (%ebx)
   2312 
   2313 	.p2align 4
   2314 L(46bytes):
   2315 	mov	-46(%eax), %ecx
   2316 	mov	-46(%edx), %ebx
   2317 	cmp	%ebx, %ecx
   2318 	jne	L(find_diff)
   2319 L(42bytes):
   2320 	mov	-42(%eax), %ecx
   2321 	mov	-42(%edx), %ebx
   2322 	cmp	%ebx, %ecx
   2323 	jne	L(find_diff)
   2324 L(38bytes):
   2325 	mov	-38(%eax), %ecx
   2326 	mov	-38(%edx), %ebx
   2327 	cmp	%ebx, %ecx
   2328 	jne	L(find_diff)
   2329 L(34bytes):
   2330 	mov	-34(%eax), %ecx
   2331 	mov	-34(%edx), %ebx
   2332 	cmp	%ebx, %ecx
   2333 	jne	L(find_diff)
   2334 L(30bytes):
   2335 	mov	-30(%eax), %ecx
   2336 	mov	-30(%edx), %ebx
   2337 	cmp	%ebx, %ecx
   2338 	jne	L(find_diff)
   2339 L(26bytes):
   2340 	mov	-26(%eax), %ecx
   2341 	mov	-26(%edx), %ebx
   2342 	cmp	%ebx, %ecx
   2343 	jne	L(find_diff)
   2344 L(22bytes):
   2345 	mov	-22(%eax), %ecx
   2346 	mov	-22(%edx), %ebx
   2347 	cmp	%ebx, %ecx
   2348 	jne	L(find_diff)
   2349 L(18bytes):
   2350 	mov	-18(%eax), %ecx
   2351 	mov	-18(%edx), %ebx
   2352 	cmp	%ebx, %ecx
   2353 	jne	L(find_diff)
   2354 L(14bytes):
   2355 	mov	-14(%eax), %ecx
   2356 	mov	-14(%edx), %ebx
   2357 	cmp	%ebx, %ecx
   2358 	jne	L(find_diff)
   2359 L(10bytes):
   2360 	mov	-10(%eax), %ecx
   2361 	mov	-10(%edx), %ebx
   2362 	cmp	%ebx, %ecx
   2363 	jne	L(find_diff)
   2364 L(6bytes):
   2365 	mov	-6(%eax), %ecx
   2366 	mov	-6(%edx), %ebx
   2367 	cmp	%ebx, %ecx
   2368 	jne	L(find_diff)
   2369 L(2bytes):
   2370 	movzwl	-2(%eax), %ecx
   2371 	movzwl	-2(%edx), %ebx
   2372 	cmp	%bl, %cl
   2373 	jne	L(end)
   2374 	cmp	%bh, %ch
   2375 	mov	$0, %eax
   2376 	jne	L(end)
   2377 	POP	(%ebx)
   2378 	ret
   2379 	CFI_PUSH (%ebx)
   2380 
   2381 	.p2align 4
   2382 L(47bytes):
   2383 	movl	-47(%eax), %ecx
   2384 	movl	-47(%edx), %ebx
   2385 	cmp	%ebx, %ecx
   2386 	jne	L(find_diff)
   2387 L(43bytes):
   2388 	movl	-43(%eax), %ecx
   2389 	movl	-43(%edx), %ebx
   2390 	cmp	%ebx, %ecx
   2391 	jne	L(find_diff)
   2392 L(39bytes):
   2393 	movl	-39(%eax), %ecx
   2394 	movl	-39(%edx), %ebx
   2395 	cmp	%ebx, %ecx
   2396 	jne	L(find_diff)
   2397 L(35bytes):
   2398 	movl	-35(%eax), %ecx
   2399 	movl	-35(%edx), %ebx
   2400 	cmp	%ebx, %ecx
   2401 	jne	L(find_diff)
   2402 L(31bytes):
   2403 	movl	-31(%eax), %ecx
   2404 	movl	-31(%edx), %ebx
   2405 	cmp	%ebx, %ecx
   2406 	jne	L(find_diff)
   2407 L(27bytes):
   2408 	movl	-27(%eax), %ecx
   2409 	movl	-27(%edx), %ebx
   2410 	cmp	%ebx, %ecx
   2411 	jne	L(find_diff)
   2412 L(23bytes):
   2413 	movl	-23(%eax), %ecx
   2414 	movl	-23(%edx), %ebx
   2415 	cmp	%ebx, %ecx
   2416 	jne	L(find_diff)
   2417 L(19bytes):
   2418 	movl	-19(%eax), %ecx
   2419 	movl	-19(%edx), %ebx
   2420 	cmp	%ebx, %ecx
   2421 	jne	L(find_diff)
   2422 L(15bytes):
   2423 	movl	-15(%eax), %ecx
   2424 	movl	-15(%edx), %ebx
   2425 	cmp	%ebx, %ecx
   2426 	jne	L(find_diff)
   2427 L(11bytes):
   2428 	movl	-11(%eax), %ecx
   2429 	movl	-11(%edx), %ebx
   2430 	cmp	%ebx, %ecx
   2431 	jne	L(find_diff)
   2432 L(7bytes):
   2433 	movl	-7(%eax), %ecx
   2434 	movl	-7(%edx), %ebx
   2435 	cmp	%ebx, %ecx
   2436 	jne	L(find_diff)
   2437 L(3bytes):
   2438 	movzwl	-3(%eax), %ecx
   2439 	movzwl	-3(%edx), %ebx
   2440 	cmpb	%bl, %cl
   2441 	jne	L(end)
   2442 	cmp	%bx, %cx
   2443 	jne	L(end)
   2444 	movzbl	-1(%eax), %eax
   2445 	cmpb	-1(%edx), %al
   2446 	mov	$0, %eax
   2447 	jne	L(end)
   2448 	POP	(%ebx)
   2449 	ret
   2450 	CFI_PUSH (%ebx)
   2451 
   2452 	.p2align 4
   2453 L(find_diff):
   2454 	cmpb	%bl, %cl
   2455 	jne	L(end)
   2456 	cmp	%bx, %cx
   2457 	jne	L(end)
   2458 	shr	$16,%ecx
   2459 	shr	$16,%ebx
   2460 	cmp	%bl, %cl
   2461 	jne	L(end)
   2462 	cmp	%bx, %cx
   2463 
   2464 	.p2align 4
   2465 L(end):
   2466 	POP	(%ebx)
   2467 	mov	$1, %eax
   2468 	ja	L(bigger)
   2469 	neg	%eax
   2470 L(bigger):
   2471 	ret
   2472 #elif defined(USE_AS_WMEMCMP)
   2473 
   2474 	.p2align 4
   2475 L(find_diff):
   2476 	POP	(%ebx)
   2477 	mov	$1, %eax
   2478 	jg	L(find_diff_bigger)
   2479 	neg	%eax
   2480 	ret
   2481 
   2482 	.p2align 4
   2483 L(find_diff_bigger):
   2484 	ret
   2485 
   2486 #elif defined(USE_AS_MEMCMP16)
   2487 
   2488 	.p2align 4
   2489 L(memcmp16_exit):
   2490 	POP	(%ebx)
   2491 	mov	%ecx, %eax
   2492 	ret
   2493 #else
   2494 # error Unreachable preprocessor case
   2495 #endif
   2496 END (MEMCMP)
   2497