Home | History | Annotate | Download | only in string
      1 /*
      2 Copyright (c) 2014, Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 #ifdef USE_AS_STRNCMP
     32 /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
     33    if the new counter > the old one or is 0.  */
     34 #define UPDATE_STRNCMP_COUNTER				\
     35 	/* calculate left number to compare */		\
     36 	lea	-16(%rcx, %r11), %r9;			\
     37 	cmp	%r9, %r11;				\
     38 	jb	L(strcmp_exitz);			\
     39 	test	%r9, %r9;				\
     40 	je	L(strcmp_exitz);			\
     41 	mov	%r9, %r11
     42 
     43 #else
     44 #define UPDATE_STRNCMP_COUNTER
     45 #ifndef STRCMP
     46 #define STRCMP		strcmp
     47 #endif
     48 #endif
     49 
     50 #ifndef L
     51 # define L(label)	.L##label
     52 #endif
     53 
     54 #ifndef cfi_startproc
     55 # define cfi_startproc			.cfi_startproc
     56 #endif
     57 
     58 #ifndef cfi_endproc
     59 # define cfi_endproc			.cfi_endproc
     60 #endif
     61 
     62 #ifndef ENTRY
     63 # define ENTRY(name)			\
     64 	.type name,  @function; 	\
     65 	.globl name;			\
     66 	.p2align 4;			\
     67 name:					\
     68 	cfi_startproc
     69 #endif
     70 
     71 #ifndef END
     72 # define END(name)			\
     73 	cfi_endproc;			\
     74 	.size name, .-name
     75 #endif
     76 #define RETURN ret
     77 	.section .text.ssse3,"ax",@progbits
     78 ENTRY (STRCMP)
     79 /*
     80  * This implementation uses SSE to compare up to 16 bytes at a time.
     81  */
     82 #ifdef USE_AS_STRNCMP
     83 	test	%rdx, %rdx
     84 	je	L(strcmp_exitz)
     85 	cmp	$1, %rdx
     86 	je	L(Byte0)
     87 	mov	%rdx, %r11
     88 #endif
     89 	mov	%esi, %ecx
     90 	mov	%edi, %eax
     91 /* Use 64bit AND here to avoid long NOP padding.  */
     92 	and	$0x3f, %rcx		/* rsi alignment in cache line */
     93 	and	$0x3f, %rax		/* rdi alignment in cache line */
     94 	cmp	$0x30, %ecx
     95 	ja	L(crosscache)	/* rsi: 16-byte load will cross cache line */
     96 	cmp	$0x30, %eax
     97 	ja	L(crosscache)	/* rdi: 16-byte load will cross cache line */
     98 	movlpd	(%rdi), %xmm1
     99 	movlpd	(%rsi), %xmm2
    100 	movhpd	8(%rdi), %xmm1
    101 	movhpd	8(%rsi), %xmm2
    102 	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
    103 	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
    104 	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
    105 	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
    106 	pmovmskb %xmm1, %edx
    107 	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
    108 	jnz	L(less16bytes)	/* If not, find different value or null char */
    109 #ifdef USE_AS_STRNCMP
    110 	sub	$16, %r11
    111 	jbe	L(strcmp_exitz)	/* finish comparision */
    112 #endif
    113 	add	$16, %rsi		/* prepare to search next 16 bytes */
    114 	add	$16, %rdi		/* prepare to search next 16 bytes */
    115 
    116 	/*
    117 	 * Determine source and destination string offsets from 16-byte alignment.
    118 	 * Use relative offset difference between the two to determine which case
    119 	 * below to use.
    120 	 */
    121 	.p2align 4
    122 L(crosscache):
    123 	and	$0xfffffffffffffff0, %rsi	/* force %rsi is 16 byte aligned */
    124 	and	$0xfffffffffffffff0, %rdi	/* force %rdi is 16 byte aligned */
    125 	mov	$0xffff, %edx			/* for equivalent offset */
    126 	xor	%r8d, %r8d
    127 	and	$0xf, %ecx			/* offset of rsi */
    128 	and	$0xf, %eax			/* offset of rdi */
    129 	cmp	%eax, %ecx
    130 	je	L(ashr_0)			/* rsi and rdi relative offset same */
    131 	ja	L(bigger)
    132 	mov	%edx, %r8d			/* r8d is offset flag for exit tail */
    133 	xchg	%ecx, %eax
    134 	xchg	%rsi, %rdi
    135 L(bigger):
    136 	lea	15(%rax), %r9
    137 	sub	%rcx, %r9
    138 	lea	L(unaligned_table)(%rip), %r10
    139 	movslq	(%r10, %r9,4), %r9
    140 	lea	(%r10, %r9), %r10
    141 	jmp	*%r10				/* jump to corresponding case */
    142 
    143 /*
    144  * The following cases will be handled by ashr_0
    145  *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  corresponding case
    146  *        n(0~15)            n(0~15)           15(15+ n-n)         ashr_0
    147  */
    148 	.p2align 4
    149 L(ashr_0):
    150 
    151 	movdqa	(%rsi), %xmm1
    152 	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char check */
    153 	pcmpeqb	%xmm1, %xmm0			/* Any null chars? */
    154 	pcmpeqb	(%rdi), %xmm1			/* compare 16 bytes for equality */
    155 	psubb	%xmm0, %xmm1			/* packed sub of comparison results*/
    156 	pmovmskb %xmm1, %r9d
    157 	shr	%cl, %edx			/* adjust 0xffff for offset */
    158 	shr	%cl, %r9d			/* adjust for 16-byte offset */
    159 	sub	%r9d, %edx
    160 	/*
    161 	 * edx must be the same with r9d if in left byte (16-rcx) is equal to
    162 	 * the start from (16-rax) and no null char was seen.
    163 	 */
    164 	jne	L(less32bytes)		/* mismatch or null char */
    165 	UPDATE_STRNCMP_COUNTER
    166 	mov	$16, %rcx
    167 	mov	$16, %r9
    168 	pxor	%xmm0, %xmm0			/* clear xmm0, may have changed above */
    169 
    170 	/*
    171 	 * Now both strings are aligned at 16-byte boundary. Loop over strings
    172 	 * checking 32-bytes per iteration.
    173 	 */
    174 	.p2align 4
    175 L(loop_ashr_0):
    176 	movdqa	(%rsi, %rcx), %xmm1
    177 	movdqa	(%rdi, %rcx), %xmm2
    178 
    179 	pcmpeqb	%xmm1, %xmm0
    180 	pcmpeqb	%xmm2, %xmm1
    181 	psubb	%xmm0, %xmm1
    182 	pmovmskb %xmm1, %edx
    183 	sub	$0xffff, %edx
    184 	jnz	L(exit)		/* mismatch or null char seen */
    185 
    186 #ifdef USE_AS_STRNCMP
    187 	sub	$16, %r11
    188 	jbe	L(strcmp_exitz)
    189 #endif
    190 	add	$16, %rcx
    191 	movdqa	(%rsi, %rcx), %xmm1
    192 	movdqa	(%rdi, %rcx), %xmm2
    193 
    194 	pcmpeqb	%xmm1, %xmm0
    195 	pcmpeqb	%xmm2, %xmm1
    196 	psubb	%xmm0, %xmm1
    197 	pmovmskb %xmm1, %edx
    198 	sub	$0xffff, %edx
    199 	jnz	L(exit)
    200 #ifdef USE_AS_STRNCMP
    201 	sub	$16, %r11
    202 	jbe	L(strcmp_exitz)
    203 #endif
    204 	add	$16, %rcx
    205 	jmp	L(loop_ashr_0)
    206 
    207 /*
    208  * The following cases will be handled by ashr_1
    209  * rcx(offset of rsi)  rax(offset of rdi)   relative offset   	corresponding case
    210  *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
    211  */
    212 	.p2align 4
    213 L(ashr_1):
    214 	pxor	%xmm0, %xmm0
    215 	movdqa	(%rdi), %xmm2
    216 	movdqa	(%rsi), %xmm1
    217 	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
    218 	pslldq	$15, %xmm2		/* shift first string to align with second */
    219 	pcmpeqb	%xmm1, %xmm2		/* compare 16 bytes for equality */
    220 	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
    221 	pmovmskb %xmm2, %r9d
    222 	shr	%cl, %edx		/* adjust 0xffff for offset */
    223 	shr	%cl, %r9d		/* adjust for 16-byte offset */
    224 	sub	%r9d, %edx
    225 	jnz	L(less32bytes)	/* mismatch or null char seen */
    226 	movdqa	(%rdi), %xmm3
    227 	UPDATE_STRNCMP_COUNTER
    228 
    229 	pxor	%xmm0, %xmm0
    230 	mov	$16, %rcx		/* index for loads*/
    231 	mov	$1, %r9d		/* byte position left over from less32bytes case */
    232 	/*
    233 	 * Setup %r10 value allows us to detect crossing a page boundary.
    234 	 * When %r10 goes positive we have crossed a page boundary and
    235 	 * need to do a nibble.
    236 	 */
    237 	lea	1(%rdi), %r10
    238 	and	$0xfff, %r10		/* offset into 4K page */
    239 	sub	$0x1000, %r10		/* subtract 4K pagesize */
    240 
    241 	.p2align 4
    242 L(loop_ashr_1):
    243 	add	$16, %r10
    244 	jg	L(nibble_ashr_1)	/* cross page boundary */
    245 
    246 L(gobble_ashr_1):
    247 	movdqa	(%rsi, %rcx), %xmm1
    248 	movdqa	(%rdi, %rcx), %xmm2
    249 	movdqa	%xmm2, %xmm4		 /* store for next cycle */
    250 
    251 	palignr $1, %xmm3, %xmm2        /* merge into one 16byte value */
    252 
    253 	pcmpeqb	%xmm1, %xmm0
    254 	pcmpeqb	%xmm2, %xmm1
    255 	psubb	%xmm0, %xmm1
    256 	pmovmskb %xmm1, %edx
    257 	sub	$0xffff, %edx
    258 	jnz	L(exit)
    259 
    260 #ifdef USE_AS_STRNCMP
    261 	sub	$16, %r11
    262 	jbe	L(strcmp_exitz)
    263 #endif
    264 	add	$16, %rcx
    265 	movdqa	%xmm4, %xmm3
    266 
    267 	add	$16, %r10
    268 	jg	L(nibble_ashr_1)	/* cross page boundary */
    269 
    270 	movdqa	(%rsi, %rcx), %xmm1
    271 	movdqa	(%rdi, %rcx), %xmm2
    272 	movdqa	%xmm2, %xmm4		/* store for next cycle */
    273 
    274 	palignr $1, %xmm3, %xmm2        /* merge into one 16byte value */
    275 
    276 	pcmpeqb	%xmm1, %xmm0
    277 	pcmpeqb	%xmm2, %xmm1
    278 	psubb	%xmm0, %xmm1
    279 	pmovmskb %xmm1, %edx
    280 	sub	$0xffff, %edx
    281 	jnz	L(exit)
    282 
    283 #ifdef USE_AS_STRNCMP
    284 	sub	$16, %r11
    285 	jbe	L(strcmp_exitz)
    286 #endif
    287 	add	$16, %rcx
    288 	movdqa	%xmm4, %xmm3
    289 	jmp	L(loop_ashr_1)
    290 
    291 	/*
    292 	 * Nibble avoids loads across page boundary. This is to avoid a potential
    293 	 * access into unmapped memory.
    294 	 */
    295 	.p2align 4
    296 L(nibble_ashr_1):
    297 	pcmpeqb	%xmm3, %xmm0		 /* check nibble for null char*/
    298 	pmovmskb %xmm0, %edx
    299 	test	$0xfffe, %edx
    300 	jnz	L(ashr_1_exittail)	/* find null char*/
    301 
    302 #ifdef USE_AS_STRNCMP
    303 	cmp	$14, %r11
    304 	jbe	L(ashr_1_exittail)
    305 #endif
    306 
    307 	pxor	%xmm0, %xmm0
    308 	sub	$0x1000, %r10		/* substract 4K from %r10 */
    309 	jmp	L(gobble_ashr_1)
    310 
    311 	/*
    312 	 * Once find null char, determine if there is a string mismatch
    313 	 * before the null char.
    314 	 */
    315 	.p2align 4
    316 L(ashr_1_exittail):
    317 	movdqa	(%rsi, %rcx), %xmm1
    318 	psrldq	$1, %xmm0
    319 	psrldq	$1, %xmm3
    320 	jmp	L(aftertail)
    321 
    322 /*
    323  * The following cases will be handled by ashr_2
    324  * rcx(offset of rsi)  rax(offset of rdi)   relative offset   	corresponding case
    325  *        n(14~15)            n -14         1(15 +(n-14) - n)         ashr_2
    326  */
    327 	.p2align 4
    328 L(ashr_2):
    329 	pxor	%xmm0, %xmm0
    330 	movdqa	(%rdi), %xmm2
    331 	movdqa	(%rsi), %xmm1
    332 	pcmpeqb	%xmm1, %xmm0
    333 	pslldq	$14, %xmm2
    334 	pcmpeqb	%xmm1, %xmm2
    335 	psubb	%xmm0, %xmm2
    336 	pmovmskb %xmm2, %r9d
    337 	shr	%cl, %edx
    338 	shr	%cl, %r9d
    339 	sub	%r9d, %edx
    340 	jnz	L(less32bytes)
    341 	movdqa	(%rdi), %xmm3
    342 	UPDATE_STRNCMP_COUNTER
    343 
    344 	pxor	%xmm0, %xmm0
    345 	mov	$16, %rcx	/* index for loads */
    346 	mov	$2, %r9d	/* byte position left over from less32bytes case */
    347 	/*
    348 	 * Setup %r10 value allows us to detect crossing a page boundary.
    349 	 * When %r10 goes positive we have crossed a page boundary and
    350 	 * need to do a nibble.
    351 	 */
    352 	lea	2(%rdi), %r10
    353 	and	$0xfff, %r10	/* offset into 4K page */
    354 	sub	$0x1000, %r10	/* subtract 4K pagesize */
    355 
    356 	.p2align 4
    357 L(loop_ashr_2):
    358 	add	$16, %r10
    359 	jg	L(nibble_ashr_2)
    360 
    361 L(gobble_ashr_2):
    362 	movdqa	(%rsi, %rcx), %xmm1
    363 	movdqa	(%rdi, %rcx), %xmm2
    364 	movdqa	%xmm2, %xmm4
    365 
    366 	palignr $2, %xmm3, %xmm2        /* merge into one 16byte value */
    367 
    368 	pcmpeqb	%xmm1, %xmm0
    369 	pcmpeqb	%xmm2, %xmm1
    370 	psubb	%xmm0, %xmm1
    371 	pmovmskb %xmm1, %edx
    372 	sub	$0xffff, %edx
    373 	jnz	L(exit)
    374 
    375 #ifdef USE_AS_STRNCMP
    376 	sub	$16, %r11
    377 	jbe	L(strcmp_exitz)
    378 #endif
    379 
    380 	add	$16, %rcx
    381 	movdqa	%xmm4, %xmm3
    382 
    383 	add	$16, %r10
    384 	jg	L(nibble_ashr_2)	/* cross page boundary */
    385 
    386 	movdqa	(%rsi, %rcx), %xmm1
    387 	movdqa	(%rdi, %rcx), %xmm2
    388 	movdqa	%xmm2, %xmm4
    389 
    390 	palignr $2, %xmm3, %xmm2        /* merge into one 16byte value */
    391 
    392 	pcmpeqb	%xmm1, %xmm0
    393 	pcmpeqb	%xmm2, %xmm1
    394 	psubb	%xmm0, %xmm1
    395 	pmovmskb %xmm1, %edx
    396 	sub	$0xffff, %edx
    397 	jnz	L(exit)
    398 
    399 #ifdef USE_AS_STRNCMP
    400 	sub	$16, %r11
    401 	jbe	L(strcmp_exitz)
    402 #endif
    403 
    404 	add	$16, %rcx
    405 	movdqa	%xmm4, %xmm3
    406 	jmp	L(loop_ashr_2)
    407 
    408 	.p2align 4
    409 L(nibble_ashr_2):
    410 	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
    411 	pmovmskb %xmm0, %edx
    412 	test	$0xfffc, %edx
    413 	jnz	L(ashr_2_exittail)
    414 
    415 #ifdef USE_AS_STRNCMP
    416 	cmp	$13, %r11
    417 	jbe	L(ashr_2_exittail)
    418 #endif
    419 
    420 	pxor	%xmm0, %xmm0
    421 	sub	$0x1000, %r10
    422 	jmp	L(gobble_ashr_2)
    423 
    424 	.p2align 4
    425 L(ashr_2_exittail):
    426 	movdqa	(%rsi, %rcx), %xmm1
    427 	psrldq	$2, %xmm0
    428 	psrldq	$2, %xmm3
    429 	jmp	L(aftertail)
    430 
    431 /*
    432  * The following cases will be handled by ashr_3
    433  *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
    434  *        n(13~15)            n -13         2(15 +(n-13) - n)         ashr_3
    435  */
    436 	.p2align 4
    437 L(ashr_3):
    438 	pxor	%xmm0, %xmm0
    439 	movdqa	(%rdi), %xmm2
    440 	movdqa	(%rsi), %xmm1
    441 	pcmpeqb	%xmm1, %xmm0
    442 	pslldq	$13, %xmm2
    443 	pcmpeqb	%xmm1, %xmm2
    444 	psubb	%xmm0, %xmm2
    445 	pmovmskb %xmm2, %r9d
    446 	shr	%cl, %edx
    447 	shr	%cl, %r9d
    448 	sub	%r9d, %edx
    449 	jnz	L(less32bytes)
    450 	movdqa	(%rdi), %xmm3
    451 
    452 	UPDATE_STRNCMP_COUNTER
    453 
    454 	pxor	%xmm0, %xmm0
    455 	mov	$16, %rcx	/* index for loads */
    456 	mov	$3, %r9d	/* byte position left over from less32bytes case */
    457 	/*
    458 	 * Setup %r10 value allows us to detect crossing a page boundary.
    459 	 * When %r10 goes positive we have crossed a page boundary and
    460 	 * need to do a nibble.
    461 	 */
    462 	lea	3(%rdi), %r10
    463 	and	$0xfff, %r10	/* offset into 4K page */
    464 	sub	$0x1000, %r10	/* subtract 4K pagesize */
    465 
    466 	.p2align 4
    467 L(loop_ashr_3):
    468 	add	$16, %r10
    469 	jg	L(nibble_ashr_3)
    470 
    471 L(gobble_ashr_3):
    472 	movdqa	(%rsi, %rcx), %xmm1
    473 	movdqa	(%rdi, %rcx), %xmm2
    474 	movdqa	%xmm2, %xmm4
    475 
    476 	palignr $3, %xmm3, %xmm2        /* merge into one 16byte value */
    477 
    478 	pcmpeqb	%xmm1, %xmm0
    479 	pcmpeqb	%xmm2, %xmm1
    480 	psubb	%xmm0, %xmm1
    481 	pmovmskb %xmm1, %edx
    482 	sub	$0xffff, %edx
    483 	jnz	L(exit)
    484 
    485 #ifdef USE_AS_STRNCMP
    486 	sub	$16, %r11
    487 	jbe	L(strcmp_exitz)
    488 #endif
    489 
    490 	add	$16, %rcx
    491 	movdqa	%xmm4, %xmm3
    492 
    493 	add	$16, %r10
    494 	jg	L(nibble_ashr_3)	/* cross page boundary */
    495 
    496 	movdqa	(%rsi, %rcx), %xmm1
    497 	movdqa	(%rdi, %rcx), %xmm2
    498 	movdqa	%xmm2, %xmm4
    499 
    500 	palignr $3, %xmm3, %xmm2        /* merge into one 16byte value */
    501 
    502 	pcmpeqb	%xmm1, %xmm0
    503 	pcmpeqb	%xmm2, %xmm1
    504 	psubb	%xmm0, %xmm1
    505 	pmovmskb %xmm1, %edx
    506 	sub	$0xffff, %edx
    507 	jnz	L(exit)
    508 
    509 #ifdef USE_AS_STRNCMP
    510 	sub	$16, %r11
    511 	jbe	L(strcmp_exitz)
    512 #endif
    513 
    514 	add	$16, %rcx
    515 	movdqa	%xmm4, %xmm3
    516 	jmp	L(loop_ashr_3)
    517 
    518 	.p2align 4
    519 L(nibble_ashr_3):
    520 	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
    521 	pmovmskb %xmm0, %edx
    522 	test	$0xfff8, %edx
    523 	jnz	L(ashr_3_exittail)
    524 
    525 #ifdef USE_AS_STRNCMP
    526 	cmp	$12, %r11
    527 	jbe	L(ashr_3_exittail)
    528 #endif
    529 
    530 	pxor	%xmm0, %xmm0
    531 	sub	$0x1000, %r10
    532 	jmp	L(gobble_ashr_3)
    533 
    534 	.p2align 4
    535 L(ashr_3_exittail):
    536 	movdqa	(%rsi, %rcx), %xmm1
    537 	psrldq	$3, %xmm0
    538 	psrldq	$3, %xmm3
    539 	jmp	L(aftertail)
    540 
    541 /*
    542  * The following cases will be handled by ashr_4
    543  *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
    544  *        n(12~15)            n -12         3(15 +(n-12) - n)         ashr_4
    545  */
    546 	.p2align 4
    547 L(ashr_4):
    548 	pxor	%xmm0, %xmm0
    549 	movdqa	(%rdi), %xmm2
    550 	movdqa	(%rsi), %xmm1
    551 	pcmpeqb	%xmm1, %xmm0
    552 	pslldq	$12, %xmm2
    553 	pcmpeqb	%xmm1, %xmm2
    554 	psubb	%xmm0, %xmm2
    555 	pmovmskb %xmm2, %r9d
    556 	shr	%cl, %edx
    557 	shr	%cl, %r9d
    558 	sub	%r9d, %edx
    559 	jnz	L(less32bytes)
    560 	movdqa	(%rdi), %xmm3
    561 
    562 	UPDATE_STRNCMP_COUNTER
    563 
    564 	pxor	%xmm0, %xmm0
    565 	mov	$16, %rcx	/* index for loads */
    566 	mov	$4, %r9d	/* byte position left over from less32bytes case */
    567 	/*
    568 	 * Setup %r10 value allows us to detect crossing a page boundary.
    569 	 * When %r10 goes positive we have crossed a page boundary and
    570 	 * need to do a nibble.
    571 	 */
    572 	lea	4(%rdi), %r10
    573 	and	$0xfff, %r10	/* offset into 4K page */
    574 	sub	$0x1000, %r10	/* subtract 4K pagesize */
    575 
    576 	.p2align 4
    577 L(loop_ashr_4):
    578 	add	$16, %r10
    579 	jg	L(nibble_ashr_4)
    580 
    581 L(gobble_ashr_4):
    582 	movdqa	(%rsi, %rcx), %xmm1
    583 	movdqa	(%rdi, %rcx), %xmm2
    584 	movdqa	%xmm2, %xmm4
    585 
    586 	palignr $4, %xmm3, %xmm2        /* merge into one 16byte value */
    587 
    588 	pcmpeqb	%xmm1, %xmm0
    589 	pcmpeqb	%xmm2, %xmm1
    590 	psubb	%xmm0, %xmm1
    591 	pmovmskb %xmm1, %edx
    592 	sub	$0xffff, %edx
    593 	jnz	L(exit)
    594 
    595 #ifdef USE_AS_STRNCMP
    596 	sub	$16, %r11
    597 	jbe	L(strcmp_exitz)
    598 #endif
    599 
    600 	add	$16, %rcx
    601 	movdqa	%xmm4, %xmm3
    602 
    603 	add	$16, %r10
    604 	jg	L(nibble_ashr_4)	/* cross page boundary */
    605 
    606 	movdqa	(%rsi, %rcx), %xmm1
    607 	movdqa	(%rdi, %rcx), %xmm2
    608 	movdqa	%xmm2, %xmm4
    609 
    610 	palignr $4, %xmm3, %xmm2        /* merge into one 16byte value */
    611 
    612 	pcmpeqb	%xmm1, %xmm0
    613 	pcmpeqb	%xmm2, %xmm1
    614 	psubb	%xmm0, %xmm1
    615 	pmovmskb %xmm1, %edx
    616 	sub	$0xffff, %edx
    617 	jnz	L(exit)
    618 
    619 #ifdef USE_AS_STRNCMP
    620 	sub	$16, %r11
    621 	jbe	L(strcmp_exitz)
    622 #endif
    623 
    624 	add	$16, %rcx
    625 	movdqa	%xmm4, %xmm3
    626 	jmp	L(loop_ashr_4)
    627 
    628 	.p2align 4
    629 L(nibble_ashr_4):
    630 	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
    631 	pmovmskb %xmm0, %edx
    632 	test	$0xfff0, %edx
    633 	jnz	L(ashr_4_exittail)
    634 
    635 #ifdef USE_AS_STRNCMP
    636 	cmp	$11, %r11
    637 	jbe	L(ashr_4_exittail)
    638 #endif
    639 
    640 	pxor	%xmm0, %xmm0
    641 	sub	$0x1000, %r10
    642 	jmp	L(gobble_ashr_4)
    643 
    644 	.p2align 4
    645 L(ashr_4_exittail):
    646 	movdqa	(%rsi, %rcx), %xmm1
    647 	psrldq	$4, %xmm0
    648 	psrldq	$4, %xmm3
    649 	jmp	L(aftertail)
    650 
    651 /*
    652  * The following cases will be handled by ashr_5
    653  *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
    654  *        n(11~15)          n - 11      	  4(15 +(n-11) - n)         ashr_5
    655  */
    656 	.p2align 4
    657 L(ashr_5):
    658 	pxor	%xmm0, %xmm0
    659 	movdqa	(%rdi), %xmm2
    660 	movdqa	(%rsi), %xmm1
    661 	pcmpeqb	%xmm1, %xmm0
    662 	pslldq	$11, %xmm2
    663 	pcmpeqb	%xmm1, %xmm2
    664 	psubb	%xmm0, %xmm2
    665 	pmovmskb %xmm2, %r9d
    666 	shr	%cl, %edx
    667 	shr	%cl, %r9d
    668 	sub	%r9d, %edx
    669 	jnz	L(less32bytes)
    670 	movdqa	(%rdi), %xmm3
    671 
    672 	UPDATE_STRNCMP_COUNTER
    673 
    674 	pxor	%xmm0, %xmm0
    675 	mov	$16, %rcx	/* index for loads */
    676 	mov	$5, %r9d	/* byte position left over from less32bytes case */
    677 	/*
    678 	 * Setup %r10 value allows us to detect crossing a page boundary.
    679 	 * When %r10 goes positive we have crossed a page boundary and
    680 	 * need to do a nibble.
    681 	 */
    682 	lea	5(%rdi), %r10
    683 	and	$0xfff, %r10	/* offset into 4K page */
    684 	sub	$0x1000, %r10	/* subtract 4K pagesize */
    685 
    686 	.p2align 4
    687 L(loop_ashr_5):
    688 	add	$16, %r10
    689 	jg	L(nibble_ashr_5)
    690 
    691 L(gobble_ashr_5):
    692 	movdqa	(%rsi, %rcx), %xmm1
    693 	movdqa	(%rdi, %rcx), %xmm2
    694 	movdqa	%xmm2, %xmm4
    695 
    696 	palignr $5, %xmm3, %xmm2        /* merge into one 16byte value */
    697 
    698 	pcmpeqb	%xmm1, %xmm0
    699 	pcmpeqb	%xmm2, %xmm1
    700 	psubb	%xmm0, %xmm1
    701 	pmovmskb %xmm1, %edx
    702 	sub	$0xffff, %edx
    703 	jnz	L(exit)
    704 
    705 #ifdef USE_AS_STRNCMP
    706 	sub	$16, %r11
    707 	jbe	L(strcmp_exitz)
    708 #endif
    709 
    710 	add	$16, %rcx
    711 	movdqa	%xmm4, %xmm3
    712 
    713 	add	$16, %r10
    714 	jg	L(nibble_ashr_5)	/* cross page boundary */
    715 
    716 	movdqa	(%rsi, %rcx), %xmm1
    717 	movdqa	(%rdi, %rcx), %xmm2
    718 	movdqa	%xmm2, %xmm4
    719 
    720 	palignr $5, %xmm3, %xmm2        /* merge into one 16byte value */
    721 
    722 	pcmpeqb	%xmm1, %xmm0
    723 	pcmpeqb	%xmm2, %xmm1
    724 	psubb	%xmm0, %xmm1
    725 	pmovmskb %xmm1, %edx
    726 	sub	$0xffff, %edx
    727 	jnz	L(exit)
    728 
    729 #ifdef USE_AS_STRNCMP
    730 	sub	$16, %r11
    731 	jbe	L(strcmp_exitz)
    732 #endif
    733 
    734 	add	$16, %rcx
    735 	movdqa	%xmm4, %xmm3
    736 	jmp	L(loop_ashr_5)
    737 
    738 	.p2align 4
    739 L(nibble_ashr_5):
    740 	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
    741 	pmovmskb %xmm0, %edx
    742 	test	$0xffe0, %edx
    743 	jnz	L(ashr_5_exittail)
    744 
    745 #ifdef USE_AS_STRNCMP
    746 	cmp	$10, %r11
    747 	jbe	L(ashr_5_exittail)
    748 #endif
    749 
    750 	pxor	%xmm0, %xmm0
    751 	sub	$0x1000, %r10
    752 	jmp	L(gobble_ashr_5)
    753 
    754 	.p2align 4
    755 L(ashr_5_exittail):
    756 	movdqa	(%rsi, %rcx), %xmm1
    757 	psrldq	$5, %xmm0
    758 	psrldq	$5, %xmm3
    759 	jmp	L(aftertail)
    760 
    761 /*
    762  * The following cases will be handled by ashr_6
    763  *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
    764  *        n(10~15)          n - 10      	  5(15 +(n-10) - n)         ashr_6
    765  */
    766 	.p2align 4
    767 L(ashr_6):
    768 	pxor	%xmm0, %xmm0
    769 	movdqa	(%rdi), %xmm2
    770 	movdqa	(%rsi), %xmm1
    771 	pcmpeqb	%xmm1, %xmm0
    772 	pslldq	$10, %xmm2
    773 	pcmpeqb	%xmm1, %xmm2
    774 	psubb	%xmm0, %xmm2
    775 	pmovmskb %xmm2, %r9d
    776 	shr	%cl, %edx
    777 	shr	%cl, %r9d
    778 	sub	%r9d, %edx
    779 	jnz	L(less32bytes)
    780 	movdqa	(%rdi), %xmm3
    781 
    782 	UPDATE_STRNCMP_COUNTER
    783 
    784 	pxor	%xmm0, %xmm0
    785 	mov	$16, %rcx	/* index for loads */
    786 	mov	$6, %r9d	/* byte position left over from less32bytes case */
    787 	/*
    788 	 * Setup %r10 value allows us to detect crossing a page boundary.
    789 	 * When %r10 goes positive we have crossed a page boundary and
    790 	 * need to do a nibble.
    791 	 */
    792 	lea	6(%rdi), %r10
    793 	and	$0xfff, %r10	/* offset into 4K page */
    794 	sub	$0x1000, %r10	/* subtract 4K pagesize */
    795 
    796 	.p2align 4
    797 L(loop_ashr_6):
    798 	add	$16, %r10
    799 	jg	L(nibble_ashr_6)
    800 
    801 L(gobble_ashr_6):
    802 	movdqa	(%rsi, %rcx), %xmm1
    803 	movdqa	(%rdi, %rcx), %xmm2
    804 	movdqa	%xmm2, %xmm4
    805 
    806 	palignr $6, %xmm3, %xmm2        /* merge into one 16byte value */
    807 
    808 	pcmpeqb	%xmm1, %xmm0
    809 	pcmpeqb	%xmm2, %xmm1
    810 	psubb	%xmm0, %xmm1
    811 	pmovmskb %xmm1, %edx
    812 	sub	$0xffff, %edx
    813 	jnz	L(exit)
    814 
    815 #ifdef USE_AS_STRNCMP
    816 	sub	$16, %r11
    817 	jbe	L(strcmp_exitz)
    818 #endif
    819 
    820 	add	$16, %rcx
    821 	movdqa	%xmm4, %xmm3
    822 
    823 	add	$16, %r10
    824 	jg	L(nibble_ashr_6)	/* cross page boundary */
    825 
    826 	movdqa	(%rsi, %rcx), %xmm1
    827 	movdqa	(%rdi, %rcx), %xmm2
    828 	movdqa	%xmm2, %xmm4
    829 
    830 	palignr $6, %xmm3, %xmm2        /* merge into one 16byte value */
    831 
    832 	pcmpeqb	%xmm1, %xmm0
    833 	pcmpeqb	%xmm2, %xmm1
    834 	psubb	%xmm0, %xmm1
    835 	pmovmskb %xmm1, %edx
    836 	sub	$0xffff, %edx
    837 	jnz	L(exit)
    838 
    839 #ifdef USE_AS_STRNCMP
    840 	sub	$16, %r11
    841 	jbe	L(strcmp_exitz)
    842 #endif
    843 
    844 	add	$16, %rcx
    845 	movdqa	%xmm4, %xmm3
    846 	jmp	L(loop_ashr_6)
    847 
    848 	.p2align 4
    849 L(nibble_ashr_6):
    850 	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
    851 	pmovmskb %xmm0, %edx
    852 	test	$0xffc0, %edx
    853 	jnz	L(ashr_6_exittail)
    854 
    855 #ifdef USE_AS_STRNCMP
    856 	cmp	$9, %r11
    857 	jbe	L(ashr_6_exittail)
    858 #endif
    859 
    860 	pxor	%xmm0, %xmm0
    861 	sub	$0x1000, %r10
    862 	jmp	L(gobble_ashr_6)
    863 
    864 	.p2align 4
    865 L(ashr_6_exittail):
    866 	movdqa	(%rsi, %rcx), %xmm1
    867 	psrldq	$6, %xmm0
    868 	psrldq	$6, %xmm3
    869 	jmp	L(aftertail)
    870 
    871 /*
    872  * The following cases will be handled by ashr_7
    873  *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
    874  *        n(9~15)          n - 9      	        6(15 +(n - 9) - n)         ashr_7
    875  */
    876 	.p2align 4
    877 L(ashr_7):
    878 	pxor	%xmm0, %xmm0
    879 	movdqa	(%rdi), %xmm2
    880 	movdqa	(%rsi), %xmm1
    881 	pcmpeqb	%xmm1, %xmm0
    882 	pslldq	$9, %xmm2
    883 	pcmpeqb	%xmm1, %xmm2
    884 	psubb	%xmm0, %xmm2
    885 	pmovmskb %xmm2, %r9d
    886 	shr	%cl, %edx
    887 	shr	%cl, %r9d
    888 	sub	%r9d, %edx
    889 	jnz	L(less32bytes)
    890 	movdqa	(%rdi), %xmm3
    891 
    892 	UPDATE_STRNCMP_COUNTER
    893 
    894 	pxor	%xmm0, %xmm0
    895 	mov	$16, %rcx	/* index for loads */
    896 	mov	$7, %r9d	/* byte position left over from less32bytes case */
    897 	/*
    898 	 * Setup %r10 value allows us to detect crossing a page boundary.
    899 	 * When %r10 goes positive we have crossed a page boundary and
    900 	 * need to do a nibble.
    901 	 */
    902 	lea	7(%rdi), %r10
    903 	and	$0xfff, %r10	/* offset into 4K page */
    904 	sub	$0x1000, %r10	/* subtract 4K pagesize */
    905 
    906 	.p2align 4
    907 L(loop_ashr_7):
    908 	add	$16, %r10
    909 	jg	L(nibble_ashr_7)
    910 
    911 L(gobble_ashr_7):
    912 	movdqa	(%rsi, %rcx), %xmm1
    913 	movdqa	(%rdi, %rcx), %xmm2
    914 	movdqa	%xmm2, %xmm4
    915 
    916 	palignr $7, %xmm3, %xmm2        /* merge into one 16byte value */
    917 
    918 	pcmpeqb	%xmm1, %xmm0
    919 	pcmpeqb	%xmm2, %xmm1
    920 	psubb	%xmm0, %xmm1
    921 	pmovmskb %xmm1, %edx
    922 	sub	$0xffff, %edx
    923 	jnz	L(exit)
    924 
    925 #ifdef USE_AS_STRNCMP
    926 	sub	$16, %r11
    927 	jbe	L(strcmp_exitz)
    928 #endif
    929 
    930 	add	$16, %rcx
    931 	movdqa	%xmm4, %xmm3
    932 
    933 	add	$16, %r10
    934 	jg	L(nibble_ashr_7)	/* cross page boundary */
    935 
    936 	movdqa	(%rsi, %rcx), %xmm1
    937 	movdqa	(%rdi, %rcx), %xmm2
    938 	movdqa	%xmm2, %xmm4
    939 
    940 	palignr $7, %xmm3, %xmm2        /* merge into one 16byte value */
    941 
    942 	pcmpeqb	%xmm1, %xmm0
    943 	pcmpeqb	%xmm2, %xmm1
    944 	psubb	%xmm0, %xmm1
    945 	pmovmskb %xmm1, %edx
    946 	sub	$0xffff, %edx
    947 	jnz	L(exit)
    948 
    949 #ifdef USE_AS_STRNCMP
    950 	sub	$16, %r11
    951 	jbe	L(strcmp_exitz)
    952 #endif
    953 
    954 	add	$16, %rcx
    955 	movdqa	%xmm4, %xmm3
    956 	jmp	L(loop_ashr_7)
    957 
    958 	.p2align 4
    959 L(nibble_ashr_7):
    960 	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
    961 	pmovmskb %xmm0, %edx
    962 	test	$0xff80, %edx
    963 	jnz	L(ashr_7_exittail)
    964 
    965 #ifdef USE_AS_STRNCMP
    966 	cmp	$8, %r11
    967 	jbe	L(ashr_7_exittail)
    968 #endif
    969 
    970 	pxor	%xmm0, %xmm0
    971 	sub	$0x1000, %r10
    972 	jmp	L(gobble_ashr_7)
    973 
    974 	.p2align 4
    975 L(ashr_7_exittail):
    976 	movdqa	(%rsi, %rcx), %xmm1
    977 	psrldq	$7, %xmm0
    978 	psrldq	$7, %xmm3
    979 	jmp	L(aftertail)
    980 
    981 /*
    982  *  The following cases will be handled by ashr_8
    983  *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
    984  *        n(8~15)          n - 8      	        7(15 +(n - 8) - n)         ashr_8
    985  */
    986 	.p2align 4
    987 L(ashr_8):
    988 	pxor	%xmm0, %xmm0
    989 	movdqa	(%rdi), %xmm2
    990 	movdqa	(%rsi), %xmm1
    991 	pcmpeqb	%xmm1, %xmm0
    992 	pslldq	$8, %xmm2
    993 	pcmpeqb	%xmm1, %xmm2
    994 	psubb	%xmm0, %xmm2
    995 	pmovmskb %xmm2, %r9d
    996 	shr	%cl, %edx
    997 	shr	%cl, %r9d
    998 	sub	%r9d, %edx
    999 	jnz	L(less32bytes)
   1000 	movdqa	(%rdi), %xmm3
   1001 
   1002 	UPDATE_STRNCMP_COUNTER
   1003 
   1004 	pxor	%xmm0, %xmm0
   1005 	mov	$16, %rcx	/* index for loads */
   1006 	mov	$8, %r9d	/* byte position left over from less32bytes case */
   1007 	/*
   1008 	 * Setup %r10 value allows us to detect crossing a page boundary.
   1009 	 * When %r10 goes positive we have crossed a page boundary and
   1010 	 * need to do a nibble.
   1011 	 */
   1012 	lea	8(%rdi), %r10
   1013 	and	$0xfff, %r10	/* offset into 4K page */
   1014 	sub	$0x1000, %r10	/* subtract 4K pagesize */
   1015 
   1016 	.p2align 4
   1017 L(loop_ashr_8):
   1018 	add	$16, %r10
   1019 	jg	L(nibble_ashr_8)
   1020 
   1021 L(gobble_ashr_8):
   1022 	movdqa	(%rsi, %rcx), %xmm1
   1023 	movdqa	(%rdi, %rcx), %xmm2
   1024 	movdqa	%xmm2, %xmm4
   1025 
   1026 	palignr $8, %xmm3, %xmm2        /* merge into one 16byte value */
   1027 
   1028 	pcmpeqb	%xmm1, %xmm0
   1029 	pcmpeqb	%xmm2, %xmm1
   1030 	psubb	%xmm0, %xmm1
   1031 	pmovmskb %xmm1, %edx
   1032 	sub	$0xffff, %edx
   1033 	jnz	L(exit)
   1034 
   1035 #ifdef USE_AS_STRNCMP
   1036 	sub	$16, %r11
   1037 	jbe	L(strcmp_exitz)
   1038 #endif
   1039 
   1040 	add	$16, %rcx
   1041 	movdqa	%xmm4, %xmm3
   1042 
   1043 	add	$16, %r10
   1044 	jg	L(nibble_ashr_8)	/* cross page boundary */
   1045 
   1046 	movdqa	(%rsi, %rcx), %xmm1
   1047 	movdqa	(%rdi, %rcx), %xmm2
   1048 	movdqa	%xmm2, %xmm4
   1049 
   1050 	palignr $8, %xmm3, %xmm2        /* merge into one 16byte value */
   1051 
   1052 	pcmpeqb	%xmm1, %xmm0
   1053 	pcmpeqb	%xmm2, %xmm1
   1054 	psubb	%xmm0, %xmm1
   1055 	pmovmskb %xmm1, %edx
   1056 	sub	$0xffff, %edx
   1057 	jnz	L(exit)
   1058 
   1059 #ifdef USE_AS_STRNCMP
   1060 	sub	$16, %r11
   1061 	jbe	L(strcmp_exitz)
   1062 #endif
   1063 
   1064 	add	$16, %rcx
   1065 	movdqa	%xmm4, %xmm3
   1066 	jmp	L(loop_ashr_8)
   1067 
   1068 	.p2align 4
   1069 L(nibble_ashr_8):
   1070 	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
   1071 	pmovmskb %xmm0, %edx
   1072 	test	$0xff00, %edx
   1073 	jnz	L(ashr_8_exittail)
   1074 
   1075 #ifdef USE_AS_STRNCMP
   1076 	cmp	$7, %r11
   1077 	jbe	L(ashr_8_exittail)
   1078 #endif
   1079 
   1080 	pxor	%xmm0, %xmm0
   1081 	sub	$0x1000, %r10
   1082 	jmp	L(gobble_ashr_8)
   1083 
   1084 	.p2align 4
   1085 L(ashr_8_exittail):
   1086 	movdqa	(%rsi, %rcx), %xmm1
   1087 	psrldq	$8, %xmm0
   1088 	psrldq	$8, %xmm3
   1089 	jmp	L(aftertail)
   1090 
   1091 /*
   1092  *  The following cases will be handled by ashr_9
   1093  *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
   1094  *        n(7~15)          n - 7      	        8(15 +(n - 7) - n)         ashr_9
   1095  */
   1096 	.p2align 4
   1097 L(ashr_9):
   1098 	pxor	%xmm0, %xmm0
   1099 	movdqa	(%rdi), %xmm2
   1100 	movdqa	(%rsi), %xmm1
   1101 	pcmpeqb	%xmm1, %xmm0
   1102 	pslldq	$7, %xmm2
   1103 	pcmpeqb	%xmm1, %xmm2
   1104 	psubb	%xmm0, %xmm2
   1105 	pmovmskb %xmm2, %r9d
   1106 	shr	%cl, %edx
   1107 	shr	%cl, %r9d
   1108 	sub	%r9d, %edx
   1109 	jnz	L(less32bytes)
   1110 	movdqa	(%rdi), %xmm3
   1111 
   1112 	UPDATE_STRNCMP_COUNTER
   1113 
   1114 	pxor	%xmm0, %xmm0
   1115 	mov	$16, %rcx	/* index for loads */
   1116 	mov	$9, %r9d	/* byte position left over from less32bytes case */
   1117 	/*
   1118 	 * Setup %r10 value allows us to detect crossing a page boundary.
   1119 	 * When %r10 goes positive we have crossed a page boundary and
   1120 	 * need to do a nibble.
   1121 	 */
   1122 	lea	9(%rdi), %r10
   1123 	and	$0xfff, %r10	/* offset into 4K page */
   1124 	sub	$0x1000, %r10	/* subtract 4K pagesize */
   1125 
   1126 	.p2align 4
   1127 L(loop_ashr_9):
   1128 	add	$16, %r10
   1129 	jg	L(nibble_ashr_9)
   1130 
   1131 L(gobble_ashr_9):
   1132 	movdqa	(%rsi, %rcx), %xmm1
   1133 	movdqa	(%rdi, %rcx), %xmm2
   1134 	movdqa	%xmm2, %xmm4
   1135 
   1136 	palignr $9, %xmm3, %xmm2        /* merge into one 16byte value */
   1137 
   1138 	pcmpeqb	%xmm1, %xmm0
   1139 	pcmpeqb	%xmm2, %xmm1
   1140 	psubb	%xmm0, %xmm1
   1141 	pmovmskb %xmm1, %edx
   1142 	sub	$0xffff, %edx
   1143 	jnz	L(exit)
   1144 
   1145 #ifdef USE_AS_STRNCMP
   1146 	sub	$16, %r11
   1147 	jbe	L(strcmp_exitz)
   1148 #endif
   1149 
   1150 	add	$16, %rcx
   1151 	movdqa	%xmm4, %xmm3
   1152 
   1153 	add	$16, %r10
   1154 	jg	L(nibble_ashr_9)	/* cross page boundary */
   1155 
   1156 	movdqa	(%rsi, %rcx), %xmm1
   1157 	movdqa	(%rdi, %rcx), %xmm2
   1158 	movdqa	%xmm2, %xmm4
   1159 
   1160 	palignr $9, %xmm3, %xmm2        /* merge into one 16byte value */
   1161 
   1162 	pcmpeqb	%xmm1, %xmm0
   1163 	pcmpeqb	%xmm2, %xmm1
   1164 	psubb	%xmm0, %xmm1
   1165 	pmovmskb %xmm1, %edx
   1166 	sub	$0xffff, %edx
   1167 	jnz	L(exit)
   1168 
   1169 #ifdef USE_AS_STRNCMP
   1170 	sub	$16, %r11
   1171 	jbe	L(strcmp_exitz)
   1172 #endif
   1173 
   1174 	add	$16, %rcx
   1175 	movdqa	%xmm4, %xmm3		/* store for next cycle */
   1176 	jmp	L(loop_ashr_9)
   1177 
   1178 	.p2align 4
   1179 L(nibble_ashr_9):
   1180 	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
   1181 	pmovmskb %xmm0, %edx
   1182 	test	$0xfe00, %edx
   1183 	jnz	L(ashr_9_exittail)
   1184 
   1185 #ifdef USE_AS_STRNCMP
   1186 	cmp	$6, %r11
   1187 	jbe	L(ashr_9_exittail)
   1188 #endif
   1189 
   1190 	pxor	%xmm0, %xmm0
   1191 	sub	$0x1000, %r10
   1192 	jmp	L(gobble_ashr_9)
   1193 
   1194 	.p2align 4
   1195 L(ashr_9_exittail):
   1196 	movdqa	(%rsi, %rcx), %xmm1
   1197 	psrldq	$9, %xmm0
   1198 	psrldq	$9, %xmm3
   1199 	jmp	L(aftertail)
   1200 
   1201 /*
   1202  *  The following cases will be handled by ashr_10
   1203  *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
   1204  *        n(6~15)          n - 6      	        9(15 +(n - 6) - n)         ashr_10
   1205  */
   1206 	.p2align 4
   1207 L(ashr_10):
   1208 	pxor	%xmm0, %xmm0
   1209 	movdqa	(%rdi), %xmm2
   1210 	movdqa	(%rsi), %xmm1
   1211 	pcmpeqb	%xmm1, %xmm0
   1212 	pslldq	$6, %xmm2
   1213 	pcmpeqb	%xmm1, %xmm2
   1214 	psubb	%xmm0, %xmm2
   1215 	pmovmskb %xmm2, %r9d
   1216 	shr	%cl, %edx
   1217 	shr	%cl, %r9d
   1218 	sub	%r9d, %edx
   1219 	jnz	L(less32bytes)
   1220 	movdqa	(%rdi), %xmm3
   1221 
   1222 	UPDATE_STRNCMP_COUNTER
   1223 
   1224 	pxor	%xmm0, %xmm0
   1225 	mov	$16, %rcx	/* index for loads */
   1226 	mov	$10, %r9d	/* byte position left over from less32bytes case */
   1227 	/*
   1228 	 * Setup %r10 value allows us to detect crossing a page boundary.
   1229 	 * When %r10 goes positive we have crossed a page boundary and
   1230 	 * need to do a nibble.
   1231 	 */
   1232 	lea	10(%rdi), %r10
   1233 	and	$0xfff, %r10	/* offset into 4K page */
   1234 	sub	$0x1000, %r10	/* subtract 4K pagesize */
   1235 
   1236 	.p2align 4
   1237 L(loop_ashr_10):
   1238 	add	$16, %r10
   1239 	jg	L(nibble_ashr_10)
   1240 
   1241 L(gobble_ashr_10):
   1242 	movdqa	(%rsi, %rcx), %xmm1
   1243 	movdqa	(%rdi, %rcx), %xmm2
   1244 	movdqa	%xmm2, %xmm4
   1245 
   1246 	palignr $10, %xmm3, %xmm2        /* merge into one 16byte value */
   1247 
   1248 	pcmpeqb	%xmm1, %xmm0
   1249 	pcmpeqb	%xmm2, %xmm1
   1250 	psubb	%xmm0, %xmm1
   1251 	pmovmskb %xmm1, %edx
   1252 	sub	$0xffff, %edx
   1253 	jnz	L(exit)
   1254 
   1255 #ifdef USE_AS_STRNCMP
   1256 	sub	$16, %r11
   1257 	jbe	L(strcmp_exitz)
   1258 #endif
   1259 
   1260 	add	$16, %rcx
   1261 	movdqa	%xmm4, %xmm3
   1262 
   1263 	add	$16, %r10
   1264 	jg	L(nibble_ashr_10)	/* cross page boundary */
   1265 
   1266 	movdqa	(%rsi, %rcx), %xmm1
   1267 	movdqa	(%rdi, %rcx), %xmm2
   1268 	movdqa	%xmm2, %xmm4
   1269 
   1270 	palignr $10, %xmm3, %xmm2        /* merge into one 16byte value */
   1271 
   1272 	pcmpeqb	%xmm1, %xmm0
   1273 	pcmpeqb	%xmm2, %xmm1
   1274 	psubb	%xmm0, %xmm1
   1275 	pmovmskb %xmm1, %edx
   1276 	sub	$0xffff, %edx
   1277 	jnz	L(exit)
   1278 
   1279 #ifdef USE_AS_STRNCMP
   1280 	sub	$16, %r11
   1281 	jbe	L(strcmp_exitz)
   1282 #endif
   1283 
   1284 	add	$16, %rcx
   1285 	movdqa	%xmm4, %xmm3
   1286 	jmp	L(loop_ashr_10)
   1287 
   1288 	.p2align 4
   1289 L(nibble_ashr_10):
   1290 	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
   1291 	pmovmskb %xmm0, %edx
   1292 	test	$0xfc00, %edx
   1293 	jnz	L(ashr_10_exittail)
   1294 
   1295 #ifdef USE_AS_STRNCMP
   1296 	cmp	$5, %r11
   1297 	jbe	L(ashr_10_exittail)
   1298 #endif
   1299 
   1300 	pxor	%xmm0, %xmm0
   1301 	sub	$0x1000, %r10
   1302 	jmp	L(gobble_ashr_10)
   1303 
   1304 	.p2align 4
   1305 L(ashr_10_exittail):
   1306 	movdqa	(%rsi, %rcx), %xmm1
   1307 	psrldq	$10, %xmm0
   1308 	psrldq	$10, %xmm3
   1309 	jmp	L(aftertail)
   1310 
   1311 /*
   1312  *  The following cases will be handled by ashr_11
   1313  *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
   1314  *        n(5~15)          n - 5      	        10(15 +(n - 5) - n)         ashr_11
   1315  */
   1316 	.p2align 4
   1317 L(ashr_11):
   1318 	pxor	%xmm0, %xmm0
   1319 	movdqa	(%rdi), %xmm2
   1320 	movdqa	(%rsi), %xmm1
   1321 	pcmpeqb	%xmm1, %xmm0
   1322 	pslldq	$5, %xmm2
   1323 	pcmpeqb	%xmm1, %xmm2
   1324 	psubb	%xmm0, %xmm2
   1325 	pmovmskb %xmm2, %r9d
   1326 	shr	%cl, %edx
   1327 	shr	%cl, %r9d
   1328 	sub	%r9d, %edx
   1329 	jnz	L(less32bytes)
   1330 	movdqa	(%rdi), %xmm3
   1331 
   1332 	UPDATE_STRNCMP_COUNTER
   1333 
   1334 	pxor	%xmm0, %xmm0
   1335 	mov	$16, %rcx	/* index for loads */
   1336 	mov	$11, %r9d	/* byte position left over from less32bytes case */
   1337 	/*
   1338 	 * Setup %r10 value allows us to detect crossing a page boundary.
   1339 	 * When %r10 goes positive we have crossed a page boundary and
   1340 	 * need to do a nibble.
   1341 	 */
   1342 	lea	11(%rdi), %r10
   1343 	and	$0xfff, %r10	/* offset into 4K page */
   1344 	sub	$0x1000, %r10	/* subtract 4K pagesize */
   1345 
   1346 	.p2align 4
   1347 L(loop_ashr_11):
   1348 	add	$16, %r10
   1349 	jg	L(nibble_ashr_11)
   1350 
   1351 L(gobble_ashr_11):
   1352 	movdqa	(%rsi, %rcx), %xmm1
   1353 	movdqa	(%rdi, %rcx), %xmm2
   1354 	movdqa	%xmm2, %xmm4
   1355 
   1356 	palignr $11, %xmm3, %xmm2        /* merge into one 16byte value */
   1357 
   1358 	pcmpeqb	%xmm1, %xmm0
   1359 	pcmpeqb	%xmm2, %xmm1
   1360 	psubb	%xmm0, %xmm1
   1361 	pmovmskb %xmm1, %edx
   1362 	sub	$0xffff, %edx
   1363 	jnz	L(exit)
   1364 
   1365 #ifdef USE_AS_STRNCMP
   1366 	sub	$16, %r11
   1367 	jbe	L(strcmp_exitz)
   1368 #endif
   1369 
   1370 	add	$16, %rcx
   1371 	movdqa	%xmm4, %xmm3
   1372 
   1373 	add	$16, %r10
   1374 	jg	L(nibble_ashr_11)	/* cross page boundary */
   1375 
   1376 	movdqa	(%rsi, %rcx), %xmm1
   1377 	movdqa	(%rdi, %rcx), %xmm2
   1378 	movdqa	%xmm2, %xmm4
   1379 
   1380 	palignr $11, %xmm3, %xmm2        /* merge into one 16byte value */
   1381 
   1382 	pcmpeqb	%xmm1, %xmm0
   1383 	pcmpeqb	%xmm2, %xmm1
   1384 	psubb	%xmm0, %xmm1
   1385 	pmovmskb %xmm1, %edx
   1386 	sub	$0xffff, %edx
   1387 	jnz	L(exit)
   1388 
   1389 #ifdef USE_AS_STRNCMP
   1390 	sub	$16, %r11
   1391 	jbe	L(strcmp_exitz)
   1392 #endif
   1393 
   1394 	add	$16, %rcx
   1395 	movdqa	%xmm4, %xmm3
   1396 	jmp	L(loop_ashr_11)
   1397 
   1398 	.p2align 4
   1399 L(nibble_ashr_11):
   1400 	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
   1401 	pmovmskb %xmm0, %edx
   1402 	test	$0xf800, %edx
   1403 	jnz	L(ashr_11_exittail)
   1404 
   1405 #ifdef USE_AS_STRNCMP
   1406 	cmp	$4, %r11
   1407 	jbe	L(ashr_11_exittail)
   1408 #endif
   1409 
   1410 	pxor	%xmm0, %xmm0
   1411 	sub	$0x1000, %r10
   1412 	jmp	L(gobble_ashr_11)
   1413 
   1414 	.p2align 4
   1415 L(ashr_11_exittail):
   1416 	movdqa	(%rsi, %rcx), %xmm1
   1417 	psrldq	$11, %xmm0
   1418 	psrldq	$11, %xmm3
   1419 	jmp	L(aftertail)
   1420 
   1421 /*
   1422  *  The following cases will be handled by ashr_12
   1423  *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
   1424  *        n(4~15)          n - 4      	        11(15 +(n - 4) - n)         ashr_12
   1425  */
   1426 	.p2align 4
   1427 L(ashr_12):
   1428 	pxor	%xmm0, %xmm0
   1429 	movdqa	(%rdi), %xmm2
   1430 	movdqa	(%rsi), %xmm1
   1431 	pcmpeqb	%xmm1, %xmm0
   1432 	pslldq	$4, %xmm2
   1433 	pcmpeqb	%xmm1, %xmm2
   1434 	psubb	%xmm0, %xmm2
   1435 	pmovmskb %xmm2, %r9d
   1436 	shr	%cl, %edx
   1437 	shr	%cl, %r9d
   1438 	sub	%r9d, %edx
   1439 	jnz	L(less32bytes)
   1440 	movdqa	(%rdi), %xmm3
   1441 
   1442 	UPDATE_STRNCMP_COUNTER
   1443 
   1444 	pxor	%xmm0, %xmm0
   1445 	mov	$16, %rcx	/* index for loads */
   1446 	mov	$12, %r9d	/* byte position left over from less32bytes case */
   1447 	/*
   1448 	 * Setup %r10 value allows us to detect crossing a page boundary.
   1449 	 * When %r10 goes positive we have crossed a page boundary and
   1450 	 * need to do a nibble.
   1451 	 */
   1452 	lea	12(%rdi), %r10
   1453 	and	$0xfff, %r10	/* offset into 4K page */
   1454 	sub	$0x1000, %r10	/* subtract 4K pagesize */
   1455 
   1456 	.p2align 4
   1457 L(loop_ashr_12):
   1458 	add	$16, %r10
   1459 	jg	L(nibble_ashr_12)
   1460 
   1461 L(gobble_ashr_12):
   1462 	movdqa	(%rsi, %rcx), %xmm1
   1463 	movdqa	(%rdi, %rcx), %xmm2
   1464 	movdqa	%xmm2, %xmm4
   1465 
   1466 	palignr $12, %xmm3, %xmm2        /* merge into one 16byte value */
   1467 
   1468 	pcmpeqb	%xmm1, %xmm0
   1469 	pcmpeqb	%xmm2, %xmm1
   1470 	psubb	%xmm0, %xmm1
   1471 	pmovmskb %xmm1, %edx
   1472 	sub	$0xffff, %edx
   1473 	jnz	L(exit)
   1474 
   1475 #ifdef USE_AS_STRNCMP
   1476 	sub	$16, %r11
   1477 	jbe	L(strcmp_exitz)
   1478 #endif
   1479 
   1480 	add	$16, %rcx
   1481 	movdqa	%xmm4, %xmm3
   1482 
   1483 	add	$16, %r10
   1484 	jg	L(nibble_ashr_12)	/* cross page boundary */
   1485 
   1486 	movdqa	(%rsi, %rcx), %xmm1
   1487 	movdqa	(%rdi, %rcx), %xmm2
   1488 	movdqa	%xmm2, %xmm4
   1489 
   1490 	palignr $12, %xmm3, %xmm2        /* merge into one 16byte value */
   1491 
   1492 	pcmpeqb	%xmm1, %xmm0
   1493 	pcmpeqb	%xmm2, %xmm1
   1494 	psubb	%xmm0, %xmm1
   1495 	pmovmskb %xmm1, %edx
   1496 	sub	$0xffff, %edx
   1497 	jnz	L(exit)
   1498 
   1499 #ifdef USE_AS_STRNCMP
   1500 	sub	$16, %r11
   1501 	jbe	L(strcmp_exitz)
   1502 #endif
   1503 
   1504 	add	$16, %rcx
   1505 	movdqa	%xmm4, %xmm3
   1506 	jmp	L(loop_ashr_12)
   1507 
   1508 	.p2align 4
   1509 L(nibble_ashr_12):
   1510 	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
   1511 	pmovmskb %xmm0, %edx
   1512 	test	$0xf000, %edx
   1513 	jnz	L(ashr_12_exittail)
   1514 
   1515 #ifdef USE_AS_STRNCMP
   1516 	cmp	$3, %r11
   1517 	jbe	L(ashr_12_exittail)
   1518 #endif
   1519 
   1520 	pxor	%xmm0, %xmm0
   1521 	sub	$0x1000, %r10
   1522 	jmp	L(gobble_ashr_12)
   1523 
   1524 	.p2align 4
   1525 L(ashr_12_exittail):
   1526 	movdqa	(%rsi, %rcx), %xmm1
   1527 	psrldq	$12, %xmm0
   1528 	psrldq	$12, %xmm3
   1529 	jmp	L(aftertail)
   1530 
   1531 /*
   1532  *  The following cases will be handled by ashr_13
   1533  *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
   1534  *        n(3~15)          n - 3      	        12(15 +(n - 3) - n)         ashr_13
   1535  */
   1536 	.p2align 4
   1537 L(ashr_13):
   1538 	pxor	%xmm0, %xmm0
   1539 	movdqa	(%rdi), %xmm2
   1540 	movdqa	(%rsi), %xmm1
   1541 	pcmpeqb	%xmm1, %xmm0
   1542 	pslldq	$3, %xmm2
   1543 	pcmpeqb	%xmm1, %xmm2
   1544 	psubb	%xmm0, %xmm2
   1545 	pmovmskb %xmm2, %r9d
   1546 	shr	%cl, %edx
   1547 	shr	%cl, %r9d
   1548 	sub	%r9d, %edx
   1549 	jnz	L(less32bytes)
   1550 	movdqa	(%rdi), %xmm3
   1551 
   1552 	UPDATE_STRNCMP_COUNTER
   1553 
   1554 	pxor	%xmm0, %xmm0
   1555 	mov	$16, %rcx	/* index for loads */
   1556 	mov	$13, %r9d	/* byte position left over from less32bytes case */
   1557 	/*
   1558 	 * Setup %r10 value allows us to detect crossing a page boundary.
   1559 	 * When %r10 goes positive we have crossed a page boundary and
   1560 	 * need to do a nibble.
   1561 	 */
   1562 	lea	13(%rdi), %r10
   1563 	and	$0xfff, %r10	/* offset into 4K page */
   1564 	sub	$0x1000, %r10	/* subtract 4K pagesize */
   1565 
   1566 	.p2align 4
   1567 L(loop_ashr_13):
   1568 	add	$16, %r10
   1569 	jg	L(nibble_ashr_13)
   1570 
   1571 L(gobble_ashr_13):
   1572 	movdqa	(%rsi, %rcx), %xmm1
   1573 	movdqa	(%rdi, %rcx), %xmm2
   1574 	movdqa	%xmm2, %xmm4
   1575 
   1576 	palignr $13, %xmm3, %xmm2        /* merge into one 16byte value */
   1577 
   1578 	pcmpeqb	%xmm1, %xmm0
   1579 	pcmpeqb	%xmm2, %xmm1
   1580 	psubb	%xmm0, %xmm1
   1581 	pmovmskb %xmm1, %edx
   1582 	sub	$0xffff, %edx
   1583 	jnz	L(exit)
   1584 
   1585 #ifdef USE_AS_STRNCMP
   1586 	sub	$16, %r11
   1587 	jbe	L(strcmp_exitz)
   1588 #endif
   1589 
   1590 	add	$16, %rcx
   1591 	movdqa	%xmm4, %xmm3
   1592 
   1593 	add	$16, %r10
   1594 	jg	L(nibble_ashr_13)	/* cross page boundary */
   1595 
   1596 	movdqa	(%rsi, %rcx), %xmm1
   1597 	movdqa	(%rdi, %rcx), %xmm2
   1598 	movdqa	%xmm2, %xmm4
   1599 
   1600 	palignr $13, %xmm3, %xmm2        /* merge into one 16byte value */
   1601 
   1602 	pcmpeqb	%xmm1, %xmm0
   1603 	pcmpeqb	%xmm2, %xmm1
   1604 	psubb	%xmm0, %xmm1
   1605 	pmovmskb %xmm1, %edx
   1606 	sub	$0xffff, %edx
   1607 	jnz	L(exit)
   1608 
   1609 #ifdef USE_AS_STRNCMP
   1610 	sub	$16, %r11
   1611 	jbe	L(strcmp_exitz)
   1612 #endif
   1613 
   1614 	add	$16, %rcx
   1615 	movdqa	%xmm4, %xmm3
   1616 	jmp	L(loop_ashr_13)
   1617 
   1618 	.p2align 4
   1619 L(nibble_ashr_13):
   1620 	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
   1621 	pmovmskb %xmm0, %edx
   1622 	test	$0xe000, %edx
   1623 	jnz	L(ashr_13_exittail)
   1624 
   1625 #ifdef USE_AS_STRNCMP
   1626 	cmp	$2, %r11
   1627 	jbe	L(ashr_13_exittail)
   1628 #endif
   1629 
   1630 	pxor	%xmm0, %xmm0
   1631 	sub	$0x1000, %r10
   1632 	jmp	L(gobble_ashr_13)
   1633 
   1634 	.p2align 4
   1635 L(ashr_13_exittail):
   1636 	movdqa	(%rsi, %rcx), %xmm1
   1637 	psrldq  $13, %xmm0
   1638 	psrldq  $13, %xmm3
   1639 	jmp	L(aftertail)
   1640 
   1641 /*
   1642  *  The following cases will be handled by ashr_14
   1643  *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
   1644  *        n(2~15)          n - 2      	        13(15 +(n - 2) - n)         ashr_14
   1645  */
   1646 	.p2align 4
   1647 L(ashr_14):
   1648 	pxor	%xmm0, %xmm0
   1649 	movdqa	(%rdi), %xmm2
   1650 	movdqa	(%rsi), %xmm1
   1651 	pcmpeqb	%xmm1, %xmm0
   1652 	pslldq  $2, %xmm2
   1653 	pcmpeqb	%xmm1, %xmm2
   1654 	psubb	%xmm0, %xmm2
   1655 	pmovmskb %xmm2, %r9d
   1656 	shr	%cl, %edx
   1657 	shr	%cl, %r9d
   1658 	sub	%r9d, %edx
   1659 	jnz	L(less32bytes)
   1660 	movdqa	(%rdi), %xmm3
   1661 
   1662 	UPDATE_STRNCMP_COUNTER
   1663 
   1664 	pxor	%xmm0, %xmm0
   1665 	mov	$16, %rcx	/* index for loads */
   1666 	mov	$14, %r9d	/* byte position left over from less32bytes case */
   1667 	/*
   1668 	 * Setup %r10 value allows us to detect crossing a page boundary.
   1669 	 * When %r10 goes positive we have crossed a page boundary and
   1670 	 * need to do a nibble.
   1671 	 */
   1672 	lea	14(%rdi), %r10
   1673 	and	$0xfff, %r10	/* offset into 4K page */
   1674 	sub	$0x1000, %r10	/* subtract 4K pagesize */
   1675 
   1676 	.p2align 4
   1677 L(loop_ashr_14):
   1678 	add	$16, %r10
   1679 	jg	L(nibble_ashr_14)
   1680 
   1681 L(gobble_ashr_14):
   1682 	movdqa	(%rsi, %rcx), %xmm1
   1683 	movdqa	(%rdi, %rcx), %xmm2
   1684 	movdqa	%xmm2, %xmm4
   1685 
   1686 	palignr $14, %xmm3, %xmm2        /* merge into one 16byte value */
   1687 
   1688 	pcmpeqb	%xmm1, %xmm0
   1689 	pcmpeqb	%xmm2, %xmm1
   1690 	psubb	%xmm0, %xmm1
   1691 	pmovmskb %xmm1, %edx
   1692 	sub	$0xffff, %edx
   1693 	jnz	L(exit)
   1694 
   1695 #ifdef USE_AS_STRNCMP
   1696 	sub	$16, %r11
   1697 	jbe	L(strcmp_exitz)
   1698 #endif
   1699 
   1700 	add	$16, %rcx
   1701 	movdqa	%xmm4, %xmm3
   1702 
   1703 	add	$16, %r10
   1704 	jg	L(nibble_ashr_14)	/* cross page boundary */
   1705 
   1706 	movdqa	(%rsi, %rcx), %xmm1
   1707 	movdqa	(%rdi, %rcx), %xmm2
   1708 	movdqa	%xmm2, %xmm4
   1709 
   1710 	palignr $14, %xmm3, %xmm2        /* merge into one 16byte value */
   1711 
   1712 	pcmpeqb	%xmm1, %xmm0
   1713 	pcmpeqb	%xmm2, %xmm1
   1714 	psubb	%xmm0, %xmm1
   1715 	pmovmskb %xmm1, %edx
   1716 	sub	$0xffff, %edx
   1717 	jnz	L(exit)
   1718 
   1719 #ifdef USE_AS_STRNCMP
   1720 	sub	$16, %r11
   1721 	jbe	L(strcmp_exitz)
   1722 #endif
   1723 
   1724 	add	$16, %rcx
   1725 	movdqa	%xmm4, %xmm3
   1726 	jmp	L(loop_ashr_14)
   1727 
   1728 	.p2align 4
   1729 L(nibble_ashr_14):
   1730 	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
   1731 	pmovmskb %xmm0, %edx
   1732 	test	$0xc000, %edx
   1733 	jnz	L(ashr_14_exittail)
   1734 
   1735 #ifdef USE_AS_STRNCMP
   1736 	cmp	$1, %r11
   1737 	jbe	L(ashr_14_exittail)
   1738 #endif
   1739 
   1740 	pxor	%xmm0, %xmm0
   1741 	sub	$0x1000, %r10
   1742 	jmp	L(gobble_ashr_14)
   1743 
   1744 	.p2align 4
   1745 L(ashr_14_exittail):
   1746 	movdqa	(%rsi, %rcx), %xmm1
   1747 	psrldq	$14, %xmm0
   1748 	psrldq	$14, %xmm3
   1749 	jmp	L(aftertail)
   1750 
   1751 /*
   1752  *  The following cases will be handled by ashr_15
   1753  *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
   1754  *        n(1~15)          n - 1      	        14(15 +(n - 1) - n)         ashr_15
   1755  */
   1756 	.p2align 4
   1757 L(ashr_15):
   1758 	pxor	%xmm0, %xmm0
   1759 	movdqa	(%rdi), %xmm2
   1760 	movdqa	(%rsi), %xmm1
   1761 	pcmpeqb	%xmm1, %xmm0
   1762 	pslldq	$1, %xmm2
   1763 	pcmpeqb	%xmm1, %xmm2
   1764 	psubb	%xmm0, %xmm2
   1765 	pmovmskb %xmm2, %r9d
   1766 	shr	%cl, %edx
   1767 	shr	%cl, %r9d
   1768 	sub	%r9d, %edx
   1769 	jnz	L(less32bytes)
   1770 
   1771 	movdqa	(%rdi), %xmm3
   1772 
   1773 	UPDATE_STRNCMP_COUNTER
   1774 
   1775 	pxor	%xmm0, %xmm0
   1776 	mov	$16, %rcx	/* index for loads */
   1777 	mov	$15, %r9d	/* byte position left over from less32bytes case */
   1778 	/*
   1779 	 * Setup %r10 value allows us to detect crossing a page boundary.
   1780 	 * When %r10 goes positive we have crossed a page boundary and
   1781 	 * need to do a nibble.
   1782 	 */
   1783 	lea	15(%rdi), %r10
   1784 	and	$0xfff, %r10	/* offset into 4K page */
   1785 
   1786 	sub	$0x1000, %r10	/* subtract 4K pagesize */
   1787 
   1788 	.p2align 4
   1789 L(loop_ashr_15):
   1790 	add	$16, %r10
   1791 	jg	L(nibble_ashr_15)
   1792 
   1793 L(gobble_ashr_15):
   1794 	movdqa	(%rsi, %rcx), %xmm1
   1795 	movdqa	(%rdi, %rcx), %xmm2
   1796 	movdqa	%xmm2, %xmm4
   1797 
   1798 	palignr $15, %xmm3, %xmm2        /* merge into one 16byte value */
   1799 
   1800 	pcmpeqb	%xmm1, %xmm0
   1801 	pcmpeqb	%xmm2, %xmm1
   1802 	psubb	%xmm0, %xmm1
   1803 	pmovmskb %xmm1, %edx
   1804 	sub	$0xffff, %edx
   1805 	jnz	L(exit)
   1806 
   1807 #ifdef USE_AS_STRNCMP
   1808 	sub	$16, %r11
   1809 	jbe	L(strcmp_exitz)
   1810 #endif
   1811 
   1812 	add	$16, %rcx
   1813 	movdqa	%xmm4, %xmm3
   1814 
   1815 	add	$16, %r10
   1816 	jg	L(nibble_ashr_15)	/* cross page boundary */
   1817 
   1818 	movdqa	(%rsi, %rcx), %xmm1
   1819 	movdqa	(%rdi, %rcx), %xmm2
   1820 	movdqa	%xmm2, %xmm4
   1821 
   1822 	palignr $15, %xmm3, %xmm2        /* merge into one 16byte value */
   1823 
   1824 	pcmpeqb	%xmm1, %xmm0
   1825 	pcmpeqb	%xmm2, %xmm1
   1826 	psubb	%xmm0, %xmm1
   1827 	pmovmskb %xmm1, %edx
   1828 	sub	$0xffff, %edx
   1829 	jnz	L(exit)
   1830 
   1831 #ifdef USE_AS_STRNCMP
   1832 	sub	$16, %r11
   1833 	jbe	L(strcmp_exitz)
   1834 #endif
   1835 
   1836 	add	$16, %rcx
   1837 	movdqa	%xmm4, %xmm3
   1838 	jmp	L(loop_ashr_15)
   1839 
   1840 	.p2align 4
   1841 L(nibble_ashr_15):
   1842 	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
   1843 	pmovmskb %xmm0, %edx
   1844 	test	$0x8000, %edx
   1845 	jnz	L(ashr_15_exittail)
   1846 
   1847 #ifdef USE_AS_STRNCMP
   1848 	test	%r11, %r11
   1849 	je	L(ashr_15_exittail)
   1850 #endif
   1851 
   1852 	pxor	%xmm0, %xmm0
   1853 	sub	$0x1000, %r10
   1854 	jmp	L(gobble_ashr_15)
   1855 
   1856 	.p2align 4
   1857 L(ashr_15_exittail):
   1858 	movdqa	(%rsi, %rcx), %xmm1
   1859 	psrldq	$15, %xmm3
   1860 	psrldq	$15, %xmm0
   1861 
   1862 	.p2align 4
   1863 L(aftertail):
   1864 	pcmpeqb	%xmm3, %xmm1
   1865 	psubb	%xmm0, %xmm1
   1866 	pmovmskb %xmm1, %edx
   1867 	not	%edx
   1868 
   1869 	.p2align 4
   1870 L(exit):
   1871 	lea	-16(%r9, %rcx), %rax	/* locate the exact offset for rdi */
   1872 L(less32bytes):
   1873 	lea	(%rdi, %rax), %rdi	/* locate the exact address for first operand(rdi) */
   1874 	lea	(%rsi, %rcx), %rsi	/* locate the exact address for second operand(rsi) */
   1875 	test	%r8d, %r8d
   1876 	jz	L(ret)
   1877 	xchg	%rsi, %rdi		/* recover original order according to flag(%r8d) */
   1878 
   1879 	.p2align 4
   1880 L(ret):
   1881 L(less16bytes):
   1882 	bsf	%rdx, %rdx		/* find and store bit index in %rdx */
   1883 
   1884 #ifdef USE_AS_STRNCMP
   1885 	sub	%rdx, %r11
   1886 	jbe	L(strcmp_exitz)
   1887 #endif
   1888 	movzbl	(%rsi, %rdx), %ecx
   1889 	movzbl	(%rdi, %rdx), %eax
   1890 
   1891 	sub	%ecx, %eax
   1892 	ret
   1893 
   1894 L(strcmp_exitz):
   1895 	xor	%eax, %eax
   1896 	ret
   1897 
   1898 	.p2align 4
   1899 L(Byte0):
   1900 	movzbl	(%rsi), %ecx
   1901 	movzbl	(%rdi), %eax
   1902 
   1903 	sub	%ecx, %eax
   1904 	ret
   1905 END (STRCMP)
   1906 
   1907 	.section .rodata,"a",@progbits
   1908 	.p2align 3
   1909 L(unaligned_table):
   1910 	.int	L(ashr_1) - L(unaligned_table)
   1911 	.int	L(ashr_2) - L(unaligned_table)
   1912 	.int	L(ashr_3) - L(unaligned_table)
   1913 	.int	L(ashr_4) - L(unaligned_table)
   1914 	.int	L(ashr_5) - L(unaligned_table)
   1915 	.int	L(ashr_6) - L(unaligned_table)
   1916 	.int	L(ashr_7) - L(unaligned_table)
   1917 	.int	L(ashr_8) - L(unaligned_table)
   1918 	.int	L(ashr_9) - L(unaligned_table)
   1919 	.int	L(ashr_10) - L(unaligned_table)
   1920 	.int	L(ashr_11) - L(unaligned_table)
   1921 	.int	L(ashr_12) - L(unaligned_table)
   1922 	.int	L(ashr_13) - L(unaligned_table)
   1923 	.int	L(ashr_14) - L(unaligned_table)
   1924 	.int	L(ashr_15) - L(unaligned_table)
   1925 	.int	L(ashr_0) - L(unaligned_table)
   1926