Home | History | Annotate | Download | only in string
      1 /*
      2 Copyright (c) 2011, Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 /* Optimized strlcat with SSSE3 */
     32 
     33 #ifndef cfi_startproc
     34 # define cfi_startproc	.cfi_startproc
     35 #endif
     36 
     37 #ifndef cfi_endproc
     38 # define cfi_endproc	.cfi_endproc
     39 #endif
     40 
     41 #ifndef cfi_rel_offset
     42 # define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
     43 #endif
     44 
     45 #ifndef cfi_restore
     46 # define cfi_restore(reg)	.cfi_restore reg
     47 #endif
     48 
     49 #ifndef cfi_adjust_cfa_offset
     50 # define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
     51 #endif
     52 
     53 #ifndef ENTRY
     54 # define ENTRY(name)	\
     55 	.type name,  @function;	\
     56 	.globl name;	\
     57 	.p2align 4;	\
     58 name:	\
     59 	cfi_startproc
     60 #endif
     61 
     62 #ifndef END
     63 # define END(name)	\
     64 	cfi_endproc;	\
     65 	.size name, .-name
     66 #endif
     67 
     68 #define CFI_PUSH(REG)	\
     69 	cfi_adjust_cfa_offset (4);	\
     70 	cfi_rel_offset (REG, 0)
     71 
     72 #define CFI_POP(REG)	\
     73 	cfi_adjust_cfa_offset (-4);	\
     74 	cfi_restore (REG)
     75 
     76 #define PUSH(REG)	pushl	REG;	CFI_PUSH (REG)
     77 #define POP(REG)	popl	REG;	CFI_POP (REG)
     78 #define L(label)	.L##Prolog_##label
     79 
     80 #define DST	4
     81 #define SRC	DST+8
     82 #define LEN	SRC+4
     83 
     84 	.text
     85 ENTRY (strlcat_ssse3)
     86 	mov	DST(%esp), %edx
     87 	PUSH	(%ebx)
     88 	mov	LEN(%esp), %ebx
     89 	sub	$4, %ebx
     90 	jbe	L(len_less4_prolog)
     91 
     92 #define RETURN	jmp	L(StrcpyStep)
     93 #define edi	ebx
     94 
     95 #define USE_AS_STRNLEN
     96 #define USE_AS_STRCAT
     97 #define USE_AS_STRLCAT
     98 
     99 #include "sse2-strlen-atom.S"
    100 
    101 	.p2align 4
    102 L(StrcpyStep):
    103 
    104 #undef edi
    105 #undef L
    106 #define L(label) .L##label
    107 #undef RETURN
    108 #define RETURN	POP (%ebx); ret; CFI_PUSH (%ebx);
    109 #define RETURN1	POP (%edi); POP (%ebx); ret; CFI_PUSH (%ebx); CFI_PUSH (%edi)
    110 
    111         movl	SRC(%esp), %ecx
    112 	movl	LEN(%esp), %ebx
    113 
    114 	cmp	%eax, %ebx
    115 	je	L(CalculateLengthOfSrcProlog)
    116 	sub	%eax, %ebx
    117 
    118 	test	%ebx, %ebx
    119 	jz	L(CalculateLengthOfSrcProlog)
    120 
    121 	mov	DST + 4(%esp), %edx
    122 
    123 	PUSH	(%edi)
    124 	add	%eax, %edx
    125 	mov	%ecx, %edi
    126 	sub	%eax, %edi
    127 
    128 	cmp	$8, %ebx
    129 	jbe	L(StrncpyExit8Bytes)
    130 
    131 	cmpb	$0, (%ecx)
    132 	jz	L(Exit1)
    133 	cmpb	$0, 1(%ecx)
    134 	jz	L(Exit2)
    135 	cmpb	$0, 2(%ecx)
    136 	jz	L(Exit3)
    137 	cmpb	$0, 3(%ecx)
    138 	jz	L(Exit4)
    139 	cmpb	$0, 4(%ecx)
    140 	jz	L(Exit5)
    141 	cmpb	$0, 5(%ecx)
    142 	jz	L(Exit6)
    143 	cmpb	$0, 6(%ecx)
    144 	jz	L(Exit7)
    145 	cmpb	$0, 7(%ecx)
    146 	jz	L(Exit8)
    147 	cmp	$16, %ebx
    148 	jb	L(StrncpyExit15Bytes)
    149 	cmpb	$0, 8(%ecx)
    150 	jz	L(Exit9)
    151 	cmpb	$0, 9(%ecx)
    152 	jz	L(Exit10)
    153 	cmpb	$0, 10(%ecx)
    154 	jz	L(Exit11)
    155 	cmpb	$0, 11(%ecx)
    156 	jz	L(Exit12)
    157 	cmpb	$0, 12(%ecx)
    158 	jz	L(Exit13)
    159 	cmpb	$0, 13(%ecx)
    160 	jz	L(Exit14)
    161 	cmpb	$0, 14(%ecx)
    162 	jz	L(Exit15)
    163 	cmpb	$0, 15(%ecx)
    164 	jz	L(Exit16)
    165 	cmp	$16, %ebx
    166 	je	L(StrlcpyExit16)
    167 
    168 #define USE_AS_STRNCPY
    169 #include "ssse3-strcpy-atom.S"
    170 
    171 	.p2align 4
    172 L(CopyFrom1To16Bytes):
    173 	add	%esi, %edx
    174 	add	%esi, %ecx
    175 
    176 	POP	(%esi)
    177 	test	%al, %al
    178 	jz	L(ExitHigh8)
    179 
    180 L(CopyFrom1To16BytesLess8):
    181 	mov	%al, %ah
    182 	and	$15, %ah
    183 	jz	L(ExitHigh4)
    184 
    185 	test	$0x01, %al
    186 	jnz	L(Exit1)
    187 	test	$0x02, %al
    188 	jnz	L(Exit2)
    189 	test	$0x04, %al
    190 	jnz	L(Exit3)
    191 L(Exit4):
    192 	movl	(%ecx), %eax
    193 	movl	%eax, (%edx)
    194 
    195 	lea	3(%ecx), %eax
    196 	sub	%edi, %eax
    197 	RETURN1
    198 
    199 	.p2align 4
    200 L(ExitHigh4):
    201 	test	$0x10, %al
    202 	jnz	L(Exit5)
    203 	test	$0x20, %al
    204 	jnz	L(Exit6)
    205 	test	$0x40, %al
    206 	jnz	L(Exit7)
    207 L(Exit8):
    208 	movlpd	(%ecx), %xmm0
    209 	movlpd	%xmm0, (%edx)
    210 
    211 	lea	7(%ecx), %eax
    212 	sub	%edi, %eax
    213 	RETURN1
    214 
    215 	.p2align 4
    216 L(ExitHigh8):
    217 	mov	%ah, %al
    218 	and	$15, %al
    219 	jz	L(ExitHigh12)
    220 
    221 	test	$0x01, %ah
    222 	jnz	L(Exit9)
    223 	test	$0x02, %ah
    224 	jnz	L(Exit10)
    225 	test	$0x04, %ah
    226 	jnz	L(Exit11)
    227 L(Exit12):
    228 	movlpd	(%ecx), %xmm0
    229 	movlpd	%xmm0, (%edx)
    230 	movl	8(%ecx), %eax
    231 	movl	%eax, 8(%edx)
    232 
    233 	lea	11(%ecx), %eax
    234 	sub	%edi, %eax
    235 	RETURN1
    236 
    237 	.p2align 4
    238 L(ExitHigh12):
    239 	test	$0x10, %ah
    240 	jnz	L(Exit13)
    241 	test	$0x20, %ah
    242 	jnz	L(Exit14)
    243 	test	$0x40, %ah
    244 	jnz	L(Exit15)
    245 L(Exit16):
    246 	movlpd	(%ecx), %xmm0
    247 	movlpd	8(%ecx), %xmm1
    248 	movlpd	%xmm0, (%edx)
    249 	movlpd	%xmm1, 8(%edx)
    250 
    251 	lea	15(%ecx), %eax
    252 	sub	%edi, %eax
    253 	RETURN1
    254 
    255 	CFI_PUSH(%esi)
    256 
    257 	.p2align 4
    258 L(CopyFrom1To16BytesCase2):
    259 	add	$16, %ebx
    260 	add	%esi, %ecx
    261 	add	%esi, %edx
    262 
    263 	POP	(%esi)
    264 
    265 	test	%al, %al
    266 	jz	L(ExitHighCase2)
    267 
    268 	cmp	$8, %ebx
    269 	ja	L(CopyFrom1To16BytesLess8)
    270 
    271 	test	$0x01, %al
    272 	jnz	L(Exit1)
    273 	cmp	$1, %ebx
    274 	je	L(StrlcpyExit1)
    275 	test	$0x02, %al
    276 	jnz	L(Exit2)
    277 	cmp	$2, %ebx
    278 	je	L(StrlcpyExit2)
    279 	test	$0x04, %al
    280 	jnz	L(Exit3)
    281 	cmp	$3, %ebx
    282 	je	L(StrlcpyExit3)
    283 	test	$0x08, %al
    284 	jnz	L(Exit4)
    285 	cmp	$4, %ebx
    286 	je	L(StrlcpyExit4)
    287 	test	$0x10, %al
    288 	jnz	L(Exit5)
    289 	cmp	$5, %ebx
    290 	je	L(StrlcpyExit5)
    291 	test	$0x20, %al
    292 	jnz	L(Exit6)
    293 	cmp	$6, %ebx
    294 	je	L(StrlcpyExit6)
    295 	test	$0x40, %al
    296 	jnz	L(Exit7)
    297 	cmp	$7, %ebx
    298 	je	L(StrlcpyExit7)
    299 	test	$0x80, %al
    300 	jnz	L(Exit8)
    301 	jmp	L(StrlcpyExit8)
    302 
    303 	.p2align 4
    304 L(ExitHighCase2):
    305 	cmp	$8, %ebx
    306 	jbe	L(CopyFrom1To16BytesLess8Case3)
    307 
    308 	test	$0x01, %ah
    309 	jnz	L(Exit9)
    310 	cmp	$9, %ebx
    311 	je	L(StrlcpyExit9)
    312 	test	$0x02, %ah
    313 	jnz	L(Exit10)
    314 	cmp	$10, %ebx
    315 	je	L(StrlcpyExit10)
    316 	test	$0x04, %ah
    317 	jnz	L(Exit11)
    318 	cmp	$11, %ebx
    319 	je	L(StrlcpyExit11)
    320 	test	$0x8, %ah
    321 	jnz	L(Exit12)
    322 	cmp	$12, %ebx
    323 	je	L(StrlcpyExit12)
    324 	test	$0x10, %ah
    325 	jnz	L(Exit13)
    326 	cmp	$13, %ebx
    327 	je	L(StrlcpyExit13)
    328 	test	$0x20, %ah
    329 	jnz	L(Exit14)
    330 	cmp	$14, %ebx
    331 	je	L(StrlcpyExit14)
    332 	test	$0x40, %ah
    333 	jnz	L(Exit15)
    334 	cmp	$15, %ebx
    335 	je	L(StrlcpyExit15)
    336 	test	$0x80, %ah
    337 	jnz	L(Exit16)
    338 	jmp	L(StrlcpyExit16)
    339 
    340 	CFI_PUSH(%esi)
    341 
    342 	.p2align 4
    343 L(CopyFrom1To16BytesCase2OrCase3):
    344 	test	%eax, %eax
    345 	jnz	L(CopyFrom1To16BytesCase2)
    346 
    347 	.p2align 4
    348 L(CopyFrom1To16BytesCase3):
    349 	add	$16, %ebx
    350 	add	%esi, %edx
    351 	add	%esi, %ecx
    352 
    353 	POP	(%esi)
    354 
    355 	cmp	$8, %ebx
    356 	ja	L(ExitHigh8Case3)
    357 
    358 L(CopyFrom1To16BytesLess8Case3):
    359 	cmp	$4, %ebx
    360 	ja	L(ExitHigh4Case3)
    361 
    362 	cmp	$1, %ebx
    363 	je	L(StrlcpyExit1)
    364 	cmp	$2, %ebx
    365 	je	L(StrlcpyExit2)
    366 	cmp	$3, %ebx
    367 	je	L(StrlcpyExit3)
    368 L(StrlcpyExit4):
    369 	movb	%bh, 3(%edx)
    370 	movw	(%ecx), %ax
    371 	movw	%ax, (%edx)
    372 	movb	2(%ecx), %al
    373 	movb	%al, 2(%edx)
    374 
    375 	lea	4(%ecx), %edx
    376 	mov	%edi, %ecx
    377 	POP	(%edi)
    378 	jmp	L(CalculateLengthOfSrc)
    379         CFI_PUSH     (%edi)
    380 
    381 	.p2align 4
    382 L(ExitHigh4Case3):
    383 	cmp	$5, %ebx
    384 	je	L(StrlcpyExit5)
    385 	cmp	$6, %ebx
    386 	je	L(StrlcpyExit6)
    387 	cmp	$7, %ebx
    388 	je	L(StrlcpyExit7)
    389 L(StrlcpyExit8):
    390 	movb	%bh, 7(%edx)
    391 	movl	(%ecx), %eax
    392 	movl	%eax, (%edx)
    393 	movl	3(%ecx), %eax
    394 	movl	%eax, 3(%edx)
    395 
    396 	lea	8(%ecx), %edx
    397 	mov	%edi, %ecx
    398 	POP	(%edi)
    399 	jmp	L(CalculateLengthOfSrc)
    400         CFI_PUSH     (%edi)
    401 
    402 	.p2align 4
    403 L(ExitHigh8Case3):
    404 	cmp	$12, %ebx
    405 	ja	L(ExitHigh12Case3)
    406 
    407 	cmp	$9, %ebx
    408 	je	L(StrlcpyExit9)
    409 	cmp	$10, %ebx
    410 	je	L(StrlcpyExit10)
    411 	cmp	$11, %ebx
    412 	je	L(StrlcpyExit11)
    413 L(StrlcpyExit12):
    414 	movb	%bh, 11(%edx)
    415 	movlpd	(%ecx), %xmm0
    416 	movlpd	%xmm0, (%edx)
    417 	movl	7(%ecx), %eax
    418 	movl	%eax, 7(%edx)
    419 
    420 	lea	12(%ecx), %edx
    421 	mov	%edi, %ecx
    422 	POP	(%edi)
    423 	jmp	L(CalculateLengthOfSrc)
    424         CFI_PUSH     (%edi)
    425 
    426 	.p2align 4
    427 L(ExitHigh12Case3):
    428 	cmp	$13, %ebx
    429 	je	L(StrlcpyExit13)
    430 	cmp	$14, %ebx
    431 	je	L(StrlcpyExit14)
    432 	cmp	$15, %ebx
    433 	je	L(StrlcpyExit15)
    434 L(StrlcpyExit16):
    435 	movb	%bh, 15(%edx)
    436 	movlpd	(%ecx), %xmm0
    437 	movlpd	%xmm0, (%edx)
    438 	movlpd	7(%ecx), %xmm0
    439 	movlpd	%xmm0, 7(%edx)
    440 
    441 	lea	16(%ecx), %edx
    442 	mov	%edi, %ecx
    443 	POP	(%edi)
    444 	jmp	L(CalculateLengthOfSrc)
    445         CFI_PUSH     (%edi)
    446 
    447 	.p2align 4
    448 L(StrlcpyExit1):
    449 	movb	%bh, (%edx)
    450 
    451 	lea	1(%ecx), %edx
    452 	mov	%edi, %ecx
    453 	POP	(%edi)
    454 	jmp	L(CalculateLengthOfSrc)
    455         CFI_PUSH     (%edi)
    456 
    457 	.p2align 4
    458 L(Exit1):
    459 	movb	(%ecx), %al
    460 	movb	%al, (%edx)
    461 
    462 	mov	%ecx, %eax
    463 	sub	%edi, %eax
    464 	RETURN1
    465 
    466 	.p2align 4
    467 L(StrlcpyExit2):
    468 	movb	%bh, 1(%edx)
    469 	movb	(%ecx), %al
    470 	movb	%al, (%edx)
    471 
    472 	lea	2(%ecx), %edx
    473 	mov	%edi, %ecx
    474 	POP	(%edi)
    475 	jmp	L(CalculateLengthOfSrc)
    476         CFI_PUSH     (%edi)
    477 
    478 	.p2align 4
    479 L(Exit2):
    480 	movw	(%ecx), %ax
    481 	movw	%ax, (%edx)
    482 	movl	%edi, %eax
    483 
    484 	lea	1(%ecx), %eax
    485 	sub	%edi, %eax
    486 	RETURN1
    487 
    488 	.p2align 4
    489 L(StrlcpyExit3):
    490 	movb	%bh, 2(%edx)
    491 	movw	(%ecx), %ax
    492 	movw	%ax, (%edx)
    493 
    494 	lea	3(%ecx), %edx
    495 	mov	%edi, %ecx
    496 	POP	(%edi)
    497 	jmp	L(CalculateLengthOfSrc)
    498         CFI_PUSH     (%edi)
    499 
    500 	.p2align 4
    501 L(Exit3):
    502 	movw	(%ecx), %ax
    503 	movw	%ax, (%edx)
    504 	movb	2(%ecx), %al
    505 	movb	%al, 2(%edx)
    506 
    507 	lea	2(%ecx), %eax
    508 	sub	%edi, %eax
    509 	RETURN1
    510 
    511 	.p2align 4
    512 L(StrlcpyExit5):
    513 	movb	%bh, 4(%edx)
    514 	movl	(%ecx), %eax
    515 	movl	%eax, (%edx)
    516 	movl	%edi, %eax
    517 
    518 	lea	5(%ecx), %edx
    519 	mov	%edi, %ecx
    520 	POP	(%edi)
    521 	jmp	L(CalculateLengthOfSrc)
    522         CFI_PUSH     (%edi)
    523 
    524 	.p2align 4
    525 L(Exit5):
    526 	movl	(%ecx), %eax
    527 	movl	%eax, (%edx)
    528 	movb	4(%ecx), %al
    529 	movb	%al, 4(%edx)
    530 
    531 	lea	4(%ecx), %eax
    532 	sub	%edi, %eax
    533 	RETURN1
    534 
    535 	.p2align 4
    536 L(StrlcpyExit6):
    537 	movb	%bh, 5(%edx)
    538 	movl	(%ecx), %eax
    539 	movl	%eax, (%edx)
    540 	movb	4(%ecx), %al
    541 	movb	%al, 4(%edx)
    542 
    543 	lea	6(%ecx), %edx
    544 	mov	%edi, %ecx
    545 	POP	(%edi)
    546 	jmp	L(CalculateLengthOfSrc)
    547         CFI_PUSH     (%edi)
    548 
    549 	.p2align 4
    550 L(Exit6):
    551 	movl	(%ecx), %eax
    552 	movl	%eax, (%edx)
    553 	movw	4(%ecx), %ax
    554 	movw	%ax, 4(%edx)
    555 
    556 	lea	5(%ecx), %eax
    557 	sub	%edi, %eax
    558 	RETURN1
    559 
    560 	.p2align 4
    561 L(StrlcpyExit7):
    562 	movb	%bh, 6(%edx)
    563 	movl	(%ecx), %eax
    564 	movl	%eax, (%edx)
    565 	movw	4(%ecx), %ax
    566 	movw	%ax, 4(%edx)
    567 
    568 	lea	7(%ecx), %edx
    569 	mov	%edi, %ecx
    570 	POP	(%edi)
    571 	jmp	L(CalculateLengthOfSrc)
    572         CFI_PUSH     (%edi)
    573 
    574 	.p2align 4
    575 L(Exit7):
    576 	movl	(%ecx), %eax
    577 	movl	%eax, (%edx)
    578 	movl	3(%ecx), %eax
    579 	movl	%eax, 3(%edx)
    580 
    581 	lea	6(%ecx), %eax
    582 	sub	%edi, %eax
    583 	RETURN1
    584 
    585 	.p2align 4
    586 L(StrlcpyExit9):
    587 	movb	%bh, 8(%edx)
    588 	movlpd	(%ecx), %xmm0
    589 	movlpd	%xmm0, (%edx)
    590 
    591 	lea	9(%ecx), %edx
    592 	mov	%edi, %ecx
    593 	POP	(%edi)
    594 	jmp	L(CalculateLengthOfSrc)
    595         CFI_PUSH     (%edi)
    596 
    597 	.p2align 4
    598 L(Exit9):
    599 	movlpd	(%ecx), %xmm0
    600 	movlpd	%xmm0, (%edx)
    601 	movb	8(%ecx), %al
    602 	movb	%al, 8(%edx)
    603 
    604 	lea	8(%ecx), %eax
    605 	sub	%edi, %eax
    606 	RETURN1
    607 
    608 	.p2align 4
    609 L(StrlcpyExit10):
    610 	movb	%bh, 9(%edx)
    611 	movlpd	(%ecx), %xmm0
    612 	movlpd	%xmm0, (%edx)
    613 	movb	8(%ecx), %al
    614 	movb	%al, 8(%edx)
    615 
    616 	lea	10(%ecx), %edx
    617 	mov	%edi, %ecx
    618 	POP	(%edi)
    619 	jmp	L(CalculateLengthOfSrc)
    620         CFI_PUSH     (%edi)
    621 
    622 	.p2align 4
    623 L(Exit10):
    624 	movlpd	(%ecx), %xmm0
    625 	movlpd	%xmm0, (%edx)
    626 	movw	8(%ecx), %ax
    627 	movw	%ax, 8(%edx)
    628 
    629 	lea	9(%ecx), %eax
    630 	sub	%edi, %eax
    631 	RETURN1
    632 
    633 	.p2align 4
    634 L(StrlcpyExit11):
    635 	movb	%bh, 10(%edx)
    636 	movlpd	(%ecx), %xmm0
    637 	movlpd	%xmm0, (%edx)
    638 	movw	8(%ecx), %ax
    639 	movw	%ax, 8(%edx)
    640 
    641 	lea	11(%ecx), %edx
    642 	mov	%edi, %ecx
    643 	POP	(%edi)
    644 	jmp	L(CalculateLengthOfSrc)
    645         CFI_PUSH     (%edi)
    646 
    647 	.p2align 4
    648 L(Exit11):
    649 	movlpd	(%ecx), %xmm0
    650 	movlpd	%xmm0, (%edx)
    651 	movl	7(%ecx), %eax
    652 	movl	%eax, 7(%edx)
    653 
    654 	lea	10(%ecx), %eax
    655 	sub	%edi, %eax
    656 	RETURN1
    657 
    658 	.p2align 4
    659 L(StrlcpyExit13):
    660 	movb	%bh, 12(%edx)
    661 	movlpd	(%ecx), %xmm0
    662 	movlpd	%xmm0, (%edx)
    663 	movl	8(%ecx), %eax
    664 	movl	%eax, 8(%edx)
    665 
    666 	lea	13(%ecx), %edx
    667 	mov	%edi, %ecx
    668 	POP	(%edi)
    669 	jmp	L(CalculateLengthOfSrc)
    670         CFI_PUSH     (%edi)
    671 
    672 	.p2align 4
    673 L(Exit13):
    674 	movlpd	(%ecx), %xmm0
    675 	movlpd	%xmm0, (%edx)
    676 	movlpd	5(%ecx), %xmm0
    677 	movlpd	%xmm0, 5(%edx)
    678 
    679 	lea	12(%ecx), %eax
    680 	sub	%edi, %eax
    681 	RETURN1
    682 
    683 	.p2align 4
    684 L(StrlcpyExit14):
    685 	movb	%bh, 13(%edx)
    686 	movlpd	(%ecx), %xmm0
    687 	movlpd	%xmm0, (%edx)
    688 	movlpd	5(%ecx), %xmm0
    689 	movlpd	%xmm0, 5(%edx)
    690 
    691 	lea	14(%ecx), %edx
    692 	mov	%edi, %ecx
    693 	POP	(%edi)
    694 	jmp	L(CalculateLengthOfSrc)
    695         CFI_PUSH     (%edi)
    696 
    697 	.p2align 4
    698 L(Exit14):
    699 	movlpd	(%ecx), %xmm0
    700 	movlpd	%xmm0, (%edx)
    701 	movlpd	6(%ecx), %xmm0
    702 	movlpd	%xmm0, 6(%edx)
    703 
    704 	lea	13(%ecx), %eax
    705 	sub	%edi, %eax
    706 	RETURN1
    707 
    708 	.p2align 4
    709 L(StrlcpyExit15):
    710 	movb	%bh, 14(%edx)
    711 	movlpd	(%ecx), %xmm0
    712 	movlpd	%xmm0, (%edx)
    713 	movlpd	6(%ecx), %xmm0
    714 	movlpd	%xmm0, 6(%edx)
    715 
    716 	lea	15(%ecx), %edx
    717 	mov	%edi, %ecx
    718 	POP	(%edi)
    719 	jmp	L(CalculateLengthOfSrc)
    720         CFI_PUSH     (%edi)
    721 
    722 	.p2align 4
    723 L(Exit15):
    724 	movlpd	(%ecx), %xmm0
    725 	movlpd	%xmm0, (%edx)
    726 	movlpd	7(%ecx), %xmm0
    727 	movlpd	%xmm0, 7(%edx)
    728 
    729 	lea	14(%ecx), %eax
    730 	sub	%edi, %eax
    731 	RETURN1
    732 
    733 	.p2align 4
    734 L(StrncpyExit15Bytes):
    735 	cmp	$12, %ebx
    736 	ja	L(StrncpyExit15Bytes1)
    737 
    738 	cmpb	$0, 8(%ecx)
    739 	jz	L(Exit9)
    740 	cmp	$9, %ebx
    741 	je	L(StrlcpyExit9)
    742 
    743 	cmpb	$0, 9(%ecx)
    744 	jz	L(Exit10)
    745 	cmp	$10, %ebx
    746 	je	L(StrlcpyExit10)
    747 
    748 	cmpb	$0, 10(%ecx)
    749 	jz	L(Exit11)
    750 	cmp	$11, %ebx
    751 	je	L(StrlcpyExit11)
    752 
    753 	cmpb	$0, 11(%ecx)
    754 	jz	L(Exit12)
    755 	jmp	L(StrlcpyExit12)
    756 
    757 	.p2align 4
    758 L(StrncpyExit15Bytes1):
    759 	cmpb	$0, 8(%ecx)
    760 	jz	L(Exit9)
    761 	cmpb	$0, 9(%ecx)
    762 	jz	L(Exit10)
    763 	cmpb	$0, 10(%ecx)
    764 	jz	L(Exit11)
    765 	cmpb	$0, 11(%ecx)
    766 	jz	L(Exit12)
    767 
    768 	cmpb	$0, 12(%ecx)
    769 	jz	L(Exit13)
    770 	cmp	$13, %ebx
    771 	je	L(StrlcpyExit13)
    772 
    773 	cmpb	$0, 13(%ecx)
    774 	jz	L(Exit14)
    775 	cmp	$14, %ebx
    776 	je	L(StrlcpyExit14)
    777 
    778 	cmpb	$0, 14(%ecx)
    779 	jz	L(Exit15)
    780 	jmp	L(StrlcpyExit15)
    781 
    782 	.p2align 4
    783 L(StrncpyExit8Bytes):
    784 	cmp	$4, %ebx
    785 	ja	L(StrncpyExit8Bytes1)
    786 
    787 	cmpb	$0, (%ecx)
    788 	jz	L(Exit1)
    789 	cmp	$1, %ebx
    790 	je	L(StrlcpyExit1)
    791 
    792 	cmpb	$0, 1(%ecx)
    793 	jz	L(Exit2)
    794 	cmp	$2, %ebx
    795 	je	L(StrlcpyExit2)
    796 
    797 	cmpb	$0, 2(%ecx)
    798 	jz	L(Exit3)
    799 	cmp	$3, %ebx
    800 	je	L(StrlcpyExit3)
    801 
    802 	cmpb	$0, 3(%ecx)
    803 	jz	L(Exit4)
    804 	jmp	L(StrlcpyExit4)
    805 
    806 	.p2align 4
    807 L(StrncpyExit8Bytes1):
    808 	cmpb	$0, (%ecx)
    809 	jz	L(Exit1)
    810 	cmpb	$0, 1(%ecx)
    811 	jz	L(Exit2)
    812 	cmpb	$0, 2(%ecx)
    813 	jz	L(Exit3)
    814 	cmpb	$0, 3(%ecx)
    815 	jz	L(Exit4)
    816 
    817 	cmpb	$0, 4(%ecx)
    818 	jz	L(Exit5)
    819 	cmp	$5, %ebx
    820 	je	L(StrlcpyExit5)
    821 
    822 	cmpb	$0, 5(%ecx)
    823 	jz	L(Exit6)
    824 	cmp	$6, %ebx
    825 	je	L(StrlcpyExit6)
    826 
    827 	cmpb	$0, 6(%ecx)
    828 	jz	L(Exit7)
    829 	cmp	$7, %ebx
    830 	je	L(StrlcpyExit7)
    831 
    832 	cmpb	$0, 7(%ecx)
    833 	jz	L(Exit8)
    834 	jmp	L(StrlcpyExit8)
    835 
    836 	CFI_POP	(%edi)
    837 
    838 
    839 	.p2align 4
    840 L(Prolog_return_start_len):
    841 	movl	LEN(%esp), %ebx
    842         movl	SRC(%esp), %ecx
    843 L(CalculateLengthOfSrcProlog):
    844 	mov	%ecx, %edx
    845 	sub	%ebx, %ecx
    846 
    847 	.p2align 4
    848 L(CalculateLengthOfSrc):
    849 	cmpb	$0, (%edx)
    850 	jz	L(exit_tail0)
    851 	cmpb	$0, 1(%edx)
    852 	jz	L(exit_tail1)
    853 	cmpb	$0, 2(%edx)
    854 	jz	L(exit_tail2)
    855 	cmpb	$0, 3(%edx)
    856 	jz	L(exit_tail3)
    857 
    858 	cmpb	$0, 4(%edx)
    859 	jz	L(exit_tail4)
    860 	cmpb	$0, 5(%edx)
    861 	jz	L(exit_tail5)
    862 	cmpb	$0, 6(%edx)
    863 	jz	L(exit_tail6)
    864 	cmpb	$0, 7(%edx)
    865 	jz	L(exit_tail7)
    866 
    867 	cmpb	$0, 8(%edx)
    868 	jz	L(exit_tail8)
    869 	cmpb	$0, 9(%edx)
    870 	jz	L(exit_tail9)
    871 	cmpb	$0, 10(%edx)
    872 	jz	L(exit_tail10)
    873 	cmpb	$0, 11(%edx)
    874 	jz	L(exit_tail11)
    875 
    876 	cmpb	$0, 12(%edx)
    877 	jz	L(exit_tail12)
    878 	cmpb	$0, 13(%edx)
    879 	jz	L(exit_tail13)
    880 	cmpb	$0, 14(%edx)
    881 	jz	L(exit_tail14)
    882 	cmpb	$0, 15(%edx)
    883 	jz	L(exit_tail15)
    884 
    885 	pxor	%xmm0, %xmm0
    886 	lea	16(%edx), %eax
    887 	add	$16, %ecx
    888 	and	$-16, %eax
    889 
    890 	pcmpeqb	(%eax), %xmm0
    891 	pmovmskb %xmm0, %edx
    892 	pxor	%xmm1, %xmm1
    893 	lea	16(%eax), %eax
    894 	test	%edx, %edx
    895 	jnz	L(exit)
    896 
    897 	pcmpeqb	(%eax), %xmm1
    898 	pmovmskb %xmm1, %edx
    899 	pxor	%xmm2, %xmm2
    900 	lea	16(%eax), %eax
    901 	test	%edx, %edx
    902 	jnz	L(exit)
    903 
    904 	pcmpeqb	(%eax), %xmm2
    905 	pmovmskb %xmm2, %edx
    906 	pxor	%xmm3, %xmm3
    907 	lea	16(%eax), %eax
    908 	test	%edx, %edx
    909 	jnz	L(exit)
    910 
    911 	pcmpeqb	(%eax), %xmm3
    912 	pmovmskb %xmm3, %edx
    913 	lea	16(%eax), %eax
    914 	test	%edx, %edx
    915 	jnz	L(exit)
    916 
    917 	pcmpeqb	(%eax), %xmm0
    918 	pmovmskb %xmm0, %edx
    919 	lea	16(%eax), %eax
    920 	test	%edx, %edx
    921 	jnz	L(exit)
    922 
    923 	pcmpeqb	(%eax), %xmm1
    924 	pmovmskb %xmm1, %edx
    925 	lea	16(%eax), %eax
    926 	test	%edx, %edx
    927 	jnz	L(exit)
    928 
    929 	pcmpeqb	(%eax), %xmm2
    930 	pmovmskb %xmm2, %edx
    931 	lea	16(%eax), %eax
    932 	test	%edx, %edx
    933 	jnz	L(exit)
    934 
    935 	pcmpeqb	(%eax), %xmm3
    936 	pmovmskb %xmm3, %edx
    937 	lea	16(%eax), %eax
    938 	test	%edx, %edx
    939 	jnz	L(exit)
    940 
    941 	pcmpeqb	(%eax), %xmm0
    942 	pmovmskb %xmm0, %edx
    943 	lea	16(%eax), %eax
    944 	test	%edx, %edx
    945 	jnz	L(exit)
    946 
    947 	pcmpeqb	(%eax), %xmm1
    948 	pmovmskb %xmm1, %edx
    949 	lea	16(%eax), %eax
    950 	test	%edx, %edx
    951 	jnz	L(exit)
    952 
    953 	pcmpeqb	(%eax), %xmm2
    954 	pmovmskb %xmm2, %edx
    955 	lea	16(%eax), %eax
    956 	test	%edx, %edx
    957 	jnz	L(exit)
    958 
    959 	pcmpeqb	(%eax), %xmm3
    960 	pmovmskb %xmm3, %edx
    961 	lea	16(%eax), %eax
    962 	test	%edx, %edx
    963 	jnz	L(exit)
    964 
    965 	pcmpeqb	(%eax), %xmm0
    966 	pmovmskb %xmm0, %edx
    967 	lea	16(%eax), %eax
    968 	test	%edx, %edx
    969 	jnz	L(exit)
    970 
    971 	pcmpeqb	(%eax), %xmm1
    972 	pmovmskb %xmm1, %edx
    973 	lea	16(%eax), %eax
    974 	test	%edx, %edx
    975 	jnz	L(exit)
    976 
    977 	pcmpeqb	(%eax), %xmm2
    978 	pmovmskb %xmm2, %edx
    979 	lea	16(%eax), %eax
    980 	test	%edx, %edx
    981 	jnz	L(exit)
    982 
    983 	pcmpeqb	(%eax), %xmm3
    984 	pmovmskb %xmm3, %edx
    985 	lea	16(%eax), %eax
    986 	test	%edx, %edx
    987 	jnz	L(exit)
    988 
    989 	and	$-0x40, %eax
    990 
    991 	.p2align 4
    992 L(aligned_64_loop):
    993 	movaps	(%eax), %xmm0
    994 	movaps	16(%eax), %xmm1
    995 	movaps	32(%eax), %xmm2
    996 	movaps	48(%eax), %xmm6
    997 	pminub	%xmm1, %xmm0
    998 	pminub	%xmm6, %xmm2
    999 	pminub	%xmm0, %xmm2
   1000 	pcmpeqb	%xmm3, %xmm2
   1001 	pmovmskb %xmm2, %edx
   1002 	lea	64(%eax), %eax
   1003 	test	%edx, %edx
   1004 	jz	L(aligned_64_loop)
   1005 
   1006 	pcmpeqb	-64(%eax), %xmm3
   1007 	pmovmskb %xmm3, %edx
   1008 	lea	48(%ecx), %ecx
   1009 	test	%edx, %edx
   1010 	jnz	L(exit)
   1011 
   1012 	pcmpeqb	%xmm1, %xmm3
   1013 	pmovmskb %xmm3, %edx
   1014 	lea	-16(%ecx), %ecx
   1015 	test	%edx, %edx
   1016 	jnz	L(exit)
   1017 
   1018 	pcmpeqb	-32(%eax), %xmm3
   1019 	pmovmskb %xmm3, %edx
   1020 	lea	-16(%ecx), %ecx
   1021 	test	%edx, %edx
   1022 	jnz	L(exit)
   1023 
   1024 	pcmpeqb	%xmm6, %xmm3
   1025 	pmovmskb %xmm3, %edx
   1026 	lea	-16(%ecx), %ecx
   1027 
   1028 	.p2align 4
   1029 L(exit):
   1030 	sub	%ecx, %eax
   1031 	test	%dl, %dl
   1032 	jz	L(exit_more_8)
   1033 
   1034 	mov	%dl, %cl
   1035 	and	$15, %cl
   1036 	jz	L(exit_more_4)
   1037 	test	$0x01, %dl
   1038 	jnz	L(exit_0)
   1039 	test	$0x02, %dl
   1040 	jnz	L(exit_1)
   1041 	test	$0x04, %dl
   1042 	jnz	L(exit_2)
   1043 	add	$3, %eax
   1044 	RETURN
   1045 
   1046 	.p2align 4
   1047 L(exit_more_4):
   1048 	test	$0x10, %dl
   1049 	jnz	L(exit_4)
   1050 	test	$0x20, %dl
   1051 	jnz	L(exit_5)
   1052 	test	$0x40, %dl
   1053 	jnz	L(exit_6)
   1054 	add	$7, %eax
   1055 	RETURN
   1056 
   1057 	.p2align 4
   1058 L(exit_more_8):
   1059 	mov	%dh, %ch
   1060 	and	$15, %ch
   1061 	jz	L(exit_more_12)
   1062 	test	$0x01, %dh
   1063 	jnz	L(exit_8)
   1064 	test	$0x02, %dh
   1065 	jnz	L(exit_9)
   1066 	test	$0x04, %dh
   1067 	jnz	L(exit_10)
   1068 	add	$11, %eax
   1069 	RETURN
   1070 
   1071 	.p2align 4
   1072 L(exit_more_12):
   1073 	test	$0x10, %dh
   1074 	jnz	L(exit_12)
   1075 	test	$0x20, %dh
   1076 	jnz	L(exit_13)
   1077 	test	$0x40, %dh
   1078 	jnz	L(exit_14)
   1079 	add	$15, %eax
   1080 L(exit_0):
   1081 	RETURN
   1082 
   1083 	.p2align 4
   1084 L(exit_1):
   1085 	add	$1, %eax
   1086 	RETURN
   1087 
   1088 L(exit_2):
   1089 	add	$2, %eax
   1090 	RETURN
   1091 
   1092 L(exit_3):
   1093 	add	$3, %eax
   1094 	RETURN
   1095 
   1096 L(exit_4):
   1097 	add	$4, %eax
   1098 	RETURN
   1099 
   1100 L(exit_5):
   1101 	add	$5, %eax
   1102 	RETURN
   1103 
   1104 L(exit_6):
   1105 	add	$6, %eax
   1106 	RETURN
   1107 
   1108 L(exit_7):
   1109 	add	$7, %eax
   1110 	RETURN
   1111 
   1112 L(exit_8):
   1113 	add	$8, %eax
   1114 	RETURN
   1115 
   1116 L(exit_9):
   1117 	add	$9, %eax
   1118 	RETURN
   1119 
   1120 L(exit_10):
   1121 	add	$10, %eax
   1122 	RETURN
   1123 
   1124 L(exit_11):
   1125 	add	$11, %eax
   1126 	RETURN
   1127 
   1128 L(exit_12):
   1129 	add	$12, %eax
   1130 	RETURN
   1131 
   1132 L(exit_13):
   1133 	add	$13, %eax
   1134 	RETURN
   1135 
   1136 L(exit_14):
   1137 	add	$14, %eax
   1138 	RETURN
   1139 
   1140 L(exit_15):
   1141 	add	$15, %eax
   1142 	RETURN
   1143 
   1144 L(exit_tail0):
   1145 	mov	%edx, %eax
   1146 	sub	%ecx, %eax
   1147 	RETURN
   1148 
   1149 	.p2align 4
   1150 L(exit_tail1):
   1151 	lea	1(%edx), %eax
   1152 	sub	%ecx, %eax
   1153 	RETURN
   1154 
   1155 L(exit_tail2):
   1156 	lea	2(%edx), %eax
   1157 	sub	%ecx, %eax
   1158 	RETURN
   1159 
   1160 L(exit_tail3):
   1161 	lea	3(%edx), %eax
   1162 	sub	%ecx, %eax
   1163 	RETURN
   1164 
   1165 L(exit_tail4):
   1166 	lea	4(%edx), %eax
   1167 	sub	%ecx, %eax
   1168 	RETURN
   1169 
   1170 L(exit_tail5):
   1171 	lea	5(%edx), %eax
   1172 	sub	%ecx, %eax
   1173 	RETURN
   1174 
   1175 L(exit_tail6):
   1176 	lea	6(%edx), %eax
   1177 	sub	%ecx, %eax
   1178 	RETURN
   1179 
   1180 L(exit_tail7):
   1181 	lea	7(%edx), %eax
   1182 	sub	%ecx, %eax
   1183 	RETURN
   1184 
   1185 L(exit_tail8):
   1186 	lea	8(%edx), %eax
   1187 	sub	%ecx, %eax
   1188 	RETURN
   1189 
   1190 L(exit_tail9):
   1191 	lea	9(%edx), %eax
   1192 	sub	%ecx, %eax
   1193 	RETURN
   1194 
   1195 L(exit_tail10):
   1196 	lea	10(%edx), %eax
   1197 	sub	%ecx, %eax
   1198 	RETURN
   1199 
   1200 L(exit_tail11):
   1201 	lea	11(%edx), %eax
   1202 	sub	%ecx, %eax
   1203 	RETURN
   1204 
   1205 L(exit_tail12):
   1206 	lea	12(%edx), %eax
   1207 	sub	%ecx, %eax
   1208 	RETURN
   1209 
   1210 L(exit_tail13):
   1211 	lea	13(%edx), %eax
   1212 	sub	%ecx, %eax
   1213 	RETURN
   1214 
   1215 L(exit_tail14):
   1216 	lea	14(%edx), %eax
   1217 	sub	%ecx, %eax
   1218 	RETURN
   1219 
   1220 L(exit_tail15):
   1221 	lea	15(%edx), %eax
   1222 	sub	%ecx, %eax
   1223 	RETURN
   1224 
   1225 END (strlcat)
   1226