Home | History | Annotate | Download | only in string
      1 /*
      2 Copyright (c) 2014, Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 #ifndef L
     32 # define L(label)	.L##label
     33 #endif
     34 
     35 #ifndef cfi_startproc
     36 # define cfi_startproc	.cfi_startproc
     37 #endif
     38 
     39 #ifndef cfi_endproc
     40 # define cfi_endproc	.cfi_endproc
     41 #endif
     42 
     43 #ifndef ENTRY
     44 # define ENTRY(name)	\
     45 	.type name, @function;	\
     46 	.globl name;	\
     47 	.p2align 4;	\
     48 name:	\
     49 	cfi_startproc
     50 #endif
     51 
     52 #ifndef END
     53 # define END(name)	\
     54        cfi_endproc;	\
     55        .size name, .-name
     56 #endif
     57 
     58 
     59 #ifndef STRLCPY
     60 # define STRLCPY	strlcpy
     61 #endif
     62 
     63 #define JMPTBL(I, B)	I - B
     64 #define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
     65 	lea	TABLE(%rip), %r11;	\
     66 	movslq	(%r11, INDEX, SCALE), %rcx;	\
     67 	lea	(%r11, %rcx), %rcx;	\
     68 	jmp	*%rcx
     69 
     70 #define RETURN	\
     71 	add	%r9, %rax;	\
     72 	ret
     73 
     74 .text
     75 ENTRY (STRLCPY)
     76 	xor	%rax, %rax
     77 	xor	%r9, %r9
     78 	mov	%rdx, %r8
     79 	cmp	$0, %r8
     80 	jz	L(CalculateSrcLen)
     81 
     82 #ifdef USE_AS_STRLCAT
     83 	xor	%rcx, %rcx
     84 	pxor	%xmm0, %xmm0
     85 
     86 	movdqu	(%rdi), %xmm1
     87 	pcmpeqb %xmm1, %xmm0
     88 	pmovmskb %xmm0, %rdx
     89 
     90 	cmp	$17, %r8
     91 	jb	L(SizeEndCase1)
     92 	test	%rdx, %rdx
     93 	jnz	L(StringEndCase1)
     94 
     95 	add	$16, %rax
     96 	movdqu	16(%rdi), %xmm1
     97 	pcmpeqb %xmm1, %xmm0
     98 	pmovmskb %xmm0, %rdx
     99 
    100 	cmp	$33, %r8
    101 	jb	L(SizeEndCase1)
    102 	test	%rdx, %rdx
    103 	jnz	L(StringEndCase1)
    104 
    105 	mov	%rdi, %rcx
    106 	and	$15, %rcx
    107 	and	$-16, %rdi
    108 
    109 	add	%rcx, %r8
    110 	sub	$16, %r8
    111 
    112 L(DstLenLoop):
    113 	movdqa	(%rdi, %rax), %xmm1
    114 	pcmpeqb %xmm1, %xmm0
    115 	pmovmskb %xmm0, %rdx
    116 	sub	$16, %r8
    117 	jbe	L(SizeEndCase2)
    118 	test	%rdx, %rdx
    119 	jnz	L(StringEndCase2)
    120 	add	$16, %rax
    121 	jmp	L(DstLenLoop)
    122 
    123 L(StringEndCase2):
    124 	add	$16, %r8
    125 	bsf	%rdx, %rdx
    126 	sub	%rdx, %r8
    127 	add	%rdx, %rax
    128 	sub	%rcx, %r9
    129 	add	%rax, %rdi
    130 	jmp	 L(CopySrcString)
    131 
    132 L(SizeEndCase1):
    133 	test	%rdx, %rdx
    134 	jz	L(SizeEnd)
    135 	bsf	%rdx, %rdx
    136 	add	%rdx, %rax
    137 	cmp	%r8, %rax
    138 	jb	L(StringEnd)
    139 L(SizeEnd):
    140 	mov	%r8, %r9
    141 	jmp	L(CalculateSrcLenCase1)
    142 
    143 L(SizeEndCase2):
    144 	add	$16, %r8
    145 	test	%rdx, %rdx
    146 	jz	L(StringEndCase4)
    147 	bsf	%rdx, %rdx
    148 	cmp	%r8, %rdx
    149 	jb	L(StringEndCase3)
    150 L(StringEndCase4):
    151 	add	%r8, %rax
    152 	sub	%rcx, %rax
    153 	mov	%rax, %r9
    154 	jmp	L(CalculateSrcLenCase1)
    155 
    156 L(StringEndCase3):
    157 	add	%rdx, %rax
    158 	sub	%rcx, %r9
    159 	add	%rax, %rdi
    160 	sub	%rdx, %r8
    161 	jmp	L(CopySrcString)
    162 
    163 L(StringEndCase1):
    164 	bsf	%rdx, %rdx
    165 	add	%rdx, %rax
    166 	sub	%rcx, %rax
    167 L(StringEnd):
    168 	add	%rax, %rdi
    169 	sub	%rax, %r8
    170 #endif
    171 
    172 	mov	%rsi, %rcx
    173 	and	$63, %rcx
    174 	cmp	$32, %rcx
    175 	jbe	L(CopySrcString)
    176 
    177 	and	$-16, %rsi
    178 	and	$15, %rcx
    179 	pxor	%xmm0, %xmm0
    180 	pxor	%xmm1, %xmm1
    181 
    182 	pcmpeqb	(%rsi), %xmm1
    183 	pmovmskb %xmm1, %rdx
    184 	shr	%cl, %rdx
    185 	mov	$16, %r10
    186 	sub	%rcx, %r10
    187 	cmp	%r10, %r8
    188 	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
    189 	test	%rdx, %rdx
    190 	jnz	L(CopyFrom1To16BytesTail)
    191 
    192 	pcmpeqb	16(%rsi), %xmm0
    193 	pmovmskb %xmm0, %rdx
    194 	add	$16, %r10
    195 	cmp	%r10, %r8
    196 	jbe	L(CopyFrom1To32BytesCase2OrCase3)
    197 	test	%rdx, %rdx
    198 	jnz	L(CopyFrom1To32Bytes)
    199 
    200 	movdqu	(%rsi, %rcx), %xmm1
    201 	movdqu	%xmm1, (%rdi)
    202 #ifdef USE_AS_STRLCAT
    203 	add	%rax, %r9
    204 #endif
    205 	jmp	L(LoopStart)
    206 
    207 	.p2align 4
    208 L(CopySrcString):
    209 #ifdef USE_AS_STRLCAT
    210 	add	%rax, %r9
    211 	xor	%rax, %rax
    212 #endif
    213 	pxor	%xmm0, %xmm0
    214 	movdqu	(%rsi), %xmm1
    215 	pcmpeqb	%xmm1, %xmm0
    216 	pmovmskb %xmm0, %rdx
    217 
    218 	cmp	$17, %r8
    219 	jb	L(CopyFrom1To16BytesTail1Case2OrCase3)
    220 	test	%rdx, %rdx
    221 	jnz	L(CopyFrom1To16BytesTail1)
    222 
    223 	movdqu	16(%rsi), %xmm2
    224 	pcmpeqb	%xmm2, %xmm0
    225 	movdqu	%xmm1, (%rdi)
    226 	pmovmskb %xmm0, %rdx
    227 	add	$16, %rax
    228 
    229 	cmp	$33, %r8
    230 	jb	L(CopyFrom1To32Bytes1Case2OrCase3)
    231 	test	%rdx, %rdx
    232 	jnz	L(CopyFrom1To32Bytes1)
    233 
    234 	mov	%rsi, %rcx
    235 	and	$15, %rcx
    236 	and	$-16, %rsi
    237 
    238 L(LoopStart):
    239 	sub	%rcx, %rdi
    240 	add	%rcx, %r8
    241 	sub	$16, %r8
    242 	mov	$16, %rax
    243 
    244 L(16Loop):
    245 	movdqa	(%rsi, %rax), %xmm1
    246 	pcmpeqb	%xmm1, %xmm0
    247 	pmovmskb %xmm0, %rdx
    248 	sub	$16, %r8
    249 	jbe	L(CopyFrom1To16BytesCase2OrCase3)
    250 	test	%rdx, %rdx
    251 	jnz	L(CopyFrom1To16BytesXmmExit)
    252 	movdqu	%xmm1, (%rdi, %rax)
    253 	add	$16, %rax
    254 	jmp	L(16Loop)
    255 
    256 /*------End of main part with loops---------------------*/
    257 
    258 /* Case1 */
    259 	.p2align 4
    260 L(CopyFrom1To16Bytes):
    261 	add	%rcx, %rdi
    262 	add	%rcx, %rsi
    263 	bsf	%rdx, %rdx
    264 	add	%rdx, %rax
    265 	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
    266 
    267 	.p2align 4
    268 L(CopyFrom1To16BytesTail):
    269 	add	%rcx, %rsi
    270 	bsf	%rdx, %rdx
    271 	add	%rdx, %rax
    272 	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
    273 
    274 	.p2align 4
    275 L(CopyFrom1To32Bytes1):
    276 	add	$16, %rsi
    277 	add	$16, %rdi
    278 	sub	$16, %r8
    279 L(CopyFrom1To16BytesTail1):
    280 	bsf	%rdx, %rdx
    281 	add	%rdx, %rax
    282 	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
    283 
    284 	.p2align 4
    285 L(CopyFrom1To32Bytes):
    286 	bsf	%rdx, %rdx
    287 	add	%rcx, %rsi
    288 	add	$16, %rdx
    289 	sub	%rcx, %rdx
    290 	add	%rdx, %rax
    291 	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
    292 
    293 	.p2align 4
    294 L(CopyFrom1To16BytesExit):
    295 	add	%rdx, %rax
    296 	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
    297 
    298 /* Case2 */
    299 
    300 	.p2align 4
    301 L(CopyFrom1To16BytesCase2):
    302 	add	$16, %r8
    303 	add	%rax, %rdi
    304 	add	%rax, %rsi
    305 	bsf	%rdx, %rdx
    306 	sub	%rcx, %rax
    307 	cmp	%r8, %rdx
    308 	jb	L(CopyFrom1To16BytesExit)
    309 	add	%r8, %rax
    310 	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
    311 
    312 	.p2align 4
    313 L(CopyFrom1To32BytesCase2):
    314 	add	%rcx, %rsi
    315 	bsf	%rdx, %rdx
    316 	add	$16, %rdx
    317 	sub	%rcx, %rdx
    318 	cmp	%r8, %rdx
    319 	jb	L(CopyFrom1To16BytesExit)
    320 	add	%r8, %rax
    321 	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
    322 
    323 L(CopyFrom1To16BytesTailCase2):
    324 	add	%rcx, %rsi
    325 	bsf	%rdx, %rdx
    326 	cmp	%r8, %rdx
    327 	jb	L(CopyFrom1To16BytesExit)
    328 	add	%r8, %rax
    329 	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
    330 
    331 	.p2align 4
    332 L(CopyFrom1To16BytesTail1Case2):
    333 	bsf	%rdx, %rdx
    334 	cmp	%r8, %rdx
    335 	jb	L(CopyFrom1To16BytesExit)
    336 	add	%r8, %rax
    337 	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
    338 
    339 /* Case2 or Case3,  Case3 */
    340 
    341 	.p2align 4
    342 L(CopyFrom1To16BytesCase2OrCase3):
    343 	test	%rdx, %rdx
    344 	jnz	L(CopyFrom1To16BytesCase2)
    345 	add	$16, %r8
    346 	add	%rax, %rdi
    347 	add	%rax, %rsi
    348 	add	%r8, %rax
    349 	sub	%rcx, %rax
    350 	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
    351 
    352 	.p2align 4
    353 L(CopyFrom1To32BytesCase2OrCase3):
    354 	test	%rdx, %rdx
    355 	jnz	L(CopyFrom1To32BytesCase2)
    356 	add	%rcx, %rsi
    357 	add	%r8, %rax
    358 	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
    359 
    360 	.p2align 4
    361 L(CopyFrom1To16BytesTailCase2OrCase3):
    362 	test	%rdx, %rdx
    363 	jnz	L(CopyFrom1To16BytesTailCase2)
    364 	add	%rcx, %rsi
    365 	add	%r8, %rax
    366 	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
    367 
    368 	.p2align 4
    369 L(CopyFrom1To32Bytes1Case2OrCase3):
    370 	add	$16, %rdi
    371 	add	$16, %rsi
    372 	sub	$16, %r8
    373 L(CopyFrom1To16BytesTail1Case2OrCase3):
    374 	test	%rdx, %rdx
    375 	jnz	L(CopyFrom1To16BytesTail1Case2)
    376 	add	%r8, %rax
    377 	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
    378 
    379 	.p2align 4
    380 L(CopyFrom1To16BytesXmmExit):
    381 	bsf	%rdx, %rdx
    382 	add	%rax, %rdi
    383 	add	%rax, %rsi
    384 	add	%rdx, %rax
    385 	sub	%rcx, %rax
    386 	BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
    387 
    388 /*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/
    389 
    390 
    391 	.p2align 4
    392 L(Exit0):
    393 	RETURN
    394 
    395 	.p2align 4
    396 L(Exit1):
    397 	movb	$0, (%rdi)
    398 	jmp	L(CalculateSrcLen)
    399 
    400 	.p2align 4
    401 L(Exit2):
    402 	movb	(%rsi), %dh
    403 	movb	%dh, (%rdi)
    404 	movb	$0, 1(%rdi)
    405 	jmp	L(CalculateSrcLen)
    406 
    407 	.p2align 4
    408 L(Exit3):
    409 	movw	(%rsi), %dx
    410 	movw	%dx, (%rdi)
    411 	movb	$0, 2(%rdi)
    412 	jmp	L(CalculateSrcLen)
    413 
    414 	.p2align 4
    415 L(Exit4):
    416 	movw	(%rsi), %cx
    417 	movb	2(%rsi), %dh
    418 	movw	%cx, (%rdi)
    419 	movb	%dh, 2(%rdi)
    420 	movb	$0, 3(%rdi)
    421 	jmp	L(CalculateSrcLen)
    422 
    423 	.p2align 4
    424 L(Exit5):
    425 	movl	(%rsi), %edx
    426 	movl	%edx, (%rdi)
    427 	movb	$0, 4(%rdi)
    428 	jmp	L(CalculateSrcLen)
    429 
    430 	.p2align 4
    431 L(Exit6):
    432 	movl	(%rsi), %ecx
    433 	movb	4(%rsi), %dh
    434 	movl	%ecx, (%rdi)
    435 	movb	%dh, 4(%rdi)
    436 	movb	$0, 5(%rdi)
    437 	jmp	L(CalculateSrcLen)
    438 
    439 	.p2align 4
    440 L(Exit7):
    441 	movl	(%rsi), %ecx
    442 	movw	4(%rsi), %dx
    443 	movl	%ecx, (%rdi)
    444 	movw	%dx, 4(%rdi)
    445 	movb	$0, 6(%rdi)
    446 	jmp	L(CalculateSrcLen)
    447 
    448 	.p2align 4
    449 L(Exit8):
    450 	movl	(%rsi), %ecx
    451 	movl	3(%rsi), %edx
    452 	movl	%ecx, (%rdi)
    453 	movl	%edx, 3(%rdi)
    454 	movb	$0, 7(%rdi)
    455 	jmp	L(CalculateSrcLen)
    456 
    457 	.p2align 4
    458 L(Exit9):
    459 	movq	(%rsi), %rdx
    460 	movq	%rdx, (%rdi)
    461 	movb	$0, 8(%rdi)
    462 	jmp	L(CalculateSrcLen)
    463 
    464 	.p2align 4
    465 L(Exit10):
    466 	movq	(%rsi), %rcx
    467 	movb	8(%rsi), %dh
    468 	movq	%rcx, (%rdi)
    469 	movb	%dh, 8(%rdi)
    470 	movb	$0, 9(%rdi)
    471 	jmp	L(CalculateSrcLen)
    472 
    473 	.p2align 4
    474 L(Exit11):
    475 	movq	(%rsi), %rcx
    476 	movw	8(%rsi), %dx
    477 	movq	%rcx, (%rdi)
    478 	movw	%dx, 8(%rdi)
    479 	movb	$0, 10(%rdi)
    480 	jmp	L(CalculateSrcLen)
    481 
    482 	.p2align 4
    483 L(Exit12):
    484 	movq	(%rsi), %rcx
    485 	movl	7(%rsi), %edx
    486 	movq	%rcx, (%rdi)
    487 	movl	%edx, 7(%rdi)
    488 	movb	$0, 11(%rdi)
    489 	jmp	L(CalculateSrcLen)
    490 
    491 	.p2align 4
    492 L(Exit13):
    493 	movq	(%rsi), %rcx
    494 	movl	8(%rsi), %edx
    495 	movq	%rcx, (%rdi)
    496 	movl	%edx, 8(%rdi)
    497 	movb	$0, 12(%rdi)
    498 	jmp	L(CalculateSrcLen)
    499 
    500 	.p2align 4
    501 L(Exit14):
    502 	movq	(%rsi), %rcx
    503 	movq	5(%rsi), %rdx
    504 	movq	%rcx, (%rdi)
    505 	movq	%rdx, 5(%rdi)
    506 	movb	$0, 13(%rdi)
    507 	jmp	L(CalculateSrcLen)
    508 
    509 	.p2align 4
    510 L(Exit15):
    511 	movq	(%rsi), %rcx
    512 	movq	6(%rsi), %rdx
    513 	movq	%rcx, (%rdi)
    514 	movq	%rdx, 6(%rdi)
    515 	movb	$0, 14(%rdi)
    516 	jmp	L(CalculateSrcLen)
    517 
    518 	.p2align 4
    519 L(Exit16):
    520 	movq	(%rsi), %rcx
    521 	movq	7(%rsi), %rdx
    522 	movq	%rcx, (%rdi)
    523 	movq	%rdx, 7(%rdi)
    524 	movb	$0, 15(%rdi)
    525 	jmp	L(CalculateSrcLen)
    526 
    527 	.p2align 4
    528 L(Exit17):
    529 	movdqu	(%rsi), %xmm0
    530 	movdqu	%xmm0, (%rdi)
    531 	movb	$0, 16(%rdi)
    532 	jmp	L(CalculateSrcLen)
    533 
    534 	.p2align 4
    535 L(Exit18):
    536 	movdqu	(%rsi), %xmm0
    537 	movb	16(%rsi), %dh
    538 	movdqu	%xmm0, (%rdi)
    539 	movb	%dh, 16(%rdi)
    540 	movb	$0, 17(%rdi)
    541 	jmp	L(CalculateSrcLen)
    542 
    543 	.p2align 4
    544 L(Exit19):
    545 	movdqu	(%rsi), %xmm0
    546 	movw	16(%rsi), %cx
    547 	movdqu	%xmm0, (%rdi)
    548 	movw	%cx, 16(%rdi)
    549 	movb	$0, 18(%rdi)
    550 	jmp	L(CalculateSrcLen)
    551 
    552 	.p2align 4
    553 L(Exit20):
    554 	movdqu	(%rsi), %xmm0
    555 	movl	15(%rsi), %ecx
    556 	movdqu	%xmm0, (%rdi)
    557 	movl	%ecx, 15(%rdi)
    558 	movb	$0, 19(%rdi)
    559 	jmp	L(CalculateSrcLen)
    560 
    561 	.p2align 4
    562 L(Exit21):
    563 	movdqu	(%rsi), %xmm0
    564 	movl	16(%rsi), %ecx
    565 	movdqu	%xmm0, (%rdi)
    566 	movl	%ecx, 16(%rdi)
    567 	movb	$0, 20(%rdi)
    568 	jmp	L(CalculateSrcLen)
    569 
    570 	.p2align 4
    571 L(Exit22):
    572 	movdqu	(%rsi), %xmm0
    573 	movl	16(%rsi), %ecx
    574 	movb	20(%rsi), %dh
    575 	movdqu	%xmm0, (%rdi)
    576 	movl	%ecx, 16(%rdi)
    577 	movb	%dh, 20(%rdi)
    578 	movb	$0, 21(%rdi)
    579 	jmp	L(CalculateSrcLen)
    580 
    581 	.p2align 4
    582 L(Exit23):
    583 	movdqu	(%rsi), %xmm0
    584 	movq	14(%rsi), %rcx
    585 	movdqu	%xmm0, (%rdi)
    586 	movq	%rcx, 14(%rdi)
    587 	movb	$0, 22(%rdi)
    588 	jmp	L(CalculateSrcLen)
    589 
    590 	.p2align 4
    591 L(Exit24):
    592 	movdqu	(%rsi), %xmm0
    593 	movq	15(%rsi), %rcx
    594 	movdqu	%xmm0, (%rdi)
    595 	movq	%rcx, 15(%rdi)
    596 	movb	$0, 23(%rdi)
    597 	jmp	L(CalculateSrcLen)
    598 
    599 	.p2align 4
    600 L(Exit25):
    601 	movdqu	(%rsi), %xmm0
    602 	movq	16(%rsi), %rcx
    603 	movdqu	%xmm0, (%rdi)
    604 	movq	%rcx, 16(%rdi)
    605 	movb	$0, 24(%rdi)
    606 	jmp	L(CalculateSrcLen)
    607 
    608 	.p2align 4
    609 L(Exit26):
    610 	movdqu	(%rsi), %xmm0
    611 	movq	16(%rsi), %rcx
    612 	movb	24(%rsi), %dh
    613 	movdqu	%xmm0, (%rdi)
    614 	movq	%rcx, 16(%rdi)
    615 	mov	%dh, 24(%rdi)
    616 	movb	$0, 25(%rdi)
    617 	jmp	L(CalculateSrcLen)
    618 
    619 	.p2align 4
    620 L(Exit27):
    621 	movdqu	(%rsi), %xmm0
    622 	movq	16(%rsi), %rdx
    623 	movw	24(%rsi), %cx
    624 	movdqu	%xmm0, (%rdi)
    625 	movq	%rdx, 16(%rdi)
    626 	movw	%cx, 24(%rdi)
    627 	movb	$0, 26(%rdi)
    628 	jmp	L(CalculateSrcLen)
    629 
    630 	.p2align 4
    631 L(Exit28):
    632 	movdqu	(%rsi), %xmm0
    633 	movq	16(%rsi), %rdx
    634 	movl	23(%rsi), %ecx
    635 	movdqu	%xmm0, (%rdi)
    636 	movq	%rdx, 16(%rdi)
    637 	movl	%ecx, 23(%rdi)
    638 	movb	$0, 27(%rdi)
    639 	jmp	L(CalculateSrcLen)
    640 
    641 	.p2align 4
    642 L(Exit29):
    643 	movdqu	(%rsi), %xmm0
    644 	movq	16(%rsi), %rdx
    645 	movl	24(%rsi), %ecx
    646 	movdqu	%xmm0, (%rdi)
    647 	movq	%rdx, 16(%rdi)
    648 	movl	%ecx, 24(%rdi)
    649 	movb	$0, 28(%rdi)
    650 	jmp	L(CalculateSrcLen)
    651 
    652 	.p2align 4
    653 L(Exit30):
    654 	movdqu	(%rsi), %xmm0
    655 	movdqu	13(%rsi), %xmm2
    656 	movdqu	%xmm0, (%rdi)
    657 	movdqu	%xmm2, 13(%rdi)
    658 	movb	$0, 29(%rdi)
    659 	jmp	L(CalculateSrcLen)
    660 
    661 	.p2align 4
    662 L(Exit31):
    663 	movdqu	(%rsi), %xmm0
    664 	movdqu	14(%rsi), %xmm2
    665 	movdqu	%xmm0, (%rdi)
    666 	movdqu	%xmm2, 14(%rdi)
    667 	movb	$0, 30(%rdi)
    668 	jmp	L(CalculateSrcLen)
    669 
    670 	.p2align 4
    671 L(Exit32):
    672 	movdqu	(%rsi), %xmm0
    673 	movdqu	15(%rsi), %xmm2
    674 	movdqu	%xmm0, (%rdi)
    675 	movdqu	%xmm2, 15(%rdi)
    676 	movb	$0, 31(%rdi)
    677 	jmp	L(CalculateSrcLen)
    678 
    679 	.p2align 4
    680 L(StringTail0):
    681 	mov	(%rsi), %dl
    682 	mov	%dl, (%rdi)
    683 	RETURN
    684 
    685 	.p2align 4
    686 L(StringTail1):
    687 	mov	(%rsi), %dx
    688 	mov	%dx, (%rdi)
    689 	RETURN
    690 
    691 	.p2align 4
    692 L(StringTail2):
    693 	mov	(%rsi), %cx
    694 	mov	2(%rsi), %dl
    695 	mov	%cx, (%rdi)
    696 	mov	%dl, 2(%rdi)
    697 	RETURN
    698 
    699 	.p2align 4
    700 L(StringTail3):
    701 	mov	(%rsi), %edx
    702 	mov	%edx, (%rdi)
    703 	RETURN
    704 
    705 	.p2align 4
    706 L(StringTail4):
    707 	mov	(%rsi), %ecx
    708 	mov	4(%rsi), %dl
    709 	mov	%ecx, (%rdi)
    710 	mov	%dl, 4(%rdi)
    711 	RETURN
    712 
    713 	.p2align 4
    714 L(StringTail5):
    715 	mov	(%rsi), %ecx
    716 	mov	4(%rsi), %dx
    717 	mov	%ecx, (%rdi)
    718 	mov	%dx, 4(%rdi)
    719 	RETURN
    720 
    721 	.p2align 4
    722 L(StringTail6):
    723 	mov	(%rsi), %ecx
    724 	mov	3(%rsi), %edx
    725 	mov	%ecx, (%rdi)
    726 	mov	%edx, 3(%rdi)
    727 	RETURN
    728 
    729 	.p2align 4
    730 L(StringTail7):
    731 	mov	(%rsi), %rdx
    732 	mov	%rdx, (%rdi)
    733 	RETURN
    734 
    735 	.p2align 4
    736 L(StringTail8):
    737 	mov	(%rsi), %rcx
    738 	mov	8(%rsi), %dl
    739 	mov	%rcx, (%rdi)
    740 	mov	%dl, 8(%rdi)
    741 	RETURN
    742 
    743 	.p2align 4
    744 L(StringTail9):
    745 	mov	(%rsi), %rcx
    746 	mov	8(%rsi), %dx
    747 	mov	%rcx, (%rdi)
    748 	mov	%dx, 8(%rdi)
    749 	RETURN
    750 
    751 	.p2align 4
    752 L(StringTail10):
    753 	mov	(%rsi), %rcx
    754 	mov	7(%rsi), %edx
    755 	mov	%rcx, (%rdi)
    756 	mov	%edx, 7(%rdi)
    757 	RETURN
    758 
    759 	.p2align 4
    760 L(StringTail11):
    761 	mov	(%rsi), %rcx
    762 	mov	8(%rsi), %edx
    763 	mov	%rcx, (%rdi)
    764 	mov	%edx, 8(%rdi)
    765 	RETURN
    766 
    767 	.p2align 4
    768 L(StringTail12):
    769 	mov	(%rsi), %rcx
    770 	mov	5(%rsi), %rdx
    771 	mov	%rcx, (%rdi)
    772 	mov	%rdx, 5(%rdi)
    773 	RETURN
    774 
    775 	.p2align 4
    776 L(StringTail13):
    777 	mov	(%rsi), %rcx
    778 	mov	6(%rsi), %rdx
    779 	mov	%rcx, (%rdi)
    780 	mov	%rdx, 6(%rdi)
    781 	RETURN
    782 
    783 	.p2align 4
    784 L(StringTail14):
    785 	mov	(%rsi), %rcx
    786 	mov	7(%rsi), %rdx
    787 	mov	%rcx, (%rdi)
    788 	mov	%rdx, 7(%rdi)
    789 	RETURN
    790 
    791 	.p2align 4
    792 L(StringTail15):
    793 	movdqu	(%rsi), %xmm0
    794 	movdqu	%xmm0, (%rdi)
    795 	RETURN
    796 
    797 	.p2align 4
    798 L(StringTail16):
    799 	movdqu	(%rsi), %xmm0
    800 	mov	16(%rsi), %cl
    801 	movdqu	%xmm0, (%rdi)
    802 	mov	%cl, 16(%rdi)
    803 	RETURN
    804 
    805 	.p2align 4
    806 L(StringTail17):
    807 	movdqu	(%rsi), %xmm0
    808 	mov	16(%rsi), %cx
    809 	movdqu	%xmm0, (%rdi)
    810 	mov	%cx, 16(%rdi)
    811 	RETURN
    812 
    813 	.p2align 4
    814 L(StringTail18):
    815 	movdqu	(%rsi), %xmm0
    816 	mov	15(%rsi), %ecx
    817 	movdqu	%xmm0, (%rdi)
    818 	mov	%ecx, 15(%rdi)
    819 	RETURN
    820 
    821 	.p2align 4
    822 L(StringTail19):
    823 	movdqu	(%rsi), %xmm0
    824 	mov	16(%rsi), %ecx
    825 	movdqu	%xmm0, (%rdi)
    826 	mov	%ecx, 16(%rdi)
    827 	RETURN
    828 
    829 	.p2align 4
    830 L(StringTail20):
    831 	movdqu	(%rsi), %xmm0
    832 	mov	16(%rsi), %ecx
    833 	mov	20(%rsi), %dl
    834 	movdqu	%xmm0, (%rdi)
    835 	mov	%ecx, 16(%rdi)
    836 	mov	%dl, 20(%rdi)
    837 	RETURN
    838 
    839 	.p2align 4
    840 L(StringTail21):
    841 	movdqu	(%rsi), %xmm0
    842 	mov	14(%rsi), %rcx
    843 	movdqu	%xmm0, (%rdi)
    844 	mov	%rcx, 14(%rdi)
    845 	RETURN
    846 
    847 	.p2align 4
    848 L(StringTail22):
    849 	movdqu	(%rsi), %xmm0
    850 	mov	15(%rsi), %rcx
    851 	movdqu	%xmm0, (%rdi)
    852 	mov	%rcx, 15(%rdi)
    853 	RETURN
    854 
    855 	.p2align 4
    856 L(StringTail23):
    857 	movdqu	(%rsi), %xmm0
    858 	mov	16(%rsi), %rcx
    859 	movdqu	%xmm0, (%rdi)
    860 	mov	%rcx, 16(%rdi)
    861 	RETURN
    862 
    863 	.p2align 4
    864 L(StringTail24):
    865 	movdqu	(%rsi), %xmm0
    866 	mov	16(%rsi), %rdx
    867 	mov	24(%rsi), %cl
    868 	movdqu	%xmm0, (%rdi)
    869 	mov	%rdx, 16(%rdi)
    870 	mov	%cl, 24(%rdi)
    871 	RETURN
    872 
    873 	.p2align 4
    874 L(StringTail25):
    875 	movdqu	(%rsi), %xmm0
    876 	mov	16(%rsi), %rdx
    877 	mov	24(%rsi), %cx
    878 	movdqu	%xmm0, (%rdi)
    879 	mov	%rdx, 16(%rdi)
    880 	mov	%cx, 24(%rdi)
    881 	RETURN
    882 
    883 	.p2align 4
    884 L(StringTail26):
    885 	movdqu	(%rsi), %xmm0
    886 	mov	16(%rsi), %rdx
    887 	mov	23(%rsi), %ecx
    888 	movdqu	%xmm0, (%rdi)
    889 	mov	%rdx, 16(%rdi)
    890 	mov	%ecx, 23(%rdi)
    891 	RETURN
    892 
    893 	.p2align 4
    894 L(StringTail27):
    895 	movdqu	(%rsi), %xmm0
    896 	mov	16(%rsi), %rdx
    897 	mov	24(%rsi), %ecx
    898 	movdqu	%xmm0, (%rdi)
    899 	mov	%rdx, 16(%rdi)
    900 	mov	%ecx, 24(%rdi)
    901 	RETURN
    902 
    903 	.p2align 4
    904 L(StringTail28):
    905 	movdqu	(%rsi), %xmm0
    906 	movdqu	13(%rsi), %xmm2
    907 	movdqu	%xmm0, (%rdi)
    908 	movdqu	%xmm2, 13(%rdi)
    909 	RETURN
    910 
    911 	.p2align 4
    912 L(StringTail29):
    913 	movdqu	(%rsi), %xmm0
    914 	movdqu	14(%rsi), %xmm2
    915 	movdqu	%xmm0, (%rdi)
    916 	movdqu	%xmm2, 14(%rdi)
    917 	RETURN
    918 
    919 	.p2align 4
    920 L(StringTail30):
    921 	movdqu	(%rsi), %xmm0
    922 	movdqu	15(%rsi), %xmm2
    923 	movdqu	%xmm0, (%rdi)
    924 	movdqu	%xmm2, 15(%rdi)
    925 	RETURN
    926 
    927 	.p2align 4
    928 L(StringTail31):
    929 	movdqu	(%rsi), %xmm0
    930 	movdqu	16(%rsi), %xmm2
    931 	movdqu	%xmm0, (%rdi)
    932 	movdqu	%xmm2, 16(%rdi)
    933 	RETURN
    934 
    935 	.p2align 4
    936 L(StringTail32):
    937 	movdqu	(%rsi), %xmm0
    938 	movdqu	16(%rsi), %xmm2
    939 	mov	32(%rsi), %cl
    940 	movdqu	%xmm0, (%rdi)
    941 	movdqu	%xmm2, 16(%rdi)
    942 	mov	%cl, 32(%rdi)
    943 	RETURN
    944 
    945 	.p2align 4
    946 L(StringTail33):
    947 	movdqu	(%rsi), %xmm0
    948 	movdqu	16(%rsi), %xmm2
    949 	mov	32(%rsi), %cl
    950 	movdqu	%xmm0, (%rdi)
    951 	movdqu	%xmm2, 16(%rdi)
    952 	mov	%cl, 32(%rdi)
    953 	RETURN
    954 
    955 	.p2align 4
    956 L(CalculateSrcLenCase1):
    957 	xor	%r8, %r8
    958 	xor	%rax, %rax
    959 L(CalculateSrcLen):
    960 	pxor	%xmm0, %xmm0
    961 	xor	%rcx, %rcx
    962 	add	%r8, %rsi
    963 	movdqu	(%rsi), %xmm1
    964 	pcmpeqb	%xmm1, %xmm0
    965 	pmovmskb %xmm0, %rdx
    966 	test	%rdx, %rdx
    967 	jnz	L(SrcLenLoopEnd)
    968 
    969 	add	%rax, %r9
    970 	mov	$16, %rax
    971 	mov	%rsi, %rcx
    972 	and	$15, %rcx
    973 	and	$-16, %rsi
    974 L(SrcLenLoop):
    975 	movdqa	(%rsi, %rax), %xmm1
    976 	pcmpeqb	%xmm1, %xmm0
    977 	pmovmskb %xmm0, %rdx
    978 	test	%rdx, %rdx
    979 	jnz	L(SrcLenLoopEnd)
    980 	add	$16, %rax
    981 	jmp	L(SrcLenLoop)
    982 
    983 	.p2align 4
    984 L(SrcLenLoopEnd):
    985 	bsf	%rdx, %rdx
    986 	add	%rdx, %rax
    987 	sub	%rcx, %rax
    988 	RETURN
    989 
    990 END (STRLCPY)
    991 
    992 	.p2align 4
    993 	.section .rodata
    994 L(ExitTable):
    995 	.int	JMPTBL(L(Exit0), L(ExitTable))
    996 	.int	JMPTBL(L(Exit1), L(ExitTable))
    997 	.int	JMPTBL(L(Exit2), L(ExitTable))
    998 	.int	JMPTBL(L(Exit3), L(ExitTable))
    999 	.int	JMPTBL(L(Exit4), L(ExitTable))
   1000 	.int	JMPTBL(L(Exit5), L(ExitTable))
   1001 	.int	JMPTBL(L(Exit6), L(ExitTable))
   1002 	.int	JMPTBL(L(Exit7), L(ExitTable))
   1003 	.int	JMPTBL(L(Exit8), L(ExitTable))
   1004 	.int	JMPTBL(L(Exit9), L(ExitTable))
   1005 	.int	JMPTBL(L(Exit10), L(ExitTable))
   1006 	.int	JMPTBL(L(Exit11), L(ExitTable))
   1007 	.int	JMPTBL(L(Exit12), L(ExitTable))
   1008 	.int	JMPTBL(L(Exit13), L(ExitTable))
   1009 	.int	JMPTBL(L(Exit14), L(ExitTable))
   1010 	.int	JMPTBL(L(Exit15), L(ExitTable))
   1011 	.int	JMPTBL(L(Exit16), L(ExitTable))
   1012 	.int	JMPTBL(L(Exit17), L(ExitTable))
   1013 	.int	JMPTBL(L(Exit18), L(ExitTable))
   1014 	.int	JMPTBL(L(Exit19), L(ExitTable))
   1015 	.int	JMPTBL(L(Exit20), L(ExitTable))
   1016 	.int	JMPTBL(L(Exit21), L(ExitTable))
   1017 	.int	JMPTBL(L(Exit22), L(ExitTable))
   1018 	.int	JMPTBL(L(Exit23), L(ExitTable))
   1019 	.int	JMPTBL(L(Exit24), L(ExitTable))
   1020 	.int	JMPTBL(L(Exit25), L(ExitTable))
   1021 	.int	JMPTBL(L(Exit26), L(ExitTable))
   1022 	.int	JMPTBL(L(Exit27), L(ExitTable))
   1023 	.int	JMPTBL(L(Exit28), L(ExitTable))
   1024 	.int	JMPTBL(L(Exit29), L(ExitTable))
   1025 	.int	JMPTBL(L(Exit30), L(ExitTable))
   1026 	.int	JMPTBL(L(Exit31), L(ExitTable))
   1027 	.int	JMPTBL(L(Exit32), L(ExitTable))
   1028 L(ExitStringTailTable):
   1029 	.int	JMPTBL(L(StringTail0), L(ExitStringTailTable))
   1030 	.int	JMPTBL(L(StringTail1), L(ExitStringTailTable))
   1031 	.int	JMPTBL(L(StringTail2), L(ExitStringTailTable))
   1032 	.int	JMPTBL(L(StringTail3), L(ExitStringTailTable))
   1033 	.int	JMPTBL(L(StringTail4), L(ExitStringTailTable))
   1034 	.int	JMPTBL(L(StringTail5), L(ExitStringTailTable))
   1035 	.int	JMPTBL(L(StringTail6), L(ExitStringTailTable))
   1036 	.int	JMPTBL(L(StringTail7), L(ExitStringTailTable))
   1037 	.int	JMPTBL(L(StringTail8), L(ExitStringTailTable))
   1038 	.int	JMPTBL(L(StringTail9), L(ExitStringTailTable))
   1039 	.int	JMPTBL(L(StringTail10), L(ExitStringTailTable))
   1040 	.int	JMPTBL(L(StringTail11), L(ExitStringTailTable))
   1041 	.int	JMPTBL(L(StringTail12), L(ExitStringTailTable))
   1042 	.int	JMPTBL(L(StringTail13), L(ExitStringTailTable))
   1043 	.int	JMPTBL(L(StringTail14), L(ExitStringTailTable))
   1044 	.int	JMPTBL(L(StringTail15), L(ExitStringTailTable))
   1045 	.int	JMPTBL(L(StringTail16), L(ExitStringTailTable))
   1046 	.int	JMPTBL(L(StringTail17), L(ExitStringTailTable))
   1047 	.int	JMPTBL(L(StringTail18), L(ExitStringTailTable))
   1048 	.int	JMPTBL(L(StringTail19), L(ExitStringTailTable))
   1049 	.int	JMPTBL(L(StringTail20), L(ExitStringTailTable))
   1050 	.int	JMPTBL(L(StringTail21), L(ExitStringTailTable))
   1051 	.int	JMPTBL(L(StringTail22), L(ExitStringTailTable))
   1052 	.int	JMPTBL(L(StringTail23), L(ExitStringTailTable))
   1053 	.int	JMPTBL(L(StringTail24), L(ExitStringTailTable))
   1054 	.int	JMPTBL(L(StringTail25), L(ExitStringTailTable))
   1055 	.int	JMPTBL(L(StringTail26), L(ExitStringTailTable))
   1056 	.int	JMPTBL(L(StringTail27), L(ExitStringTailTable))
   1057 	.int	JMPTBL(L(StringTail28), L(ExitStringTailTable))
   1058 	.int	JMPTBL(L(StringTail29), L(ExitStringTailTable))
   1059 	.int	JMPTBL(L(StringTail30), L(ExitStringTailTable))
   1060 	.int	JMPTBL(L(StringTail31), L(ExitStringTailTable))
   1061 	.int	JMPTBL(L(StringTail32), L(ExitStringTailTable))
   1062 	.int	JMPTBL(L(StringTail33), L(ExitStringTailTable))
   1063