Home | History | Annotate | Download | only in string
      1 /*
      2 Copyright (c) 2010, Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 #include "cache.h"
     32 
     33 #ifndef MEMCPY
     34 # define MEMCPY	memcpy
     35 #endif
     36 
     37 #ifndef L
     38 # define L(label)	.L##label
     39 #endif
     40 
     41 #ifndef cfi_startproc
     42 # define cfi_startproc	.cfi_startproc
     43 #endif
     44 
     45 #ifndef cfi_endproc
     46 # define cfi_endproc	.cfi_endproc
     47 #endif
     48 
     49 #ifndef cfi_rel_offset
     50 # define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
     51 #endif
     52 
     53 #ifndef cfi_restore
     54 # define cfi_restore(reg)	.cfi_restore reg
     55 #endif
     56 
     57 #ifndef cfi_adjust_cfa_offset
     58 # define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
     59 #endif
     60 
     61 #ifndef ENTRY
     62 # define ENTRY(name)		\
     63 	.type name,  @function;		\
     64 	.globl name;		\
     65 	.p2align 4;		\
     66 name:		\
     67 	cfi_startproc
     68 #endif
     69 
     70 #ifndef END
     71 # define END(name)		\
     72 	cfi_endproc;		\
     73 	.size name, .-name
     74 #endif
     75 
     76 #ifdef USE_AS_BCOPY
     77 # define SRC		PARMS
     78 # define DEST		SRC+4
     79 # define LEN		DEST+4
     80 #else
     81 # define DEST		PARMS
     82 # define SRC		DEST+4
     83 # define LEN		SRC+4
     84 #endif
     85 
     86 #define CFI_PUSH(REG)		\
     87   cfi_adjust_cfa_offset (4);		\
     88   cfi_rel_offset (REG, 0)
     89 
     90 #define CFI_POP(REG)		\
     91   cfi_adjust_cfa_offset (-4);		\
     92   cfi_restore (REG)
     93 
     94 #define PUSH(REG)	pushl REG; CFI_PUSH (REG)
     95 #define POP(REG)	popl REG; CFI_POP (REG)
     96 
     97 #if (defined SHARED || defined __PIC__)
     98 # define PARMS		8		/* Preserve EBX.  */
     99 # define ENTRANCE	PUSH (%ebx);
    100 # define RETURN_END	POP (%ebx); ret
    101 # define RETURN		RETURN_END; CFI_PUSH (%ebx)
    102 # define JMPTBL(I, B)	I - B
    103 
    104 # define SETUP_PIC_REG(x)	call	__x86.get_pc_thunk.x
    105 
    106 /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
    107 	jump table with relative offsets.  INDEX is a register contains the
    108 	index into the jump table.   SCALE is the scale of INDEX. */
    109 
    110 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
    111     /* We first load PC into EBX.  */		\
    112 	SETUP_PIC_REG(bx);		\
    113     /* Get the address of the jump table.  */		\
    114 	addl	$(TABLE - .), %ebx;		\
    115     /* Get the entry and convert the relative offset to the		\
    116 	absolute	address.  */		\
    117 	addl	(%ebx, INDEX, SCALE), %ebx;		\
    118     /* We loaded the jump table.  Go.  */		\
    119 	jmp	*%ebx
    120 #else
    121 
    122 # define PARMS		4
    123 # define ENTRANCE
    124 # define RETURN_END	ret
    125 # define RETURN		RETURN_END
    126 # define JMPTBL(I, B)	I
    127 
    128 /* Branch to an entry in a jump table.  TABLE is a jump table with
    129 	absolute offsets.  INDEX is a register contains the index into the
    130 	jump table.  SCALE is the scale of INDEX. */
    131 
    132 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
    133 	jmp	*TABLE(, INDEX, SCALE)
    134 #endif
    135 
    136 	.section .text.ssse3,"ax",@progbits
    137 ENTRY (MEMCPY)
    138 	ENTRANCE
    139 	movl	LEN(%esp), %ecx
    140 	movl	SRC(%esp), %eax
    141 	movl	DEST(%esp), %edx
    142 
    143 #ifdef USE_AS_MEMMOVE
    144 	cmp	%eax, %edx
    145 	jb	L(copy_forward)
    146 	je	L(fwd_write_0bytes)
    147 	cmp	$32, %ecx
    148 	jae	L(memmove_bwd)
    149 	jmp	L(bk_write_less32bytes_2)
    150 
    151 	.p2align 4
    152 L(memmove_bwd):
    153 	add	%ecx, %eax
    154 	cmp	%eax, %edx
    155 	movl	SRC(%esp), %eax
    156 	jb	L(copy_backward)
    157 
    158 L(copy_forward):
    159 #endif
    160 	cmp	$48, %ecx
    161 	jae	L(48bytesormore)
    162 
    163 L(fwd_write_less32bytes):
    164 #ifndef USE_AS_MEMMOVE
    165 	cmp	%dl, %al
    166 	jb	L(bk_write)
    167 #endif
    168 	add	%ecx, %edx
    169 	add	%ecx, %eax
    170 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
    171 #ifndef USE_AS_MEMMOVE
    172 	.p2align 4
    173 L(bk_write):
    174 	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
    175 #endif
    176 
    177 	.p2align 4
    178 L(48bytesormore):
    179 #ifndef USE_AS_MEMMOVE
    180 	movlpd	(%eax), %xmm0
    181 	movlpd	8(%eax), %xmm1
    182 	movlpd	%xmm0, (%edx)
    183 	movlpd	%xmm1, 8(%edx)
    184 #else
    185 	movdqu	(%eax), %xmm0
    186 #endif
    187 	PUSH (%edi)
    188 	movl	%edx, %edi
    189 	and	$-16, %edx
    190 	add	$16, %edx
    191 	sub	%edx, %edi
    192 	add	%edi, %ecx
    193 	sub	%edi, %eax
    194 
    195 #ifdef SHARED_CACHE_SIZE_HALF
    196 	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
    197 #else
    198 # if (defined SHARED || defined __PIC__)
    199 	SETUP_PIC_REG(bx)
    200 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    201 	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
    202 # else
    203 	cmp	__x86_shared_cache_size_half, %ecx
    204 # endif
    205 #endif
    206 
    207 	mov	%eax, %edi
    208 	jae	L(large_page)
    209 	and	$0xf, %edi
    210 	jz	L(shl_0)
    211 	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
    212 
    213 	.p2align 4
    214 L(shl_0):
    215 #ifdef USE_AS_MEMMOVE
    216 	movl	DEST+4(%esp), %edi
    217 	movdqu	%xmm0, (%edi)
    218 #endif
    219 	xor	%edi, %edi
    220 	cmp	$127, %ecx
    221 	ja	L(shl_0_gobble)
    222 	lea	-32(%ecx), %ecx
    223 
    224 	.p2align 4
    225 L(shl_0_loop):
    226 	movdqa	(%eax, %edi), %xmm0
    227 	movdqa	16(%eax, %edi), %xmm1
    228 	sub	$32, %ecx
    229 	movdqa	%xmm0, (%edx, %edi)
    230 	movdqa	%xmm1, 16(%edx, %edi)
    231 	lea	32(%edi), %edi
    232 	jb	L(shl_0_end)
    233 
    234 	movdqa	(%eax, %edi), %xmm0
    235 	movdqa	16(%eax, %edi), %xmm1
    236 	sub	$32, %ecx
    237 	movdqa	%xmm0, (%edx, %edi)
    238 	movdqa	%xmm1, 16(%edx, %edi)
    239 	lea	32(%edi), %edi
    240 	jb	L(shl_0_end)
    241 
    242 	movdqa	(%eax, %edi), %xmm0
    243 	movdqa	16(%eax, %edi), %xmm1
    244 	sub	$32, %ecx
    245 	movdqa	%xmm0, (%edx, %edi)
    246 	movdqa	%xmm1, 16(%edx, %edi)
    247 	lea	32(%edi), %edi
    248 	jb	L(shl_0_end)
    249 
    250 	movdqa	(%eax, %edi), %xmm0
    251 	movdqa	16(%eax, %edi), %xmm1
    252 	sub	$32, %ecx
    253 	movdqa	%xmm0, (%edx, %edi)
    254 	movdqa	%xmm1, 16(%edx, %edi)
    255 	lea	32(%edi), %edi
    256 
    257 L(shl_0_end):
    258 	lea	32(%ecx), %ecx
    259 	add	%ecx, %edi
    260 	add	%edi, %edx
    261 	add	%edi, %eax
    262 	POP (%edi)
    263 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
    264 
    265 	CFI_PUSH (%edi)
    266 
    267 	.p2align 4
    268 L(shl_0_gobble):
    269 #ifdef DATA_CACHE_SIZE_HALF
    270 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    271 #else
    272 # if (defined SHARED || defined __PIC__)
    273 	SETUP_PIC_REG(bx)
    274 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    275 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    276 # else
    277 	cmp	__x86_data_cache_size_half, %ecx
    278 # endif
    279 #endif
    280 	POP	(%edi)
    281 	lea	-128(%ecx), %ecx
    282 	jae	L(shl_0_gobble_mem_loop)
    283 
    284 	.p2align 4
    285 L(shl_0_gobble_cache_loop):
    286 	movdqa	(%eax), %xmm0
    287 	movdqa	0x10(%eax), %xmm1
    288 	movdqa	0x20(%eax), %xmm2
    289 	movdqa	0x30(%eax), %xmm3
    290 	movdqa	0x40(%eax), %xmm4
    291 	movdqa	0x50(%eax), %xmm5
    292 	movdqa	0x60(%eax), %xmm6
    293 	movdqa	0x70(%eax), %xmm7
    294 	lea	0x80(%eax), %eax
    295 	sub	$128, %ecx
    296 	movdqa	%xmm0, (%edx)
    297 	movdqa	%xmm1, 0x10(%edx)
    298 	movdqa	%xmm2, 0x20(%edx)
    299 	movdqa	%xmm3, 0x30(%edx)
    300 	movdqa	%xmm4, 0x40(%edx)
    301 	movdqa	%xmm5, 0x50(%edx)
    302 	movdqa	%xmm6, 0x60(%edx)
    303 	movdqa	%xmm7, 0x70(%edx)
    304 	lea	0x80(%edx), %edx
    305 
    306 	jae	L(shl_0_gobble_cache_loop)
    307 	cmp	$-0x40, %ecx
    308 	lea	0x80(%ecx), %ecx
    309 	jl	L(shl_0_cache_less_64bytes)
    310 
    311 	movdqa	(%eax), %xmm0
    312 	sub	$0x40, %ecx
    313 	movdqa	0x10(%eax), %xmm1
    314 	movdqa	%xmm0, (%edx)
    315 	movdqa	%xmm1, 0x10(%edx)
    316 	movdqa	0x20(%eax), %xmm0
    317 	movdqa	0x30(%eax), %xmm1
    318 	add	$0x40, %eax
    319 	movdqa	%xmm0, 0x20(%edx)
    320 	movdqa	%xmm1, 0x30(%edx)
    321 	add	$0x40, %edx
    322 
    323 L(shl_0_cache_less_64bytes):
    324 	cmp	$0x20, %ecx
    325 	jb	L(shl_0_cache_less_32bytes)
    326 	movdqa	(%eax), %xmm0
    327 	sub	$0x20, %ecx
    328 	movdqa	0x10(%eax), %xmm1
    329 	add	$0x20, %eax
    330 	movdqa	%xmm0, (%edx)
    331 	movdqa	%xmm1, 0x10(%edx)
    332 	add	$0x20, %edx
    333 
    334 L(shl_0_cache_less_32bytes):
    335 	cmp	$0x10, %ecx
    336 	jb	L(shl_0_cache_less_16bytes)
    337 	sub	$0x10, %ecx
    338 	movdqa	(%eax), %xmm0
    339 	add	$0x10, %eax
    340 	movdqa	%xmm0, (%edx)
    341 	add	$0x10, %edx
    342 
    343 L(shl_0_cache_less_16bytes):
    344 	add	%ecx, %edx
    345 	add	%ecx, %eax
    346 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
    347 
    348 	.p2align 4
    349 L(shl_0_gobble_mem_loop):
    350 	prefetcht0 0x1c0(%eax)
    351 	prefetcht0 0x280(%eax)
    352 	prefetcht0 0x1c0(%edx)
    353 
    354 	movdqa	(%eax), %xmm0
    355 	movdqa	0x10(%eax), %xmm1
    356 	movdqa	0x20(%eax), %xmm2
    357 	movdqa	0x30(%eax), %xmm3
    358 	movdqa	0x40(%eax), %xmm4
    359 	movdqa	0x50(%eax), %xmm5
    360 	movdqa	0x60(%eax), %xmm6
    361 	movdqa	0x70(%eax), %xmm7
    362 	lea	0x80(%eax), %eax
    363 	sub	$0x80, %ecx
    364 	movdqa	%xmm0, (%edx)
    365 	movdqa	%xmm1, 0x10(%edx)
    366 	movdqa	%xmm2, 0x20(%edx)
    367 	movdqa	%xmm3, 0x30(%edx)
    368 	movdqa	%xmm4, 0x40(%edx)
    369 	movdqa	%xmm5, 0x50(%edx)
    370 	movdqa	%xmm6, 0x60(%edx)
    371 	movdqa	%xmm7, 0x70(%edx)
    372 	lea	0x80(%edx), %edx
    373 
    374 	jae	L(shl_0_gobble_mem_loop)
    375 	cmp	$-0x40, %ecx
    376 	lea	0x80(%ecx), %ecx
    377 	jl	L(shl_0_mem_less_64bytes)
    378 
    379 	movdqa	(%eax), %xmm0
    380 	sub	$0x40, %ecx
    381 	movdqa	0x10(%eax), %xmm1
    382 
    383 	movdqa	%xmm0, (%edx)
    384 	movdqa	%xmm1, 0x10(%edx)
    385 
    386 	movdqa	0x20(%eax), %xmm0
    387 	movdqa	0x30(%eax), %xmm1
    388 	add	$0x40, %eax
    389 
    390 	movdqa	%xmm0, 0x20(%edx)
    391 	movdqa	%xmm1, 0x30(%edx)
    392 	add	$0x40, %edx
    393 
    394 L(shl_0_mem_less_64bytes):
    395 	cmp	$0x20, %ecx
    396 	jb	L(shl_0_mem_less_32bytes)
    397 	movdqa	(%eax), %xmm0
    398 	sub	$0x20, %ecx
    399 	movdqa	0x10(%eax), %xmm1
    400 	add	$0x20, %eax
    401 	movdqa	%xmm0, (%edx)
    402 	movdqa	%xmm1, 0x10(%edx)
    403 	add	$0x20, %edx
    404 
    405 L(shl_0_mem_less_32bytes):
    406 	cmp	$0x10, %ecx
    407 	jb	L(shl_0_mem_less_16bytes)
    408 	sub	$0x10, %ecx
    409 	movdqa	(%eax), %xmm0
    410 	add	$0x10, %eax
    411 	movdqa	%xmm0, (%edx)
    412 	add	$0x10, %edx
    413 
    414 L(shl_0_mem_less_16bytes):
    415 	add	%ecx, %edx
    416 	add	%ecx, %eax
    417 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
    418 
    419 	.p2align 4
    420 L(shl_1):
    421 #ifndef USE_AS_MEMMOVE
    422 	movaps	-1(%eax), %xmm1
    423 #else
    424 	movl	DEST+4(%esp), %edi
    425 	movaps	-1(%eax), %xmm1
    426 	movdqu	%xmm0, (%edi)
    427 #endif
    428 #ifdef DATA_CACHE_SIZE_HALF
    429 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    430 #else
    431 # if (defined SHARED || defined __PIC__)
    432 	SETUP_PIC_REG(bx)
    433 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    434 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    435 # else
    436 	cmp	__x86_data_cache_size_half, %ecx
    437 # endif
    438 #endif
    439 	jb L(sh_1_no_prefetch)
    440 
    441 	lea	-64(%ecx), %ecx
    442 
    443 	.p2align 4
    444 L(Shl1LoopStart):
    445 	prefetcht0 0x1c0(%eax)
    446 	prefetcht0 0x1c0(%edx)
    447 	movaps	15(%eax), %xmm2
    448 	movaps	31(%eax), %xmm3
    449 	movaps	47(%eax), %xmm4
    450 	movaps	63(%eax), %xmm5
    451 	movaps	%xmm5, %xmm7
    452 	palignr	$1, %xmm4, %xmm5
    453 	palignr	$1, %xmm3, %xmm4
    454 	movaps	%xmm5, 48(%edx)
    455 	palignr	$1, %xmm2, %xmm3
    456 	lea	64(%eax), %eax
    457 	palignr	$1, %xmm1, %xmm2
    458 	movaps	%xmm4, 32(%edx)
    459 	movaps	%xmm3, 16(%edx)
    460 	movaps	%xmm7, %xmm1
    461 	movaps	%xmm2, (%edx)
    462 	lea	64(%edx), %edx
    463 	sub	$64, %ecx
    464 	ja	L(Shl1LoopStart)
    465 
    466 L(Shl1LoopLeave):
    467 	add	$32, %ecx
    468 	jle	L(shl_end_0)
    469 
    470 	movaps	15(%eax), %xmm2
    471 	movaps	31(%eax), %xmm3
    472 	palignr	$1, %xmm2, %xmm3
    473 	palignr	$1, %xmm1, %xmm2
    474 	movaps	%xmm2, (%edx)
    475 	movaps	%xmm3, 16(%edx)
    476 	lea	32(%edx, %ecx), %edx
    477 	lea	32(%eax, %ecx), %eax
    478 	POP (%edi)
    479 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    480 
    481 	CFI_PUSH (%edi)
    482 
    483 	.p2align 4
    484 L(sh_1_no_prefetch):
    485 	lea	-32(%ecx), %ecx
    486 	lea	-1(%eax), %eax
    487 	xor	%edi, %edi
    488 
    489 	.p2align 4
    490 L(sh_1_no_prefetch_loop):
    491 	movdqa	16(%eax, %edi), %xmm2
    492 	sub	$32, %ecx
    493 	movdqa	32(%eax, %edi), %xmm3
    494 	movdqa	%xmm3, %xmm4
    495 	palignr	$1, %xmm2, %xmm3
    496 	palignr	$1, %xmm1, %xmm2
    497 	lea	32(%edi), %edi
    498 	movdqa	%xmm2, -32(%edx, %edi)
    499 	movdqa	%xmm3, -16(%edx, %edi)
    500 	jb	L(sh_1_end_no_prefetch_loop)
    501 
    502 	movdqa	16(%eax, %edi), %xmm2
    503 	sub	$32, %ecx
    504 	movdqa	32(%eax, %edi), %xmm3
    505 	movdqa	%xmm3, %xmm1
    506 	palignr	$1, %xmm2, %xmm3
    507 	palignr	$1, %xmm4, %xmm2
    508 	lea	32(%edi), %edi
    509 	movdqa	%xmm2, -32(%edx, %edi)
    510 	movdqa	%xmm3, -16(%edx, %edi)
    511 	jae	L(sh_1_no_prefetch_loop)
    512 
    513 L(sh_1_end_no_prefetch_loop):
    514 	lea	32(%ecx), %ecx
    515 	add	%ecx, %edi
    516 	add	%edi, %edx
    517 	lea	1(%edi, %eax), %eax
    518 	POP	(%edi)
    519 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    520 
    521 	CFI_PUSH (%edi)
    522 
    523 	.p2align 4
    524 L(shl_2):
    525 #ifndef USE_AS_MEMMOVE
    526 	movaps	-2(%eax), %xmm1
    527 #else
    528 	movl	DEST+4(%esp), %edi
    529 	movaps	-2(%eax), %xmm1
    530 	movdqu	%xmm0, (%edi)
    531 #endif
    532 #ifdef DATA_CACHE_SIZE_HALF
    533 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    534 #else
    535 # if (defined SHARED || defined __PIC__)
    536 	SETUP_PIC_REG(bx)
    537 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    538 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    539 # else
    540 	cmp	__x86_data_cache_size_half, %ecx
    541 # endif
    542 #endif
    543 	jb L(sh_2_no_prefetch)
    544 
    545 	lea	-64(%ecx), %ecx
    546 
    547 	.p2align 4
    548 L(Shl2LoopStart):
    549 	prefetcht0 0x1c0(%eax)
    550 	prefetcht0 0x1c0(%edx)
    551 	movaps	14(%eax), %xmm2
    552 	movaps	30(%eax), %xmm3
    553 	movaps	46(%eax), %xmm4
    554 	movaps	62(%eax), %xmm5
    555 	movaps	%xmm5, %xmm7
    556 	palignr	$2, %xmm4, %xmm5
    557 	palignr	$2, %xmm3, %xmm4
    558 	movaps	%xmm5, 48(%edx)
    559 	palignr	$2, %xmm2, %xmm3
    560 	lea	64(%eax), %eax
    561 	palignr	$2, %xmm1, %xmm2
    562 	movaps	%xmm4, 32(%edx)
    563 	movaps	%xmm3, 16(%edx)
    564 	movaps	%xmm7, %xmm1
    565 	movaps	%xmm2, (%edx)
    566 	lea	64(%edx), %edx
    567 	sub	$64, %ecx
    568 	ja	L(Shl2LoopStart)
    569 
    570 L(Shl2LoopLeave):
    571 	add	$32, %ecx
    572 	jle	L(shl_end_0)
    573 
    574 	movaps	14(%eax), %xmm2
    575 	movaps	30(%eax), %xmm3
    576 	palignr	$2, %xmm2, %xmm3
    577 	palignr	$2, %xmm1, %xmm2
    578 	movaps	%xmm2, (%edx)
    579 	movaps	%xmm3, 16(%edx)
    580 	lea	32(%edx, %ecx), %edx
    581 	lea	32(%eax, %ecx), %eax
    582 	POP (%edi)
    583 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    584 
    585 	CFI_PUSH (%edi)
    586 
    587 	.p2align 4
    588 L(sh_2_no_prefetch):
    589 	lea	-32(%ecx), %ecx
    590 	lea	-2(%eax), %eax
    591 	xor	%edi, %edi
    592 
    593 	.p2align 4
    594 L(sh_2_no_prefetch_loop):
    595 	movdqa	16(%eax, %edi), %xmm2
    596 	sub	$32, %ecx
    597 	movdqa	32(%eax, %edi), %xmm3
    598 	movdqa	%xmm3, %xmm4
    599 	palignr	$2, %xmm2, %xmm3
    600 	palignr	$2, %xmm1, %xmm2
    601 	lea	32(%edi), %edi
    602 	movdqa	%xmm2, -32(%edx, %edi)
    603 	movdqa	%xmm3, -16(%edx, %edi)
    604 	jb	L(sh_2_end_no_prefetch_loop)
    605 
    606 	movdqa	16(%eax, %edi), %xmm2
    607 	sub	$32, %ecx
    608 	movdqa	32(%eax, %edi), %xmm3
    609 	movdqa	%xmm3, %xmm1
    610 	palignr	$2, %xmm2, %xmm3
    611 	palignr	$2, %xmm4, %xmm2
    612 	lea	32(%edi), %edi
    613 	movdqa	%xmm2, -32(%edx, %edi)
    614 	movdqa	%xmm3, -16(%edx, %edi)
    615 	jae	L(sh_2_no_prefetch_loop)
    616 
    617 L(sh_2_end_no_prefetch_loop):
    618 	lea	32(%ecx), %ecx
    619 	add	%ecx, %edi
    620 	add	%edi, %edx
    621 	lea	2(%edi, %eax), %eax
    622 	POP	(%edi)
    623 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    624 
    625 	CFI_PUSH (%edi)
    626 
    627 	.p2align 4
    628 L(shl_3):
    629 #ifndef USE_AS_MEMMOVE
    630 	movaps	-3(%eax), %xmm1
    631 #else
    632 	movl	DEST+4(%esp), %edi
    633 	movaps	-3(%eax), %xmm1
    634 	movdqu	%xmm0, (%edi)
    635 #endif
    636 #ifdef DATA_CACHE_SIZE_HALF
    637 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    638 #else
    639 # if (defined SHARED || defined __PIC__)
    640 	SETUP_PIC_REG(bx)
    641 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    642 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    643 # else
    644 	cmp	__x86_data_cache_size_half, %ecx
    645 # endif
    646 #endif
    647 	jb L(sh_3_no_prefetch)
    648 
    649 	lea	-64(%ecx), %ecx
    650 
    651 	.p2align 4
    652 L(Shl3LoopStart):
    653 	prefetcht0 0x1c0(%eax)
    654 	prefetcht0 0x1c0(%edx)
    655 	movaps	13(%eax), %xmm2
    656 	movaps	29(%eax), %xmm3
    657 	movaps	45(%eax), %xmm4
    658 	movaps	61(%eax), %xmm5
    659 	movaps	%xmm5, %xmm7
    660 	palignr	$3, %xmm4, %xmm5
    661 	palignr	$3, %xmm3, %xmm4
    662 	movaps	%xmm5, 48(%edx)
    663 	palignr	$3, %xmm2, %xmm3
    664 	lea	64(%eax), %eax
    665 	palignr	$3, %xmm1, %xmm2
    666 	movaps	%xmm4, 32(%edx)
    667 	movaps	%xmm3, 16(%edx)
    668 	movaps	%xmm7, %xmm1
    669 	movaps	%xmm2, (%edx)
    670 	lea	64(%edx), %edx
    671 	sub	$64, %ecx
    672 	ja	L(Shl3LoopStart)
    673 
    674 L(Shl3LoopLeave):
    675 	add	$32, %ecx
    676 	jle	L(shl_end_0)
    677 
    678 	movaps	13(%eax), %xmm2
    679 	movaps	29(%eax), %xmm3
    680 	palignr	$3, %xmm2, %xmm3
    681 	palignr	$3, %xmm1, %xmm2
    682 	movaps	%xmm2, (%edx)
    683 	movaps	%xmm3, 16(%edx)
    684 	lea	32(%edx, %ecx), %edx
    685 	lea	32(%eax, %ecx), %eax
    686 	POP (%edi)
    687 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    688 
    689 	CFI_PUSH (%edi)
    690 
    691 	.p2align 4
    692 L(sh_3_no_prefetch):
    693 	lea	-32(%ecx), %ecx
    694 	lea	-3(%eax), %eax
    695 	xor	%edi, %edi
    696 
    697 	.p2align 4
    698 L(sh_3_no_prefetch_loop):
    699 	movdqa	16(%eax, %edi), %xmm2
    700 	sub	$32, %ecx
    701 	movdqa	32(%eax, %edi), %xmm3
    702 	movdqa	%xmm3, %xmm4
    703 	palignr	$3, %xmm2, %xmm3
    704 	palignr	$3, %xmm1, %xmm2
    705 	lea	32(%edi), %edi
    706 	movdqa	%xmm2, -32(%edx, %edi)
    707 	movdqa	%xmm3, -16(%edx, %edi)
    708 
    709 	jb	L(sh_3_end_no_prefetch_loop)
    710 
    711 	movdqa	16(%eax, %edi), %xmm2
    712 	sub	$32, %ecx
    713 	movdqa	32(%eax, %edi), %xmm3
    714 	movdqa	%xmm3, %xmm1
    715 	palignr	$3, %xmm2, %xmm3
    716 	palignr	$3, %xmm4, %xmm2
    717 	lea	32(%edi), %edi
    718 	movdqa	%xmm2, -32(%edx, %edi)
    719 	movdqa	%xmm3, -16(%edx, %edi)
    720 
    721 	jae	L(sh_3_no_prefetch_loop)
    722 
    723 L(sh_3_end_no_prefetch_loop):
    724 	lea	32(%ecx), %ecx
    725 	add	%ecx, %edi
    726 	add	%edi, %edx
    727 	lea	3(%edi, %eax), %eax
    728 	POP	(%edi)
    729 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    730 
    731 	CFI_PUSH (%edi)
    732 
    733 	.p2align 4
    734 L(shl_4):
    735 #ifndef USE_AS_MEMMOVE
    736 	movaps	-4(%eax), %xmm1
    737 #else
    738 	movl	DEST+4(%esp), %edi
    739 	movaps	-4(%eax), %xmm1
    740 	movdqu	%xmm0, (%edi)
    741 #endif
    742 #ifdef DATA_CACHE_SIZE_HALF
    743 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    744 #else
    745 # if (defined SHARED || defined __PIC__)
    746 	SETUP_PIC_REG(bx)
    747 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    748 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    749 # else
    750 	cmp	__x86_data_cache_size_half, %ecx
    751 # endif
    752 #endif
    753 	jb L(sh_4_no_prefetch)
    754 
    755 	lea	-64(%ecx), %ecx
    756 
    757 	.p2align 4
    758 L(Shl4LoopStart):
    759 	prefetcht0 0x1c0(%eax)
    760 	prefetcht0 0x1c0(%edx)
    761 	movaps	12(%eax), %xmm2
    762 	movaps	28(%eax), %xmm3
    763 	movaps	44(%eax), %xmm4
    764 	movaps	60(%eax), %xmm5
    765 	movaps	%xmm5, %xmm7
    766 	palignr	$4, %xmm4, %xmm5
    767 	palignr	$4, %xmm3, %xmm4
    768 	movaps	%xmm5, 48(%edx)
    769 	palignr	$4, %xmm2, %xmm3
    770 	lea	64(%eax), %eax
    771 	palignr	$4, %xmm1, %xmm2
    772 	movaps	%xmm4, 32(%edx)
    773 	movaps	%xmm3, 16(%edx)
    774 	movaps	%xmm7, %xmm1
    775 	movaps	%xmm2, (%edx)
    776 	lea	64(%edx), %edx
    777 	sub	$64, %ecx
    778 	ja	L(Shl4LoopStart)
    779 
    780 L(Shl4LoopLeave):
    781 	add	$32, %ecx
    782 	jle	L(shl_end_0)
    783 
    784 	movaps	12(%eax), %xmm2
    785 	movaps	28(%eax), %xmm3
    786 	palignr	$4, %xmm2, %xmm3
    787 	palignr	$4, %xmm1, %xmm2
    788 	movaps	%xmm2, (%edx)
    789 	movaps	%xmm3, 16(%edx)
    790 	lea	32(%edx, %ecx), %edx
    791 	lea	32(%eax, %ecx), %eax
    792 	POP (%edi)
    793 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    794 
    795 	CFI_PUSH (%edi)
    796 
    797 	.p2align 4
    798 L(sh_4_no_prefetch):
    799 	lea	-32(%ecx), %ecx
    800 	lea	-4(%eax), %eax
    801 	xor	%edi, %edi
    802 
    803 	.p2align 4
    804 L(sh_4_no_prefetch_loop):
    805 	movdqa	16(%eax, %edi), %xmm2
    806 	sub	$32, %ecx
    807 	movdqa	32(%eax, %edi), %xmm3
    808 	movdqa	%xmm3, %xmm4
    809 	palignr	$4, %xmm2, %xmm3
    810 	palignr	$4, %xmm1, %xmm2
    811 	lea	32(%edi), %edi
    812 	movdqa	%xmm2, -32(%edx, %edi)
    813 	movdqa	%xmm3, -16(%edx, %edi)
    814 
    815 	jb	L(sh_4_end_no_prefetch_loop)
    816 
    817 	movdqa	16(%eax, %edi), %xmm2
    818 	sub	$32, %ecx
    819 	movdqa	32(%eax, %edi), %xmm3
    820 	movdqa	%xmm3, %xmm1
    821 	palignr	$4, %xmm2, %xmm3
    822 	palignr	$4, %xmm4, %xmm2
    823 	lea	32(%edi), %edi
    824 	movdqa	%xmm2, -32(%edx, %edi)
    825 	movdqa	%xmm3, -16(%edx, %edi)
    826 
    827 	jae	L(sh_4_no_prefetch_loop)
    828 
    829 L(sh_4_end_no_prefetch_loop):
    830 	lea	32(%ecx), %ecx
    831 	add	%ecx, %edi
    832 	add	%edi, %edx
    833 	lea	4(%edi, %eax), %eax
    834 	POP	(%edi)
    835 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    836 
    837 	CFI_PUSH (%edi)
    838 
    839 	.p2align 4
    840 L(shl_5):
    841 #ifndef USE_AS_MEMMOVE
    842 	movaps	-5(%eax), %xmm1
    843 #else
    844 	movl	DEST+4(%esp), %edi
    845 	movaps	-5(%eax), %xmm1
    846 	movdqu	%xmm0, (%edi)
    847 #endif
    848 #ifdef DATA_CACHE_SIZE_HALF
    849 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    850 #else
    851 # if (defined SHARED || defined __PIC__)
    852 	SETUP_PIC_REG(bx)
    853 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    854 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    855 # else
    856 	cmp	__x86_data_cache_size_half, %ecx
    857 # endif
    858 #endif
    859 	jb L(sh_5_no_prefetch)
    860 
    861 	lea	-64(%ecx), %ecx
    862 
    863 	.p2align 4
    864 L(Shl5LoopStart):
    865 	prefetcht0 0x1c0(%eax)
    866 	prefetcht0 0x1c0(%edx)
    867 	movaps	11(%eax), %xmm2
    868 	movaps	27(%eax), %xmm3
    869 	movaps	43(%eax), %xmm4
    870 	movaps	59(%eax), %xmm5
    871 	movaps	%xmm5, %xmm7
    872 	palignr	$5, %xmm4, %xmm5
    873 	palignr	$5, %xmm3, %xmm4
    874 	movaps	%xmm5, 48(%edx)
    875 	palignr	$5, %xmm2, %xmm3
    876 	lea	64(%eax), %eax
    877 	palignr	$5, %xmm1, %xmm2
    878 	movaps	%xmm4, 32(%edx)
    879 	movaps	%xmm3, 16(%edx)
    880 	movaps	%xmm7, %xmm1
    881 	movaps	%xmm2, (%edx)
    882 	lea	64(%edx), %edx
    883 	sub	$64, %ecx
    884 	ja	L(Shl5LoopStart)
    885 
    886 L(Shl5LoopLeave):
    887 	add	$32, %ecx
    888 	jle	L(shl_end_0)
    889 
    890 	movaps	11(%eax), %xmm2
    891 	movaps	27(%eax), %xmm3
    892 	palignr	$5, %xmm2, %xmm3
    893 	palignr	$5, %xmm1, %xmm2
    894 	movaps	%xmm2, (%edx)
    895 	movaps	%xmm3, 16(%edx)
    896 	lea	32(%edx, %ecx), %edx
    897 	lea	32(%eax, %ecx), %eax
    898 	POP (%edi)
    899 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    900 
    901 	CFI_PUSH (%edi)
    902 
    903 	.p2align 4
    904 L(sh_5_no_prefetch):
    905 	lea	-32(%ecx), %ecx
    906 	lea	-5(%eax), %eax
    907 	xor	%edi, %edi
    908 
    909 	.p2align 4
    910 L(sh_5_no_prefetch_loop):
    911 	movdqa	16(%eax, %edi), %xmm2
    912 	sub	$32, %ecx
    913 	movdqa	32(%eax, %edi), %xmm3
    914 	movdqa	%xmm3, %xmm4
    915 	palignr	$5, %xmm2, %xmm3
    916 	palignr	$5, %xmm1, %xmm2
    917 	lea	32(%edi), %edi
    918 	movdqa	%xmm2, -32(%edx, %edi)
    919 	movdqa	%xmm3, -16(%edx, %edi)
    920 
    921 	jb	L(sh_5_end_no_prefetch_loop)
    922 
    923 	movdqa	16(%eax, %edi), %xmm2
    924 	sub	$32, %ecx
    925 	movdqa	32(%eax, %edi), %xmm3
    926 	movdqa	%xmm3, %xmm1
    927 	palignr	$5, %xmm2, %xmm3
    928 	palignr	$5, %xmm4, %xmm2
    929 	lea	32(%edi), %edi
    930 	movdqa	%xmm2, -32(%edx, %edi)
    931 	movdqa	%xmm3, -16(%edx, %edi)
    932 
    933 	jae	L(sh_5_no_prefetch_loop)
    934 
    935 L(sh_5_end_no_prefetch_loop):
    936 	lea	32(%ecx), %ecx
    937 	add	%ecx, %edi
    938 	add	%edi, %edx
    939 	lea	5(%edi, %eax), %eax
    940 	POP	(%edi)
    941 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    942 
    943 	CFI_PUSH (%edi)
    944 
    945 	.p2align 4
    946 L(shl_6):
    947 #ifndef USE_AS_MEMMOVE
    948 	movaps	-6(%eax), %xmm1
    949 #else
    950 	movl	DEST+4(%esp), %edi
    951 	movaps	-6(%eax), %xmm1
    952 	movdqu	%xmm0, (%edi)
    953 #endif
    954 #ifdef DATA_CACHE_SIZE_HALF
    955 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    956 #else
    957 # if (defined SHARED || defined __PIC__)
    958 	SETUP_PIC_REG(bx)
    959 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    960 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    961 # else
    962 	cmp	__x86_data_cache_size_half, %ecx
    963 # endif
    964 #endif
    965 	jb L(sh_6_no_prefetch)
    966 
    967 	lea	-64(%ecx), %ecx
    968 
    969 	.p2align 4
    970 L(Shl6LoopStart):
    971 	prefetcht0 0x1c0(%eax)
    972 	prefetcht0 0x1c0(%edx)
    973 	movaps	10(%eax), %xmm2
    974 	movaps	26(%eax), %xmm3
    975 	movaps	42(%eax), %xmm4
    976 	movaps	58(%eax), %xmm5
    977 	movaps	%xmm5, %xmm7
    978 	palignr	$6, %xmm4, %xmm5
    979 	palignr	$6, %xmm3, %xmm4
    980 	movaps	%xmm5, 48(%edx)
    981 	palignr	$6, %xmm2, %xmm3
    982 	lea	64(%eax), %eax
    983 	palignr	$6, %xmm1, %xmm2
    984 	movaps	%xmm4, 32(%edx)
    985 	movaps	%xmm3, 16(%edx)
    986 	movaps	%xmm7, %xmm1
    987 	movaps	%xmm2, (%edx)
    988 	lea	64(%edx), %edx
    989 	sub	$64, %ecx
    990 	ja	L(Shl6LoopStart)
    991 
    992 L(Shl6LoopLeave):
    993 	add	$32, %ecx
    994 	jle	L(shl_end_0)
    995 
    996 	movaps	10(%eax), %xmm2
    997 	movaps	26(%eax), %xmm3
    998 	palignr	$6, %xmm2, %xmm3
    999 	palignr	$6, %xmm1, %xmm2
   1000 	movaps	%xmm2, (%edx)
   1001 	movaps	%xmm3, 16(%edx)
   1002 	lea	32(%edx, %ecx), %edx
   1003 	lea	32(%eax, %ecx), %eax
   1004 	POP (%edi)
   1005 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1006 
   1007 	CFI_PUSH (%edi)
   1008 
   1009 	.p2align 4
   1010 L(sh_6_no_prefetch):
   1011 	lea	-32(%ecx), %ecx
   1012 	lea	-6(%eax), %eax
   1013 	xor	%edi, %edi
   1014 
   1015 	.p2align 4
   1016 L(sh_6_no_prefetch_loop):
   1017 	movdqa	16(%eax, %edi), %xmm2
   1018 	sub	$32, %ecx
   1019 	movdqa	32(%eax, %edi), %xmm3
   1020 	movdqa	%xmm3, %xmm4
   1021 	palignr	$6, %xmm2, %xmm3
   1022 	palignr	$6, %xmm1, %xmm2
   1023 	lea	32(%edi), %edi
   1024 	movdqa	%xmm2, -32(%edx, %edi)
   1025 	movdqa	%xmm3, -16(%edx, %edi)
   1026 
   1027 	jb	L(sh_6_end_no_prefetch_loop)
   1028 
   1029 	movdqa	16(%eax, %edi), %xmm2
   1030 	sub	$32, %ecx
   1031 	movdqa	32(%eax, %edi), %xmm3
   1032 	movdqa	%xmm3, %xmm1
   1033 	palignr	$6, %xmm2, %xmm3
   1034 	palignr	$6, %xmm4, %xmm2
   1035 	lea	32(%edi), %edi
   1036 	movdqa	%xmm2, -32(%edx, %edi)
   1037 	movdqa	%xmm3, -16(%edx, %edi)
   1038 
   1039 	jae	L(sh_6_no_prefetch_loop)
   1040 
   1041 L(sh_6_end_no_prefetch_loop):
   1042 	lea	32(%ecx), %ecx
   1043 	add	%ecx, %edi
   1044 	add	%edi, %edx
   1045 	lea	6(%edi, %eax), %eax
   1046 	POP	(%edi)
   1047 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1048 
   1049 	CFI_PUSH (%edi)
   1050 
   1051 	.p2align 4
   1052 L(shl_7):
   1053 #ifndef USE_AS_MEMMOVE
   1054 	movaps	-7(%eax), %xmm1
   1055 #else
   1056 	movl	DEST+4(%esp), %edi
   1057 	movaps	-7(%eax), %xmm1
   1058 	movdqu	%xmm0, (%edi)
   1059 #endif
   1060 #ifdef DATA_CACHE_SIZE_HALF
   1061 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1062 #else
   1063 # if (defined SHARED || defined __PIC__)
   1064 	SETUP_PIC_REG(bx)
   1065 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1066 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1067 # else
   1068 	cmp	__x86_data_cache_size_half, %ecx
   1069 # endif
   1070 #endif
   1071 	jb L(sh_7_no_prefetch)
   1072 
   1073 	lea	-64(%ecx), %ecx
   1074 
   1075 	.p2align 4
   1076 L(Shl7LoopStart):
   1077 	prefetcht0 0x1c0(%eax)
   1078 	prefetcht0 0x1c0(%edx)
   1079 	movaps	9(%eax), %xmm2
   1080 	movaps	25(%eax), %xmm3
   1081 	movaps	41(%eax), %xmm4
   1082 	movaps	57(%eax), %xmm5
   1083 	movaps	%xmm5, %xmm7
   1084 	palignr	$7, %xmm4, %xmm5
   1085 	palignr	$7, %xmm3, %xmm4
   1086 	movaps	%xmm5, 48(%edx)
   1087 	palignr	$7, %xmm2, %xmm3
   1088 	lea	64(%eax), %eax
   1089 	palignr	$7, %xmm1, %xmm2
   1090 	movaps	%xmm4, 32(%edx)
   1091 	movaps	%xmm3, 16(%edx)
   1092 	movaps	%xmm7, %xmm1
   1093 	movaps	%xmm2, (%edx)
   1094 	lea	64(%edx), %edx
   1095 	sub	$64, %ecx
   1096 	ja	L(Shl7LoopStart)
   1097 
   1098 L(Shl7LoopLeave):
   1099 	add	$32, %ecx
   1100 	jle	L(shl_end_0)
   1101 
   1102 	movaps	9(%eax), %xmm2
   1103 	movaps	25(%eax), %xmm3
   1104 	palignr	$7, %xmm2, %xmm3
   1105 	palignr	$7, %xmm1, %xmm2
   1106 	movaps	%xmm2, (%edx)
   1107 	movaps	%xmm3, 16(%edx)
   1108 	lea	32(%edx, %ecx), %edx
   1109 	lea	32(%eax, %ecx), %eax
   1110 	POP (%edi)
   1111 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1112 
   1113 	CFI_PUSH (%edi)
   1114 
   1115 	.p2align 4
   1116 L(sh_7_no_prefetch):
   1117 	lea	-32(%ecx), %ecx
   1118 	lea	-7(%eax), %eax
   1119 	xor	%edi, %edi
   1120 
   1121 	.p2align 4
   1122 L(sh_7_no_prefetch_loop):
   1123 	movdqa	16(%eax, %edi), %xmm2
   1124 	sub	$32, %ecx
   1125 	movdqa	32(%eax, %edi), %xmm3
   1126 	movdqa	%xmm3, %xmm4
   1127 	palignr	$7, %xmm2, %xmm3
   1128 	palignr	$7, %xmm1, %xmm2
   1129 	lea	32(%edi), %edi
   1130 	movdqa	%xmm2, -32(%edx, %edi)
   1131 	movdqa	%xmm3, -16(%edx, %edi)
   1132 	jb	L(sh_7_end_no_prefetch_loop)
   1133 
   1134 	movdqa	16(%eax, %edi), %xmm2
   1135 	sub	$32, %ecx
   1136 	movdqa	32(%eax, %edi), %xmm3
   1137 	movdqa	%xmm3, %xmm1
   1138 	palignr	$7, %xmm2, %xmm3
   1139 	palignr	$7, %xmm4, %xmm2
   1140 	lea	32(%edi), %edi
   1141 	movdqa	%xmm2, -32(%edx, %edi)
   1142 	movdqa	%xmm3, -16(%edx, %edi)
   1143 	jae	L(sh_7_no_prefetch_loop)
   1144 
   1145 L(sh_7_end_no_prefetch_loop):
   1146 	lea	32(%ecx), %ecx
   1147 	add	%ecx, %edi
   1148 	add	%edi, %edx
   1149 	lea	7(%edi, %eax), %eax
   1150 	POP	(%edi)
   1151 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1152 
   1153 	CFI_PUSH (%edi)
   1154 
   1155 	.p2align 4
   1156 L(shl_8):
   1157 #ifndef USE_AS_MEMMOVE
   1158 	movaps	-8(%eax), %xmm1
   1159 #else
   1160 	movl	DEST+4(%esp), %edi
   1161 	movaps	-8(%eax), %xmm1
   1162 	movdqu	%xmm0, (%edi)
   1163 #endif
   1164 #ifdef DATA_CACHE_SIZE_HALF
   1165 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1166 #else
   1167 # if (defined SHARED || defined __PIC__)
   1168 	SETUP_PIC_REG(bx)
   1169 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1170 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1171 # else
   1172 	cmp	__x86_data_cache_size_half, %ecx
   1173 # endif
   1174 #endif
   1175 	jb L(sh_8_no_prefetch)
   1176 
   1177 	lea	-64(%ecx), %ecx
   1178 
   1179 	.p2align 4
   1180 L(Shl8LoopStart):
   1181 	prefetcht0 0x1c0(%eax)
   1182 	prefetcht0 0x1c0(%edx)
   1183 	movaps	8(%eax), %xmm2
   1184 	movaps	24(%eax), %xmm3
   1185 	movaps	40(%eax), %xmm4
   1186 	movaps	56(%eax), %xmm5
   1187 	movaps	%xmm5, %xmm7
   1188 	palignr	$8, %xmm4, %xmm5
   1189 	palignr	$8, %xmm3, %xmm4
   1190 	movaps	%xmm5, 48(%edx)
   1191 	palignr	$8, %xmm2, %xmm3
   1192 	lea	64(%eax), %eax
   1193 	palignr	$8, %xmm1, %xmm2
   1194 	movaps	%xmm4, 32(%edx)
   1195 	movaps	%xmm3, 16(%edx)
   1196 	movaps	%xmm7, %xmm1
   1197 	movaps	%xmm2, (%edx)
   1198 	lea	64(%edx), %edx
   1199 	sub	$64, %ecx
   1200 	ja	L(Shl8LoopStart)
   1201 
   1202 L(LoopLeave8):
   1203 	add	$32, %ecx
   1204 	jle	L(shl_end_0)
   1205 
   1206 	movaps	8(%eax), %xmm2
   1207 	movaps	24(%eax), %xmm3
   1208 	palignr	$8, %xmm2, %xmm3
   1209 	palignr	$8, %xmm1, %xmm2
   1210 	movaps	%xmm2, (%edx)
   1211 	movaps	%xmm3, 16(%edx)
   1212 	lea	32(%edx, %ecx), %edx
   1213 	lea	32(%eax, %ecx), %eax
   1214 	POP (%edi)
   1215 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1216 
   1217 	CFI_PUSH (%edi)
   1218 
   1219 	.p2align 4
   1220 L(sh_8_no_prefetch):
   1221 	lea	-32(%ecx), %ecx
   1222 	lea	-8(%eax), %eax
   1223 	xor	%edi, %edi
   1224 
   1225 	.p2align 4
   1226 L(sh_8_no_prefetch_loop):
   1227 	movdqa	16(%eax, %edi), %xmm2
   1228 	sub	$32, %ecx
   1229 	movdqa	32(%eax, %edi), %xmm3
   1230 	movdqa	%xmm3, %xmm4
   1231 	palignr	$8, %xmm2, %xmm3
   1232 	palignr	$8, %xmm1, %xmm2
   1233 	lea	32(%edi), %edi
   1234 	movdqa	%xmm2, -32(%edx, %edi)
   1235 	movdqa	%xmm3, -16(%edx, %edi)
   1236 	jb	L(sh_8_end_no_prefetch_loop)
   1237 
   1238 	movdqa	16(%eax, %edi), %xmm2
   1239 	sub	$32, %ecx
   1240 	movdqa	32(%eax, %edi), %xmm3
   1241 	movdqa	%xmm3, %xmm1
   1242 	palignr	$8, %xmm2, %xmm3
   1243 	palignr	$8, %xmm4, %xmm2
   1244 	lea	32(%edi), %edi
   1245 	movdqa	%xmm2, -32(%edx, %edi)
   1246 	movdqa	%xmm3, -16(%edx, %edi)
   1247 	jae	L(sh_8_no_prefetch_loop)
   1248 
   1249 L(sh_8_end_no_prefetch_loop):
   1250 	lea	32(%ecx), %ecx
   1251 	add	%ecx, %edi
   1252 	add	%edi, %edx
   1253 	lea	8(%edi, %eax), %eax
   1254 	POP	(%edi)
   1255 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1256 
   1257 	CFI_PUSH (%edi)
   1258 
   1259 	.p2align 4
   1260 L(shl_9):
   1261 #ifndef USE_AS_MEMMOVE
   1262 	movaps	-9(%eax), %xmm1
   1263 #else
   1264 	movl	DEST+4(%esp), %edi
   1265 	movaps	-9(%eax), %xmm1
   1266 	movdqu	%xmm0, (%edi)
   1267 #endif
   1268 #ifdef DATA_CACHE_SIZE_HALF
   1269 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1270 #else
   1271 # if (defined SHARED || defined __PIC__)
   1272 	SETUP_PIC_REG(bx)
   1273 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1274 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1275 # else
   1276 	cmp	__x86_data_cache_size_half, %ecx
   1277 # endif
   1278 #endif
   1279 	jb L(sh_9_no_prefetch)
   1280 
   1281 	lea	-64(%ecx), %ecx
   1282 
   1283 	.p2align 4
   1284 L(Shl9LoopStart):
   1285 	prefetcht0 0x1c0(%eax)
   1286 	prefetcht0 0x1c0(%edx)
   1287 	movaps	7(%eax), %xmm2
   1288 	movaps	23(%eax), %xmm3
   1289 	movaps	39(%eax), %xmm4
   1290 	movaps	55(%eax), %xmm5
   1291 	movaps	%xmm5, %xmm7
   1292 	palignr	$9, %xmm4, %xmm5
   1293 	palignr	$9, %xmm3, %xmm4
   1294 	movaps	%xmm5, 48(%edx)
   1295 	palignr	$9, %xmm2, %xmm3
   1296 	lea	64(%eax), %eax
   1297 	palignr	$9, %xmm1, %xmm2
   1298 	movaps	%xmm4, 32(%edx)
   1299 	movaps	%xmm3, 16(%edx)
   1300 	movaps	%xmm7, %xmm1
   1301 	movaps	%xmm2, (%edx)
   1302 	lea	64(%edx), %edx
   1303 	sub	$64, %ecx
   1304 	ja	L(Shl9LoopStart)
   1305 
   1306 L(Shl9LoopLeave):
   1307 	add	$32, %ecx
   1308 	jle	L(shl_end_0)
   1309 
   1310 	movaps	7(%eax), %xmm2
   1311 	movaps	23(%eax), %xmm3
   1312 	palignr	$9, %xmm2, %xmm3
   1313 	palignr	$9, %xmm1, %xmm2
   1314 
   1315 	movaps	%xmm2, (%edx)
   1316 	movaps	%xmm3, 16(%edx)
   1317 	lea	32(%edx, %ecx), %edx
   1318 	lea	32(%eax, %ecx), %eax
   1319 	POP (%edi)
   1320 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1321 
   1322 	CFI_PUSH (%edi)
   1323 
   1324 	.p2align 4
   1325 L(sh_9_no_prefetch):
   1326 	lea	-32(%ecx), %ecx
   1327 	lea	-9(%eax), %eax
   1328 	xor	%edi, %edi
   1329 
   1330 	.p2align 4
   1331 L(sh_9_no_prefetch_loop):
   1332 	movdqa	16(%eax, %edi), %xmm2
   1333 	sub	$32, %ecx
   1334 	movdqa	32(%eax, %edi), %xmm3
   1335 	movdqa	%xmm3, %xmm4
   1336 	palignr	$9, %xmm2, %xmm3
   1337 	palignr	$9, %xmm1, %xmm2
   1338 	lea	32(%edi), %edi
   1339 	movdqa	%xmm2, -32(%edx, %edi)
   1340 	movdqa	%xmm3, -16(%edx, %edi)
   1341 	jb	L(sh_9_end_no_prefetch_loop)
   1342 
   1343 	movdqa	16(%eax, %edi), %xmm2
   1344 	sub	$32, %ecx
   1345 	movdqa	32(%eax, %edi), %xmm3
   1346 	movdqa	%xmm3, %xmm1
   1347 	palignr	$9, %xmm2, %xmm3
   1348 	palignr	$9, %xmm4, %xmm2
   1349 	lea	32(%edi), %edi
   1350 	movdqa	%xmm2, -32(%edx, %edi)
   1351 	movdqa	%xmm3, -16(%edx, %edi)
   1352 	jae	L(sh_9_no_prefetch_loop)
   1353 
   1354 L(sh_9_end_no_prefetch_loop):
   1355 	lea	32(%ecx), %ecx
   1356 	add	%ecx, %edi
   1357 	add	%edi, %edx
   1358 	lea	9(%edi, %eax), %eax
   1359 	POP	(%edi)
   1360 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1361 
   1362 	CFI_PUSH (%edi)
   1363 
   1364 	.p2align 4
   1365 L(shl_10):
   1366 #ifndef USE_AS_MEMMOVE
   1367 	movaps	-10(%eax), %xmm1
   1368 #else
   1369 	movl	DEST+4(%esp), %edi
   1370 	movaps	-10(%eax), %xmm1
   1371 	movdqu	%xmm0, (%edi)
   1372 #endif
   1373 #ifdef DATA_CACHE_SIZE_HALF
   1374 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1375 #else
   1376 # if (defined SHARED || defined __PIC__)
   1377 	SETUP_PIC_REG(bx)
   1378 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1379 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1380 # else
   1381 	cmp	__x86_data_cache_size_half, %ecx
   1382 # endif
   1383 #endif
   1384 	jb L(sh_10_no_prefetch)
   1385 
   1386 	lea	-64(%ecx), %ecx
   1387 
   1388 	.p2align 4
   1389 L(Shl10LoopStart):
   1390 	prefetcht0 0x1c0(%eax)
   1391 	prefetcht0 0x1c0(%edx)
   1392 	movaps	6(%eax), %xmm2
   1393 	movaps	22(%eax), %xmm3
   1394 	movaps	38(%eax), %xmm4
   1395 	movaps	54(%eax), %xmm5
   1396 	movaps	%xmm5, %xmm7
   1397 	palignr	$10, %xmm4, %xmm5
   1398 	palignr	$10, %xmm3, %xmm4
   1399 	movaps	%xmm5, 48(%edx)
   1400 	palignr	$10, %xmm2, %xmm3
   1401 	lea	64(%eax), %eax
   1402 	palignr	$10, %xmm1, %xmm2
   1403 	movaps	%xmm4, 32(%edx)
   1404 	movaps	%xmm3, 16(%edx)
   1405 	movaps	%xmm7, %xmm1
   1406 	movaps	%xmm2, (%edx)
   1407 	lea	64(%edx), %edx
   1408 	sub	$64, %ecx
   1409 	ja	L(Shl10LoopStart)
   1410 
   1411 L(Shl10LoopLeave):
   1412 	add	$32, %ecx
   1413 	jle	L(shl_end_0)
   1414 
   1415 	movaps	6(%eax), %xmm2
   1416 	movaps	22(%eax), %xmm3
   1417 	palignr	$10, %xmm2, %xmm3
   1418 	palignr	$10, %xmm1, %xmm2
   1419 
   1420 	movaps	%xmm2, (%edx)
   1421 	movaps	%xmm3, 16(%edx)
   1422 	lea	32(%edx, %ecx), %edx
   1423 	lea	32(%eax, %ecx), %eax
   1424 	POP (%edi)
   1425 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1426 
   1427 	CFI_PUSH (%edi)
   1428 
   1429 	.p2align 4
   1430 L(sh_10_no_prefetch):
   1431 	lea	-32(%ecx), %ecx
   1432 	lea	-10(%eax), %eax
   1433 	xor	%edi, %edi
   1434 
   1435 	.p2align 4
   1436 L(sh_10_no_prefetch_loop):
   1437 	movdqa	16(%eax, %edi), %xmm2
   1438 	sub	$32, %ecx
   1439 	movdqa	32(%eax, %edi), %xmm3
   1440 	movdqa	%xmm3, %xmm4
   1441 	palignr	$10, %xmm2, %xmm3
   1442 	palignr	$10, %xmm1, %xmm2
   1443 	lea	32(%edi), %edi
   1444 	movdqa	%xmm2, -32(%edx, %edi)
   1445 	movdqa	%xmm3, -16(%edx, %edi)
   1446 	jb	L(sh_10_end_no_prefetch_loop)
   1447 
   1448 	movdqa	16(%eax, %edi), %xmm2
   1449 	sub	$32, %ecx
   1450 	movdqa	32(%eax, %edi), %xmm3
   1451 	movdqa	%xmm3, %xmm1
   1452 	palignr	$10, %xmm2, %xmm3
   1453 	palignr	$10, %xmm4, %xmm2
   1454 	lea	32(%edi), %edi
   1455 	movdqa	%xmm2, -32(%edx, %edi)
   1456 	movdqa	%xmm3, -16(%edx, %edi)
   1457 	jae	L(sh_10_no_prefetch_loop)
   1458 
   1459 L(sh_10_end_no_prefetch_loop):
   1460 	lea	32(%ecx), %ecx
   1461 	add	%ecx, %edi
   1462 	add	%edi, %edx
   1463 	lea	10(%edi, %eax), %eax
   1464 	POP	(%edi)
   1465 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1466 
   1467 	CFI_PUSH (%edi)
   1468 
   1469 	.p2align 4
   1470 L(shl_11):
   1471 #ifndef USE_AS_MEMMOVE
   1472 	movaps	-11(%eax), %xmm1
   1473 #else
   1474 	movl	DEST+4(%esp), %edi
   1475 	movaps	-11(%eax), %xmm1
   1476 	movdqu	%xmm0, (%edi)
   1477 #endif
   1478 #ifdef DATA_CACHE_SIZE_HALF
   1479 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1480 #else
   1481 # if (defined SHARED || defined __PIC__)
   1482 	SETUP_PIC_REG(bx)
   1483 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1484 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1485 # else
   1486 	cmp	__x86_data_cache_size_half, %ecx
   1487 # endif
   1488 #endif
   1489 	jb L(sh_11_no_prefetch)
   1490 
   1491 	lea	-64(%ecx), %ecx
   1492 
   1493 	.p2align 4
   1494 L(Shl11LoopStart):
   1495 	prefetcht0 0x1c0(%eax)
   1496 	prefetcht0 0x1c0(%edx)
   1497 	movaps	5(%eax), %xmm2
   1498 	movaps	21(%eax), %xmm3
   1499 	movaps	37(%eax), %xmm4
   1500 	movaps	53(%eax), %xmm5
   1501 	movaps	%xmm5, %xmm7
   1502 	palignr	$11, %xmm4, %xmm5
   1503 	palignr	$11, %xmm3, %xmm4
   1504 	movaps	%xmm5, 48(%edx)
   1505 	palignr	$11, %xmm2, %xmm3
   1506 	lea	64(%eax), %eax
   1507 	palignr	$11, %xmm1, %xmm2
   1508 	movaps	%xmm4, 32(%edx)
   1509 	movaps	%xmm3, 16(%edx)
   1510 	movaps	%xmm7, %xmm1
   1511 	movaps	%xmm2, (%edx)
   1512 	lea	64(%edx), %edx
   1513 	sub	$64, %ecx
   1514 	ja	L(Shl11LoopStart)
   1515 
   1516 L(Shl11LoopLeave):
   1517 	add	$32, %ecx
   1518 	jle	L(shl_end_0)
   1519 
   1520 	movaps	5(%eax), %xmm2
   1521 	movaps	21(%eax), %xmm3
   1522 	palignr	$11, %xmm2, %xmm3
   1523 	palignr	$11, %xmm1, %xmm2
   1524 
   1525 	movaps	%xmm2, (%edx)
   1526 	movaps	%xmm3, 16(%edx)
   1527 	lea	32(%edx, %ecx), %edx
   1528 	lea	32(%eax, %ecx), %eax
   1529 	POP (%edi)
   1530 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1531 
   1532 	CFI_PUSH (%edi)
   1533 
   1534 	.p2align 4
   1535 L(sh_11_no_prefetch):
   1536 	lea	-32(%ecx), %ecx
   1537 	lea	-11(%eax), %eax
   1538 	xor	%edi, %edi
   1539 
   1540 	.p2align 4
   1541 L(sh_11_no_prefetch_loop):
   1542 	movdqa	16(%eax, %edi), %xmm2
   1543 	sub	$32, %ecx
   1544 	movdqa	32(%eax, %edi), %xmm3
   1545 	movdqa	%xmm3, %xmm4
   1546 	palignr	$11, %xmm2, %xmm3
   1547 	palignr	$11, %xmm1, %xmm2
   1548 	lea	32(%edi), %edi
   1549 	movdqa	%xmm2, -32(%edx, %edi)
   1550 	movdqa	%xmm3, -16(%edx, %edi)
   1551 	jb	L(sh_11_end_no_prefetch_loop)
   1552 
   1553 	movdqa	16(%eax, %edi), %xmm2
   1554 	sub	$32, %ecx
   1555 	movdqa	32(%eax, %edi), %xmm3
   1556 	movdqa	%xmm3, %xmm1
   1557 	palignr	$11, %xmm2, %xmm3
   1558 	palignr	$11, %xmm4, %xmm2
   1559 	lea	32(%edi), %edi
   1560 	movdqa	%xmm2, -32(%edx, %edi)
   1561 	movdqa	%xmm3, -16(%edx, %edi)
   1562 	jae	L(sh_11_no_prefetch_loop)
   1563 
   1564 L(sh_11_end_no_prefetch_loop):
   1565 	lea	32(%ecx), %ecx
   1566 	add	%ecx, %edi
   1567 	add	%edi, %edx
   1568 	lea	11(%edi, %eax), %eax
   1569 	POP	(%edi)
   1570 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1571 
   1572 	CFI_PUSH (%edi)
   1573 
   1574 	.p2align 4
   1575 L(shl_12):
   1576 #ifndef USE_AS_MEMMOVE
   1577 	movaps	-12(%eax), %xmm1
   1578 #else
   1579 	movl	DEST+4(%esp), %edi
   1580 	movaps	-12(%eax), %xmm1
   1581 	movdqu	%xmm0, (%edi)
   1582 #endif
   1583 #ifdef DATA_CACHE_SIZE_HALF
   1584 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1585 #else
   1586 # if (defined SHARED || defined __PIC__)
   1587 	SETUP_PIC_REG(bx)
   1588 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1589 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1590 # else
   1591 	cmp	__x86_data_cache_size_half, %ecx
   1592 # endif
   1593 #endif
   1594 	jb L(sh_12_no_prefetch)
   1595 
   1596 	lea	-64(%ecx), %ecx
   1597 
   1598 	.p2align 4
   1599 L(Shl12LoopStart):
   1600 	prefetcht0 0x1c0(%eax)
   1601 	prefetcht0 0x1c0(%edx)
   1602 	movaps	4(%eax), %xmm2
   1603 	movaps	20(%eax), %xmm3
   1604 	movaps	36(%eax), %xmm4
   1605 	movaps	52(%eax), %xmm5
   1606 	movaps	%xmm5, %xmm7
   1607 	palignr	$12, %xmm4, %xmm5
   1608 	palignr	$12, %xmm3, %xmm4
   1609 	movaps	%xmm5, 48(%edx)
   1610 	palignr	$12, %xmm2, %xmm3
   1611 	lea	64(%eax), %eax
   1612 	palignr	$12, %xmm1, %xmm2
   1613 	movaps	%xmm4, 32(%edx)
   1614 	movaps	%xmm3, 16(%edx)
   1615 	movaps	%xmm7, %xmm1
   1616 	movaps	%xmm2, (%edx)
   1617 	lea	64(%edx), %edx
   1618 	sub	$64, %ecx
   1619 	ja	L(Shl12LoopStart)
   1620 
   1621 L(Shl12LoopLeave):
   1622 	add	$32, %ecx
   1623 	jle	L(shl_end_0)
   1624 
   1625 	movaps	4(%eax), %xmm2
   1626 	movaps	20(%eax), %xmm3
   1627 	palignr	$12, %xmm2, %xmm3
   1628 	palignr	$12, %xmm1, %xmm2
   1629 
   1630 	movaps	%xmm2, (%edx)
   1631 	movaps	%xmm3, 16(%edx)
   1632 	lea	32(%edx, %ecx), %edx
   1633 	lea	32(%eax, %ecx), %eax
   1634 	POP (%edi)
   1635 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1636 
   1637 	CFI_PUSH (%edi)
   1638 
   1639 	.p2align 4
   1640 L(sh_12_no_prefetch):
   1641 	lea	-32(%ecx), %ecx
   1642 	lea	-12(%eax), %eax
   1643 	xor	%edi, %edi
   1644 
   1645 	.p2align 4
   1646 L(sh_12_no_prefetch_loop):
   1647 	movdqa	16(%eax, %edi), %xmm2
   1648 	sub	$32, %ecx
   1649 	movdqa	32(%eax, %edi), %xmm3
   1650 	movdqa	%xmm3, %xmm4
   1651 	palignr	$12, %xmm2, %xmm3
   1652 	palignr	$12, %xmm1, %xmm2
   1653 	lea	32(%edi), %edi
   1654 	movdqa	%xmm2, -32(%edx, %edi)
   1655 	movdqa	%xmm3, -16(%edx, %edi)
   1656 	jb	L(sh_12_end_no_prefetch_loop)
   1657 
   1658 	movdqa	16(%eax, %edi), %xmm2
   1659 	sub	$32, %ecx
   1660 	movdqa	32(%eax, %edi), %xmm3
   1661 	movdqa	%xmm3, %xmm1
   1662 	palignr	$12, %xmm2, %xmm3
   1663 	palignr	$12, %xmm4, %xmm2
   1664 	lea	32(%edi), %edi
   1665 	movdqa	%xmm2, -32(%edx, %edi)
   1666 	movdqa	%xmm3, -16(%edx, %edi)
   1667 	jae	L(sh_12_no_prefetch_loop)
   1668 
   1669 L(sh_12_end_no_prefetch_loop):
   1670 	lea	32(%ecx), %ecx
   1671 	add	%ecx, %edi
   1672 	add	%edi, %edx
   1673 	lea	12(%edi, %eax), %eax
   1674 	POP	(%edi)
   1675 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1676 
   1677 	CFI_PUSH (%edi)
   1678 
   1679 	.p2align 4
   1680 L(shl_13):
   1681 #ifndef USE_AS_MEMMOVE
   1682 	movaps	-13(%eax), %xmm1
   1683 #else
   1684 	movl	DEST+4(%esp), %edi
   1685 	movaps	-13(%eax), %xmm1
   1686 	movdqu	%xmm0, (%edi)
   1687 #endif
   1688 #ifdef DATA_CACHE_SIZE_HALF
   1689 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1690 #else
   1691 # if (defined SHARED || defined __PIC__)
   1692 	SETUP_PIC_REG(bx)
   1693 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1694 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1695 # else
   1696 	cmp	__x86_data_cache_size_half, %ecx
   1697 # endif
   1698 #endif
   1699 	jb L(sh_13_no_prefetch)
   1700 
   1701 	lea	-64(%ecx), %ecx
   1702 
   1703 	.p2align 4
   1704 L(Shl13LoopStart):
   1705 	prefetcht0 0x1c0(%eax)
   1706 	prefetcht0 0x1c0(%edx)
   1707 	movaps	3(%eax), %xmm2
   1708 	movaps	19(%eax), %xmm3
   1709 	movaps	35(%eax), %xmm4
   1710 	movaps	51(%eax), %xmm5
   1711 	movaps	%xmm5, %xmm7
   1712 	palignr	$13, %xmm4, %xmm5
   1713 	palignr	$13, %xmm3, %xmm4
   1714 	movaps	%xmm5, 48(%edx)
   1715 	palignr	$13, %xmm2, %xmm3
   1716 	lea	64(%eax), %eax
   1717 	palignr	$13, %xmm1, %xmm2
   1718 	movaps	%xmm4, 32(%edx)
   1719 	movaps	%xmm3, 16(%edx)
   1720 	movaps	%xmm7, %xmm1
   1721 	movaps	%xmm2, (%edx)
   1722 	lea	64(%edx), %edx
   1723 	sub	$64, %ecx
   1724 	ja	L(Shl13LoopStart)
   1725 
   1726 L(Shl13LoopLeave):
   1727 	add	$32, %ecx
   1728 	jle	L(shl_end_0)
   1729 
   1730 	movaps	3(%eax), %xmm2
   1731 	movaps	19(%eax), %xmm3
   1732 	palignr	$13, %xmm2, %xmm3
   1733 	palignr	$13, %xmm1, %xmm2
   1734 
   1735 	movaps	%xmm2, (%edx)
   1736 	movaps	%xmm3, 16(%edx)
   1737 	lea	32(%edx, %ecx), %edx
   1738 	lea	32(%eax, %ecx), %eax
   1739 	POP (%edi)
   1740 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1741 
   1742 	CFI_PUSH (%edi)
   1743 
   1744 	.p2align 4
   1745 L(sh_13_no_prefetch):
   1746 	lea	-32(%ecx), %ecx
   1747 	lea	-13(%eax), %eax
   1748 	xor	%edi, %edi
   1749 
   1750 	.p2align 4
   1751 L(sh_13_no_prefetch_loop):
   1752 	movdqa	16(%eax, %edi), %xmm2
   1753 	sub	$32, %ecx
   1754 	movdqa	32(%eax, %edi), %xmm3
   1755 	movdqa	%xmm3, %xmm4
   1756 	palignr	$13, %xmm2, %xmm3
   1757 	palignr	$13, %xmm1, %xmm2
   1758 	lea	32(%edi), %edi
   1759 	movdqa	%xmm2, -32(%edx, %edi)
   1760 	movdqa	%xmm3, -16(%edx, %edi)
   1761 	jb	L(sh_13_end_no_prefetch_loop)
   1762 
   1763 	movdqa	16(%eax, %edi), %xmm2
   1764 	sub	$32, %ecx
   1765 	movdqa	32(%eax, %edi), %xmm3
   1766 	movdqa	%xmm3, %xmm1
   1767 	palignr	$13, %xmm2, %xmm3
   1768 	palignr	$13, %xmm4, %xmm2
   1769 	lea	32(%edi), %edi
   1770 	movdqa	%xmm2, -32(%edx, %edi)
   1771 	movdqa	%xmm3, -16(%edx, %edi)
   1772 	jae	L(sh_13_no_prefetch_loop)
   1773 
   1774 L(sh_13_end_no_prefetch_loop):
   1775 	lea	32(%ecx), %ecx
   1776 	add	%ecx, %edi
   1777 	add	%edi, %edx
   1778 	lea	13(%edi, %eax), %eax
   1779 	POP	(%edi)
   1780 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1781 
   1782 	CFI_PUSH (%edi)
   1783 
   1784 	.p2align 4
   1785 L(shl_14):
   1786 #ifndef USE_AS_MEMMOVE
   1787 	movaps	-14(%eax), %xmm1
   1788 #else
   1789 	movl	DEST+4(%esp), %edi
   1790 	movaps	-14(%eax), %xmm1
   1791 	movdqu	%xmm0, (%edi)
   1792 #endif
   1793 #ifdef DATA_CACHE_SIZE_HALF
   1794 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1795 #else
   1796 # if (defined SHARED || defined __PIC__)
   1797 	SETUP_PIC_REG(bx)
   1798 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1799 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1800 # else
   1801 	cmp	__x86_data_cache_size_half, %ecx
   1802 # endif
   1803 #endif
   1804 	jb L(sh_14_no_prefetch)
   1805 
   1806 	lea	-64(%ecx), %ecx
   1807 
   1808 	.p2align 4
   1809 L(Shl14LoopStart):
   1810 	prefetcht0 0x1c0(%eax)
   1811 	prefetcht0 0x1c0(%edx)
   1812 	movaps	2(%eax), %xmm2
   1813 	movaps	18(%eax), %xmm3
   1814 	movaps	34(%eax), %xmm4
   1815 	movaps	50(%eax), %xmm5
   1816 	movaps	%xmm5, %xmm7
   1817 	palignr	$14, %xmm4, %xmm5
   1818 	palignr	$14, %xmm3, %xmm4
   1819 	movaps	%xmm5, 48(%edx)
   1820 	palignr	$14, %xmm2, %xmm3
   1821 	lea	64(%eax), %eax
   1822 	palignr	$14, %xmm1, %xmm2
   1823 	movaps	%xmm4, 32(%edx)
   1824 	movaps	%xmm3, 16(%edx)
   1825 	movaps	%xmm7, %xmm1
   1826 	movaps	%xmm2, (%edx)
   1827 	lea	64(%edx), %edx
   1828 	sub	$64, %ecx
   1829 	ja	L(Shl14LoopStart)
   1830 
   1831 L(Shl14LoopLeave):
   1832 	add	$32, %ecx
   1833 	jle	L(shl_end_0)
   1834 
   1835 	movaps	2(%eax), %xmm2
   1836 	movaps	18(%eax), %xmm3
   1837 	palignr	$14, %xmm2, %xmm3
   1838 	palignr	$14, %xmm1, %xmm2
   1839 
   1840 	movaps	%xmm2, (%edx)
   1841 	movaps	%xmm3, 16(%edx)
   1842 	lea	32(%edx, %ecx), %edx
   1843 	lea	32(%eax, %ecx), %eax
   1844 	POP (%edi)
   1845 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1846 
   1847 	CFI_PUSH (%edi)
   1848 
   1849 	.p2align 4
   1850 L(sh_14_no_prefetch):
   1851 	lea	-32(%ecx), %ecx
   1852 	lea	-14(%eax), %eax
   1853 	xor	%edi, %edi
   1854 
   1855 	.p2align 4
   1856 L(sh_14_no_prefetch_loop):
   1857 	movdqa	16(%eax, %edi), %xmm2
   1858 	sub	$32, %ecx
   1859 	movdqa	32(%eax, %edi), %xmm3
   1860 	movdqa	%xmm3, %xmm4
   1861 	palignr	$14, %xmm2, %xmm3
   1862 	palignr	$14, %xmm1, %xmm2
   1863 	lea	32(%edi), %edi
   1864 	movdqa	%xmm2, -32(%edx, %edi)
   1865 	movdqa	%xmm3, -16(%edx, %edi)
   1866 	jb	L(sh_14_end_no_prefetch_loop)
   1867 
   1868 	movdqa	16(%eax, %edi), %xmm2
   1869 	sub	$32, %ecx
   1870 	movdqa	32(%eax, %edi), %xmm3
   1871 	movdqa	%xmm3, %xmm1
   1872 	palignr	$14, %xmm2, %xmm3
   1873 	palignr	$14, %xmm4, %xmm2
   1874 	lea	32(%edi), %edi
   1875 	movdqa	%xmm2, -32(%edx, %edi)
   1876 	movdqa	%xmm3, -16(%edx, %edi)
   1877 	jae	L(sh_14_no_prefetch_loop)
   1878 
   1879 L(sh_14_end_no_prefetch_loop):
   1880 	lea	32(%ecx), %ecx
   1881 	add	%ecx, %edi
   1882 	add	%edi, %edx
   1883 	lea	14(%edi, %eax), %eax
   1884 	POP	(%edi)
   1885 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1886 
   1887 	CFI_PUSH (%edi)
   1888 
   1889 	.p2align 4
   1890 L(shl_15):
   1891 #ifndef USE_AS_MEMMOVE
   1892 	movaps	-15(%eax), %xmm1
   1893 #else
   1894 	movl	DEST+4(%esp), %edi
   1895 	movaps	-15(%eax), %xmm1
   1896 	movdqu	%xmm0, (%edi)
   1897 #endif
   1898 #ifdef DATA_CACHE_SIZE_HALF
   1899 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1900 #else
   1901 # if (defined SHARED || defined __PIC__)
   1902 	SETUP_PIC_REG(bx)
   1903 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1904 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1905 # else
   1906 	cmp	__x86_data_cache_size_half, %ecx
   1907 # endif
   1908 #endif
   1909 	jb L(sh_15_no_prefetch)
   1910 
   1911 	lea	-64(%ecx), %ecx
   1912 
   1913 	.p2align 4
   1914 L(Shl15LoopStart):
   1915 	prefetcht0 0x1c0(%eax)
   1916 	prefetcht0 0x1c0(%edx)
   1917 	movaps	1(%eax), %xmm2
   1918 	movaps	17(%eax), %xmm3
   1919 	movaps	33(%eax), %xmm4
   1920 	movaps	49(%eax), %xmm5
   1921 	movaps	%xmm5, %xmm7
   1922 	palignr	$15, %xmm4, %xmm5
   1923 	palignr	$15, %xmm3, %xmm4
   1924 	movaps	%xmm5, 48(%edx)
   1925 	palignr	$15, %xmm2, %xmm3
   1926 	lea	64(%eax), %eax
   1927 	palignr	$15, %xmm1, %xmm2
   1928 	movaps	%xmm4, 32(%edx)
   1929 	movaps	%xmm3, 16(%edx)
   1930 	movaps	%xmm7, %xmm1
   1931 	movaps	%xmm2, (%edx)
   1932 	lea	64(%edx), %edx
   1933 	sub	$64, %ecx
   1934 	ja	L(Shl15LoopStart)
   1935 
   1936 L(Shl15LoopLeave):
   1937 	add	$32, %ecx
   1938 	jle	L(shl_end_0)
   1939 
   1940 	movaps	1(%eax), %xmm2
   1941 	movaps	17(%eax), %xmm3
   1942 	palignr	$15, %xmm2, %xmm3
   1943 	palignr	$15, %xmm1, %xmm2
   1944 
   1945 	movaps	%xmm2, (%edx)
   1946 	movaps	%xmm3, 16(%edx)
   1947 	lea	32(%edx, %ecx), %edx
   1948 	lea	32(%eax, %ecx), %eax
   1949 	POP (%edi)
   1950 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1951 
   1952 	CFI_PUSH (%edi)
   1953 
   1954 	.p2align 4
   1955 L(sh_15_no_prefetch):
   1956 	lea	-32(%ecx), %ecx
   1957 	lea	-15(%eax), %eax
   1958 	xor	%edi, %edi
   1959 
   1960 	.p2align 4
   1961 L(sh_15_no_prefetch_loop):
   1962 	movdqa	16(%eax, %edi), %xmm2
   1963 	sub	$32, %ecx
   1964 	movdqa	32(%eax, %edi), %xmm3
   1965 	movdqa	%xmm3, %xmm4
   1966 	palignr	$15, %xmm2, %xmm3
   1967 	palignr	$15, %xmm1, %xmm2
   1968 	lea	32(%edi), %edi
   1969 	movdqa	%xmm2, -32(%edx, %edi)
   1970 	movdqa	%xmm3, -16(%edx, %edi)
   1971 	jb	L(sh_15_end_no_prefetch_loop)
   1972 
   1973 	movdqa	16(%eax, %edi), %xmm2
   1974 	sub	$32, %ecx
   1975 	movdqa	32(%eax, %edi), %xmm3
   1976 	movdqa	%xmm3, %xmm1
   1977 	palignr	$15, %xmm2, %xmm3
   1978 	palignr	$15, %xmm4, %xmm2
   1979 	lea	32(%edi), %edi
   1980 	movdqa	%xmm2, -32(%edx, %edi)
   1981 	movdqa	%xmm3, -16(%edx, %edi)
   1982 	jae	L(sh_15_no_prefetch_loop)
   1983 
   1984 L(sh_15_end_no_prefetch_loop):
   1985 	lea	32(%ecx), %ecx
   1986 	add	%ecx, %edi
   1987 	add	%edi, %edx
   1988 	lea	15(%edi, %eax), %eax
   1989 	POP	(%edi)
   1990 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1991 
   1992 	CFI_PUSH (%edi)
   1993 
   1994 	.p2align 4
   1995 L(shl_end_0):
   1996 	lea	32(%ecx), %ecx
   1997 	lea	(%edx, %ecx), %edx
   1998 	lea	(%eax, %ecx), %eax
   1999 	POP	(%edi)
   2000 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   2001 
   2002 	.p2align 4
   2003 L(fwd_write_44bytes):
   2004 	movq	-44(%eax), %xmm0
   2005 	movq	%xmm0, -44(%edx)
   2006 L(fwd_write_36bytes):
   2007 	movq	-36(%eax), %xmm0
   2008 	movq	%xmm0, -36(%edx)
   2009 L(fwd_write_28bytes):
   2010 	movq	-28(%eax), %xmm0
   2011 	movq	%xmm0, -28(%edx)
   2012 L(fwd_write_20bytes):
   2013 	movq	-20(%eax), %xmm0
   2014 	movq	%xmm0, -20(%edx)
   2015 L(fwd_write_12bytes):
   2016 	movq	-12(%eax), %xmm0
   2017 	movq	%xmm0, -12(%edx)
   2018 L(fwd_write_4bytes):
   2019 	movl	-4(%eax), %ecx
   2020 	movl	%ecx, -4(%edx)
   2021 #ifndef USE_AS_BCOPY
   2022 # ifdef USE_AS_MEMPCPY
   2023 	movl	%edx, %eax
   2024 # else
   2025 	movl	DEST(%esp), %eax
   2026 # endif
   2027 #endif
   2028 	RETURN
   2029 
   2030 	.p2align 4
   2031 L(fwd_write_40bytes):
   2032 	movq	-40(%eax), %xmm0
   2033 	movq	%xmm0, -40(%edx)
   2034 L(fwd_write_32bytes):
   2035 	movq	-32(%eax), %xmm0
   2036 	movq	%xmm0, -32(%edx)
   2037 L(fwd_write_24bytes):
   2038 	movq	-24(%eax), %xmm0
   2039 	movq	%xmm0, -24(%edx)
   2040 L(fwd_write_16bytes):
   2041 	movq	-16(%eax), %xmm0
   2042 	movq	%xmm0, -16(%edx)
   2043 L(fwd_write_8bytes):
   2044 	movq	-8(%eax), %xmm0
   2045 	movq	%xmm0, -8(%edx)
   2046 L(fwd_write_0bytes):
   2047 #ifndef USE_AS_BCOPY
   2048 # ifdef USE_AS_MEMPCPY
   2049 	movl	%edx, %eax
   2050 # else
   2051 	movl	DEST(%esp), %eax
   2052 # endif
   2053 #endif
   2054 	RETURN
   2055 
   2056 	.p2align 4
   2057 L(fwd_write_5bytes):
   2058 	movl	-5(%eax), %ecx
   2059 	movl	-4(%eax), %eax
   2060 	movl	%ecx, -5(%edx)
   2061 	movl	%eax, -4(%edx)
   2062 #ifndef USE_AS_BCOPY
   2063 # ifdef USE_AS_MEMPCPY
   2064 	movl	%edx, %eax
   2065 # else
   2066 	movl	DEST(%esp), %eax
   2067 # endif
   2068 #endif
   2069 	RETURN
   2070 
   2071 	.p2align 4
   2072 L(fwd_write_45bytes):
   2073 	movq	-45(%eax), %xmm0
   2074 	movq	%xmm0, -45(%edx)
   2075 L(fwd_write_37bytes):
   2076 	movq	-37(%eax), %xmm0
   2077 	movq	%xmm0, -37(%edx)
   2078 L(fwd_write_29bytes):
   2079 	movq	-29(%eax), %xmm0
   2080 	movq	%xmm0, -29(%edx)
   2081 L(fwd_write_21bytes):
   2082 	movq	-21(%eax), %xmm0
   2083 	movq	%xmm0, -21(%edx)
   2084 L(fwd_write_13bytes):
   2085 	movq	-13(%eax), %xmm0
   2086 	movq	%xmm0, -13(%edx)
   2087 	movl	-5(%eax), %ecx
   2088 	movl	%ecx, -5(%edx)
   2089 	movzbl	-1(%eax), %ecx
   2090 	movb	%cl, -1(%edx)
   2091 #ifndef USE_AS_BCOPY
   2092 # ifdef USE_AS_MEMPCPY
   2093 	movl	%edx, %eax
   2094 # else
   2095 	movl	DEST(%esp), %eax
   2096 # endif
   2097 #endif
   2098 	RETURN
   2099 
   2100 	.p2align 4
   2101 L(fwd_write_41bytes):
   2102 	movq	-41(%eax), %xmm0
   2103 	movq	%xmm0, -41(%edx)
   2104 L(fwd_write_33bytes):
   2105 	movq	-33(%eax), %xmm0
   2106 	movq	%xmm0, -33(%edx)
   2107 L(fwd_write_25bytes):
   2108 	movq	-25(%eax), %xmm0
   2109 	movq	%xmm0, -25(%edx)
   2110 L(fwd_write_17bytes):
   2111 	movq	-17(%eax), %xmm0
   2112 	movq	%xmm0, -17(%edx)
   2113 L(fwd_write_9bytes):
   2114 	movq	-9(%eax), %xmm0
   2115 	movq	%xmm0, -9(%edx)
   2116 L(fwd_write_1bytes):
   2117 	movzbl	-1(%eax), %ecx
   2118 	movb	%cl, -1(%edx)
   2119 #ifndef USE_AS_BCOPY
   2120 # ifdef USE_AS_MEMPCPY
   2121 	movl	%edx, %eax
   2122 # else
   2123 	movl	DEST(%esp), %eax
   2124 # endif
   2125 #endif
   2126 	RETURN
   2127 
   2128 	.p2align 4
   2129 L(fwd_write_46bytes):
   2130 	movq	-46(%eax), %xmm0
   2131 	movq	%xmm0, -46(%edx)
   2132 L(fwd_write_38bytes):
   2133 	movq	-38(%eax), %xmm0
   2134 	movq	%xmm0, -38(%edx)
   2135 L(fwd_write_30bytes):
   2136 	movq	-30(%eax), %xmm0
   2137 	movq	%xmm0, -30(%edx)
   2138 L(fwd_write_22bytes):
   2139 	movq	-22(%eax), %xmm0
   2140 	movq	%xmm0, -22(%edx)
   2141 L(fwd_write_14bytes):
   2142 	movq	-14(%eax), %xmm0
   2143 	movq	%xmm0, -14(%edx)
   2144 L(fwd_write_6bytes):
   2145 	movl	-6(%eax), %ecx
   2146 	movl	%ecx, -6(%edx)
   2147 	movzwl	-2(%eax), %ecx
   2148 	movw	%cx, -2(%edx)
   2149 #ifndef USE_AS_BCOPY
   2150 # ifdef USE_AS_MEMPCPY
   2151 	movl	%edx, %eax
   2152 # else
   2153 	movl	DEST(%esp), %eax
   2154 # endif
   2155 #endif
   2156 	RETURN
   2157 
   2158 	.p2align 4
   2159 L(fwd_write_42bytes):
   2160 	movq	-42(%eax), %xmm0
   2161 	movq	%xmm0, -42(%edx)
   2162 L(fwd_write_34bytes):
   2163 	movq	-34(%eax), %xmm0
   2164 	movq	%xmm0, -34(%edx)
   2165 L(fwd_write_26bytes):
   2166 	movq	-26(%eax), %xmm0
   2167 	movq	%xmm0, -26(%edx)
   2168 L(fwd_write_18bytes):
   2169 	movq	-18(%eax), %xmm0
   2170 	movq	%xmm0, -18(%edx)
   2171 L(fwd_write_10bytes):
   2172 	movq	-10(%eax), %xmm0
   2173 	movq	%xmm0, -10(%edx)
   2174 L(fwd_write_2bytes):
   2175 	movzwl	-2(%eax), %ecx
   2176 	movw	%cx, -2(%edx)
   2177 #ifndef USE_AS_BCOPY
   2178 # ifdef USE_AS_MEMPCPY
   2179 	movl	%edx, %eax
   2180 # else
   2181 	movl	DEST(%esp), %eax
   2182 # endif
   2183 #endif
   2184 	RETURN
   2185 
   2186 	.p2align 4
   2187 L(fwd_write_47bytes):
   2188 	movq	-47(%eax), %xmm0
   2189 	movq	%xmm0, -47(%edx)
   2190 L(fwd_write_39bytes):
   2191 	movq	-39(%eax), %xmm0
   2192 	movq	%xmm0, -39(%edx)
   2193 L(fwd_write_31bytes):
   2194 	movq	-31(%eax), %xmm0
   2195 	movq	%xmm0, -31(%edx)
   2196 L(fwd_write_23bytes):
   2197 	movq	-23(%eax), %xmm0
   2198 	movq	%xmm0, -23(%edx)
   2199 L(fwd_write_15bytes):
   2200 	movq	-15(%eax), %xmm0
   2201 	movq	%xmm0, -15(%edx)
   2202 L(fwd_write_7bytes):
   2203 	movl	-7(%eax), %ecx
   2204 	movl	%ecx, -7(%edx)
   2205 	movzwl	-3(%eax), %ecx
   2206 	movzbl	-1(%eax), %eax
   2207 	movw	%cx, -3(%edx)
   2208 	movb	%al, -1(%edx)
   2209 #ifndef USE_AS_BCOPY
   2210 # ifdef USE_AS_MEMPCPY
   2211 	movl	%edx, %eax
   2212 # else
   2213 	movl	DEST(%esp), %eax
   2214 # endif
   2215 #endif
   2216 	RETURN
   2217 
   2218 	.p2align 4
   2219 L(fwd_write_43bytes):
   2220 	movq	-43(%eax), %xmm0
   2221 	movq	%xmm0, -43(%edx)
   2222 L(fwd_write_35bytes):
   2223 	movq	-35(%eax), %xmm0
   2224 	movq	%xmm0, -35(%edx)
   2225 L(fwd_write_27bytes):
   2226 	movq	-27(%eax), %xmm0
   2227 	movq	%xmm0, -27(%edx)
   2228 L(fwd_write_19bytes):
   2229 	movq	-19(%eax), %xmm0
   2230 	movq	%xmm0, -19(%edx)
   2231 L(fwd_write_11bytes):
   2232 	movq	-11(%eax), %xmm0
   2233 	movq	%xmm0, -11(%edx)
   2234 L(fwd_write_3bytes):
   2235 	movzwl	-3(%eax), %ecx
   2236 	movzbl	-1(%eax), %eax
   2237 	movw	%cx, -3(%edx)
   2238 	movb	%al, -1(%edx)
   2239 #ifndef USE_AS_BCOPY
   2240 # ifdef USE_AS_MEMPCPY
   2241 	movl	%edx, %eax
   2242 # else
   2243 	movl	DEST(%esp), %eax
   2244 # endif
   2245 #endif
   2246 	RETURN
   2247 
   2248 	.p2align 4
   2249 L(fwd_write_40bytes_align):
   2250 	movdqa	-40(%eax), %xmm0
   2251 	movdqa	%xmm0, -40(%edx)
   2252 L(fwd_write_24bytes_align):
   2253 	movdqa	-24(%eax), %xmm0
   2254 	movdqa	%xmm0, -24(%edx)
   2255 L(fwd_write_8bytes_align):
   2256 	movq	-8(%eax), %xmm0
   2257 	movq	%xmm0, -8(%edx)
   2258 L(fwd_write_0bytes_align):
   2259 #ifndef USE_AS_BCOPY
   2260 # ifdef USE_AS_MEMPCPY
   2261 	movl	%edx, %eax
   2262 # else
   2263 	movl	DEST(%esp), %eax
   2264 # endif
   2265 #endif
   2266 	RETURN
   2267 
   2268 	.p2align 4
   2269 L(fwd_write_32bytes_align):
   2270 	movdqa	-32(%eax), %xmm0
   2271 	movdqa	%xmm0, -32(%edx)
   2272 L(fwd_write_16bytes_align):
   2273 	movdqa	-16(%eax), %xmm0
   2274 	movdqa	%xmm0, -16(%edx)
   2275 #ifndef USE_AS_BCOPY
   2276 # ifdef USE_AS_MEMPCPY
   2277 	movl	%edx, %eax
   2278 # else
   2279 	movl	DEST(%esp), %eax
   2280 # endif
   2281 #endif
   2282 	RETURN
   2283 
   2284 	.p2align 4
   2285 L(fwd_write_5bytes_align):
   2286 	movl	-5(%eax), %ecx
   2287 	movl	-4(%eax), %eax
   2288 	movl	%ecx, -5(%edx)
   2289 	movl	%eax, -4(%edx)
   2290 #ifndef USE_AS_BCOPY
   2291 # ifdef USE_AS_MEMPCPY
   2292 	movl	%edx, %eax
   2293 # else
   2294 	movl	DEST(%esp), %eax
   2295 # endif
   2296 #endif
   2297 	RETURN
   2298 
   2299 	.p2align 4
   2300 L(fwd_write_45bytes_align):
   2301 	movdqa	-45(%eax), %xmm0
   2302 	movdqa	%xmm0, -45(%edx)
   2303 L(fwd_write_29bytes_align):
   2304 	movdqa	-29(%eax), %xmm0
   2305 	movdqa	%xmm0, -29(%edx)
   2306 L(fwd_write_13bytes_align):
   2307 	movq	-13(%eax), %xmm0
   2308 	movq	%xmm0, -13(%edx)
   2309 	movl	-5(%eax), %ecx
   2310 	movl	%ecx, -5(%edx)
   2311 	movzbl	-1(%eax), %ecx
   2312 	movb	%cl, -1(%edx)
   2313 #ifndef USE_AS_BCOPY
   2314 # ifdef USE_AS_MEMPCPY
   2315 	movl	%edx, %eax
   2316 # else
   2317 	movl	DEST(%esp), %eax
   2318 # endif
   2319 #endif
   2320 	RETURN
   2321 
   2322 	.p2align 4
   2323 L(fwd_write_37bytes_align):
   2324 	movdqa	-37(%eax), %xmm0
   2325 	movdqa	%xmm0, -37(%edx)
   2326 L(fwd_write_21bytes_align):
   2327 	movdqa	-21(%eax), %xmm0
   2328 	movdqa	%xmm0, -21(%edx)
   2329 	movl	-5(%eax), %ecx
   2330 	movl	%ecx, -5(%edx)
   2331 	movzbl	-1(%eax), %ecx
   2332 	movb	%cl, -1(%edx)
   2333 #ifndef USE_AS_BCOPY
   2334 # ifdef USE_AS_MEMPCPY
   2335 	movl	%edx, %eax
   2336 # else
   2337 	movl	DEST(%esp), %eax
   2338 # endif
   2339 #endif
   2340 	RETURN
   2341 
   2342 	.p2align 4
   2343 L(fwd_write_41bytes_align):
   2344 	movdqa	-41(%eax), %xmm0
   2345 	movdqa	%xmm0, -41(%edx)
   2346 L(fwd_write_25bytes_align):
   2347 	movdqa	-25(%eax), %xmm0
   2348 	movdqa	%xmm0, -25(%edx)
   2349 L(fwd_write_9bytes_align):
   2350 	movq	-9(%eax), %xmm0
   2351 	movq	%xmm0, -9(%edx)
   2352 L(fwd_write_1bytes_align):
   2353 	movzbl	-1(%eax), %ecx
   2354 	movb	%cl, -1(%edx)
   2355 #ifndef USE_AS_BCOPY
   2356 # ifdef USE_AS_MEMPCPY
   2357 	movl	%edx, %eax
   2358 # else
   2359 	movl	DEST(%esp), %eax
   2360 # endif
   2361 #endif
   2362 	RETURN
   2363 
   2364 	.p2align 4
   2365 L(fwd_write_33bytes_align):
   2366 	movdqa	-33(%eax), %xmm0
   2367 	movdqa	%xmm0, -33(%edx)
   2368 L(fwd_write_17bytes_align):
   2369 	movdqa	-17(%eax), %xmm0
   2370 	movdqa	%xmm0, -17(%edx)
   2371 	movzbl	-1(%eax), %ecx
   2372 	movb	%cl, -1(%edx)
   2373 #ifndef USE_AS_BCOPY
   2374 # ifdef USE_AS_MEMPCPY
   2375 	movl	%edx, %eax
   2376 # else
   2377 	movl	DEST(%esp), %eax
   2378 # endif
   2379 #endif
   2380 	RETURN
   2381 
   2382 	.p2align 4
   2383 L(fwd_write_46bytes_align):
   2384 	movdqa	-46(%eax), %xmm0
   2385 	movdqa	%xmm0, -46(%edx)
   2386 L(fwd_write_30bytes_align):
   2387 	movdqa	-30(%eax), %xmm0
   2388 	movdqa	%xmm0, -30(%edx)
   2389 L(fwd_write_14bytes_align):
   2390 	movq	-14(%eax), %xmm0
   2391 	movq	%xmm0, -14(%edx)
   2392 L(fwd_write_6bytes_align):
   2393 	movl	-6(%eax), %ecx
   2394 	movl	%ecx, -6(%edx)
   2395 	movzwl	-2(%eax), %ecx
   2396 	movw	%cx, -2(%edx)
   2397 #ifndef USE_AS_BCOPY
   2398 # ifdef USE_AS_MEMPCPY
   2399 	movl	%edx, %eax
   2400 # else
   2401 	movl	DEST(%esp), %eax
   2402 # endif
   2403 #endif
   2404 	RETURN
   2405 
   2406 	.p2align 4
   2407 L(fwd_write_38bytes_align):
   2408 	movdqa	-38(%eax), %xmm0
   2409 	movdqa	%xmm0, -38(%edx)
   2410 L(fwd_write_22bytes_align):
   2411 	movdqa	-22(%eax), %xmm0
   2412 	movdqa	%xmm0, -22(%edx)
   2413 	movl	-6(%eax), %ecx
   2414 	movl	%ecx, -6(%edx)
   2415 	movzwl	-2(%eax), %ecx
   2416 	movw	%cx, -2(%edx)
   2417 #ifndef USE_AS_BCOPY
   2418 # ifdef USE_AS_MEMPCPY
   2419 	movl	%edx, %eax
   2420 # else
   2421 	movl	DEST(%esp), %eax
   2422 # endif
   2423 #endif
   2424 	RETURN
   2425 
   2426 	.p2align 4
   2427 L(fwd_write_42bytes_align):
   2428 	movdqa	-42(%eax), %xmm0
   2429 	movdqa	%xmm0, -42(%edx)
   2430 L(fwd_write_26bytes_align):
   2431 	movdqa	-26(%eax), %xmm0
   2432 	movdqa	%xmm0, -26(%edx)
   2433 L(fwd_write_10bytes_align):
   2434 	movq	-10(%eax), %xmm0
   2435 	movq	%xmm0, -10(%edx)
   2436 L(fwd_write_2bytes_align):
   2437 	movzwl	-2(%eax), %ecx
   2438 	movw	%cx, -2(%edx)
   2439 #ifndef USE_AS_BCOPY
   2440 # ifdef USE_AS_MEMPCPY
   2441 	movl	%edx, %eax
   2442 # else
   2443 	movl	DEST(%esp), %eax
   2444 # endif
   2445 #endif
   2446 	RETURN
   2447 
   2448 	.p2align 4
   2449 L(fwd_write_34bytes_align):
   2450 	movdqa	-34(%eax), %xmm0
   2451 	movdqa	%xmm0, -34(%edx)
   2452 L(fwd_write_18bytes_align):
   2453 	movdqa	-18(%eax), %xmm0
   2454 	movdqa	%xmm0, -18(%edx)
   2455 	movzwl	-2(%eax), %ecx
   2456 	movw	%cx, -2(%edx)
   2457 #ifndef USE_AS_BCOPY
   2458 # ifdef USE_AS_MEMPCPY
   2459 	movl	%edx, %eax
   2460 # else
   2461 	movl	DEST(%esp), %eax
   2462 # endif
   2463 #endif
   2464 	RETURN
   2465 
   2466 	.p2align 4
   2467 L(fwd_write_47bytes_align):
   2468 	movdqa	-47(%eax), %xmm0
   2469 	movdqa	%xmm0, -47(%edx)
   2470 L(fwd_write_31bytes_align):
   2471 	movdqa	-31(%eax), %xmm0
   2472 	movdqa	%xmm0, -31(%edx)
   2473 L(fwd_write_15bytes_align):
   2474 	movq	-15(%eax), %xmm0
   2475 	movq	%xmm0, -15(%edx)
   2476 L(fwd_write_7bytes_align):
   2477 	movl	-7(%eax), %ecx
   2478 	movl	%ecx, -7(%edx)
   2479 	movzwl	-3(%eax), %ecx
   2480 	movzbl	-1(%eax), %eax
   2481 	movw	%cx, -3(%edx)
   2482 	movb	%al, -1(%edx)
   2483 #ifndef USE_AS_BCOPY
   2484 # ifdef USE_AS_MEMPCPY
   2485 	movl	%edx, %eax
   2486 # else
   2487 	movl	DEST(%esp), %eax
   2488 # endif
   2489 #endif
   2490 	RETURN
   2491 
   2492 	.p2align 4
   2493 L(fwd_write_39bytes_align):
   2494 	movdqa	-39(%eax), %xmm0
   2495 	movdqa	%xmm0, -39(%edx)
   2496 L(fwd_write_23bytes_align):
   2497 	movdqa	-23(%eax), %xmm0
   2498 	movdqa	%xmm0, -23(%edx)
   2499 	movl	-7(%eax), %ecx
   2500 	movl	%ecx, -7(%edx)
   2501 	movzwl	-3(%eax), %ecx
   2502 	movzbl	-1(%eax), %eax
   2503 	movw	%cx, -3(%edx)
   2504 	movb	%al, -1(%edx)
   2505 #ifndef USE_AS_BCOPY
   2506 # ifdef USE_AS_MEMPCPY
   2507 	movl	%edx, %eax
   2508 # else
   2509 	movl	DEST(%esp), %eax
   2510 # endif
   2511 #endif
   2512 	RETURN
   2513 
   2514 	.p2align 4
   2515 L(fwd_write_43bytes_align):
   2516 	movdqa	-43(%eax), %xmm0
   2517 	movdqa	%xmm0, -43(%edx)
   2518 L(fwd_write_27bytes_align):
   2519 	movdqa	-27(%eax), %xmm0
   2520 	movdqa	%xmm0, -27(%edx)
   2521 L(fwd_write_11bytes_align):
   2522 	movq	-11(%eax), %xmm0
   2523 	movq	%xmm0, -11(%edx)
   2524 L(fwd_write_3bytes_align):
   2525 	movzwl	-3(%eax), %ecx
   2526 	movzbl	-1(%eax), %eax
   2527 	movw	%cx, -3(%edx)
   2528 	movb	%al, -1(%edx)
   2529 #ifndef USE_AS_BCOPY
   2530 # ifdef USE_AS_MEMPCPY
   2531 	movl	%edx, %eax
   2532 # else
   2533 	movl	DEST(%esp), %eax
   2534 # endif
   2535 #endif
   2536 	RETURN
   2537 
   2538 	.p2align 4
   2539 L(fwd_write_35bytes_align):
   2540 	movdqa	-35(%eax), %xmm0
   2541 	movdqa	%xmm0, -35(%edx)
   2542 L(fwd_write_19bytes_align):
   2543 	movdqa	-19(%eax), %xmm0
   2544 	movdqa	%xmm0, -19(%edx)
   2545 	movzwl	-3(%eax), %ecx
   2546 	movzbl	-1(%eax), %eax
   2547 	movw	%cx, -3(%edx)
   2548 	movb	%al, -1(%edx)
   2549 #ifndef USE_AS_BCOPY
   2550 # ifdef USE_AS_MEMPCPY
   2551 	movl	%edx, %eax
   2552 # else
   2553 	movl	DEST(%esp), %eax
   2554 # endif
   2555 #endif
   2556 	RETURN
   2557 
   2558 	.p2align 4
   2559 L(fwd_write_44bytes_align):
   2560 	movdqa	-44(%eax), %xmm0
   2561 	movdqa	%xmm0, -44(%edx)
   2562 L(fwd_write_28bytes_align):
   2563 	movdqa	-28(%eax), %xmm0
   2564 	movdqa	%xmm0, -28(%edx)
   2565 L(fwd_write_12bytes_align):
   2566 	movq	-12(%eax), %xmm0
   2567 	movq	%xmm0, -12(%edx)
   2568 L(fwd_write_4bytes_align):
   2569 	movl	-4(%eax), %ecx
   2570 	movl	%ecx, -4(%edx)
   2571 #ifndef USE_AS_BCOPY
   2572 # ifdef USE_AS_MEMPCPY
   2573 	movl	%edx, %eax
   2574 # else
   2575 	movl	DEST(%esp), %eax
   2576 # endif
   2577 #endif
   2578 	RETURN
   2579 
   2580 	.p2align 4
   2581 L(fwd_write_36bytes_align):
   2582 	movdqa	-36(%eax), %xmm0
   2583 	movdqa	%xmm0, -36(%edx)
   2584 L(fwd_write_20bytes_align):
   2585 	movdqa	-20(%eax), %xmm0
   2586 	movdqa	%xmm0, -20(%edx)
   2587 	movl	-4(%eax), %ecx
   2588 	movl	%ecx, -4(%edx)
   2589 #ifndef USE_AS_BCOPY
   2590 # ifdef USE_AS_MEMPCPY
   2591 	movl	%edx, %eax
   2592 # else
   2593 	movl	DEST(%esp), %eax
   2594 # endif
   2595 #endif
   2596 	RETURN_END
   2597 
   2598 	CFI_PUSH (%edi)
   2599 
   2600 	.p2align 4
   2601 L(large_page):
   2602 	movdqu	(%eax), %xmm1
   2603 #ifdef USE_AS_MEMMOVE
   2604 	movl	DEST+4(%esp), %edi
   2605 	movdqu	%xmm0, (%edi)
   2606 #endif
   2607 	lea	16(%eax), %eax
   2608 	movntdq	%xmm1, (%edx)
   2609 	lea	16(%edx), %edx
   2610 	lea	-0x90(%ecx), %ecx
   2611 	POP (%edi)
   2612 
   2613 	.p2align 4
   2614 L(large_page_loop):
   2615 	movdqu	(%eax), %xmm0
   2616 	movdqu	0x10(%eax), %xmm1
   2617 	movdqu	0x20(%eax), %xmm2
   2618 	movdqu	0x30(%eax), %xmm3
   2619 	movdqu	0x40(%eax), %xmm4
   2620 	movdqu	0x50(%eax), %xmm5
   2621 	movdqu	0x60(%eax), %xmm6
   2622 	movdqu	0x70(%eax), %xmm7
   2623 	lea	0x80(%eax), %eax
   2624 
   2625 	sub	$0x80, %ecx
   2626 	movntdq	%xmm0, (%edx)
   2627 	movntdq	%xmm1, 0x10(%edx)
   2628 	movntdq	%xmm2, 0x20(%edx)
   2629 	movntdq	%xmm3, 0x30(%edx)
   2630 	movntdq	%xmm4, 0x40(%edx)
   2631 	movntdq	%xmm5, 0x50(%edx)
   2632 	movntdq	%xmm6, 0x60(%edx)
   2633 	movntdq	%xmm7, 0x70(%edx)
   2634 	lea	0x80(%edx), %edx
   2635 	jae	L(large_page_loop)
   2636 	cmp	$-0x40, %ecx
   2637 	lea	0x80(%ecx), %ecx
   2638 	jl	L(large_page_less_64bytes)
   2639 
   2640 	movdqu	(%eax), %xmm0
   2641 	movdqu	0x10(%eax), %xmm1
   2642 	movdqu	0x20(%eax), %xmm2
   2643 	movdqu	0x30(%eax), %xmm3
   2644 	lea	0x40(%eax), %eax
   2645 
   2646 	movntdq	%xmm0, (%edx)
   2647 	movntdq	%xmm1, 0x10(%edx)
   2648 	movntdq	%xmm2, 0x20(%edx)
   2649 	movntdq	%xmm3, 0x30(%edx)
   2650 	lea	0x40(%edx), %edx
   2651 	sub	$0x40, %ecx
   2652 L(large_page_less_64bytes):
   2653 	cmp	$32, %ecx
   2654 	jb	L(large_page_less_32bytes)
   2655 	movdqu	(%eax), %xmm0
   2656 	movdqu	0x10(%eax), %xmm1
   2657 	lea	0x20(%eax), %eax
   2658 	movntdq	%xmm0, (%edx)
   2659 	movntdq	%xmm1, 0x10(%edx)
   2660 	lea	0x20(%edx), %edx
   2661 	sub	$0x20, %ecx
   2662 L(large_page_less_32bytes):
   2663 	add	%ecx, %edx
   2664 	add	%ecx, %eax
   2665 	sfence
   2666 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
   2667 
   2668 	.p2align 4
   2669 L(bk_write_44bytes):
   2670 	movq	36(%eax), %xmm0
   2671 	movq	%xmm0, 36(%edx)
   2672 L(bk_write_36bytes):
   2673 	movq	28(%eax), %xmm0
   2674 	movq	%xmm0, 28(%edx)
   2675 L(bk_write_28bytes):
   2676 	movq	20(%eax), %xmm0
   2677 	movq	%xmm0, 20(%edx)
   2678 L(bk_write_20bytes):
   2679 	movq	12(%eax), %xmm0
   2680 	movq	%xmm0, 12(%edx)
   2681 L(bk_write_12bytes):
   2682 	movq	4(%eax), %xmm0
   2683 	movq	%xmm0, 4(%edx)
   2684 L(bk_write_4bytes):
   2685 	movl	(%eax), %ecx
   2686 	movl	%ecx, (%edx)
   2687 L(bk_write_0bytes):
   2688 #ifndef USE_AS_BCOPY
   2689 	movl	DEST(%esp), %eax
   2690 # ifdef USE_AS_MEMPCPY
   2691 	movl	LEN(%esp), %ecx
   2692 	add	%ecx, %eax
   2693 # endif
   2694 #endif
   2695 	RETURN
   2696 
   2697 	.p2align 4
   2698 L(bk_write_40bytes):
   2699 	movq	32(%eax), %xmm0
   2700 	movq	%xmm0, 32(%edx)
   2701 L(bk_write_32bytes):
   2702 	movq	24(%eax), %xmm0
   2703 	movq	%xmm0, 24(%edx)
   2704 L(bk_write_24bytes):
   2705 	movq	16(%eax), %xmm0
   2706 	movq	%xmm0, 16(%edx)
   2707 L(bk_write_16bytes):
   2708 	movq	8(%eax), %xmm0
   2709 	movq	%xmm0, 8(%edx)
   2710 L(bk_write_8bytes):
   2711 	movq	(%eax), %xmm0
   2712 	movq	%xmm0, (%edx)
   2713 #ifndef USE_AS_BCOPY
   2714 	movl	DEST(%esp), %eax
   2715 # ifdef USE_AS_MEMPCPY
   2716 	movl	LEN(%esp), %ecx
   2717 	add	%ecx, %eax
   2718 # endif
   2719 #endif
   2720 	RETURN
   2721 
   2722 	.p2align 4
   2723 L(bk_write_45bytes):
   2724 	movq	37(%eax), %xmm0
   2725 	movq	%xmm0, 37(%edx)
   2726 L(bk_write_37bytes):
   2727 	movq	29(%eax), %xmm0
   2728 	movq	%xmm0, 29(%edx)
   2729 L(bk_write_29bytes):
   2730 	movq	21(%eax), %xmm0
   2731 	movq	%xmm0, 21(%edx)
   2732 L(bk_write_21bytes):
   2733 	movq	13(%eax), %xmm0
   2734 	movq	%xmm0, 13(%edx)
   2735 L(bk_write_13bytes):
   2736 	movq	5(%eax), %xmm0
   2737 	movq	%xmm0, 5(%edx)
   2738 L(bk_write_5bytes):
   2739 	movl	1(%eax), %ecx
   2740 	movl	%ecx, 1(%edx)
   2741 L(bk_write_1bytes):
   2742 	movzbl	(%eax), %ecx
   2743 	movb	%cl, (%edx)
   2744 #ifndef USE_AS_BCOPY
   2745 	movl	DEST(%esp), %eax
   2746 # ifdef USE_AS_MEMPCPY
   2747 	movl	LEN(%esp), %ecx
   2748 	add	%ecx, %eax
   2749 # endif
   2750 #endif
   2751 	RETURN
   2752 
   2753 	.p2align 4
   2754 L(bk_write_41bytes):
   2755 	movq	33(%eax), %xmm0
   2756 	movq	%xmm0, 33(%edx)
   2757 L(bk_write_33bytes):
   2758 	movq	25(%eax), %xmm0
   2759 	movq	%xmm0, 25(%edx)
   2760 L(bk_write_25bytes):
   2761 	movq	17(%eax), %xmm0
   2762 	movq	%xmm0, 17(%edx)
   2763 L(bk_write_17bytes):
   2764 	movq	9(%eax), %xmm0
   2765 	movq	%xmm0, 9(%edx)
   2766 L(bk_write_9bytes):
   2767 	movq	1(%eax), %xmm0
   2768 	movq	%xmm0, 1(%edx)
   2769 	movzbl	(%eax), %ecx
   2770 	movb	%cl, (%edx)
   2771 #ifndef USE_AS_BCOPY
   2772 	movl	DEST(%esp), %eax
   2773 # ifdef USE_AS_MEMPCPY
   2774 	movl	LEN(%esp), %ecx
   2775 	add	%ecx, %eax
   2776 # endif
   2777 #endif
   2778 	RETURN
   2779 
   2780 	.p2align 4
   2781 L(bk_write_46bytes):
   2782 	movq	38(%eax), %xmm0
   2783 	movq	%xmm0, 38(%edx)
   2784 L(bk_write_38bytes):
   2785 	movq	30(%eax), %xmm0
   2786 	movq	%xmm0, 30(%edx)
   2787 L(bk_write_30bytes):
   2788 	movq	22(%eax), %xmm0
   2789 	movq	%xmm0, 22(%edx)
   2790 L(bk_write_22bytes):
   2791 	movq	14(%eax), %xmm0
   2792 	movq	%xmm0, 14(%edx)
   2793 L(bk_write_14bytes):
   2794 	movq	6(%eax), %xmm0
   2795 	movq	%xmm0, 6(%edx)
   2796 L(bk_write_6bytes):
   2797 	movl	2(%eax), %ecx
   2798 	movl	%ecx, 2(%edx)
   2799 	movzwl	(%eax), %ecx
   2800 	movw	%cx, (%edx)
   2801 #ifndef USE_AS_BCOPY
   2802 	movl	DEST(%esp), %eax
   2803 # ifdef USE_AS_MEMPCPY
   2804 	movl	LEN(%esp), %ecx
   2805 	add	%ecx, %eax
   2806 # endif
   2807 #endif
   2808 	RETURN
   2809 
   2810 	.p2align 4
   2811 L(bk_write_42bytes):
   2812 	movq	34(%eax), %xmm0
   2813 	movq	%xmm0, 34(%edx)
   2814 L(bk_write_34bytes):
   2815 	movq	26(%eax), %xmm0
   2816 	movq	%xmm0, 26(%edx)
   2817 L(bk_write_26bytes):
   2818 	movq	18(%eax), %xmm0
   2819 	movq	%xmm0, 18(%edx)
   2820 L(bk_write_18bytes):
   2821 	movq	10(%eax), %xmm0
   2822 	movq	%xmm0, 10(%edx)
   2823 L(bk_write_10bytes):
   2824 	movq	2(%eax), %xmm0
   2825 	movq	%xmm0, 2(%edx)
   2826 L(bk_write_2bytes):
   2827 	movzwl	(%eax), %ecx
   2828 	movw	%cx, (%edx)
   2829 #ifndef USE_AS_BCOPY
   2830 	movl	DEST(%esp), %eax
   2831 # ifdef USE_AS_MEMPCPY
   2832 	movl	LEN(%esp), %ecx
   2833 	add	%ecx, %eax
   2834 # endif
   2835 #endif
   2836 	RETURN
   2837 
   2838 	.p2align 4
   2839 L(bk_write_47bytes):
   2840 	movq	39(%eax), %xmm0
   2841 	movq	%xmm0, 39(%edx)
   2842 L(bk_write_39bytes):
   2843 	movq	31(%eax), %xmm0
   2844 	movq	%xmm0, 31(%edx)
   2845 L(bk_write_31bytes):
   2846 	movq	23(%eax), %xmm0
   2847 	movq	%xmm0, 23(%edx)
   2848 L(bk_write_23bytes):
   2849 	movq	15(%eax), %xmm0
   2850 	movq	%xmm0, 15(%edx)
   2851 L(bk_write_15bytes):
   2852 	movq	7(%eax), %xmm0
   2853 	movq	%xmm0, 7(%edx)
   2854 L(bk_write_7bytes):
   2855 	movl	3(%eax), %ecx
   2856 	movl	%ecx, 3(%edx)
   2857 	movzwl	1(%eax), %ecx
   2858 	movw	%cx, 1(%edx)
   2859 	movzbl	(%eax), %eax
   2860 	movb	%al, (%edx)
   2861 #ifndef USE_AS_BCOPY
   2862 	movl	DEST(%esp), %eax
   2863 # ifdef USE_AS_MEMPCPY
   2864 	movl	LEN(%esp), %ecx
   2865 	add	%ecx, %eax
   2866 # endif
   2867 #endif
   2868 	RETURN
   2869 
   2870 	.p2align 4
   2871 L(bk_write_43bytes):
   2872 	movq	35(%eax), %xmm0
   2873 	movq	%xmm0, 35(%edx)
   2874 L(bk_write_35bytes):
   2875 	movq	27(%eax), %xmm0
   2876 	movq	%xmm0, 27(%edx)
   2877 L(bk_write_27bytes):
   2878 	movq	19(%eax), %xmm0
   2879 	movq	%xmm0, 19(%edx)
   2880 L(bk_write_19bytes):
   2881 	movq	11(%eax), %xmm0
   2882 	movq	%xmm0, 11(%edx)
   2883 L(bk_write_11bytes):
   2884 	movq	3(%eax), %xmm0
   2885 	movq	%xmm0, 3(%edx)
   2886 L(bk_write_3bytes):
   2887 	movzwl	1(%eax), %ecx
   2888 	movw	%cx, 1(%edx)
   2889 	movzbl	(%eax), %eax
   2890 	movb	%al, (%edx)
   2891 #ifndef USE_AS_BCOPY
   2892 	movl	DEST(%esp), %eax
   2893 # ifdef USE_AS_MEMPCPY
   2894 	movl	LEN(%esp), %ecx
   2895 	add	%ecx, %eax
   2896 # endif
   2897 #endif
   2898 	RETURN_END
   2899 
   2900 
   2901 	.pushsection .rodata.ssse3,"a",@progbits
   2902 	.p2align 2
   2903 L(table_48bytes_fwd):
   2904 	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
   2905 	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
   2906 	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
   2907 	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
   2908 	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
   2909 	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
   2910 	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
   2911 	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
   2912 	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
   2913 	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
   2914 	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
   2915 	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
   2916 	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
   2917 	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
   2918 	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
   2919 	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
   2920 	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
   2921 	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
   2922 	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
   2923 	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
   2924 	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
   2925 	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
   2926 	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
   2927 	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
   2928 	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
   2929 	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
   2930 	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
   2931 	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
   2932 	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
   2933 	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
   2934 	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
   2935 	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
   2936 	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
   2937 	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
   2938 	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
   2939 	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
   2940 	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
   2941 	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
   2942 	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
   2943 	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
   2944 	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
   2945 	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
   2946 	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
   2947 	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
   2948 	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
   2949 	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
   2950 	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
   2951 	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
   2952 
   2953 	.p2align 2
   2954 L(table_48bytes_fwd_align):
   2955 	.int	JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
   2956 	.int	JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
   2957 	.int	JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
   2958 	.int	JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
   2959 	.int	JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
   2960 	.int	JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
   2961 	.int	JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
   2962 	.int	JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
   2963 	.int	JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
   2964 	.int	JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
   2965 	.int	JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
   2966 	.int	JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
   2967 	.int	JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
   2968 	.int	JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
   2969 	.int	JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
   2970 	.int	JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
   2971 	.int	JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
   2972 	.int	JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
   2973 	.int	JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
   2974 	.int	JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
   2975 	.int	JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
   2976 	.int	JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
   2977 	.int	JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
   2978 	.int	JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
   2979 	.int	JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
   2980 	.int	JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
   2981 	.int	JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
   2982 	.int	JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
   2983 	.int	JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
   2984 	.int	JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
   2985 	.int	JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
   2986 	.int	JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
   2987 	.int	JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
   2988 	.int	JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
   2989 	.int	JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
   2990 	.int	JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
   2991 	.int	JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
   2992 	.int	JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
   2993 	.int	JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
   2994 	.int	JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
   2995 	.int	JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
   2996 	.int	JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
   2997 	.int	JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
   2998 	.int	JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
   2999 	.int	JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
   3000 	.int	JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
   3001 	.int	JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
   3002 	.int	JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
   3003 
   3004 	.p2align 2
   3005 L(shl_table):
   3006 	.int	JMPTBL (L(shl_0), L(shl_table))
   3007 	.int	JMPTBL (L(shl_1), L(shl_table))
   3008 	.int	JMPTBL (L(shl_2), L(shl_table))
   3009 	.int	JMPTBL (L(shl_3), L(shl_table))
   3010 	.int	JMPTBL (L(shl_4), L(shl_table))
   3011 	.int	JMPTBL (L(shl_5), L(shl_table))
   3012 	.int	JMPTBL (L(shl_6), L(shl_table))
   3013 	.int	JMPTBL (L(shl_7), L(shl_table))
   3014 	.int	JMPTBL (L(shl_8), L(shl_table))
   3015 	.int	JMPTBL (L(shl_9), L(shl_table))
   3016 	.int	JMPTBL (L(shl_10), L(shl_table))
   3017 	.int	JMPTBL (L(shl_11), L(shl_table))
   3018 	.int	JMPTBL (L(shl_12), L(shl_table))
   3019 	.int	JMPTBL (L(shl_13), L(shl_table))
   3020 	.int	JMPTBL (L(shl_14), L(shl_table))
   3021 	.int	JMPTBL (L(shl_15), L(shl_table))
   3022 
   3023 	.p2align 2
   3024 L(table_48_bytes_bwd):
   3025 	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
   3026 	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
   3027 	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
   3028 	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
   3029 	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
   3030 	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
   3031 	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
   3032 	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
   3033 	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
   3034 	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
   3035 	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
   3036 	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
   3037 	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
   3038 	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
   3039 	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
   3040 	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
   3041 	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
   3042 	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
   3043 	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
   3044 	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
   3045 	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
   3046 	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
   3047 	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
   3048 	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
   3049 	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
   3050 	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
   3051 	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
   3052 	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
   3053 	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
   3054 	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
   3055 	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
   3056 	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
   3057 	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
   3058 	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
   3059 	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
   3060 	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
   3061 	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
   3062 	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
   3063 	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
   3064 	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
   3065 	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
   3066 	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
   3067 	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
   3068 	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
   3069 	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
   3070 	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
   3071 	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
   3072 	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
   3073 
   3074 	.popsection
   3075 
   3076 #ifdef USE_AS_MEMMOVE
   3077 	.p2align 4
   3078 L(copy_backward):
   3079 	PUSH (%edi)
   3080 	movl	%eax, %edi
   3081 	lea	(%ecx,%edx,1),%edx
   3082 	lea	(%ecx,%edi,1),%edi
   3083 	testl	$0x3, %edx
   3084 	jnz	L(bk_align)
   3085 
   3086 L(bk_aligned_4):
   3087 	cmp	$64, %ecx
   3088 	jae	L(bk_write_more64bytes)
   3089 
   3090 L(bk_write_64bytesless):
   3091 	cmp	$32, %ecx
   3092 	jb	L(bk_write_less32bytes)
   3093 
   3094 L(bk_write_more32bytes):
   3095 	/* Copy 32 bytes at a time.  */
   3096 	sub	$32, %ecx
   3097 	movq	-8(%edi), %xmm0
   3098 	movq	%xmm0, -8(%edx)
   3099 	movq	-16(%edi), %xmm0
   3100 	movq	%xmm0, -16(%edx)
   3101 	movq	-24(%edi), %xmm0
   3102 	movq	%xmm0, -24(%edx)
   3103 	movq	-32(%edi), %xmm0
   3104 	movq	%xmm0, -32(%edx)
   3105 	sub	$32, %edx
   3106 	sub	$32, %edi
   3107 
   3108 L(bk_write_less32bytes):
   3109 	movl	%edi, %eax
   3110 	sub	%ecx, %edx
   3111 	sub	%ecx, %eax
   3112 	POP (%edi)
   3113 L(bk_write_less32bytes_2):
   3114 	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
   3115 
   3116 	CFI_PUSH (%edi)
   3117 
   3118 	.p2align 4
   3119 L(bk_align):
   3120 	cmp	$8, %ecx
   3121 	jbe	L(bk_write_less32bytes)
   3122 	testl	$1, %edx
   3123 	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
   3124 	then	(EDX & 2) must be != 0.  */
   3125 	jz	L(bk_got2)
   3126 	sub	$1, %edi
   3127 	sub	$1, %ecx
   3128 	sub	$1, %edx
   3129 	movzbl	(%edi), %eax
   3130 	movb	%al, (%edx)
   3131 
   3132 	testl	$2, %edx
   3133 	jz	L(bk_aligned_4)
   3134 
   3135 L(bk_got2):
   3136 	sub	$2, %edi
   3137 	sub	$2, %ecx
   3138 	sub	$2, %edx
   3139 	movzwl	(%edi), %eax
   3140 	movw	%ax, (%edx)
   3141 	jmp	L(bk_aligned_4)
   3142 
   3143 	.p2align 4
   3144 L(bk_write_more64bytes):
   3145 	/* Check alignment of last byte.  */
   3146 	testl	$15, %edx
   3147 	jz	L(bk_ssse3_cpy_pre)
   3148 
   3149 /* EDX is aligned 4 bytes, but not 16 bytes.  */
   3150 L(bk_ssse3_align):
   3151 	sub	$4, %edi
   3152 	sub	$4, %ecx
   3153 	sub	$4, %edx
   3154 	movl	(%edi), %eax
   3155 	movl	%eax, (%edx)
   3156 
   3157 	testl	$15, %edx
   3158 	jz	L(bk_ssse3_cpy_pre)
   3159 
   3160 	sub	$4, %edi
   3161 	sub	$4, %ecx
   3162 	sub	$4, %edx
   3163 	movl	(%edi), %eax
   3164 	movl	%eax, (%edx)
   3165 
   3166 	testl	$15, %edx
   3167 	jz	L(bk_ssse3_cpy_pre)
   3168 
   3169 	sub	$4, %edi
   3170 	sub	$4, %ecx
   3171 	sub	$4, %edx
   3172 	movl	(%edi), %eax
   3173 	movl	%eax, (%edx)
   3174 
   3175 L(bk_ssse3_cpy_pre):
   3176 	cmp	$64, %ecx
   3177 	jb	L(bk_write_more32bytes)
   3178 
   3179 	.p2align 4
   3180 L(bk_ssse3_cpy):
   3181 	sub	$64, %edi
   3182 	sub	$64, %ecx
   3183 	sub	$64, %edx
   3184 	movdqu	0x30(%edi), %xmm3
   3185 	movdqa	%xmm3, 0x30(%edx)
   3186 	movdqu	0x20(%edi), %xmm2
   3187 	movdqa	%xmm2, 0x20(%edx)
   3188 	movdqu	0x10(%edi), %xmm1
   3189 	movdqa	%xmm1, 0x10(%edx)
   3190 	movdqu	(%edi), %xmm0
   3191 	movdqa	%xmm0, (%edx)
   3192 	cmp	$64, %ecx
   3193 	jae	L(bk_ssse3_cpy)
   3194 	jmp	L(bk_write_64bytesless)
   3195 
   3196 #endif
   3197 
   3198 END (MEMCPY)
   3199