Home | History | Annotate | Download | only in string
      1 /*
      2 Copyright (c) 2010, Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 #ifndef MEMCPY
     32 # define MEMCPY	ssse3_memcpy5
     33 #endif
     34 
     35 #ifndef L
     36 # define L(label)	.L##label
     37 #endif
     38 
     39 #ifndef cfi_startproc
     40 # define cfi_startproc	.cfi_startproc
     41 #endif
     42 
     43 #ifndef cfi_endproc
     44 # define cfi_endproc	.cfi_endproc
     45 #endif
     46 
     47 #ifndef cfi_rel_offset
     48 # define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
     49 #endif
     50 
     51 #ifndef cfi_restore
     52 # define cfi_restore(reg)	.cfi_restore reg
     53 #endif
     54 
     55 #ifndef cfi_adjust_cfa_offset
     56 # define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
     57 #endif
     58 
     59 #ifndef ENTRY
     60 # define ENTRY(name)		\
     61 	.type name,  @function;		\
     62 	.globl name;		\
     63 	.p2align 4;		\
     64 name:		\
     65 	cfi_startproc
     66 #endif
     67 
     68 #ifndef END
     69 # define END(name)		\
     70 	cfi_endproc;		\
     71 	.size name, .-name
     72 #endif
     73 
     74 #ifdef USE_AS_BCOPY
     75 # define SRC		PARMS
     76 # define DEST		SRC+4
     77 # define LEN		DEST+4
     78 #else
     79 # define DEST		PARMS
     80 # define SRC		DEST+4
     81 # define LEN		SRC+4
     82 #endif
     83 
     84 #define CFI_PUSH(REG)		\
     85   cfi_adjust_cfa_offset (4);		\
     86   cfi_rel_offset (REG, 0)
     87 
     88 #define CFI_POP(REG)		\
     89   cfi_adjust_cfa_offset (-4);		\
     90   cfi_restore (REG)
     91 
     92 #define PUSH(REG)	pushl REG; CFI_PUSH (REG)
     93 #define POP(REG)	popl REG; CFI_POP (REG)
     94 
     95 #if (defined SHARED || defined __PIC__)
     96 # define PARMS		8		/* Preserve EBX.  */
     97 # define ENTRANCE	PUSH (%ebx);
     98 # define RETURN_END	POP (%ebx); ret
     99 # define RETURN		RETURN_END; CFI_PUSH (%ebx)
    100 # define JMPTBL(I, B)	I - B
    101 # undef __i686
    102 
    103 # define SETUP_PIC_REG(x)	call	__i686.get_pc_thunk.x
    104 
    105 /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
    106 	jump table with relative offsets.  INDEX is a register contains the
    107 	index into the jump table.   SCALE is the scale of INDEX. */
    108 
    109 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
    110     /* We first load PC into EBX.  */		\
    111 	SETUP_PIC_REG(bx);		\
    112     /* Get the address of the jump table.  */		\
    113 	addl	$(TABLE - .), %ebx;		\
    114     /* Get the entry and convert the relative offset to the		\
    115 	absolute	address.  */		\
    116 	addl	(%ebx, INDEX, SCALE), %ebx;		\
    117     /* We loaded the jump table.  Go.  */		\
    118 	jmp	*%ebx
    119 #else
    120 
    121 # define PARMS		4
    122 # define ENTRANCE
    123 # define RETURN_END	ret
    124 # define RETURN		RETURN_END
    125 # define JMPTBL(I, B)	I
    126 
    127 /* Branch to an entry in a jump table.  TABLE is a jump table with
    128 	absolute offsets.  INDEX is a register contains the index into the
    129 	jump table.  SCALE is the scale of INDEX. */
    130 
    131 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
    132 	jmp	*TABLE(, INDEX, SCALE)
    133 #endif
    134 
    135 	.section .text.ssse3,"ax",@progbits
    136 ENTRY (MEMCPY)
    137 	ENTRANCE
    138 	movl	LEN(%esp), %ecx
    139 	movl	SRC(%esp), %eax
    140 	movl	DEST(%esp), %edx
    141 
    142 #ifdef USE_AS_MEMMOVE
    143 	cmp	%eax, %edx
    144 	jb	L(copy_forward)
    145 	je	L(fwd_write_0bytes)
    146 	cmp	$32, %ecx
    147 	jae	L(memmove_bwd)
    148 	jmp	L(bk_write_less32bytes_2)
    149 
    150 	.p2align 4
    151 L(memmove_bwd):
    152 	add	%ecx, %eax
    153 	cmp	%eax, %edx
    154 	movl	SRC(%esp), %eax
    155 	jb	L(copy_backward)
    156 
    157 L(copy_forward):
    158 #endif
    159 	cmp	$48, %ecx
    160 	jae	L(48bytesormore)
    161 
    162 L(fwd_write_less32bytes):
    163 #ifndef USE_AS_MEMMOVE
    164 	cmp	%dl, %al
    165 	jb	L(bk_write)
    166 #endif
    167 	add	%ecx, %edx
    168 	add	%ecx, %eax
    169 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
    170 #ifndef USE_AS_MEMMOVE
    171 	.p2align 4
    172 L(bk_write):
    173 	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
    174 #endif
    175 
    176 	.p2align 4
    177 L(48bytesormore):
    178 #ifndef USE_AS_MEMMOVE
    179 	movlpd	(%eax), %xmm0
    180 	movlpd	8(%eax), %xmm1
    181 	movlpd	%xmm0, (%edx)
    182 	movlpd	%xmm1, 8(%edx)
    183 #else
    184 	movdqu	(%eax), %xmm0
    185 #endif
    186 	PUSH (%edi)
    187 	movl	%edx, %edi
    188 	and	$-16, %edx
    189 	add	$16, %edx
    190 	sub	%edx, %edi
    191 	add	%edi, %ecx
    192 	sub	%edi, %eax
    193 
    194 #ifdef SHARED_CACHE_SIZE_HALF
    195 	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
    196 #else
    197 # if (defined SHARED || defined __PIC__)
    198 	SETUP_PIC_REG(bx)
    199 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    200 	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
    201 # else
    202 	cmp	__x86_shared_cache_size_half, %ecx
    203 # endif
    204 #endif
    205 
    206 	mov	%eax, %edi
    207 	jae	L(large_page)
    208 	and	$0xf, %edi
    209 	jz	L(shl_0)
    210 	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
    211 
    212 	.p2align 4
    213 L(shl_0):
    214 #ifdef USE_AS_MEMMOVE
    215 	movl	DEST+4(%esp), %edi
    216 	movdqu	%xmm0, (%edi)
    217 #endif
    218 	xor	%edi, %edi
    219 	cmp	$127, %ecx
    220 	ja	L(shl_0_gobble)
    221 	lea	-32(%ecx), %ecx
    222 
    223 	.p2align 4
    224 L(shl_0_loop):
    225 	movdqa	(%eax, %edi), %xmm0
    226 	movdqa	16(%eax, %edi), %xmm1
    227 	sub	$32, %ecx
    228 	movdqa	%xmm0, (%edx, %edi)
    229 	movdqa	%xmm1, 16(%edx, %edi)
    230 	lea	32(%edi), %edi
    231 	jb	L(shl_0_end)
    232 
    233 	movdqa	(%eax, %edi), %xmm0
    234 	movdqa	16(%eax, %edi), %xmm1
    235 	sub	$32, %ecx
    236 	movdqa	%xmm0, (%edx, %edi)
    237 	movdqa	%xmm1, 16(%edx, %edi)
    238 	lea	32(%edi), %edi
    239 	jb	L(shl_0_end)
    240 
    241 	movdqa	(%eax, %edi), %xmm0
    242 	movdqa	16(%eax, %edi), %xmm1
    243 	sub	$32, %ecx
    244 	movdqa	%xmm0, (%edx, %edi)
    245 	movdqa	%xmm1, 16(%edx, %edi)
    246 	lea	32(%edi), %edi
    247 	jb	L(shl_0_end)
    248 
    249 	movdqa	(%eax, %edi), %xmm0
    250 	movdqa	16(%eax, %edi), %xmm1
    251 	sub	$32, %ecx
    252 	movdqa	%xmm0, (%edx, %edi)
    253 	movdqa	%xmm1, 16(%edx, %edi)
    254 	lea	32(%edi), %edi
    255 
    256 L(shl_0_end):
    257 	lea	32(%ecx), %ecx
    258 	add	%ecx, %edi
    259 	add	%edi, %edx
    260 	add	%edi, %eax
    261 	POP (%edi)
    262 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
    263 
    264 	CFI_PUSH (%edi)
    265 
    266 	.p2align 4
    267 L(shl_0_gobble):
    268 #ifdef DATA_CACHE_SIZE_HALF
    269 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    270 #else
    271 # if (defined SHARED || defined __PIC__)
    272 	SETUP_PIC_REG(bx)
    273 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    274 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    275 # else
    276 	cmp	__x86_data_cache_size_half, %ecx
    277 # endif
    278 #endif
    279 	POP	(%edi)
    280 	lea	-128(%ecx), %ecx
    281 	jae	L(shl_0_gobble_mem_loop)
    282 
    283 	.p2align 4
    284 L(shl_0_gobble_cache_loop):
    285 	movdqa	(%eax), %xmm0
    286 	movdqa	0x10(%eax), %xmm1
    287 	movdqa	0x20(%eax), %xmm2
    288 	movdqa	0x30(%eax), %xmm3
    289 	movdqa	0x40(%eax), %xmm4
    290 	movdqa	0x50(%eax), %xmm5
    291 	movdqa	0x60(%eax), %xmm6
    292 	movdqa	0x70(%eax), %xmm7
    293 	lea	0x80(%eax), %eax
    294 	sub	$128, %ecx
    295 	movdqa	%xmm0, (%edx)
    296 	movdqa	%xmm1, 0x10(%edx)
    297 	movdqa	%xmm2, 0x20(%edx)
    298 	movdqa	%xmm3, 0x30(%edx)
    299 	movdqa	%xmm4, 0x40(%edx)
    300 	movdqa	%xmm5, 0x50(%edx)
    301 	movdqa	%xmm6, 0x60(%edx)
    302 	movdqa	%xmm7, 0x70(%edx)
    303 	lea	0x80(%edx), %edx
    304 
    305 	jae	L(shl_0_gobble_cache_loop)
    306 	cmp	$-0x40, %ecx
    307 	lea	0x80(%ecx), %ecx
    308 	jl	L(shl_0_cache_less_64bytes)
    309 
    310 	movdqa	(%eax), %xmm0
    311 	sub	$0x40, %ecx
    312 	movdqa	0x10(%eax), %xmm1
    313 	movdqa	%xmm0, (%edx)
    314 	movdqa	%xmm1, 0x10(%edx)
    315 	movdqa	0x20(%eax), %xmm0
    316 	movdqa	0x30(%eax), %xmm1
    317 	add	$0x40, %eax
    318 	movdqa	%xmm0, 0x20(%edx)
    319 	movdqa	%xmm1, 0x30(%edx)
    320 	add	$0x40, %edx
    321 
    322 L(shl_0_cache_less_64bytes):
    323 	cmp	$0x20, %ecx
    324 	jb	L(shl_0_cache_less_32bytes)
    325 	movdqa	(%eax), %xmm0
    326 	sub	$0x20, %ecx
    327 	movdqa	0x10(%eax), %xmm1
    328 	add	$0x20, %eax
    329 	movdqa	%xmm0, (%edx)
    330 	movdqa	%xmm1, 0x10(%edx)
    331 	add	$0x20, %edx
    332 
    333 L(shl_0_cache_less_32bytes):
    334 	cmp	$0x10, %ecx
    335 	jb	L(shl_0_cache_less_16bytes)
    336 	sub	$0x10, %ecx
    337 	movdqa	(%eax), %xmm0
    338 	add	$0x10, %eax
    339 	movdqa	%xmm0, (%edx)
    340 	add	$0x10, %edx
    341 
    342 L(shl_0_cache_less_16bytes):
    343 	add	%ecx, %edx
    344 	add	%ecx, %eax
    345 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
    346 
    347 	.p2align 4
    348 L(shl_0_gobble_mem_loop):
    349 	prefetcht0 0x1c0(%eax)
    350 	prefetcht0 0x280(%eax)
    351 	prefetcht0 0x1c0(%edx)
    352 
    353 	movdqa	(%eax), %xmm0
    354 	movdqa	0x10(%eax), %xmm1
    355 	movdqa	0x20(%eax), %xmm2
    356 	movdqa	0x30(%eax), %xmm3
    357 	movdqa	0x40(%eax), %xmm4
    358 	movdqa	0x50(%eax), %xmm5
    359 	movdqa	0x60(%eax), %xmm6
    360 	movdqa	0x70(%eax), %xmm7
    361 	lea	0x80(%eax), %eax
    362 	sub	$0x80, %ecx
    363 	movdqa	%xmm0, (%edx)
    364 	movdqa	%xmm1, 0x10(%edx)
    365 	movdqa	%xmm2, 0x20(%edx)
    366 	movdqa	%xmm3, 0x30(%edx)
    367 	movdqa	%xmm4, 0x40(%edx)
    368 	movdqa	%xmm5, 0x50(%edx)
    369 	movdqa	%xmm6, 0x60(%edx)
    370 	movdqa	%xmm7, 0x70(%edx)
    371 	lea	0x80(%edx), %edx
    372 
    373 	jae	L(shl_0_gobble_mem_loop)
    374 	cmp	$-0x40, %ecx
    375 	lea	0x80(%ecx), %ecx
    376 	jl	L(shl_0_mem_less_64bytes)
    377 
    378 	movdqa	(%eax), %xmm0
    379 	sub	$0x40, %ecx
    380 	movdqa	0x10(%eax), %xmm1
    381 
    382 	movdqa	%xmm0, (%edx)
    383 	movdqa	%xmm1, 0x10(%edx)
    384 
    385 	movdqa	0x20(%eax), %xmm0
    386 	movdqa	0x30(%eax), %xmm1
    387 	add	$0x40, %eax
    388 
    389 	movdqa	%xmm0, 0x20(%edx)
    390 	movdqa	%xmm1, 0x30(%edx)
    391 	add	$0x40, %edx
    392 
    393 L(shl_0_mem_less_64bytes):
    394 	cmp	$0x20, %ecx
    395 	jb	L(shl_0_mem_less_32bytes)
    396 	movdqa	(%eax), %xmm0
    397 	sub	$0x20, %ecx
    398 	movdqa	0x10(%eax), %xmm1
    399 	add	$0x20, %eax
    400 	movdqa	%xmm0, (%edx)
    401 	movdqa	%xmm1, 0x10(%edx)
    402 	add	$0x20, %edx
    403 
    404 L(shl_0_mem_less_32bytes):
    405 	cmp	$0x10, %ecx
    406 	jb	L(shl_0_mem_less_16bytes)
    407 	sub	$0x10, %ecx
    408 	movdqa	(%eax), %xmm0
    409 	add	$0x10, %eax
    410 	movdqa	%xmm0, (%edx)
    411 	add	$0x10, %edx
    412 
    413 L(shl_0_mem_less_16bytes):
    414 	add	%ecx, %edx
    415 	add	%ecx, %eax
    416 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
    417 
    418 	.p2align 4
    419 L(shl_1):
    420 #ifndef USE_AS_MEMMOVE
    421 	movaps	-1(%eax), %xmm1
    422 #else
    423 	movl	DEST+4(%esp), %edi
    424 	movaps	-1(%eax), %xmm1
    425 	movdqu	%xmm0, (%edi)
    426 #endif
    427 #ifdef DATA_CACHE_SIZE_HALF
    428 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    429 #else
    430 # if (defined SHARED || defined __PIC__)
    431 	SETUP_PIC_REG(bx)
    432 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    433 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    434 # else
    435 	cmp	__x86_data_cache_size_half, %ecx
    436 # endif
    437 #endif
    438 	jb L(sh_1_no_prefetch)
    439 
    440 	lea	-64(%ecx), %ecx
    441 
    442 	.p2align 4
    443 L(Shl1LoopStart):
    444 	prefetcht0 0x1c0(%eax)
    445 	prefetcht0 0x1c0(%edx)
    446 	movaps	15(%eax), %xmm2
    447 	movaps	31(%eax), %xmm3
    448 	movaps	47(%eax), %xmm4
    449 	movaps	63(%eax), %xmm5
    450 	movaps	%xmm5, %xmm7
    451 	palignr	$1, %xmm4, %xmm5
    452 	palignr	$1, %xmm3, %xmm4
    453 	movaps	%xmm5, 48(%edx)
    454 	palignr	$1, %xmm2, %xmm3
    455 	lea	64(%eax), %eax
    456 	palignr	$1, %xmm1, %xmm2
    457 	movaps	%xmm4, 32(%edx)
    458 	movaps	%xmm3, 16(%edx)
    459 	movaps	%xmm7, %xmm1
    460 	movaps	%xmm2, (%edx)
    461 	lea	64(%edx), %edx
    462 	sub	$64, %ecx
    463 	ja	L(Shl1LoopStart)
    464 
    465 L(Shl1LoopLeave):
    466 	add	$32, %ecx
    467 	jle	L(shl_end_0)
    468 
    469 	movaps	15(%eax), %xmm2
    470 	movaps	31(%eax), %xmm3
    471 	palignr	$1, %xmm2, %xmm3
    472 	palignr	$1, %xmm1, %xmm2
    473 	movaps	%xmm2, (%edx)
    474 	movaps	%xmm3, 16(%edx)
    475 	lea	32(%edx, %ecx), %edx
    476 	lea	32(%eax, %ecx), %eax
    477 	POP (%edi)
    478 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    479 
    480 	CFI_PUSH (%edi)
    481 
    482 	.p2align 4
    483 L(sh_1_no_prefetch):
    484 	lea	-32(%ecx), %ecx
    485 	lea	-1(%eax), %eax
    486 	xor	%edi, %edi
    487 
    488 	.p2align 4
    489 L(sh_1_no_prefetch_loop):
    490 	movdqa	16(%eax, %edi), %xmm2
    491 	sub	$32, %ecx
    492 	movdqa	32(%eax, %edi), %xmm3
    493 	movdqa	%xmm3, %xmm4
    494 	palignr	$1, %xmm2, %xmm3
    495 	palignr	$1, %xmm1, %xmm2
    496 	lea	32(%edi), %edi
    497 	movdqa	%xmm2, -32(%edx, %edi)
    498 	movdqa	%xmm3, -16(%edx, %edi)
    499 	jb	L(sh_1_end_no_prefetch_loop)
    500 
    501 	movdqa	16(%eax, %edi), %xmm2
    502 	sub	$32, %ecx
    503 	movdqa	32(%eax, %edi), %xmm3
    504 	movdqa	%xmm3, %xmm1
    505 	palignr	$1, %xmm2, %xmm3
    506 	palignr	$1, %xmm4, %xmm2
    507 	lea	32(%edi), %edi
    508 	movdqa	%xmm2, -32(%edx, %edi)
    509 	movdqa	%xmm3, -16(%edx, %edi)
    510 	jae	L(sh_1_no_prefetch_loop)
    511 
    512 L(sh_1_end_no_prefetch_loop):
    513 	lea	32(%ecx), %ecx
    514 	add	%ecx, %edi
    515 	add	%edi, %edx
    516 	lea	1(%edi, %eax), %eax
    517 	POP	(%edi)
    518 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    519 
    520 	CFI_PUSH (%edi)
    521 
    522 	.p2align 4
    523 L(shl_2):
    524 #ifndef USE_AS_MEMMOVE
    525 	movaps	-2(%eax), %xmm1
    526 #else
    527 	movl	DEST+4(%esp), %edi
    528 	movaps	-2(%eax), %xmm1
    529 	movdqu	%xmm0, (%edi)
    530 #endif
    531 #ifdef DATA_CACHE_SIZE_HALF
    532 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    533 #else
    534 # if (defined SHARED || defined __PIC__)
    535 	SETUP_PIC_REG(bx)
    536 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    537 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    538 # else
    539 	cmp	__x86_data_cache_size_half, %ecx
    540 # endif
    541 #endif
    542 	jb L(sh_2_no_prefetch)
    543 
    544 	lea	-64(%ecx), %ecx
    545 
    546 	.p2align 4
    547 L(Shl2LoopStart):
    548 	prefetcht0 0x1c0(%eax)
    549 	prefetcht0 0x1c0(%edx)
    550 	movaps	14(%eax), %xmm2
    551 	movaps	30(%eax), %xmm3
    552 	movaps	46(%eax), %xmm4
    553 	movaps	62(%eax), %xmm5
    554 	movaps	%xmm5, %xmm7
    555 	palignr	$2, %xmm4, %xmm5
    556 	palignr	$2, %xmm3, %xmm4
    557 	movaps	%xmm5, 48(%edx)
    558 	palignr	$2, %xmm2, %xmm3
    559 	lea	64(%eax), %eax
    560 	palignr	$2, %xmm1, %xmm2
    561 	movaps	%xmm4, 32(%edx)
    562 	movaps	%xmm3, 16(%edx)
    563 	movaps	%xmm7, %xmm1
    564 	movaps	%xmm2, (%edx)
    565 	lea	64(%edx), %edx
    566 	sub	$64, %ecx
    567 	ja	L(Shl2LoopStart)
    568 
    569 L(Shl2LoopLeave):
    570 	add	$32, %ecx
    571 	jle	L(shl_end_0)
    572 
    573 	movaps	14(%eax), %xmm2
    574 	movaps	30(%eax), %xmm3
    575 	palignr	$2, %xmm2, %xmm3
    576 	palignr	$2, %xmm1, %xmm2
    577 	movaps	%xmm2, (%edx)
    578 	movaps	%xmm3, 16(%edx)
    579 	lea	32(%edx, %ecx), %edx
    580 	lea	32(%eax, %ecx), %eax
    581 	POP (%edi)
    582 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    583 
    584 	CFI_PUSH (%edi)
    585 
    586 	.p2align 4
    587 L(sh_2_no_prefetch):
    588 	lea	-32(%ecx), %ecx
    589 	lea	-2(%eax), %eax
    590 	xor	%edi, %edi
    591 
    592 	.p2align 4
    593 L(sh_2_no_prefetch_loop):
    594 	movdqa	16(%eax, %edi), %xmm2
    595 	sub	$32, %ecx
    596 	movdqa	32(%eax, %edi), %xmm3
    597 	movdqa	%xmm3, %xmm4
    598 	palignr	$2, %xmm2, %xmm3
    599 	palignr	$2, %xmm1, %xmm2
    600 	lea	32(%edi), %edi
    601 	movdqa	%xmm2, -32(%edx, %edi)
    602 	movdqa	%xmm3, -16(%edx, %edi)
    603 	jb	L(sh_2_end_no_prefetch_loop)
    604 
    605 	movdqa	16(%eax, %edi), %xmm2
    606 	sub	$32, %ecx
    607 	movdqa	32(%eax, %edi), %xmm3
    608 	movdqa	%xmm3, %xmm1
    609 	palignr	$2, %xmm2, %xmm3
    610 	palignr	$2, %xmm4, %xmm2
    611 	lea	32(%edi), %edi
    612 	movdqa	%xmm2, -32(%edx, %edi)
    613 	movdqa	%xmm3, -16(%edx, %edi)
    614 	jae	L(sh_2_no_prefetch_loop)
    615 
    616 L(sh_2_end_no_prefetch_loop):
    617 	lea	32(%ecx), %ecx
    618 	add	%ecx, %edi
    619 	add	%edi, %edx
    620 	lea	2(%edi, %eax), %eax
    621 	POP	(%edi)
    622 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    623 
    624 	CFI_PUSH (%edi)
    625 
    626 	.p2align 4
    627 L(shl_3):
    628 #ifndef USE_AS_MEMMOVE
    629 	movaps	-3(%eax), %xmm1
    630 #else
    631 	movl	DEST+4(%esp), %edi
    632 	movaps	-3(%eax), %xmm1
    633 	movdqu	%xmm0, (%edi)
    634 #endif
    635 #ifdef DATA_CACHE_SIZE_HALF
    636 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    637 #else
    638 # if (defined SHARED || defined __PIC__)
    639 	SETUP_PIC_REG(bx)
    640 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    641 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    642 # else
    643 	cmp	__x86_data_cache_size_half, %ecx
    644 # endif
    645 #endif
    646 	jb L(sh_3_no_prefetch)
    647 
    648 	lea	-64(%ecx), %ecx
    649 
    650 	.p2align 4
    651 L(Shl3LoopStart):
    652 	prefetcht0 0x1c0(%eax)
    653 	prefetcht0 0x1c0(%edx)
    654 	movaps	13(%eax), %xmm2
    655 	movaps	29(%eax), %xmm3
    656 	movaps	45(%eax), %xmm4
    657 	movaps	61(%eax), %xmm5
    658 	movaps	%xmm5, %xmm7
    659 	palignr	$3, %xmm4, %xmm5
    660 	palignr	$3, %xmm3, %xmm4
    661 	movaps	%xmm5, 48(%edx)
    662 	palignr	$3, %xmm2, %xmm3
    663 	lea	64(%eax), %eax
    664 	palignr	$3, %xmm1, %xmm2
    665 	movaps	%xmm4, 32(%edx)
    666 	movaps	%xmm3, 16(%edx)
    667 	movaps	%xmm7, %xmm1
    668 	movaps	%xmm2, (%edx)
    669 	lea	64(%edx), %edx
    670 	sub	$64, %ecx
    671 	ja	L(Shl3LoopStart)
    672 
    673 L(Shl3LoopLeave):
    674 	add	$32, %ecx
    675 	jle	L(shl_end_0)
    676 
    677 	movaps	13(%eax), %xmm2
    678 	movaps	29(%eax), %xmm3
    679 	palignr	$3, %xmm2, %xmm3
    680 	palignr	$3, %xmm1, %xmm2
    681 	movaps	%xmm2, (%edx)
    682 	movaps	%xmm3, 16(%edx)
    683 	lea	32(%edx, %ecx), %edx
    684 	lea	32(%eax, %ecx), %eax
    685 	POP (%edi)
    686 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    687 
    688 	CFI_PUSH (%edi)
    689 
    690 	.p2align 4
    691 L(sh_3_no_prefetch):
    692 	lea	-32(%ecx), %ecx
    693 	lea	-3(%eax), %eax
    694 	xor	%edi, %edi
    695 
    696 	.p2align 4
    697 L(sh_3_no_prefetch_loop):
    698 	movdqa	16(%eax, %edi), %xmm2
    699 	sub	$32, %ecx
    700 	movdqa	32(%eax, %edi), %xmm3
    701 	movdqa	%xmm3, %xmm4
    702 	palignr	$3, %xmm2, %xmm3
    703 	palignr	$3, %xmm1, %xmm2
    704 	lea	32(%edi), %edi
    705 	movdqa	%xmm2, -32(%edx, %edi)
    706 	movdqa	%xmm3, -16(%edx, %edi)
    707 
    708 	jb	L(sh_3_end_no_prefetch_loop)
    709 
    710 	movdqa	16(%eax, %edi), %xmm2
    711 	sub	$32, %ecx
    712 	movdqa	32(%eax, %edi), %xmm3
    713 	movdqa	%xmm3, %xmm1
    714 	palignr	$3, %xmm2, %xmm3
    715 	palignr	$3, %xmm4, %xmm2
    716 	lea	32(%edi), %edi
    717 	movdqa	%xmm2, -32(%edx, %edi)
    718 	movdqa	%xmm3, -16(%edx, %edi)
    719 
    720 	jae	L(sh_3_no_prefetch_loop)
    721 
    722 L(sh_3_end_no_prefetch_loop):
    723 	lea	32(%ecx), %ecx
    724 	add	%ecx, %edi
    725 	add	%edi, %edx
    726 	lea	3(%edi, %eax), %eax
    727 	POP	(%edi)
    728 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    729 
    730 	CFI_PUSH (%edi)
    731 
    732 	.p2align 4
    733 L(shl_4):
    734 #ifndef USE_AS_MEMMOVE
    735 	movaps	-4(%eax), %xmm1
    736 #else
    737 	movl	DEST+4(%esp), %edi
    738 	movaps	-4(%eax), %xmm1
    739 	movdqu	%xmm0, (%edi)
    740 #endif
    741 #ifdef DATA_CACHE_SIZE_HALF
    742 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    743 #else
    744 # if (defined SHARED || defined __PIC__)
    745 	SETUP_PIC_REG(bx)
    746 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    747 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    748 # else
    749 	cmp	__x86_data_cache_size_half, %ecx
    750 # endif
    751 #endif
    752 	jb L(sh_4_no_prefetch)
    753 
    754 	lea	-64(%ecx), %ecx
    755 
    756 	.p2align 4
    757 L(Shl4LoopStart):
    758 	prefetcht0 0x1c0(%eax)
    759 	prefetcht0 0x1c0(%edx)
    760 	movaps	12(%eax), %xmm2
    761 	movaps	28(%eax), %xmm3
    762 	movaps	44(%eax), %xmm4
    763 	movaps	60(%eax), %xmm5
    764 	movaps	%xmm5, %xmm7
    765 	palignr	$4, %xmm4, %xmm5
    766 	palignr	$4, %xmm3, %xmm4
    767 	movaps	%xmm5, 48(%edx)
    768 	palignr	$4, %xmm2, %xmm3
    769 	lea	64(%eax), %eax
    770 	palignr	$4, %xmm1, %xmm2
    771 	movaps	%xmm4, 32(%edx)
    772 	movaps	%xmm3, 16(%edx)
    773 	movaps	%xmm7, %xmm1
    774 	movaps	%xmm2, (%edx)
    775 	lea	64(%edx), %edx
    776 	sub	$64, %ecx
    777 	ja	L(Shl4LoopStart)
    778 
    779 L(Shl4LoopLeave):
    780 	add	$32, %ecx
    781 	jle	L(shl_end_0)
    782 
    783 	movaps	12(%eax), %xmm2
    784 	movaps	28(%eax), %xmm3
    785 	palignr	$4, %xmm2, %xmm3
    786 	palignr	$4, %xmm1, %xmm2
    787 	movaps	%xmm2, (%edx)
    788 	movaps	%xmm3, 16(%edx)
    789 	lea	32(%edx, %ecx), %edx
    790 	lea	32(%eax, %ecx), %eax
    791 	POP (%edi)
    792 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    793 
    794 	CFI_PUSH (%edi)
    795 
    796 	.p2align 4
    797 L(sh_4_no_prefetch):
    798 	lea	-32(%ecx), %ecx
    799 	lea	-4(%eax), %eax
    800 	xor	%edi, %edi
    801 
    802 	.p2align 4
    803 L(sh_4_no_prefetch_loop):
    804 	movdqa	16(%eax, %edi), %xmm2
    805 	sub	$32, %ecx
    806 	movdqa	32(%eax, %edi), %xmm3
    807 	movdqa	%xmm3, %xmm4
    808 	palignr	$4, %xmm2, %xmm3
    809 	palignr	$4, %xmm1, %xmm2
    810 	lea	32(%edi), %edi
    811 	movdqa	%xmm2, -32(%edx, %edi)
    812 	movdqa	%xmm3, -16(%edx, %edi)
    813 
    814 	jb	L(sh_4_end_no_prefetch_loop)
    815 
    816 	movdqa	16(%eax, %edi), %xmm2
    817 	sub	$32, %ecx
    818 	movdqa	32(%eax, %edi), %xmm3
    819 	movdqa	%xmm3, %xmm1
    820 	palignr	$4, %xmm2, %xmm3
    821 	palignr	$4, %xmm4, %xmm2
    822 	lea	32(%edi), %edi
    823 	movdqa	%xmm2, -32(%edx, %edi)
    824 	movdqa	%xmm3, -16(%edx, %edi)
    825 
    826 	jae	L(sh_4_no_prefetch_loop)
    827 
    828 L(sh_4_end_no_prefetch_loop):
    829 	lea	32(%ecx), %ecx
    830 	add	%ecx, %edi
    831 	add	%edi, %edx
    832 	lea	4(%edi, %eax), %eax
    833 	POP	(%edi)
    834 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    835 
    836 	CFI_PUSH (%edi)
    837 
    838 	.p2align 4
    839 L(shl_5):
    840 #ifndef USE_AS_MEMMOVE
    841 	movaps	-5(%eax), %xmm1
    842 #else
    843 	movl	DEST+4(%esp), %edi
    844 	movaps	-5(%eax), %xmm1
    845 	movdqu	%xmm0, (%edi)
    846 #endif
    847 #ifdef DATA_CACHE_SIZE_HALF
    848 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    849 #else
    850 # if (defined SHARED || defined __PIC__)
    851 	SETUP_PIC_REG(bx)
    852 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    853 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    854 # else
    855 	cmp	__x86_data_cache_size_half, %ecx
    856 # endif
    857 #endif
    858 	jb L(sh_5_no_prefetch)
    859 
    860 	lea	-64(%ecx), %ecx
    861 
    862 	.p2align 4
    863 L(Shl5LoopStart):
    864 	prefetcht0 0x1c0(%eax)
    865 	prefetcht0 0x1c0(%edx)
    866 	movaps	11(%eax), %xmm2
    867 	movaps	27(%eax), %xmm3
    868 	movaps	43(%eax), %xmm4
    869 	movaps	59(%eax), %xmm5
    870 	movaps	%xmm5, %xmm7
    871 	palignr	$5, %xmm4, %xmm5
    872 	palignr	$5, %xmm3, %xmm4
    873 	movaps	%xmm5, 48(%edx)
    874 	palignr	$5, %xmm2, %xmm3
    875 	lea	64(%eax), %eax
    876 	palignr	$5, %xmm1, %xmm2
    877 	movaps	%xmm4, 32(%edx)
    878 	movaps	%xmm3, 16(%edx)
    879 	movaps	%xmm7, %xmm1
    880 	movaps	%xmm2, (%edx)
    881 	lea	64(%edx), %edx
    882 	sub	$64, %ecx
    883 	ja	L(Shl5LoopStart)
    884 
    885 L(Shl5LoopLeave):
    886 	add	$32, %ecx
    887 	jle	L(shl_end_0)
    888 
    889 	movaps	11(%eax), %xmm2
    890 	movaps	27(%eax), %xmm3
    891 	palignr	$5, %xmm2, %xmm3
    892 	palignr	$5, %xmm1, %xmm2
    893 	movaps	%xmm2, (%edx)
    894 	movaps	%xmm3, 16(%edx)
    895 	lea	32(%edx, %ecx), %edx
    896 	lea	32(%eax, %ecx), %eax
    897 	POP (%edi)
    898 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    899 
    900 	CFI_PUSH (%edi)
    901 
    902 	.p2align 4
    903 L(sh_5_no_prefetch):
    904 	lea	-32(%ecx), %ecx
    905 	lea	-5(%eax), %eax
    906 	xor	%edi, %edi
    907 
    908 	.p2align 4
    909 L(sh_5_no_prefetch_loop):
    910 	movdqa	16(%eax, %edi), %xmm2
    911 	sub	$32, %ecx
    912 	movdqa	32(%eax, %edi), %xmm3
    913 	movdqa	%xmm3, %xmm4
    914 	palignr	$5, %xmm2, %xmm3
    915 	palignr	$5, %xmm1, %xmm2
    916 	lea	32(%edi), %edi
    917 	movdqa	%xmm2, -32(%edx, %edi)
    918 	movdqa	%xmm3, -16(%edx, %edi)
    919 
    920 	jb	L(sh_5_end_no_prefetch_loop)
    921 
    922 	movdqa	16(%eax, %edi), %xmm2
    923 	sub	$32, %ecx
    924 	movdqa	32(%eax, %edi), %xmm3
    925 	movdqa	%xmm3, %xmm1
    926 	palignr	$5, %xmm2, %xmm3
    927 	palignr	$5, %xmm4, %xmm2
    928 	lea	32(%edi), %edi
    929 	movdqa	%xmm2, -32(%edx, %edi)
    930 	movdqa	%xmm3, -16(%edx, %edi)
    931 
    932 	jae	L(sh_5_no_prefetch_loop)
    933 
    934 L(sh_5_end_no_prefetch_loop):
    935 	lea	32(%ecx), %ecx
    936 	add	%ecx, %edi
    937 	add	%edi, %edx
    938 	lea	5(%edi, %eax), %eax
    939 	POP	(%edi)
    940 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    941 
    942 	CFI_PUSH (%edi)
    943 
    944 	.p2align 4
    945 L(shl_6):
    946 #ifndef USE_AS_MEMMOVE
    947 	movaps	-6(%eax), %xmm1
    948 #else
    949 	movl	DEST+4(%esp), %edi
    950 	movaps	-6(%eax), %xmm1
    951 	movdqu	%xmm0, (%edi)
    952 #endif
    953 #ifdef DATA_CACHE_SIZE_HALF
    954 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    955 #else
    956 # if (defined SHARED || defined __PIC__)
    957 	SETUP_PIC_REG(bx)
    958 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    959 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    960 # else
    961 	cmp	__x86_data_cache_size_half, %ecx
    962 # endif
    963 #endif
    964 	jb L(sh_6_no_prefetch)
    965 
    966 	lea	-64(%ecx), %ecx
    967 
    968 	.p2align 4
    969 L(Shl6LoopStart):
    970 	prefetcht0 0x1c0(%eax)
    971 	prefetcht0 0x1c0(%edx)
    972 	movaps	10(%eax), %xmm2
    973 	movaps	26(%eax), %xmm3
    974 	movaps	42(%eax), %xmm4
    975 	movaps	58(%eax), %xmm5
    976 	movaps	%xmm5, %xmm7
    977 	palignr	$6, %xmm4, %xmm5
    978 	palignr	$6, %xmm3, %xmm4
    979 	movaps	%xmm5, 48(%edx)
    980 	palignr	$6, %xmm2, %xmm3
    981 	lea	64(%eax), %eax
    982 	palignr	$6, %xmm1, %xmm2
    983 	movaps	%xmm4, 32(%edx)
    984 	movaps	%xmm3, 16(%edx)
    985 	movaps	%xmm7, %xmm1
    986 	movaps	%xmm2, (%edx)
    987 	lea	64(%edx), %edx
    988 	sub	$64, %ecx
    989 	ja	L(Shl6LoopStart)
    990 
    991 L(Shl6LoopLeave):
    992 	add	$32, %ecx
    993 	jle	L(shl_end_0)
    994 
    995 	movaps	10(%eax), %xmm2
    996 	movaps	26(%eax), %xmm3
    997 	palignr	$6, %xmm2, %xmm3
    998 	palignr	$6, %xmm1, %xmm2
    999 	movaps	%xmm2, (%edx)
   1000 	movaps	%xmm3, 16(%edx)
   1001 	lea	32(%edx, %ecx), %edx
   1002 	lea	32(%eax, %ecx), %eax
   1003 	POP (%edi)
   1004 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1005 
   1006 	CFI_PUSH (%edi)
   1007 
   1008 	.p2align 4
   1009 L(sh_6_no_prefetch):
   1010 	lea	-32(%ecx), %ecx
   1011 	lea	-6(%eax), %eax
   1012 	xor	%edi, %edi
   1013 
   1014 	.p2align 4
   1015 L(sh_6_no_prefetch_loop):
   1016 	movdqa	16(%eax, %edi), %xmm2
   1017 	sub	$32, %ecx
   1018 	movdqa	32(%eax, %edi), %xmm3
   1019 	movdqa	%xmm3, %xmm4
   1020 	palignr	$6, %xmm2, %xmm3
   1021 	palignr	$6, %xmm1, %xmm2
   1022 	lea	32(%edi), %edi
   1023 	movdqa	%xmm2, -32(%edx, %edi)
   1024 	movdqa	%xmm3, -16(%edx, %edi)
   1025 
   1026 	jb	L(sh_6_end_no_prefetch_loop)
   1027 
   1028 	movdqa	16(%eax, %edi), %xmm2
   1029 	sub	$32, %ecx
   1030 	movdqa	32(%eax, %edi), %xmm3
   1031 	movdqa	%xmm3, %xmm1
   1032 	palignr	$6, %xmm2, %xmm3
   1033 	palignr	$6, %xmm4, %xmm2
   1034 	lea	32(%edi), %edi
   1035 	movdqa	%xmm2, -32(%edx, %edi)
   1036 	movdqa	%xmm3, -16(%edx, %edi)
   1037 
   1038 	jae	L(sh_6_no_prefetch_loop)
   1039 
   1040 L(sh_6_end_no_prefetch_loop):
   1041 	lea	32(%ecx), %ecx
   1042 	add	%ecx, %edi
   1043 	add	%edi, %edx
   1044 	lea	6(%edi, %eax), %eax
   1045 	POP	(%edi)
   1046 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1047 
   1048 	CFI_PUSH (%edi)
   1049 
   1050 	.p2align 4
   1051 L(shl_7):
   1052 #ifndef USE_AS_MEMMOVE
   1053 	movaps	-7(%eax), %xmm1
   1054 #else
   1055 	movl	DEST+4(%esp), %edi
   1056 	movaps	-7(%eax), %xmm1
   1057 	movdqu	%xmm0, (%edi)
   1058 #endif
   1059 #ifdef DATA_CACHE_SIZE_HALF
   1060 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1061 #else
   1062 # if (defined SHARED || defined __PIC__)
   1063 	SETUP_PIC_REG(bx)
   1064 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1065 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1066 # else
   1067 	cmp	__x86_data_cache_size_half, %ecx
   1068 # endif
   1069 #endif
   1070 	jb L(sh_7_no_prefetch)
   1071 
   1072 	lea	-64(%ecx), %ecx
   1073 
   1074 	.p2align 4
   1075 L(Shl7LoopStart):
   1076 	prefetcht0 0x1c0(%eax)
   1077 	prefetcht0 0x1c0(%edx)
   1078 	movaps	9(%eax), %xmm2
   1079 	movaps	25(%eax), %xmm3
   1080 	movaps	41(%eax), %xmm4
   1081 	movaps	57(%eax), %xmm5
   1082 	movaps	%xmm5, %xmm7
   1083 	palignr	$7, %xmm4, %xmm5
   1084 	palignr	$7, %xmm3, %xmm4
   1085 	movaps	%xmm5, 48(%edx)
   1086 	palignr	$7, %xmm2, %xmm3
   1087 	lea	64(%eax), %eax
   1088 	palignr	$7, %xmm1, %xmm2
   1089 	movaps	%xmm4, 32(%edx)
   1090 	movaps	%xmm3, 16(%edx)
   1091 	movaps	%xmm7, %xmm1
   1092 	movaps	%xmm2, (%edx)
   1093 	lea	64(%edx), %edx
   1094 	sub	$64, %ecx
   1095 	ja	L(Shl7LoopStart)
   1096 
   1097 L(Shl7LoopLeave):
   1098 	add	$32, %ecx
   1099 	jle	L(shl_end_0)
   1100 
   1101 	movaps	9(%eax), %xmm2
   1102 	movaps	25(%eax), %xmm3
   1103 	palignr	$7, %xmm2, %xmm3
   1104 	palignr	$7, %xmm1, %xmm2
   1105 	movaps	%xmm2, (%edx)
   1106 	movaps	%xmm3, 16(%edx)
   1107 	lea	32(%edx, %ecx), %edx
   1108 	lea	32(%eax, %ecx), %eax
   1109 	POP (%edi)
   1110 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1111 
   1112 	CFI_PUSH (%edi)
   1113 
   1114 	.p2align 4
   1115 L(sh_7_no_prefetch):
   1116 	lea	-32(%ecx), %ecx
   1117 	lea	-7(%eax), %eax
   1118 	xor	%edi, %edi
   1119 
   1120 	.p2align 4
   1121 L(sh_7_no_prefetch_loop):
   1122 	movdqa	16(%eax, %edi), %xmm2
   1123 	sub	$32, %ecx
   1124 	movdqa	32(%eax, %edi), %xmm3
   1125 	movdqa	%xmm3, %xmm4
   1126 	palignr	$7, %xmm2, %xmm3
   1127 	palignr	$7, %xmm1, %xmm2
   1128 	lea	32(%edi), %edi
   1129 	movdqa	%xmm2, -32(%edx, %edi)
   1130 	movdqa	%xmm3, -16(%edx, %edi)
   1131 	jb	L(sh_7_end_no_prefetch_loop)
   1132 
   1133 	movdqa	16(%eax, %edi), %xmm2
   1134 	sub	$32, %ecx
   1135 	movdqa	32(%eax, %edi), %xmm3
   1136 	movdqa	%xmm3, %xmm1
   1137 	palignr	$7, %xmm2, %xmm3
   1138 	palignr	$7, %xmm4, %xmm2
   1139 	lea	32(%edi), %edi
   1140 	movdqa	%xmm2, -32(%edx, %edi)
   1141 	movdqa	%xmm3, -16(%edx, %edi)
   1142 	jae	L(sh_7_no_prefetch_loop)
   1143 
   1144 L(sh_7_end_no_prefetch_loop):
   1145 	lea	32(%ecx), %ecx
   1146 	add	%ecx, %edi
   1147 	add	%edi, %edx
   1148 	lea	7(%edi, %eax), %eax
   1149 	POP	(%edi)
   1150 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1151 
   1152 	CFI_PUSH (%edi)
   1153 
   1154 	.p2align 4
   1155 L(shl_8):
   1156 #ifndef USE_AS_MEMMOVE
   1157 	movaps	-8(%eax), %xmm1
   1158 #else
   1159 	movl	DEST+4(%esp), %edi
   1160 	movaps	-8(%eax), %xmm1
   1161 	movdqu	%xmm0, (%edi)
   1162 #endif
   1163 #ifdef DATA_CACHE_SIZE_HALF
   1164 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1165 #else
   1166 # if (defined SHARED || defined __PIC__)
   1167 	SETUP_PIC_REG(bx)
   1168 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1169 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1170 # else
   1171 	cmp	__x86_data_cache_size_half, %ecx
   1172 # endif
   1173 #endif
   1174 	jb L(sh_8_no_prefetch)
   1175 
   1176 	lea	-64(%ecx), %ecx
   1177 
   1178 	.p2align 4
   1179 L(Shl8LoopStart):
   1180 	prefetcht0 0x1c0(%eax)
   1181 	prefetcht0 0x1c0(%edx)
   1182 	movaps	8(%eax), %xmm2
   1183 	movaps	24(%eax), %xmm3
   1184 	movaps	40(%eax), %xmm4
   1185 	movaps	56(%eax), %xmm5
   1186 	movaps	%xmm5, %xmm7
   1187 	palignr	$8, %xmm4, %xmm5
   1188 	palignr	$8, %xmm3, %xmm4
   1189 	movaps	%xmm5, 48(%edx)
   1190 	palignr	$8, %xmm2, %xmm3
   1191 	lea	64(%eax), %eax
   1192 	palignr	$8, %xmm1, %xmm2
   1193 	movaps	%xmm4, 32(%edx)
   1194 	movaps	%xmm3, 16(%edx)
   1195 	movaps	%xmm7, %xmm1
   1196 	movaps	%xmm2, (%edx)
   1197 	lea	64(%edx), %edx
   1198 	sub	$64, %ecx
   1199 	ja	L(Shl8LoopStart)
   1200 
   1201 L(LoopLeave8):
   1202 	add	$32, %ecx
   1203 	jle	L(shl_end_0)
   1204 
   1205 	movaps	8(%eax), %xmm2
   1206 	movaps	24(%eax), %xmm3
   1207 	palignr	$8, %xmm2, %xmm3
   1208 	palignr	$8, %xmm1, %xmm2
   1209 	movaps	%xmm2, (%edx)
   1210 	movaps	%xmm3, 16(%edx)
   1211 	lea	32(%edx, %ecx), %edx
   1212 	lea	32(%eax, %ecx), %eax
   1213 	POP (%edi)
   1214 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1215 
   1216 	CFI_PUSH (%edi)
   1217 
   1218 	.p2align 4
   1219 L(sh_8_no_prefetch):
   1220 	lea	-32(%ecx), %ecx
   1221 	lea	-8(%eax), %eax
   1222 	xor	%edi, %edi
   1223 
   1224 	.p2align 4
   1225 L(sh_8_no_prefetch_loop):
   1226 	movdqa	16(%eax, %edi), %xmm2
   1227 	sub	$32, %ecx
   1228 	movdqa	32(%eax, %edi), %xmm3
   1229 	movdqa	%xmm3, %xmm4
   1230 	palignr	$8, %xmm2, %xmm3
   1231 	palignr	$8, %xmm1, %xmm2
   1232 	lea	32(%edi), %edi
   1233 	movdqa	%xmm2, -32(%edx, %edi)
   1234 	movdqa	%xmm3, -16(%edx, %edi)
   1235 	jb	L(sh_8_end_no_prefetch_loop)
   1236 
   1237 	movdqa	16(%eax, %edi), %xmm2
   1238 	sub	$32, %ecx
   1239 	movdqa	32(%eax, %edi), %xmm3
   1240 	movdqa	%xmm3, %xmm1
   1241 	palignr	$8, %xmm2, %xmm3
   1242 	palignr	$8, %xmm4, %xmm2
   1243 	lea	32(%edi), %edi
   1244 	movdqa	%xmm2, -32(%edx, %edi)
   1245 	movdqa	%xmm3, -16(%edx, %edi)
   1246 	jae	L(sh_8_no_prefetch_loop)
   1247 
   1248 L(sh_8_end_no_prefetch_loop):
   1249 	lea	32(%ecx), %ecx
   1250 	add	%ecx, %edi
   1251 	add	%edi, %edx
   1252 	lea	8(%edi, %eax), %eax
   1253 	POP	(%edi)
   1254 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1255 
   1256 	CFI_PUSH (%edi)
   1257 
   1258 	.p2align 4
   1259 L(shl_9):
   1260 #ifndef USE_AS_MEMMOVE
   1261 	movaps	-9(%eax), %xmm1
   1262 #else
   1263 	movl	DEST+4(%esp), %edi
   1264 	movaps	-9(%eax), %xmm1
   1265 	movdqu	%xmm0, (%edi)
   1266 #endif
   1267 #ifdef DATA_CACHE_SIZE_HALF
   1268 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1269 #else
   1270 # if (defined SHARED || defined __PIC__)
   1271 	SETUP_PIC_REG(bx)
   1272 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1273 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1274 # else
   1275 	cmp	__x86_data_cache_size_half, %ecx
   1276 # endif
   1277 #endif
   1278 	jb L(sh_9_no_prefetch)
   1279 
   1280 	lea	-64(%ecx), %ecx
   1281 
   1282 	.p2align 4
   1283 L(Shl9LoopStart):
   1284 	prefetcht0 0x1c0(%eax)
   1285 	prefetcht0 0x1c0(%edx)
   1286 	movaps	7(%eax), %xmm2
   1287 	movaps	23(%eax), %xmm3
   1288 	movaps	39(%eax), %xmm4
   1289 	movaps	55(%eax), %xmm5
   1290 	movaps	%xmm5, %xmm7
   1291 	palignr	$9, %xmm4, %xmm5
   1292 	palignr	$9, %xmm3, %xmm4
   1293 	movaps	%xmm5, 48(%edx)
   1294 	palignr	$9, %xmm2, %xmm3
   1295 	lea	64(%eax), %eax
   1296 	palignr	$9, %xmm1, %xmm2
   1297 	movaps	%xmm4, 32(%edx)
   1298 	movaps	%xmm3, 16(%edx)
   1299 	movaps	%xmm7, %xmm1
   1300 	movaps	%xmm2, (%edx)
   1301 	lea	64(%edx), %edx
   1302 	sub	$64, %ecx
   1303 	ja	L(Shl9LoopStart)
   1304 
   1305 L(Shl9LoopLeave):
   1306 	add	$32, %ecx
   1307 	jle	L(shl_end_0)
   1308 
   1309 	movaps	7(%eax), %xmm2
   1310 	movaps	23(%eax), %xmm3
   1311 	palignr	$9, %xmm2, %xmm3
   1312 	palignr	$9, %xmm1, %xmm2
   1313 
   1314 	movaps	%xmm2, (%edx)
   1315 	movaps	%xmm3, 16(%edx)
   1316 	lea	32(%edx, %ecx), %edx
   1317 	lea	32(%eax, %ecx), %eax
   1318 	POP (%edi)
   1319 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1320 
   1321 	CFI_PUSH (%edi)
   1322 
   1323 	.p2align 4
   1324 L(sh_9_no_prefetch):
   1325 	lea	-32(%ecx), %ecx
   1326 	lea	-9(%eax), %eax
   1327 	xor	%edi, %edi
   1328 
   1329 	.p2align 4
   1330 L(sh_9_no_prefetch_loop):
   1331 	movdqa	16(%eax, %edi), %xmm2
   1332 	sub	$32, %ecx
   1333 	movdqa	32(%eax, %edi), %xmm3
   1334 	movdqa	%xmm3, %xmm4
   1335 	palignr	$9, %xmm2, %xmm3
   1336 	palignr	$9, %xmm1, %xmm2
   1337 	lea	32(%edi), %edi
   1338 	movdqa	%xmm2, -32(%edx, %edi)
   1339 	movdqa	%xmm3, -16(%edx, %edi)
   1340 	jb	L(sh_9_end_no_prefetch_loop)
   1341 
   1342 	movdqa	16(%eax, %edi), %xmm2
   1343 	sub	$32, %ecx
   1344 	movdqa	32(%eax, %edi), %xmm3
   1345 	movdqa	%xmm3, %xmm1
   1346 	palignr	$9, %xmm2, %xmm3
   1347 	palignr	$9, %xmm4, %xmm2
   1348 	lea	32(%edi), %edi
   1349 	movdqa	%xmm2, -32(%edx, %edi)
   1350 	movdqa	%xmm3, -16(%edx, %edi)
   1351 	jae	L(sh_9_no_prefetch_loop)
   1352 
   1353 L(sh_9_end_no_prefetch_loop):
   1354 	lea	32(%ecx), %ecx
   1355 	add	%ecx, %edi
   1356 	add	%edi, %edx
   1357 	lea	9(%edi, %eax), %eax
   1358 	POP	(%edi)
   1359 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1360 
   1361 	CFI_PUSH (%edi)
   1362 
   1363 	.p2align 4
   1364 L(shl_10):
   1365 #ifndef USE_AS_MEMMOVE
   1366 	movaps	-10(%eax), %xmm1
   1367 #else
   1368 	movl	DEST+4(%esp), %edi
   1369 	movaps	-10(%eax), %xmm1
   1370 	movdqu	%xmm0, (%edi)
   1371 #endif
   1372 #ifdef DATA_CACHE_SIZE_HALF
   1373 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1374 #else
   1375 # if (defined SHARED || defined __PIC__)
   1376 	SETUP_PIC_REG(bx)
   1377 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1378 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1379 # else
   1380 	cmp	__x86_data_cache_size_half, %ecx
   1381 # endif
   1382 #endif
   1383 	jb L(sh_10_no_prefetch)
   1384 
   1385 	lea	-64(%ecx), %ecx
   1386 
   1387 	.p2align 4
   1388 L(Shl10LoopStart):
   1389 	prefetcht0 0x1c0(%eax)
   1390 	prefetcht0 0x1c0(%edx)
   1391 	movaps	6(%eax), %xmm2
   1392 	movaps	22(%eax), %xmm3
   1393 	movaps	38(%eax), %xmm4
   1394 	movaps	54(%eax), %xmm5
   1395 	movaps	%xmm5, %xmm7
   1396 	palignr	$10, %xmm4, %xmm5
   1397 	palignr	$10, %xmm3, %xmm4
   1398 	movaps	%xmm5, 48(%edx)
   1399 	palignr	$10, %xmm2, %xmm3
   1400 	lea	64(%eax), %eax
   1401 	palignr	$10, %xmm1, %xmm2
   1402 	movaps	%xmm4, 32(%edx)
   1403 	movaps	%xmm3, 16(%edx)
   1404 	movaps	%xmm7, %xmm1
   1405 	movaps	%xmm2, (%edx)
   1406 	lea	64(%edx), %edx
   1407 	sub	$64, %ecx
   1408 	ja	L(Shl10LoopStart)
   1409 
   1410 L(Shl10LoopLeave):
   1411 	add	$32, %ecx
   1412 	jle	L(shl_end_0)
   1413 
   1414 	movaps	6(%eax), %xmm2
   1415 	movaps	22(%eax), %xmm3
   1416 	palignr	$10, %xmm2, %xmm3
   1417 	palignr	$10, %xmm1, %xmm2
   1418 
   1419 	movaps	%xmm2, (%edx)
   1420 	movaps	%xmm3, 16(%edx)
   1421 	lea	32(%edx, %ecx), %edx
   1422 	lea	32(%eax, %ecx), %eax
   1423 	POP (%edi)
   1424 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1425 
   1426 	CFI_PUSH (%edi)
   1427 
   1428 	.p2align 4
   1429 L(sh_10_no_prefetch):
   1430 	lea	-32(%ecx), %ecx
   1431 	lea	-10(%eax), %eax
   1432 	xor	%edi, %edi
   1433 
   1434 	.p2align 4
   1435 L(sh_10_no_prefetch_loop):
   1436 	movdqa	16(%eax, %edi), %xmm2
   1437 	sub	$32, %ecx
   1438 	movdqa	32(%eax, %edi), %xmm3
   1439 	movdqa	%xmm3, %xmm4
   1440 	palignr	$10, %xmm2, %xmm3
   1441 	palignr	$10, %xmm1, %xmm2
   1442 	lea	32(%edi), %edi
   1443 	movdqa	%xmm2, -32(%edx, %edi)
   1444 	movdqa	%xmm3, -16(%edx, %edi)
   1445 	jb	L(sh_10_end_no_prefetch_loop)
   1446 
   1447 	movdqa	16(%eax, %edi), %xmm2
   1448 	sub	$32, %ecx
   1449 	movdqa	32(%eax, %edi), %xmm3
   1450 	movdqa	%xmm3, %xmm1
   1451 	palignr	$10, %xmm2, %xmm3
   1452 	palignr	$10, %xmm4, %xmm2
   1453 	lea	32(%edi), %edi
   1454 	movdqa	%xmm2, -32(%edx, %edi)
   1455 	movdqa	%xmm3, -16(%edx, %edi)
   1456 	jae	L(sh_10_no_prefetch_loop)
   1457 
   1458 L(sh_10_end_no_prefetch_loop):
   1459 	lea	32(%ecx), %ecx
   1460 	add	%ecx, %edi
   1461 	add	%edi, %edx
   1462 	lea	10(%edi, %eax), %eax
   1463 	POP	(%edi)
   1464 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1465 
   1466 	CFI_PUSH (%edi)
   1467 
   1468 	.p2align 4
   1469 L(shl_11):
   1470 #ifndef USE_AS_MEMMOVE
   1471 	movaps	-11(%eax), %xmm1
   1472 #else
   1473 	movl	DEST+4(%esp), %edi
   1474 	movaps	-11(%eax), %xmm1
   1475 	movdqu	%xmm0, (%edi)
   1476 #endif
   1477 #ifdef DATA_CACHE_SIZE_HALF
   1478 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1479 #else
   1480 # if (defined SHARED || defined __PIC__)
   1481 	SETUP_PIC_REG(bx)
   1482 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1483 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1484 # else
   1485 	cmp	__x86_data_cache_size_half, %ecx
   1486 # endif
   1487 #endif
   1488 	jb L(sh_11_no_prefetch)
   1489 
   1490 	lea	-64(%ecx), %ecx
   1491 
   1492 	.p2align 4
   1493 L(Shl11LoopStart):
   1494 	prefetcht0 0x1c0(%eax)
   1495 	prefetcht0 0x1c0(%edx)
   1496 	movaps	5(%eax), %xmm2
   1497 	movaps	21(%eax), %xmm3
   1498 	movaps	37(%eax), %xmm4
   1499 	movaps	53(%eax), %xmm5
   1500 	movaps	%xmm5, %xmm7
   1501 	palignr	$11, %xmm4, %xmm5
   1502 	palignr	$11, %xmm3, %xmm4
   1503 	movaps	%xmm5, 48(%edx)
   1504 	palignr	$11, %xmm2, %xmm3
   1505 	lea	64(%eax), %eax
   1506 	palignr	$11, %xmm1, %xmm2
   1507 	movaps	%xmm4, 32(%edx)
   1508 	movaps	%xmm3, 16(%edx)
   1509 	movaps	%xmm7, %xmm1
   1510 	movaps	%xmm2, (%edx)
   1511 	lea	64(%edx), %edx
   1512 	sub	$64, %ecx
   1513 	ja	L(Shl11LoopStart)
   1514 
   1515 L(Shl11LoopLeave):
   1516 	add	$32, %ecx
   1517 	jle	L(shl_end_0)
   1518 
   1519 	movaps	5(%eax), %xmm2
   1520 	movaps	21(%eax), %xmm3
   1521 	palignr	$11, %xmm2, %xmm3
   1522 	palignr	$11, %xmm1, %xmm2
   1523 
   1524 	movaps	%xmm2, (%edx)
   1525 	movaps	%xmm3, 16(%edx)
   1526 	lea	32(%edx, %ecx), %edx
   1527 	lea	32(%eax, %ecx), %eax
   1528 	POP (%edi)
   1529 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1530 
   1531 	CFI_PUSH (%edi)
   1532 
   1533 	.p2align 4
   1534 L(sh_11_no_prefetch):
   1535 	lea	-32(%ecx), %ecx
   1536 	lea	-11(%eax), %eax
   1537 	xor	%edi, %edi
   1538 
   1539 	.p2align 4
   1540 L(sh_11_no_prefetch_loop):
   1541 	movdqa	16(%eax, %edi), %xmm2
   1542 	sub	$32, %ecx
   1543 	movdqa	32(%eax, %edi), %xmm3
   1544 	movdqa	%xmm3, %xmm4
   1545 	palignr	$11, %xmm2, %xmm3
   1546 	palignr	$11, %xmm1, %xmm2
   1547 	lea	32(%edi), %edi
   1548 	movdqa	%xmm2, -32(%edx, %edi)
   1549 	movdqa	%xmm3, -16(%edx, %edi)
   1550 	jb	L(sh_11_end_no_prefetch_loop)
   1551 
   1552 	movdqa	16(%eax, %edi), %xmm2
   1553 	sub	$32, %ecx
   1554 	movdqa	32(%eax, %edi), %xmm3
   1555 	movdqa	%xmm3, %xmm1
   1556 	palignr	$11, %xmm2, %xmm3
   1557 	palignr	$11, %xmm4, %xmm2
   1558 	lea	32(%edi), %edi
   1559 	movdqa	%xmm2, -32(%edx, %edi)
   1560 	movdqa	%xmm3, -16(%edx, %edi)
   1561 	jae	L(sh_11_no_prefetch_loop)
   1562 
   1563 L(sh_11_end_no_prefetch_loop):
   1564 	lea	32(%ecx), %ecx
   1565 	add	%ecx, %edi
   1566 	add	%edi, %edx
   1567 	lea	11(%edi, %eax), %eax
   1568 	POP	(%edi)
   1569 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1570 
   1571 	CFI_PUSH (%edi)
   1572 
   1573 	.p2align 4
   1574 L(shl_12):
   1575 #ifndef USE_AS_MEMMOVE
   1576 	movaps	-12(%eax), %xmm1
   1577 #else
   1578 	movl	DEST+4(%esp), %edi
   1579 	movaps	-12(%eax), %xmm1
   1580 	movdqu	%xmm0, (%edi)
   1581 #endif
   1582 #ifdef DATA_CACHE_SIZE_HALF
   1583 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1584 #else
   1585 # if (defined SHARED || defined __PIC__)
   1586 	SETUP_PIC_REG(bx)
   1587 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1588 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1589 # else
   1590 	cmp	__x86_data_cache_size_half, %ecx
   1591 # endif
   1592 #endif
   1593 	jb L(sh_12_no_prefetch)
   1594 
   1595 	lea	-64(%ecx), %ecx
   1596 
   1597 	.p2align 4
   1598 L(Shl12LoopStart):
   1599 	prefetcht0 0x1c0(%eax)
   1600 	prefetcht0 0x1c0(%edx)
   1601 	movaps	4(%eax), %xmm2
   1602 	movaps	20(%eax), %xmm3
   1603 	movaps	36(%eax), %xmm4
   1604 	movaps	52(%eax), %xmm5
   1605 	movaps	%xmm5, %xmm7
   1606 	palignr	$12, %xmm4, %xmm5
   1607 	palignr	$12, %xmm3, %xmm4
   1608 	movaps	%xmm5, 48(%edx)
   1609 	palignr	$12, %xmm2, %xmm3
   1610 	lea	64(%eax), %eax
   1611 	palignr	$12, %xmm1, %xmm2
   1612 	movaps	%xmm4, 32(%edx)
   1613 	movaps	%xmm3, 16(%edx)
   1614 	movaps	%xmm7, %xmm1
   1615 	movaps	%xmm2, (%edx)
   1616 	lea	64(%edx), %edx
   1617 	sub	$64, %ecx
   1618 	ja	L(Shl12LoopStart)
   1619 
   1620 L(Shl12LoopLeave):
   1621 	add	$32, %ecx
   1622 	jle	L(shl_end_0)
   1623 
   1624 	movaps	4(%eax), %xmm2
   1625 	movaps	20(%eax), %xmm3
   1626 	palignr	$12, %xmm2, %xmm3
   1627 	palignr	$12, %xmm1, %xmm2
   1628 
   1629 	movaps	%xmm2, (%edx)
   1630 	movaps	%xmm3, 16(%edx)
   1631 	lea	32(%edx, %ecx), %edx
   1632 	lea	32(%eax, %ecx), %eax
   1633 	POP (%edi)
   1634 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1635 
   1636 	CFI_PUSH (%edi)
   1637 
   1638 	.p2align 4
   1639 L(sh_12_no_prefetch):
   1640 	lea	-32(%ecx), %ecx
   1641 	lea	-12(%eax), %eax
   1642 	xor	%edi, %edi
   1643 
   1644 	.p2align 4
   1645 L(sh_12_no_prefetch_loop):
   1646 	movdqa	16(%eax, %edi), %xmm2
   1647 	sub	$32, %ecx
   1648 	movdqa	32(%eax, %edi), %xmm3
   1649 	movdqa	%xmm3, %xmm4
   1650 	palignr	$12, %xmm2, %xmm3
   1651 	palignr	$12, %xmm1, %xmm2
   1652 	lea	32(%edi), %edi
   1653 	movdqa	%xmm2, -32(%edx, %edi)
   1654 	movdqa	%xmm3, -16(%edx, %edi)
   1655 	jb	L(sh_12_end_no_prefetch_loop)
   1656 
   1657 	movdqa	16(%eax, %edi), %xmm2
   1658 	sub	$32, %ecx
   1659 	movdqa	32(%eax, %edi), %xmm3
   1660 	movdqa	%xmm3, %xmm1
   1661 	palignr	$12, %xmm2, %xmm3
   1662 	palignr	$12, %xmm4, %xmm2
   1663 	lea	32(%edi), %edi
   1664 	movdqa	%xmm2, -32(%edx, %edi)
   1665 	movdqa	%xmm3, -16(%edx, %edi)
   1666 	jae	L(sh_12_no_prefetch_loop)
   1667 
   1668 L(sh_12_end_no_prefetch_loop):
   1669 	lea	32(%ecx), %ecx
   1670 	add	%ecx, %edi
   1671 	add	%edi, %edx
   1672 	lea	12(%edi, %eax), %eax
   1673 	POP	(%edi)
   1674 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1675 
   1676 	CFI_PUSH (%edi)
   1677 
   1678 	.p2align 4
   1679 L(shl_13):
   1680 #ifndef USE_AS_MEMMOVE
   1681 	movaps	-13(%eax), %xmm1
   1682 #else
   1683 	movl	DEST+4(%esp), %edi
   1684 	movaps	-13(%eax), %xmm1
   1685 	movdqu	%xmm0, (%edi)
   1686 #endif
   1687 #ifdef DATA_CACHE_SIZE_HALF
   1688 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1689 #else
   1690 # if (defined SHARED || defined __PIC__)
   1691 	SETUP_PIC_REG(bx)
   1692 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1693 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1694 # else
   1695 	cmp	__x86_data_cache_size_half, %ecx
   1696 # endif
   1697 #endif
   1698 	jb L(sh_13_no_prefetch)
   1699 
   1700 	lea	-64(%ecx), %ecx
   1701 
   1702 	.p2align 4
   1703 L(Shl13LoopStart):
   1704 	prefetcht0 0x1c0(%eax)
   1705 	prefetcht0 0x1c0(%edx)
   1706 	movaps	3(%eax), %xmm2
   1707 	movaps	19(%eax), %xmm3
   1708 	movaps	35(%eax), %xmm4
   1709 	movaps	51(%eax), %xmm5
   1710 	movaps	%xmm5, %xmm7
   1711 	palignr	$13, %xmm4, %xmm5
   1712 	palignr	$13, %xmm3, %xmm4
   1713 	movaps	%xmm5, 48(%edx)
   1714 	palignr	$13, %xmm2, %xmm3
   1715 	lea	64(%eax), %eax
   1716 	palignr	$13, %xmm1, %xmm2
   1717 	movaps	%xmm4, 32(%edx)
   1718 	movaps	%xmm3, 16(%edx)
   1719 	movaps	%xmm7, %xmm1
   1720 	movaps	%xmm2, (%edx)
   1721 	lea	64(%edx), %edx
   1722 	sub	$64, %ecx
   1723 	ja	L(Shl13LoopStart)
   1724 
   1725 L(Shl13LoopLeave):
   1726 	add	$32, %ecx
   1727 	jle	L(shl_end_0)
   1728 
   1729 	movaps	3(%eax), %xmm2
   1730 	movaps	19(%eax), %xmm3
   1731 	palignr	$13, %xmm2, %xmm3
   1732 	palignr	$13, %xmm1, %xmm2
   1733 
   1734 	movaps	%xmm2, (%edx)
   1735 	movaps	%xmm3, 16(%edx)
   1736 	lea	32(%edx, %ecx), %edx
   1737 	lea	32(%eax, %ecx), %eax
   1738 	POP (%edi)
   1739 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1740 
   1741 	CFI_PUSH (%edi)
   1742 
   1743 	.p2align 4
   1744 L(sh_13_no_prefetch):
   1745 	lea	-32(%ecx), %ecx
   1746 	lea	-13(%eax), %eax
   1747 	xor	%edi, %edi
   1748 
   1749 	.p2align 4
   1750 L(sh_13_no_prefetch_loop):
   1751 	movdqa	16(%eax, %edi), %xmm2
   1752 	sub	$32, %ecx
   1753 	movdqa	32(%eax, %edi), %xmm3
   1754 	movdqa	%xmm3, %xmm4
   1755 	palignr	$13, %xmm2, %xmm3
   1756 	palignr	$13, %xmm1, %xmm2
   1757 	lea	32(%edi), %edi
   1758 	movdqa	%xmm2, -32(%edx, %edi)
   1759 	movdqa	%xmm3, -16(%edx, %edi)
   1760 	jb	L(sh_13_end_no_prefetch_loop)
   1761 
   1762 	movdqa	16(%eax, %edi), %xmm2
   1763 	sub	$32, %ecx
   1764 	movdqa	32(%eax, %edi), %xmm3
   1765 	movdqa	%xmm3, %xmm1
   1766 	palignr	$13, %xmm2, %xmm3
   1767 	palignr	$13, %xmm4, %xmm2
   1768 	lea	32(%edi), %edi
   1769 	movdqa	%xmm2, -32(%edx, %edi)
   1770 	movdqa	%xmm3, -16(%edx, %edi)
   1771 	jae	L(sh_13_no_prefetch_loop)
   1772 
   1773 L(sh_13_end_no_prefetch_loop):
   1774 	lea	32(%ecx), %ecx
   1775 	add	%ecx, %edi
   1776 	add	%edi, %edx
   1777 	lea	13(%edi, %eax), %eax
   1778 	POP	(%edi)
   1779 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1780 
   1781 	CFI_PUSH (%edi)
   1782 
   1783 	.p2align 4
   1784 L(shl_14):
   1785 #ifndef USE_AS_MEMMOVE
   1786 	movaps	-14(%eax), %xmm1
   1787 #else
   1788 	movl	DEST+4(%esp), %edi
   1789 	movaps	-14(%eax), %xmm1
   1790 	movdqu	%xmm0, (%edi)
   1791 #endif
   1792 #ifdef DATA_CACHE_SIZE_HALF
   1793 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1794 #else
   1795 # if (defined SHARED || defined __PIC__)
   1796 	SETUP_PIC_REG(bx)
   1797 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1798 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1799 # else
   1800 	cmp	__x86_data_cache_size_half, %ecx
   1801 # endif
   1802 #endif
   1803 	jb L(sh_14_no_prefetch)
   1804 
   1805 	lea	-64(%ecx), %ecx
   1806 
   1807 	.p2align 4
   1808 L(Shl14LoopStart):
   1809 	prefetcht0 0x1c0(%eax)
   1810 	prefetcht0 0x1c0(%edx)
   1811 	movaps	2(%eax), %xmm2
   1812 	movaps	18(%eax), %xmm3
   1813 	movaps	34(%eax), %xmm4
   1814 	movaps	50(%eax), %xmm5
   1815 	movaps	%xmm5, %xmm7
   1816 	palignr	$14, %xmm4, %xmm5
   1817 	palignr	$14, %xmm3, %xmm4
   1818 	movaps	%xmm5, 48(%edx)
   1819 	palignr	$14, %xmm2, %xmm3
   1820 	lea	64(%eax), %eax
   1821 	palignr	$14, %xmm1, %xmm2
   1822 	movaps	%xmm4, 32(%edx)
   1823 	movaps	%xmm3, 16(%edx)
   1824 	movaps	%xmm7, %xmm1
   1825 	movaps	%xmm2, (%edx)
   1826 	lea	64(%edx), %edx
   1827 	sub	$64, %ecx
   1828 	ja	L(Shl14LoopStart)
   1829 
   1830 L(Shl14LoopLeave):
   1831 	add	$32, %ecx
   1832 	jle	L(shl_end_0)
   1833 
   1834 	movaps	2(%eax), %xmm2
   1835 	movaps	18(%eax), %xmm3
   1836 	palignr	$14, %xmm2, %xmm3
   1837 	palignr	$14, %xmm1, %xmm2
   1838 
   1839 	movaps	%xmm2, (%edx)
   1840 	movaps	%xmm3, 16(%edx)
   1841 	lea	32(%edx, %ecx), %edx
   1842 	lea	32(%eax, %ecx), %eax
   1843 	POP (%edi)
   1844 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1845 
   1846 	CFI_PUSH (%edi)
   1847 
   1848 	.p2align 4
   1849 L(sh_14_no_prefetch):
   1850 	lea	-32(%ecx), %ecx
   1851 	lea	-14(%eax), %eax
   1852 	xor	%edi, %edi
   1853 
   1854 	.p2align 4
   1855 L(sh_14_no_prefetch_loop):
   1856 	movdqa	16(%eax, %edi), %xmm2
   1857 	sub	$32, %ecx
   1858 	movdqa	32(%eax, %edi), %xmm3
   1859 	movdqa	%xmm3, %xmm4
   1860 	palignr	$14, %xmm2, %xmm3
   1861 	palignr	$14, %xmm1, %xmm2
   1862 	lea	32(%edi), %edi
   1863 	movdqa	%xmm2, -32(%edx, %edi)
   1864 	movdqa	%xmm3, -16(%edx, %edi)
   1865 	jb	L(sh_14_end_no_prefetch_loop)
   1866 
   1867 	movdqa	16(%eax, %edi), %xmm2
   1868 	sub	$32, %ecx
   1869 	movdqa	32(%eax, %edi), %xmm3
   1870 	movdqa	%xmm3, %xmm1
   1871 	palignr	$14, %xmm2, %xmm3
   1872 	palignr	$14, %xmm4, %xmm2
   1873 	lea	32(%edi), %edi
   1874 	movdqa	%xmm2, -32(%edx, %edi)
   1875 	movdqa	%xmm3, -16(%edx, %edi)
   1876 	jae	L(sh_14_no_prefetch_loop)
   1877 
   1878 L(sh_14_end_no_prefetch_loop):
   1879 	lea	32(%ecx), %ecx
   1880 	add	%ecx, %edi
   1881 	add	%edi, %edx
   1882 	lea	14(%edi, %eax), %eax
   1883 	POP	(%edi)
   1884 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1885 
   1886 	CFI_PUSH (%edi)
   1887 
   1888 	.p2align 4
   1889 L(shl_15):
   1890 #ifndef USE_AS_MEMMOVE
   1891 	movaps	-15(%eax), %xmm1
   1892 #else
   1893 	movl	DEST+4(%esp), %edi
   1894 	movaps	-15(%eax), %xmm1
   1895 	movdqu	%xmm0, (%edi)
   1896 #endif
   1897 #ifdef DATA_CACHE_SIZE_HALF
   1898 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1899 #else
   1900 # if (defined SHARED || defined __PIC__)
   1901 	SETUP_PIC_REG(bx)
   1902 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1903 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1904 # else
   1905 	cmp	__x86_data_cache_size_half, %ecx
   1906 # endif
   1907 #endif
   1908 	jb L(sh_15_no_prefetch)
   1909 
   1910 	lea	-64(%ecx), %ecx
   1911 
   1912 	.p2align 4
   1913 L(Shl15LoopStart):
   1914 	prefetcht0 0x1c0(%eax)
   1915 	prefetcht0 0x1c0(%edx)
   1916 	movaps	1(%eax), %xmm2
   1917 	movaps	17(%eax), %xmm3
   1918 	movaps	33(%eax), %xmm4
   1919 	movaps	49(%eax), %xmm5
   1920 	movaps	%xmm5, %xmm7
   1921 	palignr	$15, %xmm4, %xmm5
   1922 	palignr	$15, %xmm3, %xmm4
   1923 	movaps	%xmm5, 48(%edx)
   1924 	palignr	$15, %xmm2, %xmm3
   1925 	lea	64(%eax), %eax
   1926 	palignr	$15, %xmm1, %xmm2
   1927 	movaps	%xmm4, 32(%edx)
   1928 	movaps	%xmm3, 16(%edx)
   1929 	movaps	%xmm7, %xmm1
   1930 	movaps	%xmm2, (%edx)
   1931 	lea	64(%edx), %edx
   1932 	sub	$64, %ecx
   1933 	ja	L(Shl15LoopStart)
   1934 
   1935 L(Shl15LoopLeave):
   1936 	add	$32, %ecx
   1937 	jle	L(shl_end_0)
   1938 
   1939 	movaps	1(%eax), %xmm2
   1940 	movaps	17(%eax), %xmm3
   1941 	palignr	$15, %xmm2, %xmm3
   1942 	palignr	$15, %xmm1, %xmm2
   1943 
   1944 	movaps	%xmm2, (%edx)
   1945 	movaps	%xmm3, 16(%edx)
   1946 	lea	32(%edx, %ecx), %edx
   1947 	lea	32(%eax, %ecx), %eax
   1948 	POP (%edi)
   1949 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1950 
   1951 	CFI_PUSH (%edi)
   1952 
   1953 	.p2align 4
   1954 L(sh_15_no_prefetch):
   1955 	lea	-32(%ecx), %ecx
   1956 	lea	-15(%eax), %eax
   1957 	xor	%edi, %edi
   1958 
   1959 	.p2align 4
   1960 L(sh_15_no_prefetch_loop):
   1961 	movdqa	16(%eax, %edi), %xmm2
   1962 	sub	$32, %ecx
   1963 	movdqa	32(%eax, %edi), %xmm3
   1964 	movdqa	%xmm3, %xmm4
   1965 	palignr	$15, %xmm2, %xmm3
   1966 	palignr	$15, %xmm1, %xmm2
   1967 	lea	32(%edi), %edi
   1968 	movdqa	%xmm2, -32(%edx, %edi)
   1969 	movdqa	%xmm3, -16(%edx, %edi)
   1970 	jb	L(sh_15_end_no_prefetch_loop)
   1971 
   1972 	movdqa	16(%eax, %edi), %xmm2
   1973 	sub	$32, %ecx
   1974 	movdqa	32(%eax, %edi), %xmm3
   1975 	movdqa	%xmm3, %xmm1
   1976 	palignr	$15, %xmm2, %xmm3
   1977 	palignr	$15, %xmm4, %xmm2
   1978 	lea	32(%edi), %edi
   1979 	movdqa	%xmm2, -32(%edx, %edi)
   1980 	movdqa	%xmm3, -16(%edx, %edi)
   1981 	jae	L(sh_15_no_prefetch_loop)
   1982 
   1983 L(sh_15_end_no_prefetch_loop):
   1984 	lea	32(%ecx), %ecx
   1985 	add	%ecx, %edi
   1986 	add	%edi, %edx
   1987 	lea	15(%edi, %eax), %eax
   1988 	POP	(%edi)
   1989 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1990 
   1991 	CFI_PUSH (%edi)
   1992 
   1993 	.p2align 4
   1994 L(shl_end_0):
   1995 	lea	32(%ecx), %ecx
   1996 	lea	(%edx, %ecx), %edx
   1997 	lea	(%eax, %ecx), %eax
   1998 	POP	(%edi)
   1999 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   2000 
   2001 	.p2align 4
   2002 L(fwd_write_44bytes):
   2003 	movq	-44(%eax), %xmm0
   2004 	movq	%xmm0, -44(%edx)
   2005 L(fwd_write_36bytes):
   2006 	movq	-36(%eax), %xmm0
   2007 	movq	%xmm0, -36(%edx)
   2008 L(fwd_write_28bytes):
   2009 	movq	-28(%eax), %xmm0
   2010 	movq	%xmm0, -28(%edx)
   2011 L(fwd_write_20bytes):
   2012 	movq	-20(%eax), %xmm0
   2013 	movq	%xmm0, -20(%edx)
   2014 L(fwd_write_12bytes):
   2015 	movq	-12(%eax), %xmm0
   2016 	movq	%xmm0, -12(%edx)
   2017 L(fwd_write_4bytes):
   2018 	movl	-4(%eax), %ecx
   2019 	movl	%ecx, -4(%edx)
   2020 #ifndef USE_AS_BCOPY
   2021 # ifdef USE_AS_MEMPCPY
   2022 	movl	%edx, %eax
   2023 # else
   2024 	movl	DEST(%esp), %eax
   2025 # endif
   2026 #endif
   2027 	RETURN
   2028 
   2029 	.p2align 4
   2030 L(fwd_write_40bytes):
   2031 	movq	-40(%eax), %xmm0
   2032 	movq	%xmm0, -40(%edx)
   2033 L(fwd_write_32bytes):
   2034 	movq	-32(%eax), %xmm0
   2035 	movq	%xmm0, -32(%edx)
   2036 L(fwd_write_24bytes):
   2037 	movq	-24(%eax), %xmm0
   2038 	movq	%xmm0, -24(%edx)
   2039 L(fwd_write_16bytes):
   2040 	movq	-16(%eax), %xmm0
   2041 	movq	%xmm0, -16(%edx)
   2042 L(fwd_write_8bytes):
   2043 	movq	-8(%eax), %xmm0
   2044 	movq	%xmm0, -8(%edx)
   2045 L(fwd_write_0bytes):
   2046 #ifndef USE_AS_BCOPY
   2047 # ifdef USE_AS_MEMPCPY
   2048 	movl	%edx, %eax
   2049 # else
   2050 	movl	DEST(%esp), %eax
   2051 # endif
   2052 #endif
   2053 	RETURN
   2054 
   2055 	.p2align 4
   2056 L(fwd_write_5bytes):
   2057 	movl	-5(%eax), %ecx
   2058 	movl	-4(%eax), %eax
   2059 	movl	%ecx, -5(%edx)
   2060 	movl	%eax, -4(%edx)
   2061 #ifndef USE_AS_BCOPY
   2062 # ifdef USE_AS_MEMPCPY
   2063 	movl	%edx, %eax
   2064 # else
   2065 	movl	DEST(%esp), %eax
   2066 # endif
   2067 #endif
   2068 	RETURN
   2069 
   2070 	.p2align 4
   2071 L(fwd_write_45bytes):
   2072 	movq	-45(%eax), %xmm0
   2073 	movq	%xmm0, -45(%edx)
   2074 L(fwd_write_37bytes):
   2075 	movq	-37(%eax), %xmm0
   2076 	movq	%xmm0, -37(%edx)
   2077 L(fwd_write_29bytes):
   2078 	movq	-29(%eax), %xmm0
   2079 	movq	%xmm0, -29(%edx)
   2080 L(fwd_write_21bytes):
   2081 	movq	-21(%eax), %xmm0
   2082 	movq	%xmm0, -21(%edx)
   2083 L(fwd_write_13bytes):
   2084 	movq	-13(%eax), %xmm0
   2085 	movq	%xmm0, -13(%edx)
   2086 	movl	-5(%eax), %ecx
   2087 	movl	%ecx, -5(%edx)
   2088 	movzbl	-1(%eax), %ecx
   2089 	movb	%cl, -1(%edx)
   2090 #ifndef USE_AS_BCOPY
   2091 # ifdef USE_AS_MEMPCPY
   2092 	movl	%edx, %eax
   2093 # else
   2094 	movl	DEST(%esp), %eax
   2095 # endif
   2096 #endif
   2097 	RETURN
   2098 
   2099 	.p2align 4
   2100 L(fwd_write_41bytes):
   2101 	movq	-41(%eax), %xmm0
   2102 	movq	%xmm0, -41(%edx)
   2103 L(fwd_write_33bytes):
   2104 	movq	-33(%eax), %xmm0
   2105 	movq	%xmm0, -33(%edx)
   2106 L(fwd_write_25bytes):
   2107 	movq	-25(%eax), %xmm0
   2108 	movq	%xmm0, -25(%edx)
   2109 L(fwd_write_17bytes):
   2110 	movq	-17(%eax), %xmm0
   2111 	movq	%xmm0, -17(%edx)
   2112 L(fwd_write_9bytes):
   2113 	movq	-9(%eax), %xmm0
   2114 	movq	%xmm0, -9(%edx)
   2115 L(fwd_write_1bytes):
   2116 	movzbl	-1(%eax), %ecx
   2117 	movb	%cl, -1(%edx)
   2118 #ifndef USE_AS_BCOPY
   2119 # ifdef USE_AS_MEMPCPY
   2120 	movl	%edx, %eax
   2121 # else
   2122 	movl	DEST(%esp), %eax
   2123 # endif
   2124 #endif
   2125 	RETURN
   2126 
   2127 	.p2align 4
   2128 L(fwd_write_46bytes):
   2129 	movq	-46(%eax), %xmm0
   2130 	movq	%xmm0, -46(%edx)
   2131 L(fwd_write_38bytes):
   2132 	movq	-38(%eax), %xmm0
   2133 	movq	%xmm0, -38(%edx)
   2134 L(fwd_write_30bytes):
   2135 	movq	-30(%eax), %xmm0
   2136 	movq	%xmm0, -30(%edx)
   2137 L(fwd_write_22bytes):
   2138 	movq	-22(%eax), %xmm0
   2139 	movq	%xmm0, -22(%edx)
   2140 L(fwd_write_14bytes):
   2141 	movq	-14(%eax), %xmm0
   2142 	movq	%xmm0, -14(%edx)
   2143 L(fwd_write_6bytes):
   2144 	movl	-6(%eax), %ecx
   2145 	movl	%ecx, -6(%edx)
   2146 	movzwl	-2(%eax), %ecx
   2147 	movw	%cx, -2(%edx)
   2148 #ifndef USE_AS_BCOPY
   2149 # ifdef USE_AS_MEMPCPY
   2150 	movl	%edx, %eax
   2151 # else
   2152 	movl	DEST(%esp), %eax
   2153 # endif
   2154 #endif
   2155 	RETURN
   2156 
   2157 	.p2align 4
   2158 L(fwd_write_42bytes):
   2159 	movq	-42(%eax), %xmm0
   2160 	movq	%xmm0, -42(%edx)
   2161 L(fwd_write_34bytes):
   2162 	movq	-34(%eax), %xmm0
   2163 	movq	%xmm0, -34(%edx)
   2164 L(fwd_write_26bytes):
   2165 	movq	-26(%eax), %xmm0
   2166 	movq	%xmm0, -26(%edx)
   2167 L(fwd_write_18bytes):
   2168 	movq	-18(%eax), %xmm0
   2169 	movq	%xmm0, -18(%edx)
   2170 L(fwd_write_10bytes):
   2171 	movq	-10(%eax), %xmm0
   2172 	movq	%xmm0, -10(%edx)
   2173 L(fwd_write_2bytes):
   2174 	movzwl	-2(%eax), %ecx
   2175 	movw	%cx, -2(%edx)
   2176 #ifndef USE_AS_BCOPY
   2177 # ifdef USE_AS_MEMPCPY
   2178 	movl	%edx, %eax
   2179 # else
   2180 	movl	DEST(%esp), %eax
   2181 # endif
   2182 #endif
   2183 	RETURN
   2184 
   2185 	.p2align 4
   2186 L(fwd_write_47bytes):
   2187 	movq	-47(%eax), %xmm0
   2188 	movq	%xmm0, -47(%edx)
   2189 L(fwd_write_39bytes):
   2190 	movq	-39(%eax), %xmm0
   2191 	movq	%xmm0, -39(%edx)
   2192 L(fwd_write_31bytes):
   2193 	movq	-31(%eax), %xmm0
   2194 	movq	%xmm0, -31(%edx)
   2195 L(fwd_write_23bytes):
   2196 	movq	-23(%eax), %xmm0
   2197 	movq	%xmm0, -23(%edx)
   2198 L(fwd_write_15bytes):
   2199 	movq	-15(%eax), %xmm0
   2200 	movq	%xmm0, -15(%edx)
   2201 L(fwd_write_7bytes):
   2202 	movl	-7(%eax), %ecx
   2203 	movl	%ecx, -7(%edx)
   2204 	movzwl	-3(%eax), %ecx
   2205 	movzbl	-1(%eax), %eax
   2206 	movw	%cx, -3(%edx)
   2207 	movb	%al, -1(%edx)
   2208 #ifndef USE_AS_BCOPY
   2209 # ifdef USE_AS_MEMPCPY
   2210 	movl	%edx, %eax
   2211 # else
   2212 	movl	DEST(%esp), %eax
   2213 # endif
   2214 #endif
   2215 	RETURN
   2216 
   2217 	.p2align 4
   2218 L(fwd_write_43bytes):
   2219 	movq	-43(%eax), %xmm0
   2220 	movq	%xmm0, -43(%edx)
   2221 L(fwd_write_35bytes):
   2222 	movq	-35(%eax), %xmm0
   2223 	movq	%xmm0, -35(%edx)
   2224 L(fwd_write_27bytes):
   2225 	movq	-27(%eax), %xmm0
   2226 	movq	%xmm0, -27(%edx)
   2227 L(fwd_write_19bytes):
   2228 	movq	-19(%eax), %xmm0
   2229 	movq	%xmm0, -19(%edx)
   2230 L(fwd_write_11bytes):
   2231 	movq	-11(%eax), %xmm0
   2232 	movq	%xmm0, -11(%edx)
   2233 L(fwd_write_3bytes):
   2234 	movzwl	-3(%eax), %ecx
   2235 	movzbl	-1(%eax), %eax
   2236 	movw	%cx, -3(%edx)
   2237 	movb	%al, -1(%edx)
   2238 #ifndef USE_AS_BCOPY
   2239 # ifdef USE_AS_MEMPCPY
   2240 	movl	%edx, %eax
   2241 # else
   2242 	movl	DEST(%esp), %eax
   2243 # endif
   2244 #endif
   2245 	RETURN
   2246 
   2247 	.p2align 4
   2248 L(fwd_write_40bytes_align):
   2249 	movdqa	-40(%eax), %xmm0
   2250 	movdqa	%xmm0, -40(%edx)
   2251 L(fwd_write_24bytes_align):
   2252 	movdqa	-24(%eax), %xmm0
   2253 	movdqa	%xmm0, -24(%edx)
   2254 L(fwd_write_8bytes_align):
   2255 	movq	-8(%eax), %xmm0
   2256 	movq	%xmm0, -8(%edx)
   2257 L(fwd_write_0bytes_align):
   2258 #ifndef USE_AS_BCOPY
   2259 # ifdef USE_AS_MEMPCPY
   2260 	movl	%edx, %eax
   2261 # else
   2262 	movl	DEST(%esp), %eax
   2263 # endif
   2264 #endif
   2265 	RETURN
   2266 
   2267 	.p2align 4
   2268 L(fwd_write_32bytes_align):
   2269 	movdqa	-32(%eax), %xmm0
   2270 	movdqa	%xmm0, -32(%edx)
   2271 L(fwd_write_16bytes_align):
   2272 	movdqa	-16(%eax), %xmm0
   2273 	movdqa	%xmm0, -16(%edx)
   2274 #ifndef USE_AS_BCOPY
   2275 # ifdef USE_AS_MEMPCPY
   2276 	movl	%edx, %eax
   2277 # else
   2278 	movl	DEST(%esp), %eax
   2279 # endif
   2280 #endif
   2281 	RETURN
   2282 
   2283 	.p2align 4
   2284 L(fwd_write_5bytes_align):
   2285 	movl	-5(%eax), %ecx
   2286 	movl	-4(%eax), %eax
   2287 	movl	%ecx, -5(%edx)
   2288 	movl	%eax, -4(%edx)
   2289 #ifndef USE_AS_BCOPY
   2290 # ifdef USE_AS_MEMPCPY
   2291 	movl	%edx, %eax
   2292 # else
   2293 	movl	DEST(%esp), %eax
   2294 # endif
   2295 #endif
   2296 	RETURN
   2297 
   2298 	.p2align 4
   2299 L(fwd_write_45bytes_align):
   2300 	movdqa	-45(%eax), %xmm0
   2301 	movdqa	%xmm0, -45(%edx)
   2302 L(fwd_write_29bytes_align):
   2303 	movdqa	-29(%eax), %xmm0
   2304 	movdqa	%xmm0, -29(%edx)
   2305 L(fwd_write_13bytes_align):
   2306 	movq	-13(%eax), %xmm0
   2307 	movq	%xmm0, -13(%edx)
   2308 	movl	-5(%eax), %ecx
   2309 	movl	%ecx, -5(%edx)
   2310 	movzbl	-1(%eax), %ecx
   2311 	movb	%cl, -1(%edx)
   2312 #ifndef USE_AS_BCOPY
   2313 # ifdef USE_AS_MEMPCPY
   2314 	movl	%edx, %eax
   2315 # else
   2316 	movl	DEST(%esp), %eax
   2317 # endif
   2318 #endif
   2319 	RETURN
   2320 
   2321 	.p2align 4
   2322 L(fwd_write_37bytes_align):
   2323 	movdqa	-37(%eax), %xmm0
   2324 	movdqa	%xmm0, -37(%edx)
   2325 L(fwd_write_21bytes_align):
   2326 	movdqa	-21(%eax), %xmm0
   2327 	movdqa	%xmm0, -21(%edx)
   2328 	movl	-5(%eax), %ecx
   2329 	movl	%ecx, -5(%edx)
   2330 	movzbl	-1(%eax), %ecx
   2331 	movb	%cl, -1(%edx)
   2332 #ifndef USE_AS_BCOPY
   2333 # ifdef USE_AS_MEMPCPY
   2334 	movl	%edx, %eax
   2335 # else
   2336 	movl	DEST(%esp), %eax
   2337 # endif
   2338 #endif
   2339 	RETURN
   2340 
   2341 	.p2align 4
   2342 L(fwd_write_41bytes_align):
   2343 	movdqa	-41(%eax), %xmm0
   2344 	movdqa	%xmm0, -41(%edx)
   2345 L(fwd_write_25bytes_align):
   2346 	movdqa	-25(%eax), %xmm0
   2347 	movdqa	%xmm0, -25(%edx)
   2348 L(fwd_write_9bytes_align):
   2349 	movq	-9(%eax), %xmm0
   2350 	movq	%xmm0, -9(%edx)
   2351 L(fwd_write_1bytes_align):
   2352 	movzbl	-1(%eax), %ecx
   2353 	movb	%cl, -1(%edx)
   2354 #ifndef USE_AS_BCOPY
   2355 # ifdef USE_AS_MEMPCPY
   2356 	movl	%edx, %eax
   2357 # else
   2358 	movl	DEST(%esp), %eax
   2359 # endif
   2360 #endif
   2361 	RETURN
   2362 
   2363 	.p2align 4
   2364 L(fwd_write_33bytes_align):
   2365 	movdqa	-33(%eax), %xmm0
   2366 	movdqa	%xmm0, -33(%edx)
   2367 L(fwd_write_17bytes_align):
   2368 	movdqa	-17(%eax), %xmm0
   2369 	movdqa	%xmm0, -17(%edx)
   2370 	movzbl	-1(%eax), %ecx
   2371 	movb	%cl, -1(%edx)
   2372 #ifndef USE_AS_BCOPY
   2373 # ifdef USE_AS_MEMPCPY
   2374 	movl	%edx, %eax
   2375 # else
   2376 	movl	DEST(%esp), %eax
   2377 # endif
   2378 #endif
   2379 	RETURN
   2380 
   2381 	.p2align 4
   2382 L(fwd_write_46bytes_align):
   2383 	movdqa	-46(%eax), %xmm0
   2384 	movdqa	%xmm0, -46(%edx)
   2385 L(fwd_write_30bytes_align):
   2386 	movdqa	-30(%eax), %xmm0
   2387 	movdqa	%xmm0, -30(%edx)
   2388 L(fwd_write_14bytes_align):
   2389 	movq	-14(%eax), %xmm0
   2390 	movq	%xmm0, -14(%edx)
   2391 L(fwd_write_6bytes_align):
   2392 	movl	-6(%eax), %ecx
   2393 	movl	%ecx, -6(%edx)
   2394 	movzwl	-2(%eax), %ecx
   2395 	movw	%cx, -2(%edx)
   2396 #ifndef USE_AS_BCOPY
   2397 # ifdef USE_AS_MEMPCPY
   2398 	movl	%edx, %eax
   2399 # else
   2400 	movl	DEST(%esp), %eax
   2401 # endif
   2402 #endif
   2403 	RETURN
   2404 
   2405 	.p2align 4
   2406 L(fwd_write_38bytes_align):
   2407 	movdqa	-38(%eax), %xmm0
   2408 	movdqa	%xmm0, -38(%edx)
   2409 L(fwd_write_22bytes_align):
   2410 	movdqa	-22(%eax), %xmm0
   2411 	movdqa	%xmm0, -22(%edx)
   2412 	movl	-6(%eax), %ecx
   2413 	movl	%ecx, -6(%edx)
   2414 	movzwl	-2(%eax), %ecx
   2415 	movw	%cx, -2(%edx)
   2416 #ifndef USE_AS_BCOPY
   2417 # ifdef USE_AS_MEMPCPY
   2418 	movl	%edx, %eax
   2419 # else
   2420 	movl	DEST(%esp), %eax
   2421 # endif
   2422 #endif
   2423 	RETURN
   2424 
   2425 	.p2align 4
   2426 L(fwd_write_42bytes_align):
   2427 	movdqa	-42(%eax), %xmm0
   2428 	movdqa	%xmm0, -42(%edx)
   2429 L(fwd_write_26bytes_align):
   2430 	movdqa	-26(%eax), %xmm0
   2431 	movdqa	%xmm0, -26(%edx)
   2432 L(fwd_write_10bytes_align):
   2433 	movq	-10(%eax), %xmm0
   2434 	movq	%xmm0, -10(%edx)
   2435 L(fwd_write_2bytes_align):
   2436 	movzwl	-2(%eax), %ecx
   2437 	movw	%cx, -2(%edx)
   2438 #ifndef USE_AS_BCOPY
   2439 # ifdef USE_AS_MEMPCPY
   2440 	movl	%edx, %eax
   2441 # else
   2442 	movl	DEST(%esp), %eax
   2443 # endif
   2444 #endif
   2445 	RETURN
   2446 
   2447 	.p2align 4
   2448 L(fwd_write_34bytes_align):
   2449 	movdqa	-34(%eax), %xmm0
   2450 	movdqa	%xmm0, -34(%edx)
   2451 L(fwd_write_18bytes_align):
   2452 	movdqa	-18(%eax), %xmm0
   2453 	movdqa	%xmm0, -18(%edx)
   2454 	movzwl	-2(%eax), %ecx
   2455 	movw	%cx, -2(%edx)
   2456 #ifndef USE_AS_BCOPY
   2457 # ifdef USE_AS_MEMPCPY
   2458 	movl	%edx, %eax
   2459 # else
   2460 	movl	DEST(%esp), %eax
   2461 # endif
   2462 #endif
   2463 	RETURN
   2464 
   2465 	.p2align 4
   2466 L(fwd_write_47bytes_align):
   2467 	movdqa	-47(%eax), %xmm0
   2468 	movdqa	%xmm0, -47(%edx)
   2469 L(fwd_write_31bytes_align):
   2470 	movdqa	-31(%eax), %xmm0
   2471 	movdqa	%xmm0, -31(%edx)
   2472 L(fwd_write_15bytes_align):
   2473 	movq	-15(%eax), %xmm0
   2474 	movq	%xmm0, -15(%edx)
   2475 L(fwd_write_7bytes_align):
   2476 	movl	-7(%eax), %ecx
   2477 	movl	%ecx, -7(%edx)
   2478 	movzwl	-3(%eax), %ecx
   2479 	movzbl	-1(%eax), %eax
   2480 	movw	%cx, -3(%edx)
   2481 	movb	%al, -1(%edx)
   2482 #ifndef USE_AS_BCOPY
   2483 # ifdef USE_AS_MEMPCPY
   2484 	movl	%edx, %eax
   2485 # else
   2486 	movl	DEST(%esp), %eax
   2487 # endif
   2488 #endif
   2489 	RETURN
   2490 
   2491 	.p2align 4
   2492 L(fwd_write_39bytes_align):
   2493 	movdqa	-39(%eax), %xmm0
   2494 	movdqa	%xmm0, -39(%edx)
   2495 L(fwd_write_23bytes_align):
   2496 	movdqa	-23(%eax), %xmm0
   2497 	movdqa	%xmm0, -23(%edx)
   2498 	movl	-7(%eax), %ecx
   2499 	movl	%ecx, -7(%edx)
   2500 	movzwl	-3(%eax), %ecx
   2501 	movzbl	-1(%eax), %eax
   2502 	movw	%cx, -3(%edx)
   2503 	movb	%al, -1(%edx)
   2504 #ifndef USE_AS_BCOPY
   2505 # ifdef USE_AS_MEMPCPY
   2506 	movl	%edx, %eax
   2507 # else
   2508 	movl	DEST(%esp), %eax
   2509 # endif
   2510 #endif
   2511 	RETURN
   2512 
   2513 	.p2align 4
   2514 L(fwd_write_43bytes_align):
   2515 	movdqa	-43(%eax), %xmm0
   2516 	movdqa	%xmm0, -43(%edx)
   2517 L(fwd_write_27bytes_align):
   2518 	movdqa	-27(%eax), %xmm0
   2519 	movdqa	%xmm0, -27(%edx)
   2520 L(fwd_write_11bytes_align):
   2521 	movq	-11(%eax), %xmm0
   2522 	movq	%xmm0, -11(%edx)
   2523 L(fwd_write_3bytes_align):
   2524 	movzwl	-3(%eax), %ecx
   2525 	movzbl	-1(%eax), %eax
   2526 	movw	%cx, -3(%edx)
   2527 	movb	%al, -1(%edx)
   2528 #ifndef USE_AS_BCOPY
   2529 # ifdef USE_AS_MEMPCPY
   2530 	movl	%edx, %eax
   2531 # else
   2532 	movl	DEST(%esp), %eax
   2533 # endif
   2534 #endif
   2535 	RETURN
   2536 
   2537 	.p2align 4
   2538 L(fwd_write_35bytes_align):
   2539 	movdqa	-35(%eax), %xmm0
   2540 	movdqa	%xmm0, -35(%edx)
   2541 L(fwd_write_19bytes_align):
   2542 	movdqa	-19(%eax), %xmm0
   2543 	movdqa	%xmm0, -19(%edx)
   2544 	movzwl	-3(%eax), %ecx
   2545 	movzbl	-1(%eax), %eax
   2546 	movw	%cx, -3(%edx)
   2547 	movb	%al, -1(%edx)
   2548 #ifndef USE_AS_BCOPY
   2549 # ifdef USE_AS_MEMPCPY
   2550 	movl	%edx, %eax
   2551 # else
   2552 	movl	DEST(%esp), %eax
   2553 # endif
   2554 #endif
   2555 	RETURN
   2556 
   2557 	.p2align 4
   2558 L(fwd_write_44bytes_align):
   2559 	movdqa	-44(%eax), %xmm0
   2560 	movdqa	%xmm0, -44(%edx)
   2561 L(fwd_write_28bytes_align):
   2562 	movdqa	-28(%eax), %xmm0
   2563 	movdqa	%xmm0, -28(%edx)
   2564 L(fwd_write_12bytes_align):
   2565 	movq	-12(%eax), %xmm0
   2566 	movq	%xmm0, -12(%edx)
   2567 L(fwd_write_4bytes_align):
   2568 	movl	-4(%eax), %ecx
   2569 	movl	%ecx, -4(%edx)
   2570 #ifndef USE_AS_BCOPY
   2571 # ifdef USE_AS_MEMPCPY
   2572 	movl	%edx, %eax
   2573 # else
   2574 	movl	DEST(%esp), %eax
   2575 # endif
   2576 #endif
   2577 	RETURN
   2578 
   2579 	.p2align 4
   2580 L(fwd_write_36bytes_align):
   2581 	movdqa	-36(%eax), %xmm0
   2582 	movdqa	%xmm0, -36(%edx)
   2583 L(fwd_write_20bytes_align):
   2584 	movdqa	-20(%eax), %xmm0
   2585 	movdqa	%xmm0, -20(%edx)
   2586 	movl	-4(%eax), %ecx
   2587 	movl	%ecx, -4(%edx)
   2588 #ifndef USE_AS_BCOPY
   2589 # ifdef USE_AS_MEMPCPY
   2590 	movl	%edx, %eax
   2591 # else
   2592 	movl	DEST(%esp), %eax
   2593 # endif
   2594 #endif
   2595 	RETURN_END
   2596 
   2597 	CFI_PUSH (%edi)
   2598 
   2599 	.p2align 4
   2600 L(large_page):
   2601 	movdqu	(%eax), %xmm1
   2602 #ifdef USE_AS_MEMMOVE
   2603 	movl	DEST+4(%esp), %edi
   2604 	movdqu	%xmm0, (%edi)
   2605 #endif
   2606 	lea	16(%eax), %eax
   2607 	movntdq	%xmm1, (%edx)
   2608 	lea	16(%edx), %edx
   2609 	lea	-0x90(%ecx), %ecx
   2610 	POP (%edi)
   2611 
   2612 	.p2align 4
   2613 L(large_page_loop):
   2614 	movdqu	(%eax), %xmm0
   2615 	movdqu	0x10(%eax), %xmm1
   2616 	movdqu	0x20(%eax), %xmm2
   2617 	movdqu	0x30(%eax), %xmm3
   2618 	movdqu	0x40(%eax), %xmm4
   2619 	movdqu	0x50(%eax), %xmm5
   2620 	movdqu	0x60(%eax), %xmm6
   2621 	movdqu	0x70(%eax), %xmm7
   2622 	lea	0x80(%eax), %eax
   2623 
   2624 	sub	$0x80, %ecx
   2625 	movntdq	%xmm0, (%edx)
   2626 	movntdq	%xmm1, 0x10(%edx)
   2627 	movntdq	%xmm2, 0x20(%edx)
   2628 	movntdq	%xmm3, 0x30(%edx)
   2629 	movntdq	%xmm4, 0x40(%edx)
   2630 	movntdq	%xmm5, 0x50(%edx)
   2631 	movntdq	%xmm6, 0x60(%edx)
   2632 	movntdq	%xmm7, 0x70(%edx)
   2633 	lea	0x80(%edx), %edx
   2634 	jae	L(large_page_loop)
   2635 	cmp	$-0x40, %ecx
   2636 	lea	0x80(%ecx), %ecx
   2637 	jl	L(large_page_less_64bytes)
   2638 
   2639 	movdqu	(%eax), %xmm0
   2640 	movdqu	0x10(%eax), %xmm1
   2641 	movdqu	0x20(%eax), %xmm2
   2642 	movdqu	0x30(%eax), %xmm3
   2643 	lea	0x40(%eax), %eax
   2644 
   2645 	movntdq	%xmm0, (%edx)
   2646 	movntdq	%xmm1, 0x10(%edx)
   2647 	movntdq	%xmm2, 0x20(%edx)
   2648 	movntdq	%xmm3, 0x30(%edx)
   2649 	lea	0x40(%edx), %edx
   2650 	sub	$0x40, %ecx
   2651 L(large_page_less_64bytes):
   2652 	cmp	$32, %ecx
   2653 	jb	L(large_page_less_32bytes)
   2654 	movdqu	(%eax), %xmm0
   2655 	movdqu	0x10(%eax), %xmm1
   2656 	lea	0x20(%eax), %eax
   2657 	movntdq	%xmm0, (%edx)
   2658 	movntdq	%xmm1, 0x10(%edx)
   2659 	lea	0x20(%edx), %edx
   2660 	sub	$0x20, %ecx
   2661 L(large_page_less_32bytes):
   2662 	add	%ecx, %edx
   2663 	add	%ecx, %eax
   2664 	sfence
   2665 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
   2666 
   2667 	.p2align 4
   2668 L(bk_write_44bytes):
   2669 	movq	36(%eax), %xmm0
   2670 	movq	%xmm0, 36(%edx)
   2671 L(bk_write_36bytes):
   2672 	movq	28(%eax), %xmm0
   2673 	movq	%xmm0, 28(%edx)
   2674 L(bk_write_28bytes):
   2675 	movq	20(%eax), %xmm0
   2676 	movq	%xmm0, 20(%edx)
   2677 L(bk_write_20bytes):
   2678 	movq	12(%eax), %xmm0
   2679 	movq	%xmm0, 12(%edx)
   2680 L(bk_write_12bytes):
   2681 	movq	4(%eax), %xmm0
   2682 	movq	%xmm0, 4(%edx)
   2683 L(bk_write_4bytes):
   2684 	movl	(%eax), %ecx
   2685 	movl	%ecx, (%edx)
   2686 L(bk_write_0bytes):
   2687 #ifndef USE_AS_BCOPY
   2688 	movl	DEST(%esp), %eax
   2689 # ifdef USE_AS_MEMPCPY
   2690 	movl	LEN(%esp), %ecx
   2691 	add	%ecx, %eax
   2692 # endif
   2693 #endif
   2694 	RETURN
   2695 
   2696 	.p2align 4
   2697 L(bk_write_40bytes):
   2698 	movq	32(%eax), %xmm0
   2699 	movq	%xmm0, 32(%edx)
   2700 L(bk_write_32bytes):
   2701 	movq	24(%eax), %xmm0
   2702 	movq	%xmm0, 24(%edx)
   2703 L(bk_write_24bytes):
   2704 	movq	16(%eax), %xmm0
   2705 	movq	%xmm0, 16(%edx)
   2706 L(bk_write_16bytes):
   2707 	movq	8(%eax), %xmm0
   2708 	movq	%xmm0, 8(%edx)
   2709 L(bk_write_8bytes):
   2710 	movq	(%eax), %xmm0
   2711 	movq	%xmm0, (%edx)
   2712 #ifndef USE_AS_BCOPY
   2713 	movl	DEST(%esp), %eax
   2714 # ifdef USE_AS_MEMPCPY
   2715 	movl	LEN(%esp), %ecx
   2716 	add	%ecx, %eax
   2717 # endif
   2718 #endif
   2719 	RETURN
   2720 
   2721 	.p2align 4
   2722 L(bk_write_45bytes):
   2723 	movq	37(%eax), %xmm0
   2724 	movq	%xmm0, 37(%edx)
   2725 L(bk_write_37bytes):
   2726 	movq	29(%eax), %xmm0
   2727 	movq	%xmm0, 29(%edx)
   2728 L(bk_write_29bytes):
   2729 	movq	21(%eax), %xmm0
   2730 	movq	%xmm0, 21(%edx)
   2731 L(bk_write_21bytes):
   2732 	movq	13(%eax), %xmm0
   2733 	movq	%xmm0, 13(%edx)
   2734 L(bk_write_13bytes):
   2735 	movq	5(%eax), %xmm0
   2736 	movq	%xmm0, 5(%edx)
   2737 L(bk_write_5bytes):
   2738 	movl	1(%eax), %ecx
   2739 	movl	%ecx, 1(%edx)
   2740 L(bk_write_1bytes):
   2741 	movzbl	(%eax), %ecx
   2742 	movb	%cl, (%edx)
   2743 #ifndef USE_AS_BCOPY
   2744 	movl	DEST(%esp), %eax
   2745 # ifdef USE_AS_MEMPCPY
   2746 	movl	LEN(%esp), %ecx
   2747 	add	%ecx, %eax
   2748 # endif
   2749 #endif
   2750 	RETURN
   2751 
   2752 	.p2align 4
   2753 L(bk_write_41bytes):
   2754 	movq	33(%eax), %xmm0
   2755 	movq	%xmm0, 33(%edx)
   2756 L(bk_write_33bytes):
   2757 	movq	25(%eax), %xmm0
   2758 	movq	%xmm0, 25(%edx)
   2759 L(bk_write_25bytes):
   2760 	movq	17(%eax), %xmm0
   2761 	movq	%xmm0, 17(%edx)
   2762 L(bk_write_17bytes):
   2763 	movq	9(%eax), %xmm0
   2764 	movq	%xmm0, 9(%edx)
   2765 L(bk_write_9bytes):
   2766 	movq	1(%eax), %xmm0
   2767 	movq	%xmm0, 1(%edx)
   2768 	movzbl	(%eax), %ecx
   2769 	movb	%cl, (%edx)
   2770 #ifndef USE_AS_BCOPY
   2771 	movl	DEST(%esp), %eax
   2772 # ifdef USE_AS_MEMPCPY
   2773 	movl	LEN(%esp), %ecx
   2774 	add	%ecx, %eax
   2775 # endif
   2776 #endif
   2777 	RETURN
   2778 
   2779 	.p2align 4
   2780 L(bk_write_46bytes):
   2781 	movq	38(%eax), %xmm0
   2782 	movq	%xmm0, 38(%edx)
   2783 L(bk_write_38bytes):
   2784 	movq	30(%eax), %xmm0
   2785 	movq	%xmm0, 30(%edx)
   2786 L(bk_write_30bytes):
   2787 	movq	22(%eax), %xmm0
   2788 	movq	%xmm0, 22(%edx)
   2789 L(bk_write_22bytes):
   2790 	movq	14(%eax), %xmm0
   2791 	movq	%xmm0, 14(%edx)
   2792 L(bk_write_14bytes):
   2793 	movq	6(%eax), %xmm0
   2794 	movq	%xmm0, 6(%edx)
   2795 L(bk_write_6bytes):
   2796 	movl	2(%eax), %ecx
   2797 	movl	%ecx, 2(%edx)
   2798 	movzwl	(%eax), %ecx
   2799 	movw	%cx, (%edx)
   2800 #ifndef USE_AS_BCOPY
   2801 	movl	DEST(%esp), %eax
   2802 # ifdef USE_AS_MEMPCPY
   2803 	movl	LEN(%esp), %ecx
   2804 	add	%ecx, %eax
   2805 # endif
   2806 #endif
   2807 	RETURN
   2808 
   2809 	.p2align 4
   2810 L(bk_write_42bytes):
   2811 	movq	34(%eax), %xmm0
   2812 	movq	%xmm0, 34(%edx)
   2813 L(bk_write_34bytes):
   2814 	movq	26(%eax), %xmm0
   2815 	movq	%xmm0, 26(%edx)
   2816 L(bk_write_26bytes):
   2817 	movq	18(%eax), %xmm0
   2818 	movq	%xmm0, 18(%edx)
   2819 L(bk_write_18bytes):
   2820 	movq	10(%eax), %xmm0
   2821 	movq	%xmm0, 10(%edx)
   2822 L(bk_write_10bytes):
   2823 	movq	2(%eax), %xmm0
   2824 	movq	%xmm0, 2(%edx)
   2825 L(bk_write_2bytes):
   2826 	movzwl	(%eax), %ecx
   2827 	movw	%cx, (%edx)
   2828 #ifndef USE_AS_BCOPY
   2829 	movl	DEST(%esp), %eax
   2830 # ifdef USE_AS_MEMPCPY
   2831 	movl	LEN(%esp), %ecx
   2832 	add	%ecx, %eax
   2833 # endif
   2834 #endif
   2835 	RETURN
   2836 
   2837 	.p2align 4
   2838 L(bk_write_47bytes):
   2839 	movq	39(%eax), %xmm0
   2840 	movq	%xmm0, 39(%edx)
   2841 L(bk_write_39bytes):
   2842 	movq	31(%eax), %xmm0
   2843 	movq	%xmm0, 31(%edx)
   2844 L(bk_write_31bytes):
   2845 	movq	23(%eax), %xmm0
   2846 	movq	%xmm0, 23(%edx)
   2847 L(bk_write_23bytes):
   2848 	movq	15(%eax), %xmm0
   2849 	movq	%xmm0, 15(%edx)
   2850 L(bk_write_15bytes):
   2851 	movq	7(%eax), %xmm0
   2852 	movq	%xmm0, 7(%edx)
   2853 L(bk_write_7bytes):
   2854 	movl	3(%eax), %ecx
   2855 	movl	%ecx, 3(%edx)
   2856 	movzwl	1(%eax), %ecx
   2857 	movw	%cx, 1(%edx)
   2858 	movzbl	(%eax), %eax
   2859 	movb	%al, (%edx)
   2860 #ifndef USE_AS_BCOPY
   2861 	movl	DEST(%esp), %eax
   2862 # ifdef USE_AS_MEMPCPY
   2863 	movl	LEN(%esp), %ecx
   2864 	add	%ecx, %eax
   2865 # endif
   2866 #endif
   2867 	RETURN
   2868 
   2869 	.p2align 4
   2870 L(bk_write_43bytes):
   2871 	movq	35(%eax), %xmm0
   2872 	movq	%xmm0, 35(%edx)
   2873 L(bk_write_35bytes):
   2874 	movq	27(%eax), %xmm0
   2875 	movq	%xmm0, 27(%edx)
   2876 L(bk_write_27bytes):
   2877 	movq	19(%eax), %xmm0
   2878 	movq	%xmm0, 19(%edx)
   2879 L(bk_write_19bytes):
   2880 	movq	11(%eax), %xmm0
   2881 	movq	%xmm0, 11(%edx)
   2882 L(bk_write_11bytes):
   2883 	movq	3(%eax), %xmm0
   2884 	movq	%xmm0, 3(%edx)
   2885 L(bk_write_3bytes):
   2886 	movzwl	1(%eax), %ecx
   2887 	movw	%cx, 1(%edx)
   2888 	movzbl	(%eax), %eax
   2889 	movb	%al, (%edx)
   2890 #ifndef USE_AS_BCOPY
   2891 	movl	DEST(%esp), %eax
   2892 # ifdef USE_AS_MEMPCPY
   2893 	movl	LEN(%esp), %ecx
   2894 	add	%ecx, %eax
   2895 # endif
   2896 #endif
   2897 	RETURN_END
   2898 
   2899 
   2900 	.pushsection .rodata.ssse3,"a",@progbits
   2901 	.p2align 2
   2902 L(table_48bytes_fwd):
   2903 	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
   2904 	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
   2905 	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
   2906 	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
   2907 	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
   2908 	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
   2909 	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
   2910 	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
   2911 	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
   2912 	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
   2913 	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
   2914 	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
   2915 	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
   2916 	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
   2917 	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
   2918 	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
   2919 	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
   2920 	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
   2921 	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
   2922 	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
   2923 	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
   2924 	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
   2925 	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
   2926 	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
   2927 	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
   2928 	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
   2929 	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
   2930 	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
   2931 	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
   2932 	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
   2933 	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
   2934 	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
   2935 	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
   2936 	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
   2937 	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
   2938 	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
   2939 	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
   2940 	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
   2941 	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
   2942 	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
   2943 	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
   2944 	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
   2945 	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
   2946 	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
   2947 	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
   2948 	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
   2949 	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
   2950 	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
   2951 
   2952 	.p2align 2
   2953 L(table_48bytes_fwd_align):
   2954 	.int	JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
   2955 	.int	JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
   2956 	.int	JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
   2957 	.int	JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
   2958 	.int	JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
   2959 	.int	JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
   2960 	.int	JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
   2961 	.int	JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
   2962 	.int	JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
   2963 	.int	JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
   2964 	.int	JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
   2965 	.int	JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
   2966 	.int	JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
   2967 	.int	JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
   2968 	.int	JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
   2969 	.int	JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
   2970 	.int	JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
   2971 	.int	JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
   2972 	.int	JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
   2973 	.int	JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
   2974 	.int	JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
   2975 	.int	JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
   2976 	.int	JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
   2977 	.int	JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
   2978 	.int	JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
   2979 	.int	JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
   2980 	.int	JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
   2981 	.int	JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
   2982 	.int	JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
   2983 	.int	JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
   2984 	.int	JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
   2985 	.int	JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
   2986 	.int	JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
   2987 	.int	JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
   2988 	.int	JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
   2989 	.int	JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
   2990 	.int	JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
   2991 	.int	JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
   2992 	.int	JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
   2993 	.int	JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
   2994 	.int	JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
   2995 	.int	JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
   2996 	.int	JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
   2997 	.int	JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
   2998 	.int	JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
   2999 	.int	JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
   3000 	.int	JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
   3001 	.int	JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
   3002 
   3003 	.p2align 2
   3004 L(shl_table):
   3005 	.int	JMPTBL (L(shl_0), L(shl_table))
   3006 	.int	JMPTBL (L(shl_1), L(shl_table))
   3007 	.int	JMPTBL (L(shl_2), L(shl_table))
   3008 	.int	JMPTBL (L(shl_3), L(shl_table))
   3009 	.int	JMPTBL (L(shl_4), L(shl_table))
   3010 	.int	JMPTBL (L(shl_5), L(shl_table))
   3011 	.int	JMPTBL (L(shl_6), L(shl_table))
   3012 	.int	JMPTBL (L(shl_7), L(shl_table))
   3013 	.int	JMPTBL (L(shl_8), L(shl_table))
   3014 	.int	JMPTBL (L(shl_9), L(shl_table))
   3015 	.int	JMPTBL (L(shl_10), L(shl_table))
   3016 	.int	JMPTBL (L(shl_11), L(shl_table))
   3017 	.int	JMPTBL (L(shl_12), L(shl_table))
   3018 	.int	JMPTBL (L(shl_13), L(shl_table))
   3019 	.int	JMPTBL (L(shl_14), L(shl_table))
   3020 	.int	JMPTBL (L(shl_15), L(shl_table))
   3021 
   3022 	.p2align 2
   3023 L(table_48_bytes_bwd):
   3024 	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
   3025 	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
   3026 	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
   3027 	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
   3028 	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
   3029 	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
   3030 	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
   3031 	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
   3032 	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
   3033 	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
   3034 	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
   3035 	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
   3036 	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
   3037 	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
   3038 	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
   3039 	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
   3040 	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
   3041 	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
   3042 	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
   3043 	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
   3044 	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
   3045 	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
   3046 	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
   3047 	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
   3048 	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
   3049 	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
   3050 	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
   3051 	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
   3052 	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
   3053 	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
   3054 	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
   3055 	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
   3056 	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
   3057 	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
   3058 	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
   3059 	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
   3060 	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
   3061 	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
   3062 	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
   3063 	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
   3064 	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
   3065 	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
   3066 	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
   3067 	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
   3068 	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
   3069 	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
   3070 	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
   3071 	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
   3072 
   3073 	.popsection
   3074 
   3075 #ifdef USE_AS_MEMMOVE
   3076 	.p2align 4
   3077 L(copy_backward):
   3078 	PUSH (%edi)
   3079 	movl	%eax, %edi
   3080 	lea	(%ecx,%edx,1),%edx
   3081 	lea	(%ecx,%edi,1),%edi
   3082 	testl	$0x3, %edx
   3083 	jnz	L(bk_align)
   3084 
   3085 L(bk_aligned_4):
   3086 	cmp	$64, %ecx
   3087 	jae	L(bk_write_more64bytes)
   3088 
   3089 L(bk_write_64bytesless):
   3090 	cmp	$32, %ecx
   3091 	jb	L(bk_write_less32bytes)
   3092 
   3093 L(bk_write_more32bytes):
   3094 	/* Copy 32 bytes at a time.  */
   3095 	sub	$32, %ecx
   3096 	movq	-8(%edi), %xmm0
   3097 	movq	%xmm0, -8(%edx)
   3098 	movq	-16(%edi), %xmm0
   3099 	movq	%xmm0, -16(%edx)
   3100 	movq	-24(%edi), %xmm0
   3101 	movq	%xmm0, -24(%edx)
   3102 	movq	-32(%edi), %xmm0
   3103 	movq	%xmm0, -32(%edx)
   3104 	sub	$32, %edx
   3105 	sub	$32, %edi
   3106 
   3107 L(bk_write_less32bytes):
   3108 	movl	%edi, %eax
   3109 	sub	%ecx, %edx
   3110 	sub	%ecx, %eax
   3111 	POP (%edi)
   3112 L(bk_write_less32bytes_2):
   3113 	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
   3114 
   3115 	CFI_PUSH (%edi)
   3116 
   3117 	.p2align 4
   3118 L(bk_align):
   3119 	cmp	$8, %ecx
   3120 	jbe	L(bk_write_less32bytes)
   3121 	testl	$1, %edx
   3122 	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
   3123 	then	(EDX & 2) must be != 0.  */
   3124 	jz	L(bk_got2)
   3125 	sub	$1, %edi
   3126 	sub	$1, %ecx
   3127 	sub	$1, %edx
   3128 	movzbl	(%edi), %eax
   3129 	movb	%al, (%edx)
   3130 
   3131 	testl	$2, %edx
   3132 	jz	L(bk_aligned_4)
   3133 
   3134 L(bk_got2):
   3135 	sub	$2, %edi
   3136 	sub	$2, %ecx
   3137 	sub	$2, %edx
   3138 	movzwl	(%edi), %eax
   3139 	movw	%ax, (%edx)
   3140 	jmp	L(bk_aligned_4)
   3141 
   3142 	.p2align 4
   3143 L(bk_write_more64bytes):
   3144 	/* Check alignment of last byte.  */
   3145 	testl	$15, %edx
   3146 	jz	L(bk_ssse3_cpy_pre)
   3147 
   3148 /* EDX is aligned 4 bytes, but not 16 bytes.  */
   3149 L(bk_ssse3_align):
   3150 	sub	$4, %edi
   3151 	sub	$4, %ecx
   3152 	sub	$4, %edx
   3153 	movl	(%edi), %eax
   3154 	movl	%eax, (%edx)
   3155 
   3156 	testl	$15, %edx
   3157 	jz	L(bk_ssse3_cpy_pre)
   3158 
   3159 	sub	$4, %edi
   3160 	sub	$4, %ecx
   3161 	sub	$4, %edx
   3162 	movl	(%edi), %eax
   3163 	movl	%eax, (%edx)
   3164 
   3165 	testl	$15, %edx
   3166 	jz	L(bk_ssse3_cpy_pre)
   3167 
   3168 	sub	$4, %edi
   3169 	sub	$4, %ecx
   3170 	sub	$4, %edx
   3171 	movl	(%edi), %eax
   3172 	movl	%eax, (%edx)
   3173 
   3174 L(bk_ssse3_cpy_pre):
   3175 	cmp	$64, %ecx
   3176 	jb	L(bk_write_more32bytes)
   3177 
   3178 	.p2align 4
   3179 L(bk_ssse3_cpy):
   3180 	sub	$64, %edi
   3181 	sub	$64, %ecx
   3182 	sub	$64, %edx
   3183 	movdqu	0x30(%edi), %xmm3
   3184 	movdqa	%xmm3, 0x30(%edx)
   3185 	movdqu	0x20(%edi), %xmm2
   3186 	movdqa	%xmm2, 0x20(%edx)
   3187 	movdqu	0x10(%edi), %xmm1
   3188 	movdqa	%xmm1, 0x10(%edx)
   3189 	movdqu	(%edi), %xmm0
   3190 	movdqa	%xmm0, (%edx)
   3191 	cmp	$64, %ecx
   3192 	jae	L(bk_ssse3_cpy)
   3193 	jmp	L(bk_write_64bytesless)
   3194 
   3195 #endif
   3196 
   3197 END (MEMCPY)
   3198