Home | History | Annotate | Download | only in string
      1 /*
      2 Copyright (c) 2010, Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 #include "cache.h"
     32 #undef __i686
     33 
     34 #ifndef MEMCPY
     35 # define MEMCPY	memcpy
     36 #endif
     37 
     38 #ifndef L
     39 # define L(label)	.L##label
     40 #endif
     41 
     42 #ifndef cfi_startproc
     43 # define cfi_startproc	.cfi_startproc
     44 #endif
     45 
     46 #ifndef cfi_endproc
     47 # define cfi_endproc	.cfi_endproc
     48 #endif
     49 
     50 #ifndef cfi_rel_offset
     51 # define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
     52 #endif
     53 
     54 #ifndef cfi_restore
     55 # define cfi_restore(reg)	.cfi_restore reg
     56 #endif
     57 
     58 #ifndef cfi_adjust_cfa_offset
     59 # define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
     60 #endif
     61 
     62 #ifndef ENTRY
     63 # define ENTRY(name)		\
     64 	.type name,  @function;		\
     65 	.globl name;		\
     66 	.p2align 4;		\
     67 name:		\
     68 	cfi_startproc
     69 #endif
     70 
     71 #ifndef END
     72 # define END(name)		\
     73 	cfi_endproc;		\
     74 	.size name, .-name
     75 #endif
     76 
     77 #ifdef USE_AS_BCOPY
     78 # define SRC		PARMS
     79 # define DEST		SRC+4
     80 # define LEN		DEST+4
     81 #else
     82 # define DEST		PARMS
     83 # define SRC		DEST+4
     84 # define LEN		SRC+4
     85 #endif
     86 
     87 #define CFI_PUSH(REG)		\
     88   cfi_adjust_cfa_offset (4);		\
     89   cfi_rel_offset (REG, 0)
     90 
     91 #define CFI_POP(REG)		\
     92   cfi_adjust_cfa_offset (-4);		\
     93   cfi_restore (REG)
     94 
     95 #define PUSH(REG)	pushl REG; CFI_PUSH (REG)
     96 #define POP(REG)	popl REG; CFI_POP (REG)
     97 
     98 #if (defined SHARED || defined __PIC__)
     99 # define PARMS		8		/* Preserve EBX.  */
    100 # define ENTRANCE	PUSH (%ebx);
    101 # define RETURN_END	POP (%ebx); ret
    102 # define RETURN		RETURN_END; CFI_PUSH (%ebx)
    103 # define JMPTBL(I, B)	I - B
    104 # undef __i686
    105 
    106 # define SETUP_PIC_REG(x)	call	__i686.get_pc_thunk.x
    107 
    108 /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
    109 	jump table with relative offsets.  INDEX is a register contains the
    110 	index into the jump table.   SCALE is the scale of INDEX. */
    111 
    112 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
    113     /* We first load PC into EBX.  */		\
    114 	SETUP_PIC_REG(bx);		\
    115     /* Get the address of the jump table.  */		\
    116 	addl	$(TABLE - .), %ebx;		\
    117     /* Get the entry and convert the relative offset to the		\
    118 	absolute	address.  */		\
    119 	addl	(%ebx, INDEX, SCALE), %ebx;		\
    120     /* We loaded the jump table.  Go.  */		\
    121 	jmp	*%ebx
    122 #else
    123 
    124 # define PARMS		4
    125 # define ENTRANCE
    126 # define RETURN_END	ret
    127 # define RETURN		RETURN_END
    128 # define JMPTBL(I, B)	I
    129 
    130 /* Branch to an entry in a jump table.  TABLE is a jump table with
    131 	absolute offsets.  INDEX is a register contains the index into the
    132 	jump table.  SCALE is the scale of INDEX. */
    133 
    134 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
    135 	jmp	*TABLE(, INDEX, SCALE)
    136 #endif
    137 
    138 	.section .text.ssse3,"ax",@progbits
    139 ENTRY (MEMCPY)
    140 	ENTRANCE
    141 	movl	LEN(%esp), %ecx
    142 	movl	SRC(%esp), %eax
    143 	movl	DEST(%esp), %edx
    144 
    145 #ifdef USE_AS_MEMMOVE
    146 	cmp	%eax, %edx
    147 	jb	L(copy_forward)
    148 	je	L(fwd_write_0bytes)
    149 	cmp	$32, %ecx
    150 	jae	L(memmove_bwd)
    151 	jmp	L(bk_write_less32bytes_2)
    152 
    153 	.p2align 4
    154 L(memmove_bwd):
    155 	add	%ecx, %eax
    156 	cmp	%eax, %edx
    157 	movl	SRC(%esp), %eax
    158 	jb	L(copy_backward)
    159 
    160 L(copy_forward):
    161 #endif
    162 	cmp	$48, %ecx
    163 	jae	L(48bytesormore)
    164 
    165 L(fwd_write_less32bytes):
    166 #ifndef USE_AS_MEMMOVE
    167 	cmp	%dl, %al
    168 	jb	L(bk_write)
    169 #endif
    170 	add	%ecx, %edx
    171 	add	%ecx, %eax
    172 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
    173 #ifndef USE_AS_MEMMOVE
    174 	.p2align 4
    175 L(bk_write):
    176 	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
    177 #endif
    178 
    179 	.p2align 4
    180 L(48bytesormore):
    181 #ifndef USE_AS_MEMMOVE
    182 	movlpd	(%eax), %xmm0
    183 	movlpd	8(%eax), %xmm1
    184 	movlpd	%xmm0, (%edx)
    185 	movlpd	%xmm1, 8(%edx)
    186 #else
    187 	movdqu	(%eax), %xmm0
    188 #endif
    189 	PUSH (%edi)
    190 	movl	%edx, %edi
    191 	and	$-16, %edx
    192 	add	$16, %edx
    193 	sub	%edx, %edi
    194 	add	%edi, %ecx
    195 	sub	%edi, %eax
    196 
    197 #ifdef SHARED_CACHE_SIZE_HALF
    198 	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
    199 #else
    200 # if (defined SHARED || defined __PIC__)
    201 	SETUP_PIC_REG(bx)
    202 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    203 	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
    204 # else
    205 	cmp	__x86_shared_cache_size_half, %ecx
    206 # endif
    207 #endif
    208 
    209 	mov	%eax, %edi
    210 	jae	L(large_page)
    211 	and	$0xf, %edi
    212 	jz	L(shl_0)
    213 	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
    214 
    215 	.p2align 4
    216 L(shl_0):
    217 #ifdef USE_AS_MEMMOVE
    218 	movl	DEST+4(%esp), %edi
    219 	movdqu	%xmm0, (%edi)
    220 #endif
    221 	xor	%edi, %edi
    222 	cmp	$127, %ecx
    223 	ja	L(shl_0_gobble)
    224 	lea	-32(%ecx), %ecx
    225 
    226 	.p2align 4
    227 L(shl_0_loop):
    228 	movdqa	(%eax, %edi), %xmm0
    229 	movdqa	16(%eax, %edi), %xmm1
    230 	sub	$32, %ecx
    231 	movdqa	%xmm0, (%edx, %edi)
    232 	movdqa	%xmm1, 16(%edx, %edi)
    233 	lea	32(%edi), %edi
    234 	jb	L(shl_0_end)
    235 
    236 	movdqa	(%eax, %edi), %xmm0
    237 	movdqa	16(%eax, %edi), %xmm1
    238 	sub	$32, %ecx
    239 	movdqa	%xmm0, (%edx, %edi)
    240 	movdqa	%xmm1, 16(%edx, %edi)
    241 	lea	32(%edi), %edi
    242 	jb	L(shl_0_end)
    243 
    244 	movdqa	(%eax, %edi), %xmm0
    245 	movdqa	16(%eax, %edi), %xmm1
    246 	sub	$32, %ecx
    247 	movdqa	%xmm0, (%edx, %edi)
    248 	movdqa	%xmm1, 16(%edx, %edi)
    249 	lea	32(%edi), %edi
    250 	jb	L(shl_0_end)
    251 
    252 	movdqa	(%eax, %edi), %xmm0
    253 	movdqa	16(%eax, %edi), %xmm1
    254 	sub	$32, %ecx
    255 	movdqa	%xmm0, (%edx, %edi)
    256 	movdqa	%xmm1, 16(%edx, %edi)
    257 	lea	32(%edi), %edi
    258 
    259 L(shl_0_end):
    260 	lea	32(%ecx), %ecx
    261 	add	%ecx, %edi
    262 	add	%edi, %edx
    263 	add	%edi, %eax
    264 	POP (%edi)
    265 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
    266 
    267 	CFI_PUSH (%edi)
    268 
    269 	.p2align 4
    270 L(shl_0_gobble):
    271 #ifdef DATA_CACHE_SIZE_HALF
    272 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    273 #else
    274 # if (defined SHARED || defined __PIC__)
    275 	SETUP_PIC_REG(bx)
    276 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    277 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    278 # else
    279 	cmp	__x86_data_cache_size_half, %ecx
    280 # endif
    281 #endif
    282 	POP	(%edi)
    283 	lea	-128(%ecx), %ecx
    284 	jae	L(shl_0_gobble_mem_loop)
    285 
    286 	.p2align 4
    287 L(shl_0_gobble_cache_loop):
    288 	movdqa	(%eax), %xmm0
    289 	movdqa	0x10(%eax), %xmm1
    290 	movdqa	0x20(%eax), %xmm2
    291 	movdqa	0x30(%eax), %xmm3
    292 	movdqa	0x40(%eax), %xmm4
    293 	movdqa	0x50(%eax), %xmm5
    294 	movdqa	0x60(%eax), %xmm6
    295 	movdqa	0x70(%eax), %xmm7
    296 	lea	0x80(%eax), %eax
    297 	sub	$128, %ecx
    298 	movdqa	%xmm0, (%edx)
    299 	movdqa	%xmm1, 0x10(%edx)
    300 	movdqa	%xmm2, 0x20(%edx)
    301 	movdqa	%xmm3, 0x30(%edx)
    302 	movdqa	%xmm4, 0x40(%edx)
    303 	movdqa	%xmm5, 0x50(%edx)
    304 	movdqa	%xmm6, 0x60(%edx)
    305 	movdqa	%xmm7, 0x70(%edx)
    306 	lea	0x80(%edx), %edx
    307 
    308 	jae	L(shl_0_gobble_cache_loop)
    309 	cmp	$-0x40, %ecx
    310 	lea	0x80(%ecx), %ecx
    311 	jl	L(shl_0_cache_less_64bytes)
    312 
    313 	movdqa	(%eax), %xmm0
    314 	sub	$0x40, %ecx
    315 	movdqa	0x10(%eax), %xmm1
    316 	movdqa	%xmm0, (%edx)
    317 	movdqa	%xmm1, 0x10(%edx)
    318 	movdqa	0x20(%eax), %xmm0
    319 	movdqa	0x30(%eax), %xmm1
    320 	add	$0x40, %eax
    321 	movdqa	%xmm0, 0x20(%edx)
    322 	movdqa	%xmm1, 0x30(%edx)
    323 	add	$0x40, %edx
    324 
    325 L(shl_0_cache_less_64bytes):
    326 	cmp	$0x20, %ecx
    327 	jb	L(shl_0_cache_less_32bytes)
    328 	movdqa	(%eax), %xmm0
    329 	sub	$0x20, %ecx
    330 	movdqa	0x10(%eax), %xmm1
    331 	add	$0x20, %eax
    332 	movdqa	%xmm0, (%edx)
    333 	movdqa	%xmm1, 0x10(%edx)
    334 	add	$0x20, %edx
    335 
    336 L(shl_0_cache_less_32bytes):
    337 	cmp	$0x10, %ecx
    338 	jb	L(shl_0_cache_less_16bytes)
    339 	sub	$0x10, %ecx
    340 	movdqa	(%eax), %xmm0
    341 	add	$0x10, %eax
    342 	movdqa	%xmm0, (%edx)
    343 	add	$0x10, %edx
    344 
    345 L(shl_0_cache_less_16bytes):
    346 	add	%ecx, %edx
    347 	add	%ecx, %eax
    348 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
    349 
    350 	.p2align 4
    351 L(shl_0_gobble_mem_loop):
    352 	prefetcht0 0x1c0(%eax)
    353 	prefetcht0 0x280(%eax)
    354 	prefetcht0 0x1c0(%edx)
    355 
    356 	movdqa	(%eax), %xmm0
    357 	movdqa	0x10(%eax), %xmm1
    358 	movdqa	0x20(%eax), %xmm2
    359 	movdqa	0x30(%eax), %xmm3
    360 	movdqa	0x40(%eax), %xmm4
    361 	movdqa	0x50(%eax), %xmm5
    362 	movdqa	0x60(%eax), %xmm6
    363 	movdqa	0x70(%eax), %xmm7
    364 	lea	0x80(%eax), %eax
    365 	sub	$0x80, %ecx
    366 	movdqa	%xmm0, (%edx)
    367 	movdqa	%xmm1, 0x10(%edx)
    368 	movdqa	%xmm2, 0x20(%edx)
    369 	movdqa	%xmm3, 0x30(%edx)
    370 	movdqa	%xmm4, 0x40(%edx)
    371 	movdqa	%xmm5, 0x50(%edx)
    372 	movdqa	%xmm6, 0x60(%edx)
    373 	movdqa	%xmm7, 0x70(%edx)
    374 	lea	0x80(%edx), %edx
    375 
    376 	jae	L(shl_0_gobble_mem_loop)
    377 	cmp	$-0x40, %ecx
    378 	lea	0x80(%ecx), %ecx
    379 	jl	L(shl_0_mem_less_64bytes)
    380 
    381 	movdqa	(%eax), %xmm0
    382 	sub	$0x40, %ecx
    383 	movdqa	0x10(%eax), %xmm1
    384 
    385 	movdqa	%xmm0, (%edx)
    386 	movdqa	%xmm1, 0x10(%edx)
    387 
    388 	movdqa	0x20(%eax), %xmm0
    389 	movdqa	0x30(%eax), %xmm1
    390 	add	$0x40, %eax
    391 
    392 	movdqa	%xmm0, 0x20(%edx)
    393 	movdqa	%xmm1, 0x30(%edx)
    394 	add	$0x40, %edx
    395 
    396 L(shl_0_mem_less_64bytes):
    397 	cmp	$0x20, %ecx
    398 	jb	L(shl_0_mem_less_32bytes)
    399 	movdqa	(%eax), %xmm0
    400 	sub	$0x20, %ecx
    401 	movdqa	0x10(%eax), %xmm1
    402 	add	$0x20, %eax
    403 	movdqa	%xmm0, (%edx)
    404 	movdqa	%xmm1, 0x10(%edx)
    405 	add	$0x20, %edx
    406 
    407 L(shl_0_mem_less_32bytes):
    408 	cmp	$0x10, %ecx
    409 	jb	L(shl_0_mem_less_16bytes)
    410 	sub	$0x10, %ecx
    411 	movdqa	(%eax), %xmm0
    412 	add	$0x10, %eax
    413 	movdqa	%xmm0, (%edx)
    414 	add	$0x10, %edx
    415 
    416 L(shl_0_mem_less_16bytes):
    417 	add	%ecx, %edx
    418 	add	%ecx, %eax
    419 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
    420 
    421 	.p2align 4
    422 L(shl_1):
    423 #ifndef USE_AS_MEMMOVE
    424 	movaps	-1(%eax), %xmm1
    425 #else
    426 	movl	DEST+4(%esp), %edi
    427 	movaps	-1(%eax), %xmm1
    428 	movdqu	%xmm0, (%edi)
    429 #endif
    430 #ifdef DATA_CACHE_SIZE_HALF
    431 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    432 #else
    433 # if (defined SHARED || defined __PIC__)
    434 	SETUP_PIC_REG(bx)
    435 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    436 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    437 # else
    438 	cmp	__x86_data_cache_size_half, %ecx
    439 # endif
    440 #endif
    441 	jb L(sh_1_no_prefetch)
    442 
    443 	lea	-64(%ecx), %ecx
    444 
    445 	.p2align 4
    446 L(Shl1LoopStart):
    447 	prefetcht0 0x1c0(%eax)
    448 	prefetcht0 0x1c0(%edx)
    449 	movaps	15(%eax), %xmm2
    450 	movaps	31(%eax), %xmm3
    451 	movaps	47(%eax), %xmm4
    452 	movaps	63(%eax), %xmm5
    453 	movaps	%xmm5, %xmm7
    454 	palignr	$1, %xmm4, %xmm5
    455 	palignr	$1, %xmm3, %xmm4
    456 	movaps	%xmm5, 48(%edx)
    457 	palignr	$1, %xmm2, %xmm3
    458 	lea	64(%eax), %eax
    459 	palignr	$1, %xmm1, %xmm2
    460 	movaps	%xmm4, 32(%edx)
    461 	movaps	%xmm3, 16(%edx)
    462 	movaps	%xmm7, %xmm1
    463 	movaps	%xmm2, (%edx)
    464 	lea	64(%edx), %edx
    465 	sub	$64, %ecx
    466 	ja	L(Shl1LoopStart)
    467 
    468 L(Shl1LoopLeave):
    469 	add	$32, %ecx
    470 	jle	L(shl_end_0)
    471 
    472 	movaps	15(%eax), %xmm2
    473 	movaps	31(%eax), %xmm3
    474 	palignr	$1, %xmm2, %xmm3
    475 	palignr	$1, %xmm1, %xmm2
    476 	movaps	%xmm2, (%edx)
    477 	movaps	%xmm3, 16(%edx)
    478 	lea	32(%edx, %ecx), %edx
    479 	lea	32(%eax, %ecx), %eax
    480 	POP (%edi)
    481 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    482 
    483 	CFI_PUSH (%edi)
    484 
    485 	.p2align 4
    486 L(sh_1_no_prefetch):
    487 	lea	-32(%ecx), %ecx
    488 	lea	-1(%eax), %eax
    489 	xor	%edi, %edi
    490 
    491 	.p2align 4
    492 L(sh_1_no_prefetch_loop):
    493 	movdqa	16(%eax, %edi), %xmm2
    494 	sub	$32, %ecx
    495 	movdqa	32(%eax, %edi), %xmm3
    496 	movdqa	%xmm3, %xmm4
    497 	palignr	$1, %xmm2, %xmm3
    498 	palignr	$1, %xmm1, %xmm2
    499 	lea	32(%edi), %edi
    500 	movdqa	%xmm2, -32(%edx, %edi)
    501 	movdqa	%xmm3, -16(%edx, %edi)
    502 	jb	L(sh_1_end_no_prefetch_loop)
    503 
    504 	movdqa	16(%eax, %edi), %xmm2
    505 	sub	$32, %ecx
    506 	movdqa	32(%eax, %edi), %xmm3
    507 	movdqa	%xmm3, %xmm1
    508 	palignr	$1, %xmm2, %xmm3
    509 	palignr	$1, %xmm4, %xmm2
    510 	lea	32(%edi), %edi
    511 	movdqa	%xmm2, -32(%edx, %edi)
    512 	movdqa	%xmm3, -16(%edx, %edi)
    513 	jae	L(sh_1_no_prefetch_loop)
    514 
    515 L(sh_1_end_no_prefetch_loop):
    516 	lea	32(%ecx), %ecx
    517 	add	%ecx, %edi
    518 	add	%edi, %edx
    519 	lea	1(%edi, %eax), %eax
    520 	POP	(%edi)
    521 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    522 
    523 	CFI_PUSH (%edi)
    524 
    525 	.p2align 4
    526 L(shl_2):
    527 #ifndef USE_AS_MEMMOVE
    528 	movaps	-2(%eax), %xmm1
    529 #else
    530 	movl	DEST+4(%esp), %edi
    531 	movaps	-2(%eax), %xmm1
    532 	movdqu	%xmm0, (%edi)
    533 #endif
    534 #ifdef DATA_CACHE_SIZE_HALF
    535 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    536 #else
    537 # if (defined SHARED || defined __PIC__)
    538 	SETUP_PIC_REG(bx)
    539 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    540 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    541 # else
    542 	cmp	__x86_data_cache_size_half, %ecx
    543 # endif
    544 #endif
    545 	jb L(sh_2_no_prefetch)
    546 
    547 	lea	-64(%ecx), %ecx
    548 
    549 	.p2align 4
    550 L(Shl2LoopStart):
    551 	prefetcht0 0x1c0(%eax)
    552 	prefetcht0 0x1c0(%edx)
    553 	movaps	14(%eax), %xmm2
    554 	movaps	30(%eax), %xmm3
    555 	movaps	46(%eax), %xmm4
    556 	movaps	62(%eax), %xmm5
    557 	movaps	%xmm5, %xmm7
    558 	palignr	$2, %xmm4, %xmm5
    559 	palignr	$2, %xmm3, %xmm4
    560 	movaps	%xmm5, 48(%edx)
    561 	palignr	$2, %xmm2, %xmm3
    562 	lea	64(%eax), %eax
    563 	palignr	$2, %xmm1, %xmm2
    564 	movaps	%xmm4, 32(%edx)
    565 	movaps	%xmm3, 16(%edx)
    566 	movaps	%xmm7, %xmm1
    567 	movaps	%xmm2, (%edx)
    568 	lea	64(%edx), %edx
    569 	sub	$64, %ecx
    570 	ja	L(Shl2LoopStart)
    571 
    572 L(Shl2LoopLeave):
    573 	add	$32, %ecx
    574 	jle	L(shl_end_0)
    575 
    576 	movaps	14(%eax), %xmm2
    577 	movaps	30(%eax), %xmm3
    578 	palignr	$2, %xmm2, %xmm3
    579 	palignr	$2, %xmm1, %xmm2
    580 	movaps	%xmm2, (%edx)
    581 	movaps	%xmm3, 16(%edx)
    582 	lea	32(%edx, %ecx), %edx
    583 	lea	32(%eax, %ecx), %eax
    584 	POP (%edi)
    585 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    586 
    587 	CFI_PUSH (%edi)
    588 
    589 	.p2align 4
    590 L(sh_2_no_prefetch):
    591 	lea	-32(%ecx), %ecx
    592 	lea	-2(%eax), %eax
    593 	xor	%edi, %edi
    594 
    595 	.p2align 4
    596 L(sh_2_no_prefetch_loop):
    597 	movdqa	16(%eax, %edi), %xmm2
    598 	sub	$32, %ecx
    599 	movdqa	32(%eax, %edi), %xmm3
    600 	movdqa	%xmm3, %xmm4
    601 	palignr	$2, %xmm2, %xmm3
    602 	palignr	$2, %xmm1, %xmm2
    603 	lea	32(%edi), %edi
    604 	movdqa	%xmm2, -32(%edx, %edi)
    605 	movdqa	%xmm3, -16(%edx, %edi)
    606 	jb	L(sh_2_end_no_prefetch_loop)
    607 
    608 	movdqa	16(%eax, %edi), %xmm2
    609 	sub	$32, %ecx
    610 	movdqa	32(%eax, %edi), %xmm3
    611 	movdqa	%xmm3, %xmm1
    612 	palignr	$2, %xmm2, %xmm3
    613 	palignr	$2, %xmm4, %xmm2
    614 	lea	32(%edi), %edi
    615 	movdqa	%xmm2, -32(%edx, %edi)
    616 	movdqa	%xmm3, -16(%edx, %edi)
    617 	jae	L(sh_2_no_prefetch_loop)
    618 
    619 L(sh_2_end_no_prefetch_loop):
    620 	lea	32(%ecx), %ecx
    621 	add	%ecx, %edi
    622 	add	%edi, %edx
    623 	lea	2(%edi, %eax), %eax
    624 	POP	(%edi)
    625 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    626 
    627 	CFI_PUSH (%edi)
    628 
    629 	.p2align 4
    630 L(shl_3):
    631 #ifndef USE_AS_MEMMOVE
    632 	movaps	-3(%eax), %xmm1
    633 #else
    634 	movl	DEST+4(%esp), %edi
    635 	movaps	-3(%eax), %xmm1
    636 	movdqu	%xmm0, (%edi)
    637 #endif
    638 #ifdef DATA_CACHE_SIZE_HALF
    639 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    640 #else
    641 # if (defined SHARED || defined __PIC__)
    642 	SETUP_PIC_REG(bx)
    643 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    644 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    645 # else
    646 	cmp	__x86_data_cache_size_half, %ecx
    647 # endif
    648 #endif
    649 	jb L(sh_3_no_prefetch)
    650 
    651 	lea	-64(%ecx), %ecx
    652 
    653 	.p2align 4
    654 L(Shl3LoopStart):
    655 	prefetcht0 0x1c0(%eax)
    656 	prefetcht0 0x1c0(%edx)
    657 	movaps	13(%eax), %xmm2
    658 	movaps	29(%eax), %xmm3
    659 	movaps	45(%eax), %xmm4
    660 	movaps	61(%eax), %xmm5
    661 	movaps	%xmm5, %xmm7
    662 	palignr	$3, %xmm4, %xmm5
    663 	palignr	$3, %xmm3, %xmm4
    664 	movaps	%xmm5, 48(%edx)
    665 	palignr	$3, %xmm2, %xmm3
    666 	lea	64(%eax), %eax
    667 	palignr	$3, %xmm1, %xmm2
    668 	movaps	%xmm4, 32(%edx)
    669 	movaps	%xmm3, 16(%edx)
    670 	movaps	%xmm7, %xmm1
    671 	movaps	%xmm2, (%edx)
    672 	lea	64(%edx), %edx
    673 	sub	$64, %ecx
    674 	ja	L(Shl3LoopStart)
    675 
    676 L(Shl3LoopLeave):
    677 	add	$32, %ecx
    678 	jle	L(shl_end_0)
    679 
    680 	movaps	13(%eax), %xmm2
    681 	movaps	29(%eax), %xmm3
    682 	palignr	$3, %xmm2, %xmm3
    683 	palignr	$3, %xmm1, %xmm2
    684 	movaps	%xmm2, (%edx)
    685 	movaps	%xmm3, 16(%edx)
    686 	lea	32(%edx, %ecx), %edx
    687 	lea	32(%eax, %ecx), %eax
    688 	POP (%edi)
    689 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    690 
    691 	CFI_PUSH (%edi)
    692 
    693 	.p2align 4
    694 L(sh_3_no_prefetch):
    695 	lea	-32(%ecx), %ecx
    696 	lea	-3(%eax), %eax
    697 	xor	%edi, %edi
    698 
    699 	.p2align 4
    700 L(sh_3_no_prefetch_loop):
    701 	movdqa	16(%eax, %edi), %xmm2
    702 	sub	$32, %ecx
    703 	movdqa	32(%eax, %edi), %xmm3
    704 	movdqa	%xmm3, %xmm4
    705 	palignr	$3, %xmm2, %xmm3
    706 	palignr	$3, %xmm1, %xmm2
    707 	lea	32(%edi), %edi
    708 	movdqa	%xmm2, -32(%edx, %edi)
    709 	movdqa	%xmm3, -16(%edx, %edi)
    710 
    711 	jb	L(sh_3_end_no_prefetch_loop)
    712 
    713 	movdqa	16(%eax, %edi), %xmm2
    714 	sub	$32, %ecx
    715 	movdqa	32(%eax, %edi), %xmm3
    716 	movdqa	%xmm3, %xmm1
    717 	palignr	$3, %xmm2, %xmm3
    718 	palignr	$3, %xmm4, %xmm2
    719 	lea	32(%edi), %edi
    720 	movdqa	%xmm2, -32(%edx, %edi)
    721 	movdqa	%xmm3, -16(%edx, %edi)
    722 
    723 	jae	L(sh_3_no_prefetch_loop)
    724 
    725 L(sh_3_end_no_prefetch_loop):
    726 	lea	32(%ecx), %ecx
    727 	add	%ecx, %edi
    728 	add	%edi, %edx
    729 	lea	3(%edi, %eax), %eax
    730 	POP	(%edi)
    731 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    732 
    733 	CFI_PUSH (%edi)
    734 
    735 	.p2align 4
    736 L(shl_4):
    737 #ifndef USE_AS_MEMMOVE
    738 	movaps	-4(%eax), %xmm1
    739 #else
    740 	movl	DEST+4(%esp), %edi
    741 	movaps	-4(%eax), %xmm1
    742 	movdqu	%xmm0, (%edi)
    743 #endif
    744 #ifdef DATA_CACHE_SIZE_HALF
    745 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    746 #else
    747 # if (defined SHARED || defined __PIC__)
    748 	SETUP_PIC_REG(bx)
    749 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    750 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    751 # else
    752 	cmp	__x86_data_cache_size_half, %ecx
    753 # endif
    754 #endif
    755 	jb L(sh_4_no_prefetch)
    756 
    757 	lea	-64(%ecx), %ecx
    758 
    759 	.p2align 4
    760 L(Shl4LoopStart):
    761 	prefetcht0 0x1c0(%eax)
    762 	prefetcht0 0x1c0(%edx)
    763 	movaps	12(%eax), %xmm2
    764 	movaps	28(%eax), %xmm3
    765 	movaps	44(%eax), %xmm4
    766 	movaps	60(%eax), %xmm5
    767 	movaps	%xmm5, %xmm7
    768 	palignr	$4, %xmm4, %xmm5
    769 	palignr	$4, %xmm3, %xmm4
    770 	movaps	%xmm5, 48(%edx)
    771 	palignr	$4, %xmm2, %xmm3
    772 	lea	64(%eax), %eax
    773 	palignr	$4, %xmm1, %xmm2
    774 	movaps	%xmm4, 32(%edx)
    775 	movaps	%xmm3, 16(%edx)
    776 	movaps	%xmm7, %xmm1
    777 	movaps	%xmm2, (%edx)
    778 	lea	64(%edx), %edx
    779 	sub	$64, %ecx
    780 	ja	L(Shl4LoopStart)
    781 
    782 L(Shl4LoopLeave):
    783 	add	$32, %ecx
    784 	jle	L(shl_end_0)
    785 
    786 	movaps	12(%eax), %xmm2
    787 	movaps	28(%eax), %xmm3
    788 	palignr	$4, %xmm2, %xmm3
    789 	palignr	$4, %xmm1, %xmm2
    790 	movaps	%xmm2, (%edx)
    791 	movaps	%xmm3, 16(%edx)
    792 	lea	32(%edx, %ecx), %edx
    793 	lea	32(%eax, %ecx), %eax
    794 	POP (%edi)
    795 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    796 
    797 	CFI_PUSH (%edi)
    798 
    799 	.p2align 4
    800 L(sh_4_no_prefetch):
    801 	lea	-32(%ecx), %ecx
    802 	lea	-4(%eax), %eax
    803 	xor	%edi, %edi
    804 
    805 	.p2align 4
    806 L(sh_4_no_prefetch_loop):
    807 	movdqa	16(%eax, %edi), %xmm2
    808 	sub	$32, %ecx
    809 	movdqa	32(%eax, %edi), %xmm3
    810 	movdqa	%xmm3, %xmm4
    811 	palignr	$4, %xmm2, %xmm3
    812 	palignr	$4, %xmm1, %xmm2
    813 	lea	32(%edi), %edi
    814 	movdqa	%xmm2, -32(%edx, %edi)
    815 	movdqa	%xmm3, -16(%edx, %edi)
    816 
    817 	jb	L(sh_4_end_no_prefetch_loop)
    818 
    819 	movdqa	16(%eax, %edi), %xmm2
    820 	sub	$32, %ecx
    821 	movdqa	32(%eax, %edi), %xmm3
    822 	movdqa	%xmm3, %xmm1
    823 	palignr	$4, %xmm2, %xmm3
    824 	palignr	$4, %xmm4, %xmm2
    825 	lea	32(%edi), %edi
    826 	movdqa	%xmm2, -32(%edx, %edi)
    827 	movdqa	%xmm3, -16(%edx, %edi)
    828 
    829 	jae	L(sh_4_no_prefetch_loop)
    830 
    831 L(sh_4_end_no_prefetch_loop):
    832 	lea	32(%ecx), %ecx
    833 	add	%ecx, %edi
    834 	add	%edi, %edx
    835 	lea	4(%edi, %eax), %eax
    836 	POP	(%edi)
    837 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    838 
    839 	CFI_PUSH (%edi)
    840 
    841 	.p2align 4
    842 L(shl_5):
    843 #ifndef USE_AS_MEMMOVE
    844 	movaps	-5(%eax), %xmm1
    845 #else
    846 	movl	DEST+4(%esp), %edi
    847 	movaps	-5(%eax), %xmm1
    848 	movdqu	%xmm0, (%edi)
    849 #endif
    850 #ifdef DATA_CACHE_SIZE_HALF
    851 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    852 #else
    853 # if (defined SHARED || defined __PIC__)
    854 	SETUP_PIC_REG(bx)
    855 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    856 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    857 # else
    858 	cmp	__x86_data_cache_size_half, %ecx
    859 # endif
    860 #endif
    861 	jb L(sh_5_no_prefetch)
    862 
    863 	lea	-64(%ecx), %ecx
    864 
    865 	.p2align 4
    866 L(Shl5LoopStart):
    867 	prefetcht0 0x1c0(%eax)
    868 	prefetcht0 0x1c0(%edx)
    869 	movaps	11(%eax), %xmm2
    870 	movaps	27(%eax), %xmm3
    871 	movaps	43(%eax), %xmm4
    872 	movaps	59(%eax), %xmm5
    873 	movaps	%xmm5, %xmm7
    874 	palignr	$5, %xmm4, %xmm5
    875 	palignr	$5, %xmm3, %xmm4
    876 	movaps	%xmm5, 48(%edx)
    877 	palignr	$5, %xmm2, %xmm3
    878 	lea	64(%eax), %eax
    879 	palignr	$5, %xmm1, %xmm2
    880 	movaps	%xmm4, 32(%edx)
    881 	movaps	%xmm3, 16(%edx)
    882 	movaps	%xmm7, %xmm1
    883 	movaps	%xmm2, (%edx)
    884 	lea	64(%edx), %edx
    885 	sub	$64, %ecx
    886 	ja	L(Shl5LoopStart)
    887 
    888 L(Shl5LoopLeave):
    889 	add	$32, %ecx
    890 	jle	L(shl_end_0)
    891 
    892 	movaps	11(%eax), %xmm2
    893 	movaps	27(%eax), %xmm3
    894 	palignr	$5, %xmm2, %xmm3
    895 	palignr	$5, %xmm1, %xmm2
    896 	movaps	%xmm2, (%edx)
    897 	movaps	%xmm3, 16(%edx)
    898 	lea	32(%edx, %ecx), %edx
    899 	lea	32(%eax, %ecx), %eax
    900 	POP (%edi)
    901 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    902 
    903 	CFI_PUSH (%edi)
    904 
    905 	.p2align 4
    906 L(sh_5_no_prefetch):
    907 	lea	-32(%ecx), %ecx
    908 	lea	-5(%eax), %eax
    909 	xor	%edi, %edi
    910 
    911 	.p2align 4
    912 L(sh_5_no_prefetch_loop):
    913 	movdqa	16(%eax, %edi), %xmm2
    914 	sub	$32, %ecx
    915 	movdqa	32(%eax, %edi), %xmm3
    916 	movdqa	%xmm3, %xmm4
    917 	palignr	$5, %xmm2, %xmm3
    918 	palignr	$5, %xmm1, %xmm2
    919 	lea	32(%edi), %edi
    920 	movdqa	%xmm2, -32(%edx, %edi)
    921 	movdqa	%xmm3, -16(%edx, %edi)
    922 
    923 	jb	L(sh_5_end_no_prefetch_loop)
    924 
    925 	movdqa	16(%eax, %edi), %xmm2
    926 	sub	$32, %ecx
    927 	movdqa	32(%eax, %edi), %xmm3
    928 	movdqa	%xmm3, %xmm1
    929 	palignr	$5, %xmm2, %xmm3
    930 	palignr	$5, %xmm4, %xmm2
    931 	lea	32(%edi), %edi
    932 	movdqa	%xmm2, -32(%edx, %edi)
    933 	movdqa	%xmm3, -16(%edx, %edi)
    934 
    935 	jae	L(sh_5_no_prefetch_loop)
    936 
    937 L(sh_5_end_no_prefetch_loop):
    938 	lea	32(%ecx), %ecx
    939 	add	%ecx, %edi
    940 	add	%edi, %edx
    941 	lea	5(%edi, %eax), %eax
    942 	POP	(%edi)
    943 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
    944 
    945 	CFI_PUSH (%edi)
    946 
    947 	.p2align 4
    948 L(shl_6):
    949 #ifndef USE_AS_MEMMOVE
    950 	movaps	-6(%eax), %xmm1
    951 #else
    952 	movl	DEST+4(%esp), %edi
    953 	movaps	-6(%eax), %xmm1
    954 	movdqu	%xmm0, (%edi)
    955 #endif
    956 #ifdef DATA_CACHE_SIZE_HALF
    957 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    958 #else
    959 # if (defined SHARED || defined __PIC__)
    960 	SETUP_PIC_REG(bx)
    961 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    962 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    963 # else
    964 	cmp	__x86_data_cache_size_half, %ecx
    965 # endif
    966 #endif
    967 	jb L(sh_6_no_prefetch)
    968 
    969 	lea	-64(%ecx), %ecx
    970 
    971 	.p2align 4
    972 L(Shl6LoopStart):
    973 	prefetcht0 0x1c0(%eax)
    974 	prefetcht0 0x1c0(%edx)
    975 	movaps	10(%eax), %xmm2
    976 	movaps	26(%eax), %xmm3
    977 	movaps	42(%eax), %xmm4
    978 	movaps	58(%eax), %xmm5
    979 	movaps	%xmm5, %xmm7
    980 	palignr	$6, %xmm4, %xmm5
    981 	palignr	$6, %xmm3, %xmm4
    982 	movaps	%xmm5, 48(%edx)
    983 	palignr	$6, %xmm2, %xmm3
    984 	lea	64(%eax), %eax
    985 	palignr	$6, %xmm1, %xmm2
    986 	movaps	%xmm4, 32(%edx)
    987 	movaps	%xmm3, 16(%edx)
    988 	movaps	%xmm7, %xmm1
    989 	movaps	%xmm2, (%edx)
    990 	lea	64(%edx), %edx
    991 	sub	$64, %ecx
    992 	ja	L(Shl6LoopStart)
    993 
    994 L(Shl6LoopLeave):
    995 	add	$32, %ecx
    996 	jle	L(shl_end_0)
    997 
    998 	movaps	10(%eax), %xmm2
    999 	movaps	26(%eax), %xmm3
   1000 	palignr	$6, %xmm2, %xmm3
   1001 	palignr	$6, %xmm1, %xmm2
   1002 	movaps	%xmm2, (%edx)
   1003 	movaps	%xmm3, 16(%edx)
   1004 	lea	32(%edx, %ecx), %edx
   1005 	lea	32(%eax, %ecx), %eax
   1006 	POP (%edi)
   1007 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1008 
   1009 	CFI_PUSH (%edi)
   1010 
   1011 	.p2align 4
   1012 L(sh_6_no_prefetch):
   1013 	lea	-32(%ecx), %ecx
   1014 	lea	-6(%eax), %eax
   1015 	xor	%edi, %edi
   1016 
   1017 	.p2align 4
   1018 L(sh_6_no_prefetch_loop):
   1019 	movdqa	16(%eax, %edi), %xmm2
   1020 	sub	$32, %ecx
   1021 	movdqa	32(%eax, %edi), %xmm3
   1022 	movdqa	%xmm3, %xmm4
   1023 	palignr	$6, %xmm2, %xmm3
   1024 	palignr	$6, %xmm1, %xmm2
   1025 	lea	32(%edi), %edi
   1026 	movdqa	%xmm2, -32(%edx, %edi)
   1027 	movdqa	%xmm3, -16(%edx, %edi)
   1028 
   1029 	jb	L(sh_6_end_no_prefetch_loop)
   1030 
   1031 	movdqa	16(%eax, %edi), %xmm2
   1032 	sub	$32, %ecx
   1033 	movdqa	32(%eax, %edi), %xmm3
   1034 	movdqa	%xmm3, %xmm1
   1035 	palignr	$6, %xmm2, %xmm3
   1036 	palignr	$6, %xmm4, %xmm2
   1037 	lea	32(%edi), %edi
   1038 	movdqa	%xmm2, -32(%edx, %edi)
   1039 	movdqa	%xmm3, -16(%edx, %edi)
   1040 
   1041 	jae	L(sh_6_no_prefetch_loop)
   1042 
   1043 L(sh_6_end_no_prefetch_loop):
   1044 	lea	32(%ecx), %ecx
   1045 	add	%ecx, %edi
   1046 	add	%edi, %edx
   1047 	lea	6(%edi, %eax), %eax
   1048 	POP	(%edi)
   1049 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1050 
   1051 	CFI_PUSH (%edi)
   1052 
   1053 	.p2align 4
   1054 L(shl_7):
   1055 #ifndef USE_AS_MEMMOVE
   1056 	movaps	-7(%eax), %xmm1
   1057 #else
   1058 	movl	DEST+4(%esp), %edi
   1059 	movaps	-7(%eax), %xmm1
   1060 	movdqu	%xmm0, (%edi)
   1061 #endif
   1062 #ifdef DATA_CACHE_SIZE_HALF
   1063 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1064 #else
   1065 # if (defined SHARED || defined __PIC__)
   1066 	SETUP_PIC_REG(bx)
   1067 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1068 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1069 # else
   1070 	cmp	__x86_data_cache_size_half, %ecx
   1071 # endif
   1072 #endif
   1073 	jb L(sh_7_no_prefetch)
   1074 
   1075 	lea	-64(%ecx), %ecx
   1076 
   1077 	.p2align 4
   1078 L(Shl7LoopStart):
   1079 	prefetcht0 0x1c0(%eax)
   1080 	prefetcht0 0x1c0(%edx)
   1081 	movaps	9(%eax), %xmm2
   1082 	movaps	25(%eax), %xmm3
   1083 	movaps	41(%eax), %xmm4
   1084 	movaps	57(%eax), %xmm5
   1085 	movaps	%xmm5, %xmm7
   1086 	palignr	$7, %xmm4, %xmm5
   1087 	palignr	$7, %xmm3, %xmm4
   1088 	movaps	%xmm5, 48(%edx)
   1089 	palignr	$7, %xmm2, %xmm3
   1090 	lea	64(%eax), %eax
   1091 	palignr	$7, %xmm1, %xmm2
   1092 	movaps	%xmm4, 32(%edx)
   1093 	movaps	%xmm3, 16(%edx)
   1094 	movaps	%xmm7, %xmm1
   1095 	movaps	%xmm2, (%edx)
   1096 	lea	64(%edx), %edx
   1097 	sub	$64, %ecx
   1098 	ja	L(Shl7LoopStart)
   1099 
   1100 L(Shl7LoopLeave):
   1101 	add	$32, %ecx
   1102 	jle	L(shl_end_0)
   1103 
   1104 	movaps	9(%eax), %xmm2
   1105 	movaps	25(%eax), %xmm3
   1106 	palignr	$7, %xmm2, %xmm3
   1107 	palignr	$7, %xmm1, %xmm2
   1108 	movaps	%xmm2, (%edx)
   1109 	movaps	%xmm3, 16(%edx)
   1110 	lea	32(%edx, %ecx), %edx
   1111 	lea	32(%eax, %ecx), %eax
   1112 	POP (%edi)
   1113 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1114 
   1115 	CFI_PUSH (%edi)
   1116 
   1117 	.p2align 4
   1118 L(sh_7_no_prefetch):
   1119 	lea	-32(%ecx), %ecx
   1120 	lea	-7(%eax), %eax
   1121 	xor	%edi, %edi
   1122 
   1123 	.p2align 4
   1124 L(sh_7_no_prefetch_loop):
   1125 	movdqa	16(%eax, %edi), %xmm2
   1126 	sub	$32, %ecx
   1127 	movdqa	32(%eax, %edi), %xmm3
   1128 	movdqa	%xmm3, %xmm4
   1129 	palignr	$7, %xmm2, %xmm3
   1130 	palignr	$7, %xmm1, %xmm2
   1131 	lea	32(%edi), %edi
   1132 	movdqa	%xmm2, -32(%edx, %edi)
   1133 	movdqa	%xmm3, -16(%edx, %edi)
   1134 	jb	L(sh_7_end_no_prefetch_loop)
   1135 
   1136 	movdqa	16(%eax, %edi), %xmm2
   1137 	sub	$32, %ecx
   1138 	movdqa	32(%eax, %edi), %xmm3
   1139 	movdqa	%xmm3, %xmm1
   1140 	palignr	$7, %xmm2, %xmm3
   1141 	palignr	$7, %xmm4, %xmm2
   1142 	lea	32(%edi), %edi
   1143 	movdqa	%xmm2, -32(%edx, %edi)
   1144 	movdqa	%xmm3, -16(%edx, %edi)
   1145 	jae	L(sh_7_no_prefetch_loop)
   1146 
   1147 L(sh_7_end_no_prefetch_loop):
   1148 	lea	32(%ecx), %ecx
   1149 	add	%ecx, %edi
   1150 	add	%edi, %edx
   1151 	lea	7(%edi, %eax), %eax
   1152 	POP	(%edi)
   1153 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1154 
   1155 	CFI_PUSH (%edi)
   1156 
   1157 	.p2align 4
   1158 L(shl_8):
   1159 #ifndef USE_AS_MEMMOVE
   1160 	movaps	-8(%eax), %xmm1
   1161 #else
   1162 	movl	DEST+4(%esp), %edi
   1163 	movaps	-8(%eax), %xmm1
   1164 	movdqu	%xmm0, (%edi)
   1165 #endif
   1166 #ifdef DATA_CACHE_SIZE_HALF
   1167 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1168 #else
   1169 # if (defined SHARED || defined __PIC__)
   1170 	SETUP_PIC_REG(bx)
   1171 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1172 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1173 # else
   1174 	cmp	__x86_data_cache_size_half, %ecx
   1175 # endif
   1176 #endif
   1177 	jb L(sh_8_no_prefetch)
   1178 
   1179 	lea	-64(%ecx), %ecx
   1180 
   1181 	.p2align 4
   1182 L(Shl8LoopStart):
   1183 	prefetcht0 0x1c0(%eax)
   1184 	prefetcht0 0x1c0(%edx)
   1185 	movaps	8(%eax), %xmm2
   1186 	movaps	24(%eax), %xmm3
   1187 	movaps	40(%eax), %xmm4
   1188 	movaps	56(%eax), %xmm5
   1189 	movaps	%xmm5, %xmm7
   1190 	palignr	$8, %xmm4, %xmm5
   1191 	palignr	$8, %xmm3, %xmm4
   1192 	movaps	%xmm5, 48(%edx)
   1193 	palignr	$8, %xmm2, %xmm3
   1194 	lea	64(%eax), %eax
   1195 	palignr	$8, %xmm1, %xmm2
   1196 	movaps	%xmm4, 32(%edx)
   1197 	movaps	%xmm3, 16(%edx)
   1198 	movaps	%xmm7, %xmm1
   1199 	movaps	%xmm2, (%edx)
   1200 	lea	64(%edx), %edx
   1201 	sub	$64, %ecx
   1202 	ja	L(Shl8LoopStart)
   1203 
   1204 L(LoopLeave8):
   1205 	add	$32, %ecx
   1206 	jle	L(shl_end_0)
   1207 
   1208 	movaps	8(%eax), %xmm2
   1209 	movaps	24(%eax), %xmm3
   1210 	palignr	$8, %xmm2, %xmm3
   1211 	palignr	$8, %xmm1, %xmm2
   1212 	movaps	%xmm2, (%edx)
   1213 	movaps	%xmm3, 16(%edx)
   1214 	lea	32(%edx, %ecx), %edx
   1215 	lea	32(%eax, %ecx), %eax
   1216 	POP (%edi)
   1217 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1218 
   1219 	CFI_PUSH (%edi)
   1220 
   1221 	.p2align 4
   1222 L(sh_8_no_prefetch):
   1223 	lea	-32(%ecx), %ecx
   1224 	lea	-8(%eax), %eax
   1225 	xor	%edi, %edi
   1226 
   1227 	.p2align 4
   1228 L(sh_8_no_prefetch_loop):
   1229 	movdqa	16(%eax, %edi), %xmm2
   1230 	sub	$32, %ecx
   1231 	movdqa	32(%eax, %edi), %xmm3
   1232 	movdqa	%xmm3, %xmm4
   1233 	palignr	$8, %xmm2, %xmm3
   1234 	palignr	$8, %xmm1, %xmm2
   1235 	lea	32(%edi), %edi
   1236 	movdqa	%xmm2, -32(%edx, %edi)
   1237 	movdqa	%xmm3, -16(%edx, %edi)
   1238 	jb	L(sh_8_end_no_prefetch_loop)
   1239 
   1240 	movdqa	16(%eax, %edi), %xmm2
   1241 	sub	$32, %ecx
   1242 	movdqa	32(%eax, %edi), %xmm3
   1243 	movdqa	%xmm3, %xmm1
   1244 	palignr	$8, %xmm2, %xmm3
   1245 	palignr	$8, %xmm4, %xmm2
   1246 	lea	32(%edi), %edi
   1247 	movdqa	%xmm2, -32(%edx, %edi)
   1248 	movdqa	%xmm3, -16(%edx, %edi)
   1249 	jae	L(sh_8_no_prefetch_loop)
   1250 
   1251 L(sh_8_end_no_prefetch_loop):
   1252 	lea	32(%ecx), %ecx
   1253 	add	%ecx, %edi
   1254 	add	%edi, %edx
   1255 	lea	8(%edi, %eax), %eax
   1256 	POP	(%edi)
   1257 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1258 
   1259 	CFI_PUSH (%edi)
   1260 
   1261 	.p2align 4
   1262 L(shl_9):
   1263 #ifndef USE_AS_MEMMOVE
   1264 	movaps	-9(%eax), %xmm1
   1265 #else
   1266 	movl	DEST+4(%esp), %edi
   1267 	movaps	-9(%eax), %xmm1
   1268 	movdqu	%xmm0, (%edi)
   1269 #endif
   1270 #ifdef DATA_CACHE_SIZE_HALF
   1271 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1272 #else
   1273 # if (defined SHARED || defined __PIC__)
   1274 	SETUP_PIC_REG(bx)
   1275 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1276 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1277 # else
   1278 	cmp	__x86_data_cache_size_half, %ecx
   1279 # endif
   1280 #endif
   1281 	jb L(sh_9_no_prefetch)
   1282 
   1283 	lea	-64(%ecx), %ecx
   1284 
   1285 	.p2align 4
   1286 L(Shl9LoopStart):
   1287 	prefetcht0 0x1c0(%eax)
   1288 	prefetcht0 0x1c0(%edx)
   1289 	movaps	7(%eax), %xmm2
   1290 	movaps	23(%eax), %xmm3
   1291 	movaps	39(%eax), %xmm4
   1292 	movaps	55(%eax), %xmm5
   1293 	movaps	%xmm5, %xmm7
   1294 	palignr	$9, %xmm4, %xmm5
   1295 	palignr	$9, %xmm3, %xmm4
   1296 	movaps	%xmm5, 48(%edx)
   1297 	palignr	$9, %xmm2, %xmm3
   1298 	lea	64(%eax), %eax
   1299 	palignr	$9, %xmm1, %xmm2
   1300 	movaps	%xmm4, 32(%edx)
   1301 	movaps	%xmm3, 16(%edx)
   1302 	movaps	%xmm7, %xmm1
   1303 	movaps	%xmm2, (%edx)
   1304 	lea	64(%edx), %edx
   1305 	sub	$64, %ecx
   1306 	ja	L(Shl9LoopStart)
   1307 
   1308 L(Shl9LoopLeave):
   1309 	add	$32, %ecx
   1310 	jle	L(shl_end_0)
   1311 
   1312 	movaps	7(%eax), %xmm2
   1313 	movaps	23(%eax), %xmm3
   1314 	palignr	$9, %xmm2, %xmm3
   1315 	palignr	$9, %xmm1, %xmm2
   1316 
   1317 	movaps	%xmm2, (%edx)
   1318 	movaps	%xmm3, 16(%edx)
   1319 	lea	32(%edx, %ecx), %edx
   1320 	lea	32(%eax, %ecx), %eax
   1321 	POP (%edi)
   1322 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1323 
   1324 	CFI_PUSH (%edi)
   1325 
   1326 	.p2align 4
   1327 L(sh_9_no_prefetch):
   1328 	lea	-32(%ecx), %ecx
   1329 	lea	-9(%eax), %eax
   1330 	xor	%edi, %edi
   1331 
   1332 	.p2align 4
   1333 L(sh_9_no_prefetch_loop):
   1334 	movdqa	16(%eax, %edi), %xmm2
   1335 	sub	$32, %ecx
   1336 	movdqa	32(%eax, %edi), %xmm3
   1337 	movdqa	%xmm3, %xmm4
   1338 	palignr	$9, %xmm2, %xmm3
   1339 	palignr	$9, %xmm1, %xmm2
   1340 	lea	32(%edi), %edi
   1341 	movdqa	%xmm2, -32(%edx, %edi)
   1342 	movdqa	%xmm3, -16(%edx, %edi)
   1343 	jb	L(sh_9_end_no_prefetch_loop)
   1344 
   1345 	movdqa	16(%eax, %edi), %xmm2
   1346 	sub	$32, %ecx
   1347 	movdqa	32(%eax, %edi), %xmm3
   1348 	movdqa	%xmm3, %xmm1
   1349 	palignr	$9, %xmm2, %xmm3
   1350 	palignr	$9, %xmm4, %xmm2
   1351 	lea	32(%edi), %edi
   1352 	movdqa	%xmm2, -32(%edx, %edi)
   1353 	movdqa	%xmm3, -16(%edx, %edi)
   1354 	jae	L(sh_9_no_prefetch_loop)
   1355 
   1356 L(sh_9_end_no_prefetch_loop):
   1357 	lea	32(%ecx), %ecx
   1358 	add	%ecx, %edi
   1359 	add	%edi, %edx
   1360 	lea	9(%edi, %eax), %eax
   1361 	POP	(%edi)
   1362 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1363 
   1364 	CFI_PUSH (%edi)
   1365 
   1366 	.p2align 4
   1367 L(shl_10):
   1368 #ifndef USE_AS_MEMMOVE
   1369 	movaps	-10(%eax), %xmm1
   1370 #else
   1371 	movl	DEST+4(%esp), %edi
   1372 	movaps	-10(%eax), %xmm1
   1373 	movdqu	%xmm0, (%edi)
   1374 #endif
   1375 #ifdef DATA_CACHE_SIZE_HALF
   1376 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1377 #else
   1378 # if (defined SHARED || defined __PIC__)
   1379 	SETUP_PIC_REG(bx)
   1380 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1381 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1382 # else
   1383 	cmp	__x86_data_cache_size_half, %ecx
   1384 # endif
   1385 #endif
   1386 	jb L(sh_10_no_prefetch)
   1387 
   1388 	lea	-64(%ecx), %ecx
   1389 
   1390 	.p2align 4
   1391 L(Shl10LoopStart):
   1392 	prefetcht0 0x1c0(%eax)
   1393 	prefetcht0 0x1c0(%edx)
   1394 	movaps	6(%eax), %xmm2
   1395 	movaps	22(%eax), %xmm3
   1396 	movaps	38(%eax), %xmm4
   1397 	movaps	54(%eax), %xmm5
   1398 	movaps	%xmm5, %xmm7
   1399 	palignr	$10, %xmm4, %xmm5
   1400 	palignr	$10, %xmm3, %xmm4
   1401 	movaps	%xmm5, 48(%edx)
   1402 	palignr	$10, %xmm2, %xmm3
   1403 	lea	64(%eax), %eax
   1404 	palignr	$10, %xmm1, %xmm2
   1405 	movaps	%xmm4, 32(%edx)
   1406 	movaps	%xmm3, 16(%edx)
   1407 	movaps	%xmm7, %xmm1
   1408 	movaps	%xmm2, (%edx)
   1409 	lea	64(%edx), %edx
   1410 	sub	$64, %ecx
   1411 	ja	L(Shl10LoopStart)
   1412 
   1413 L(Shl10LoopLeave):
   1414 	add	$32, %ecx
   1415 	jle	L(shl_end_0)
   1416 
   1417 	movaps	6(%eax), %xmm2
   1418 	movaps	22(%eax), %xmm3
   1419 	palignr	$10, %xmm2, %xmm3
   1420 	palignr	$10, %xmm1, %xmm2
   1421 
   1422 	movaps	%xmm2, (%edx)
   1423 	movaps	%xmm3, 16(%edx)
   1424 	lea	32(%edx, %ecx), %edx
   1425 	lea	32(%eax, %ecx), %eax
   1426 	POP (%edi)
   1427 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1428 
   1429 	CFI_PUSH (%edi)
   1430 
   1431 	.p2align 4
   1432 L(sh_10_no_prefetch):
   1433 	lea	-32(%ecx), %ecx
   1434 	lea	-10(%eax), %eax
   1435 	xor	%edi, %edi
   1436 
   1437 	.p2align 4
   1438 L(sh_10_no_prefetch_loop):
   1439 	movdqa	16(%eax, %edi), %xmm2
   1440 	sub	$32, %ecx
   1441 	movdqa	32(%eax, %edi), %xmm3
   1442 	movdqa	%xmm3, %xmm4
   1443 	palignr	$10, %xmm2, %xmm3
   1444 	palignr	$10, %xmm1, %xmm2
   1445 	lea	32(%edi), %edi
   1446 	movdqa	%xmm2, -32(%edx, %edi)
   1447 	movdqa	%xmm3, -16(%edx, %edi)
   1448 	jb	L(sh_10_end_no_prefetch_loop)
   1449 
   1450 	movdqa	16(%eax, %edi), %xmm2
   1451 	sub	$32, %ecx
   1452 	movdqa	32(%eax, %edi), %xmm3
   1453 	movdqa	%xmm3, %xmm1
   1454 	palignr	$10, %xmm2, %xmm3
   1455 	palignr	$10, %xmm4, %xmm2
   1456 	lea	32(%edi), %edi
   1457 	movdqa	%xmm2, -32(%edx, %edi)
   1458 	movdqa	%xmm3, -16(%edx, %edi)
   1459 	jae	L(sh_10_no_prefetch_loop)
   1460 
   1461 L(sh_10_end_no_prefetch_loop):
   1462 	lea	32(%ecx), %ecx
   1463 	add	%ecx, %edi
   1464 	add	%edi, %edx
   1465 	lea	10(%edi, %eax), %eax
   1466 	POP	(%edi)
   1467 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1468 
   1469 	CFI_PUSH (%edi)
   1470 
   1471 	.p2align 4
   1472 L(shl_11):
   1473 #ifndef USE_AS_MEMMOVE
   1474 	movaps	-11(%eax), %xmm1
   1475 #else
   1476 	movl	DEST+4(%esp), %edi
   1477 	movaps	-11(%eax), %xmm1
   1478 	movdqu	%xmm0, (%edi)
   1479 #endif
   1480 #ifdef DATA_CACHE_SIZE_HALF
   1481 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1482 #else
   1483 # if (defined SHARED || defined __PIC__)
   1484 	SETUP_PIC_REG(bx)
   1485 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1486 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1487 # else
   1488 	cmp	__x86_data_cache_size_half, %ecx
   1489 # endif
   1490 #endif
   1491 	jb L(sh_11_no_prefetch)
   1492 
   1493 	lea	-64(%ecx), %ecx
   1494 
   1495 	.p2align 4
   1496 L(Shl11LoopStart):
   1497 	prefetcht0 0x1c0(%eax)
   1498 	prefetcht0 0x1c0(%edx)
   1499 	movaps	5(%eax), %xmm2
   1500 	movaps	21(%eax), %xmm3
   1501 	movaps	37(%eax), %xmm4
   1502 	movaps	53(%eax), %xmm5
   1503 	movaps	%xmm5, %xmm7
   1504 	palignr	$11, %xmm4, %xmm5
   1505 	palignr	$11, %xmm3, %xmm4
   1506 	movaps	%xmm5, 48(%edx)
   1507 	palignr	$11, %xmm2, %xmm3
   1508 	lea	64(%eax), %eax
   1509 	palignr	$11, %xmm1, %xmm2
   1510 	movaps	%xmm4, 32(%edx)
   1511 	movaps	%xmm3, 16(%edx)
   1512 	movaps	%xmm7, %xmm1
   1513 	movaps	%xmm2, (%edx)
   1514 	lea	64(%edx), %edx
   1515 	sub	$64, %ecx
   1516 	ja	L(Shl11LoopStart)
   1517 
   1518 L(Shl11LoopLeave):
   1519 	add	$32, %ecx
   1520 	jle	L(shl_end_0)
   1521 
   1522 	movaps	5(%eax), %xmm2
   1523 	movaps	21(%eax), %xmm3
   1524 	palignr	$11, %xmm2, %xmm3
   1525 	palignr	$11, %xmm1, %xmm2
   1526 
   1527 	movaps	%xmm2, (%edx)
   1528 	movaps	%xmm3, 16(%edx)
   1529 	lea	32(%edx, %ecx), %edx
   1530 	lea	32(%eax, %ecx), %eax
   1531 	POP (%edi)
   1532 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1533 
   1534 	CFI_PUSH (%edi)
   1535 
   1536 	.p2align 4
   1537 L(sh_11_no_prefetch):
   1538 	lea	-32(%ecx), %ecx
   1539 	lea	-11(%eax), %eax
   1540 	xor	%edi, %edi
   1541 
   1542 	.p2align 4
   1543 L(sh_11_no_prefetch_loop):
   1544 	movdqa	16(%eax, %edi), %xmm2
   1545 	sub	$32, %ecx
   1546 	movdqa	32(%eax, %edi), %xmm3
   1547 	movdqa	%xmm3, %xmm4
   1548 	palignr	$11, %xmm2, %xmm3
   1549 	palignr	$11, %xmm1, %xmm2
   1550 	lea	32(%edi), %edi
   1551 	movdqa	%xmm2, -32(%edx, %edi)
   1552 	movdqa	%xmm3, -16(%edx, %edi)
   1553 	jb	L(sh_11_end_no_prefetch_loop)
   1554 
   1555 	movdqa	16(%eax, %edi), %xmm2
   1556 	sub	$32, %ecx
   1557 	movdqa	32(%eax, %edi), %xmm3
   1558 	movdqa	%xmm3, %xmm1
   1559 	palignr	$11, %xmm2, %xmm3
   1560 	palignr	$11, %xmm4, %xmm2
   1561 	lea	32(%edi), %edi
   1562 	movdqa	%xmm2, -32(%edx, %edi)
   1563 	movdqa	%xmm3, -16(%edx, %edi)
   1564 	jae	L(sh_11_no_prefetch_loop)
   1565 
   1566 L(sh_11_end_no_prefetch_loop):
   1567 	lea	32(%ecx), %ecx
   1568 	add	%ecx, %edi
   1569 	add	%edi, %edx
   1570 	lea	11(%edi, %eax), %eax
   1571 	POP	(%edi)
   1572 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1573 
   1574 	CFI_PUSH (%edi)
   1575 
   1576 	.p2align 4
   1577 L(shl_12):
   1578 #ifndef USE_AS_MEMMOVE
   1579 	movaps	-12(%eax), %xmm1
   1580 #else
   1581 	movl	DEST+4(%esp), %edi
   1582 	movaps	-12(%eax), %xmm1
   1583 	movdqu	%xmm0, (%edi)
   1584 #endif
   1585 #ifdef DATA_CACHE_SIZE_HALF
   1586 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1587 #else
   1588 # if (defined SHARED || defined __PIC__)
   1589 	SETUP_PIC_REG(bx)
   1590 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1591 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1592 # else
   1593 	cmp	__x86_data_cache_size_half, %ecx
   1594 # endif
   1595 #endif
   1596 	jb L(sh_12_no_prefetch)
   1597 
   1598 	lea	-64(%ecx), %ecx
   1599 
   1600 	.p2align 4
   1601 L(Shl12LoopStart):
   1602 	prefetcht0 0x1c0(%eax)
   1603 	prefetcht0 0x1c0(%edx)
   1604 	movaps	4(%eax), %xmm2
   1605 	movaps	20(%eax), %xmm3
   1606 	movaps	36(%eax), %xmm4
   1607 	movaps	52(%eax), %xmm5
   1608 	movaps	%xmm5, %xmm7
   1609 	palignr	$12, %xmm4, %xmm5
   1610 	palignr	$12, %xmm3, %xmm4
   1611 	movaps	%xmm5, 48(%edx)
   1612 	palignr	$12, %xmm2, %xmm3
   1613 	lea	64(%eax), %eax
   1614 	palignr	$12, %xmm1, %xmm2
   1615 	movaps	%xmm4, 32(%edx)
   1616 	movaps	%xmm3, 16(%edx)
   1617 	movaps	%xmm7, %xmm1
   1618 	movaps	%xmm2, (%edx)
   1619 	lea	64(%edx), %edx
   1620 	sub	$64, %ecx
   1621 	ja	L(Shl12LoopStart)
   1622 
   1623 L(Shl12LoopLeave):
   1624 	add	$32, %ecx
   1625 	jle	L(shl_end_0)
   1626 
   1627 	movaps	4(%eax), %xmm2
   1628 	movaps	20(%eax), %xmm3
   1629 	palignr	$12, %xmm2, %xmm3
   1630 	palignr	$12, %xmm1, %xmm2
   1631 
   1632 	movaps	%xmm2, (%edx)
   1633 	movaps	%xmm3, 16(%edx)
   1634 	lea	32(%edx, %ecx), %edx
   1635 	lea	32(%eax, %ecx), %eax
   1636 	POP (%edi)
   1637 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1638 
   1639 	CFI_PUSH (%edi)
   1640 
   1641 	.p2align 4
   1642 L(sh_12_no_prefetch):
   1643 	lea	-32(%ecx), %ecx
   1644 	lea	-12(%eax), %eax
   1645 	xor	%edi, %edi
   1646 
   1647 	.p2align 4
   1648 L(sh_12_no_prefetch_loop):
   1649 	movdqa	16(%eax, %edi), %xmm2
   1650 	sub	$32, %ecx
   1651 	movdqa	32(%eax, %edi), %xmm3
   1652 	movdqa	%xmm3, %xmm4
   1653 	palignr	$12, %xmm2, %xmm3
   1654 	palignr	$12, %xmm1, %xmm2
   1655 	lea	32(%edi), %edi
   1656 	movdqa	%xmm2, -32(%edx, %edi)
   1657 	movdqa	%xmm3, -16(%edx, %edi)
   1658 	jb	L(sh_12_end_no_prefetch_loop)
   1659 
   1660 	movdqa	16(%eax, %edi), %xmm2
   1661 	sub	$32, %ecx
   1662 	movdqa	32(%eax, %edi), %xmm3
   1663 	movdqa	%xmm3, %xmm1
   1664 	palignr	$12, %xmm2, %xmm3
   1665 	palignr	$12, %xmm4, %xmm2
   1666 	lea	32(%edi), %edi
   1667 	movdqa	%xmm2, -32(%edx, %edi)
   1668 	movdqa	%xmm3, -16(%edx, %edi)
   1669 	jae	L(sh_12_no_prefetch_loop)
   1670 
   1671 L(sh_12_end_no_prefetch_loop):
   1672 	lea	32(%ecx), %ecx
   1673 	add	%ecx, %edi
   1674 	add	%edi, %edx
   1675 	lea	12(%edi, %eax), %eax
   1676 	POP	(%edi)
   1677 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1678 
   1679 	CFI_PUSH (%edi)
   1680 
   1681 	.p2align 4
   1682 L(shl_13):
   1683 #ifndef USE_AS_MEMMOVE
   1684 	movaps	-13(%eax), %xmm1
   1685 #else
   1686 	movl	DEST+4(%esp), %edi
   1687 	movaps	-13(%eax), %xmm1
   1688 	movdqu	%xmm0, (%edi)
   1689 #endif
   1690 #ifdef DATA_CACHE_SIZE_HALF
   1691 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1692 #else
   1693 # if (defined SHARED || defined __PIC__)
   1694 	SETUP_PIC_REG(bx)
   1695 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1696 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1697 # else
   1698 	cmp	__x86_data_cache_size_half, %ecx
   1699 # endif
   1700 #endif
   1701 	jb L(sh_13_no_prefetch)
   1702 
   1703 	lea	-64(%ecx), %ecx
   1704 
   1705 	.p2align 4
   1706 L(Shl13LoopStart):
   1707 	prefetcht0 0x1c0(%eax)
   1708 	prefetcht0 0x1c0(%edx)
   1709 	movaps	3(%eax), %xmm2
   1710 	movaps	19(%eax), %xmm3
   1711 	movaps	35(%eax), %xmm4
   1712 	movaps	51(%eax), %xmm5
   1713 	movaps	%xmm5, %xmm7
   1714 	palignr	$13, %xmm4, %xmm5
   1715 	palignr	$13, %xmm3, %xmm4
   1716 	movaps	%xmm5, 48(%edx)
   1717 	palignr	$13, %xmm2, %xmm3
   1718 	lea	64(%eax), %eax
   1719 	palignr	$13, %xmm1, %xmm2
   1720 	movaps	%xmm4, 32(%edx)
   1721 	movaps	%xmm3, 16(%edx)
   1722 	movaps	%xmm7, %xmm1
   1723 	movaps	%xmm2, (%edx)
   1724 	lea	64(%edx), %edx
   1725 	sub	$64, %ecx
   1726 	ja	L(Shl13LoopStart)
   1727 
   1728 L(Shl13LoopLeave):
   1729 	add	$32, %ecx
   1730 	jle	L(shl_end_0)
   1731 
   1732 	movaps	3(%eax), %xmm2
   1733 	movaps	19(%eax), %xmm3
   1734 	palignr	$13, %xmm2, %xmm3
   1735 	palignr	$13, %xmm1, %xmm2
   1736 
   1737 	movaps	%xmm2, (%edx)
   1738 	movaps	%xmm3, 16(%edx)
   1739 	lea	32(%edx, %ecx), %edx
   1740 	lea	32(%eax, %ecx), %eax
   1741 	POP (%edi)
   1742 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1743 
   1744 	CFI_PUSH (%edi)
   1745 
   1746 	.p2align 4
   1747 L(sh_13_no_prefetch):
   1748 	lea	-32(%ecx), %ecx
   1749 	lea	-13(%eax), %eax
   1750 	xor	%edi, %edi
   1751 
   1752 	.p2align 4
   1753 L(sh_13_no_prefetch_loop):
   1754 	movdqa	16(%eax, %edi), %xmm2
   1755 	sub	$32, %ecx
   1756 	movdqa	32(%eax, %edi), %xmm3
   1757 	movdqa	%xmm3, %xmm4
   1758 	palignr	$13, %xmm2, %xmm3
   1759 	palignr	$13, %xmm1, %xmm2
   1760 	lea	32(%edi), %edi
   1761 	movdqa	%xmm2, -32(%edx, %edi)
   1762 	movdqa	%xmm3, -16(%edx, %edi)
   1763 	jb	L(sh_13_end_no_prefetch_loop)
   1764 
   1765 	movdqa	16(%eax, %edi), %xmm2
   1766 	sub	$32, %ecx
   1767 	movdqa	32(%eax, %edi), %xmm3
   1768 	movdqa	%xmm3, %xmm1
   1769 	palignr	$13, %xmm2, %xmm3
   1770 	palignr	$13, %xmm4, %xmm2
   1771 	lea	32(%edi), %edi
   1772 	movdqa	%xmm2, -32(%edx, %edi)
   1773 	movdqa	%xmm3, -16(%edx, %edi)
   1774 	jae	L(sh_13_no_prefetch_loop)
   1775 
   1776 L(sh_13_end_no_prefetch_loop):
   1777 	lea	32(%ecx), %ecx
   1778 	add	%ecx, %edi
   1779 	add	%edi, %edx
   1780 	lea	13(%edi, %eax), %eax
   1781 	POP	(%edi)
   1782 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1783 
   1784 	CFI_PUSH (%edi)
   1785 
   1786 	.p2align 4
   1787 L(shl_14):
   1788 #ifndef USE_AS_MEMMOVE
   1789 	movaps	-14(%eax), %xmm1
   1790 #else
   1791 	movl	DEST+4(%esp), %edi
   1792 	movaps	-14(%eax), %xmm1
   1793 	movdqu	%xmm0, (%edi)
   1794 #endif
   1795 #ifdef DATA_CACHE_SIZE_HALF
   1796 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1797 #else
   1798 # if (defined SHARED || defined __PIC__)
   1799 	SETUP_PIC_REG(bx)
   1800 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1801 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1802 # else
   1803 	cmp	__x86_data_cache_size_half, %ecx
   1804 # endif
   1805 #endif
   1806 	jb L(sh_14_no_prefetch)
   1807 
   1808 	lea	-64(%ecx), %ecx
   1809 
   1810 	.p2align 4
   1811 L(Shl14LoopStart):
   1812 	prefetcht0 0x1c0(%eax)
   1813 	prefetcht0 0x1c0(%edx)
   1814 	movaps	2(%eax), %xmm2
   1815 	movaps	18(%eax), %xmm3
   1816 	movaps	34(%eax), %xmm4
   1817 	movaps	50(%eax), %xmm5
   1818 	movaps	%xmm5, %xmm7
   1819 	palignr	$14, %xmm4, %xmm5
   1820 	palignr	$14, %xmm3, %xmm4
   1821 	movaps	%xmm5, 48(%edx)
   1822 	palignr	$14, %xmm2, %xmm3
   1823 	lea	64(%eax), %eax
   1824 	palignr	$14, %xmm1, %xmm2
   1825 	movaps	%xmm4, 32(%edx)
   1826 	movaps	%xmm3, 16(%edx)
   1827 	movaps	%xmm7, %xmm1
   1828 	movaps	%xmm2, (%edx)
   1829 	lea	64(%edx), %edx
   1830 	sub	$64, %ecx
   1831 	ja	L(Shl14LoopStart)
   1832 
   1833 L(Shl14LoopLeave):
   1834 	add	$32, %ecx
   1835 	jle	L(shl_end_0)
   1836 
   1837 	movaps	2(%eax), %xmm2
   1838 	movaps	18(%eax), %xmm3
   1839 	palignr	$14, %xmm2, %xmm3
   1840 	palignr	$14, %xmm1, %xmm2
   1841 
   1842 	movaps	%xmm2, (%edx)
   1843 	movaps	%xmm3, 16(%edx)
   1844 	lea	32(%edx, %ecx), %edx
   1845 	lea	32(%eax, %ecx), %eax
   1846 	POP (%edi)
   1847 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1848 
   1849 	CFI_PUSH (%edi)
   1850 
   1851 	.p2align 4
   1852 L(sh_14_no_prefetch):
   1853 	lea	-32(%ecx), %ecx
   1854 	lea	-14(%eax), %eax
   1855 	xor	%edi, %edi
   1856 
   1857 	.p2align 4
   1858 L(sh_14_no_prefetch_loop):
   1859 	movdqa	16(%eax, %edi), %xmm2
   1860 	sub	$32, %ecx
   1861 	movdqa	32(%eax, %edi), %xmm3
   1862 	movdqa	%xmm3, %xmm4
   1863 	palignr	$14, %xmm2, %xmm3
   1864 	palignr	$14, %xmm1, %xmm2
   1865 	lea	32(%edi), %edi
   1866 	movdqa	%xmm2, -32(%edx, %edi)
   1867 	movdqa	%xmm3, -16(%edx, %edi)
   1868 	jb	L(sh_14_end_no_prefetch_loop)
   1869 
   1870 	movdqa	16(%eax, %edi), %xmm2
   1871 	sub	$32, %ecx
   1872 	movdqa	32(%eax, %edi), %xmm3
   1873 	movdqa	%xmm3, %xmm1
   1874 	palignr	$14, %xmm2, %xmm3
   1875 	palignr	$14, %xmm4, %xmm2
   1876 	lea	32(%edi), %edi
   1877 	movdqa	%xmm2, -32(%edx, %edi)
   1878 	movdqa	%xmm3, -16(%edx, %edi)
   1879 	jae	L(sh_14_no_prefetch_loop)
   1880 
   1881 L(sh_14_end_no_prefetch_loop):
   1882 	lea	32(%ecx), %ecx
   1883 	add	%ecx, %edi
   1884 	add	%edi, %edx
   1885 	lea	14(%edi, %eax), %eax
   1886 	POP	(%edi)
   1887 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1888 
   1889 	CFI_PUSH (%edi)
   1890 
   1891 	.p2align 4
   1892 L(shl_15):
   1893 #ifndef USE_AS_MEMMOVE
   1894 	movaps	-15(%eax), %xmm1
   1895 #else
   1896 	movl	DEST+4(%esp), %edi
   1897 	movaps	-15(%eax), %xmm1
   1898 	movdqu	%xmm0, (%edi)
   1899 #endif
   1900 #ifdef DATA_CACHE_SIZE_HALF
   1901 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
   1902 #else
   1903 # if (defined SHARED || defined __PIC__)
   1904 	SETUP_PIC_REG(bx)
   1905 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   1906 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
   1907 # else
   1908 	cmp	__x86_data_cache_size_half, %ecx
   1909 # endif
   1910 #endif
   1911 	jb L(sh_15_no_prefetch)
   1912 
   1913 	lea	-64(%ecx), %ecx
   1914 
   1915 	.p2align 4
   1916 L(Shl15LoopStart):
   1917 	prefetcht0 0x1c0(%eax)
   1918 	prefetcht0 0x1c0(%edx)
   1919 	movaps	1(%eax), %xmm2
   1920 	movaps	17(%eax), %xmm3
   1921 	movaps	33(%eax), %xmm4
   1922 	movaps	49(%eax), %xmm5
   1923 	movaps	%xmm5, %xmm7
   1924 	palignr	$15, %xmm4, %xmm5
   1925 	palignr	$15, %xmm3, %xmm4
   1926 	movaps	%xmm5, 48(%edx)
   1927 	palignr	$15, %xmm2, %xmm3
   1928 	lea	64(%eax), %eax
   1929 	palignr	$15, %xmm1, %xmm2
   1930 	movaps	%xmm4, 32(%edx)
   1931 	movaps	%xmm3, 16(%edx)
   1932 	movaps	%xmm7, %xmm1
   1933 	movaps	%xmm2, (%edx)
   1934 	lea	64(%edx), %edx
   1935 	sub	$64, %ecx
   1936 	ja	L(Shl15LoopStart)
   1937 
   1938 L(Shl15LoopLeave):
   1939 	add	$32, %ecx
   1940 	jle	L(shl_end_0)
   1941 
   1942 	movaps	1(%eax), %xmm2
   1943 	movaps	17(%eax), %xmm3
   1944 	palignr	$15, %xmm2, %xmm3
   1945 	palignr	$15, %xmm1, %xmm2
   1946 
   1947 	movaps	%xmm2, (%edx)
   1948 	movaps	%xmm3, 16(%edx)
   1949 	lea	32(%edx, %ecx), %edx
   1950 	lea	32(%eax, %ecx), %eax
   1951 	POP (%edi)
   1952 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1953 
   1954 	CFI_PUSH (%edi)
   1955 
   1956 	.p2align 4
   1957 L(sh_15_no_prefetch):
   1958 	lea	-32(%ecx), %ecx
   1959 	lea	-15(%eax), %eax
   1960 	xor	%edi, %edi
   1961 
   1962 	.p2align 4
   1963 L(sh_15_no_prefetch_loop):
   1964 	movdqa	16(%eax, %edi), %xmm2
   1965 	sub	$32, %ecx
   1966 	movdqa	32(%eax, %edi), %xmm3
   1967 	movdqa	%xmm3, %xmm4
   1968 	palignr	$15, %xmm2, %xmm3
   1969 	palignr	$15, %xmm1, %xmm2
   1970 	lea	32(%edi), %edi
   1971 	movdqa	%xmm2, -32(%edx, %edi)
   1972 	movdqa	%xmm3, -16(%edx, %edi)
   1973 	jb	L(sh_15_end_no_prefetch_loop)
   1974 
   1975 	movdqa	16(%eax, %edi), %xmm2
   1976 	sub	$32, %ecx
   1977 	movdqa	32(%eax, %edi), %xmm3
   1978 	movdqa	%xmm3, %xmm1
   1979 	palignr	$15, %xmm2, %xmm3
   1980 	palignr	$15, %xmm4, %xmm2
   1981 	lea	32(%edi), %edi
   1982 	movdqa	%xmm2, -32(%edx, %edi)
   1983 	movdqa	%xmm3, -16(%edx, %edi)
   1984 	jae	L(sh_15_no_prefetch_loop)
   1985 
   1986 L(sh_15_end_no_prefetch_loop):
   1987 	lea	32(%ecx), %ecx
   1988 	add	%ecx, %edi
   1989 	add	%edi, %edx
   1990 	lea	15(%edi, %eax), %eax
   1991 	POP	(%edi)
   1992 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   1993 
   1994 	CFI_PUSH (%edi)
   1995 
   1996 	.p2align 4
   1997 L(shl_end_0):
   1998 	lea	32(%ecx), %ecx
   1999 	lea	(%edx, %ecx), %edx
   2000 	lea	(%eax, %ecx), %eax
   2001 	POP	(%edi)
   2002 	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
   2003 
   2004 	.p2align 4
   2005 L(fwd_write_44bytes):
   2006 	movq	-44(%eax), %xmm0
   2007 	movq	%xmm0, -44(%edx)
   2008 L(fwd_write_36bytes):
   2009 	movq	-36(%eax), %xmm0
   2010 	movq	%xmm0, -36(%edx)
   2011 L(fwd_write_28bytes):
   2012 	movq	-28(%eax), %xmm0
   2013 	movq	%xmm0, -28(%edx)
   2014 L(fwd_write_20bytes):
   2015 	movq	-20(%eax), %xmm0
   2016 	movq	%xmm0, -20(%edx)
   2017 L(fwd_write_12bytes):
   2018 	movq	-12(%eax), %xmm0
   2019 	movq	%xmm0, -12(%edx)
   2020 L(fwd_write_4bytes):
   2021 	movl	-4(%eax), %ecx
   2022 	movl	%ecx, -4(%edx)
   2023 #ifndef USE_AS_BCOPY
   2024 # ifdef USE_AS_MEMPCPY
   2025 	movl	%edx, %eax
   2026 # else
   2027 	movl	DEST(%esp), %eax
   2028 # endif
   2029 #endif
   2030 	RETURN
   2031 
   2032 	.p2align 4
   2033 L(fwd_write_40bytes):
   2034 	movq	-40(%eax), %xmm0
   2035 	movq	%xmm0, -40(%edx)
   2036 L(fwd_write_32bytes):
   2037 	movq	-32(%eax), %xmm0
   2038 	movq	%xmm0, -32(%edx)
   2039 L(fwd_write_24bytes):
   2040 	movq	-24(%eax), %xmm0
   2041 	movq	%xmm0, -24(%edx)
   2042 L(fwd_write_16bytes):
   2043 	movq	-16(%eax), %xmm0
   2044 	movq	%xmm0, -16(%edx)
   2045 L(fwd_write_8bytes):
   2046 	movq	-8(%eax), %xmm0
   2047 	movq	%xmm0, -8(%edx)
   2048 L(fwd_write_0bytes):
   2049 #ifndef USE_AS_BCOPY
   2050 # ifdef USE_AS_MEMPCPY
   2051 	movl	%edx, %eax
   2052 # else
   2053 	movl	DEST(%esp), %eax
   2054 # endif
   2055 #endif
   2056 	RETURN
   2057 
   2058 	.p2align 4
   2059 L(fwd_write_5bytes):
   2060 	movl	-5(%eax), %ecx
   2061 	movl	-4(%eax), %eax
   2062 	movl	%ecx, -5(%edx)
   2063 	movl	%eax, -4(%edx)
   2064 #ifndef USE_AS_BCOPY
   2065 # ifdef USE_AS_MEMPCPY
   2066 	movl	%edx, %eax
   2067 # else
   2068 	movl	DEST(%esp), %eax
   2069 # endif
   2070 #endif
   2071 	RETURN
   2072 
   2073 	.p2align 4
   2074 L(fwd_write_45bytes):
   2075 	movq	-45(%eax), %xmm0
   2076 	movq	%xmm0, -45(%edx)
   2077 L(fwd_write_37bytes):
   2078 	movq	-37(%eax), %xmm0
   2079 	movq	%xmm0, -37(%edx)
   2080 L(fwd_write_29bytes):
   2081 	movq	-29(%eax), %xmm0
   2082 	movq	%xmm0, -29(%edx)
   2083 L(fwd_write_21bytes):
   2084 	movq	-21(%eax), %xmm0
   2085 	movq	%xmm0, -21(%edx)
   2086 L(fwd_write_13bytes):
   2087 	movq	-13(%eax), %xmm0
   2088 	movq	%xmm0, -13(%edx)
   2089 	movl	-5(%eax), %ecx
   2090 	movl	%ecx, -5(%edx)
   2091 	movzbl	-1(%eax), %ecx
   2092 	movb	%cl, -1(%edx)
   2093 #ifndef USE_AS_BCOPY
   2094 # ifdef USE_AS_MEMPCPY
   2095 	movl	%edx, %eax
   2096 # else
   2097 	movl	DEST(%esp), %eax
   2098 # endif
   2099 #endif
   2100 	RETURN
   2101 
   2102 	.p2align 4
   2103 L(fwd_write_41bytes):
   2104 	movq	-41(%eax), %xmm0
   2105 	movq	%xmm0, -41(%edx)
   2106 L(fwd_write_33bytes):
   2107 	movq	-33(%eax), %xmm0
   2108 	movq	%xmm0, -33(%edx)
   2109 L(fwd_write_25bytes):
   2110 	movq	-25(%eax), %xmm0
   2111 	movq	%xmm0, -25(%edx)
   2112 L(fwd_write_17bytes):
   2113 	movq	-17(%eax), %xmm0
   2114 	movq	%xmm0, -17(%edx)
   2115 L(fwd_write_9bytes):
   2116 	movq	-9(%eax), %xmm0
   2117 	movq	%xmm0, -9(%edx)
   2118 L(fwd_write_1bytes):
   2119 	movzbl	-1(%eax), %ecx
   2120 	movb	%cl, -1(%edx)
   2121 #ifndef USE_AS_BCOPY
   2122 # ifdef USE_AS_MEMPCPY
   2123 	movl	%edx, %eax
   2124 # else
   2125 	movl	DEST(%esp), %eax
   2126 # endif
   2127 #endif
   2128 	RETURN
   2129 
   2130 	.p2align 4
   2131 L(fwd_write_46bytes):
   2132 	movq	-46(%eax), %xmm0
   2133 	movq	%xmm0, -46(%edx)
   2134 L(fwd_write_38bytes):
   2135 	movq	-38(%eax), %xmm0
   2136 	movq	%xmm0, -38(%edx)
   2137 L(fwd_write_30bytes):
   2138 	movq	-30(%eax), %xmm0
   2139 	movq	%xmm0, -30(%edx)
   2140 L(fwd_write_22bytes):
   2141 	movq	-22(%eax), %xmm0
   2142 	movq	%xmm0, -22(%edx)
   2143 L(fwd_write_14bytes):
   2144 	movq	-14(%eax), %xmm0
   2145 	movq	%xmm0, -14(%edx)
   2146 L(fwd_write_6bytes):
   2147 	movl	-6(%eax), %ecx
   2148 	movl	%ecx, -6(%edx)
   2149 	movzwl	-2(%eax), %ecx
   2150 	movw	%cx, -2(%edx)
   2151 #ifndef USE_AS_BCOPY
   2152 # ifdef USE_AS_MEMPCPY
   2153 	movl	%edx, %eax
   2154 # else
   2155 	movl	DEST(%esp), %eax
   2156 # endif
   2157 #endif
   2158 	RETURN
   2159 
   2160 	.p2align 4
   2161 L(fwd_write_42bytes):
   2162 	movq	-42(%eax), %xmm0
   2163 	movq	%xmm0, -42(%edx)
   2164 L(fwd_write_34bytes):
   2165 	movq	-34(%eax), %xmm0
   2166 	movq	%xmm0, -34(%edx)
   2167 L(fwd_write_26bytes):
   2168 	movq	-26(%eax), %xmm0
   2169 	movq	%xmm0, -26(%edx)
   2170 L(fwd_write_18bytes):
   2171 	movq	-18(%eax), %xmm0
   2172 	movq	%xmm0, -18(%edx)
   2173 L(fwd_write_10bytes):
   2174 	movq	-10(%eax), %xmm0
   2175 	movq	%xmm0, -10(%edx)
   2176 L(fwd_write_2bytes):
   2177 	movzwl	-2(%eax), %ecx
   2178 	movw	%cx, -2(%edx)
   2179 #ifndef USE_AS_BCOPY
   2180 # ifdef USE_AS_MEMPCPY
   2181 	movl	%edx, %eax
   2182 # else
   2183 	movl	DEST(%esp), %eax
   2184 # endif
   2185 #endif
   2186 	RETURN
   2187 
   2188 	.p2align 4
   2189 L(fwd_write_47bytes):
   2190 	movq	-47(%eax), %xmm0
   2191 	movq	%xmm0, -47(%edx)
   2192 L(fwd_write_39bytes):
   2193 	movq	-39(%eax), %xmm0
   2194 	movq	%xmm0, -39(%edx)
   2195 L(fwd_write_31bytes):
   2196 	movq	-31(%eax), %xmm0
   2197 	movq	%xmm0, -31(%edx)
   2198 L(fwd_write_23bytes):
   2199 	movq	-23(%eax), %xmm0
   2200 	movq	%xmm0, -23(%edx)
   2201 L(fwd_write_15bytes):
   2202 	movq	-15(%eax), %xmm0
   2203 	movq	%xmm0, -15(%edx)
   2204 L(fwd_write_7bytes):
   2205 	movl	-7(%eax), %ecx
   2206 	movl	%ecx, -7(%edx)
   2207 	movzwl	-3(%eax), %ecx
   2208 	movzbl	-1(%eax), %eax
   2209 	movw	%cx, -3(%edx)
   2210 	movb	%al, -1(%edx)
   2211 #ifndef USE_AS_BCOPY
   2212 # ifdef USE_AS_MEMPCPY
   2213 	movl	%edx, %eax
   2214 # else
   2215 	movl	DEST(%esp), %eax
   2216 # endif
   2217 #endif
   2218 	RETURN
   2219 
   2220 	.p2align 4
   2221 L(fwd_write_43bytes):
   2222 	movq	-43(%eax), %xmm0
   2223 	movq	%xmm0, -43(%edx)
   2224 L(fwd_write_35bytes):
   2225 	movq	-35(%eax), %xmm0
   2226 	movq	%xmm0, -35(%edx)
   2227 L(fwd_write_27bytes):
   2228 	movq	-27(%eax), %xmm0
   2229 	movq	%xmm0, -27(%edx)
   2230 L(fwd_write_19bytes):
   2231 	movq	-19(%eax), %xmm0
   2232 	movq	%xmm0, -19(%edx)
   2233 L(fwd_write_11bytes):
   2234 	movq	-11(%eax), %xmm0
   2235 	movq	%xmm0, -11(%edx)
   2236 L(fwd_write_3bytes):
   2237 	movzwl	-3(%eax), %ecx
   2238 	movzbl	-1(%eax), %eax
   2239 	movw	%cx, -3(%edx)
   2240 	movb	%al, -1(%edx)
   2241 #ifndef USE_AS_BCOPY
   2242 # ifdef USE_AS_MEMPCPY
   2243 	movl	%edx, %eax
   2244 # else
   2245 	movl	DEST(%esp), %eax
   2246 # endif
   2247 #endif
   2248 	RETURN
   2249 
   2250 	.p2align 4
   2251 L(fwd_write_40bytes_align):
   2252 	movdqa	-40(%eax), %xmm0
   2253 	movdqa	%xmm0, -40(%edx)
   2254 L(fwd_write_24bytes_align):
   2255 	movdqa	-24(%eax), %xmm0
   2256 	movdqa	%xmm0, -24(%edx)
   2257 L(fwd_write_8bytes_align):
   2258 	movq	-8(%eax), %xmm0
   2259 	movq	%xmm0, -8(%edx)
   2260 L(fwd_write_0bytes_align):
   2261 #ifndef USE_AS_BCOPY
   2262 # ifdef USE_AS_MEMPCPY
   2263 	movl	%edx, %eax
   2264 # else
   2265 	movl	DEST(%esp), %eax
   2266 # endif
   2267 #endif
   2268 	RETURN
   2269 
   2270 	.p2align 4
   2271 L(fwd_write_32bytes_align):
   2272 	movdqa	-32(%eax), %xmm0
   2273 	movdqa	%xmm0, -32(%edx)
   2274 L(fwd_write_16bytes_align):
   2275 	movdqa	-16(%eax), %xmm0
   2276 	movdqa	%xmm0, -16(%edx)
   2277 #ifndef USE_AS_BCOPY
   2278 # ifdef USE_AS_MEMPCPY
   2279 	movl	%edx, %eax
   2280 # else
   2281 	movl	DEST(%esp), %eax
   2282 # endif
   2283 #endif
   2284 	RETURN
   2285 
   2286 	.p2align 4
   2287 L(fwd_write_5bytes_align):
   2288 	movl	-5(%eax), %ecx
   2289 	movl	-4(%eax), %eax
   2290 	movl	%ecx, -5(%edx)
   2291 	movl	%eax, -4(%edx)
   2292 #ifndef USE_AS_BCOPY
   2293 # ifdef USE_AS_MEMPCPY
   2294 	movl	%edx, %eax
   2295 # else
   2296 	movl	DEST(%esp), %eax
   2297 # endif
   2298 #endif
   2299 	RETURN
   2300 
   2301 	.p2align 4
   2302 L(fwd_write_45bytes_align):
   2303 	movdqa	-45(%eax), %xmm0
   2304 	movdqa	%xmm0, -45(%edx)
   2305 L(fwd_write_29bytes_align):
   2306 	movdqa	-29(%eax), %xmm0
   2307 	movdqa	%xmm0, -29(%edx)
   2308 L(fwd_write_13bytes_align):
   2309 	movq	-13(%eax), %xmm0
   2310 	movq	%xmm0, -13(%edx)
   2311 	movl	-5(%eax), %ecx
   2312 	movl	%ecx, -5(%edx)
   2313 	movzbl	-1(%eax), %ecx
   2314 	movb	%cl, -1(%edx)
   2315 #ifndef USE_AS_BCOPY
   2316 # ifdef USE_AS_MEMPCPY
   2317 	movl	%edx, %eax
   2318 # else
   2319 	movl	DEST(%esp), %eax
   2320 # endif
   2321 #endif
   2322 	RETURN
   2323 
   2324 	.p2align 4
   2325 L(fwd_write_37bytes_align):
   2326 	movdqa	-37(%eax), %xmm0
   2327 	movdqa	%xmm0, -37(%edx)
   2328 L(fwd_write_21bytes_align):
   2329 	movdqa	-21(%eax), %xmm0
   2330 	movdqa	%xmm0, -21(%edx)
   2331 	movl	-5(%eax), %ecx
   2332 	movl	%ecx, -5(%edx)
   2333 	movzbl	-1(%eax), %ecx
   2334 	movb	%cl, -1(%edx)
   2335 #ifndef USE_AS_BCOPY
   2336 # ifdef USE_AS_MEMPCPY
   2337 	movl	%edx, %eax
   2338 # else
   2339 	movl	DEST(%esp), %eax
   2340 # endif
   2341 #endif
   2342 	RETURN
   2343 
   2344 	.p2align 4
   2345 L(fwd_write_41bytes_align):
   2346 	movdqa	-41(%eax), %xmm0
   2347 	movdqa	%xmm0, -41(%edx)
   2348 L(fwd_write_25bytes_align):
   2349 	movdqa	-25(%eax), %xmm0
   2350 	movdqa	%xmm0, -25(%edx)
   2351 L(fwd_write_9bytes_align):
   2352 	movq	-9(%eax), %xmm0
   2353 	movq	%xmm0, -9(%edx)
   2354 L(fwd_write_1bytes_align):
   2355 	movzbl	-1(%eax), %ecx
   2356 	movb	%cl, -1(%edx)
   2357 #ifndef USE_AS_BCOPY
   2358 # ifdef USE_AS_MEMPCPY
   2359 	movl	%edx, %eax
   2360 # else
   2361 	movl	DEST(%esp), %eax
   2362 # endif
   2363 #endif
   2364 	RETURN
   2365 
   2366 	.p2align 4
   2367 L(fwd_write_33bytes_align):
   2368 	movdqa	-33(%eax), %xmm0
   2369 	movdqa	%xmm0, -33(%edx)
   2370 L(fwd_write_17bytes_align):
   2371 	movdqa	-17(%eax), %xmm0
   2372 	movdqa	%xmm0, -17(%edx)
   2373 	movzbl	-1(%eax), %ecx
   2374 	movb	%cl, -1(%edx)
   2375 #ifndef USE_AS_BCOPY
   2376 # ifdef USE_AS_MEMPCPY
   2377 	movl	%edx, %eax
   2378 # else
   2379 	movl	DEST(%esp), %eax
   2380 # endif
   2381 #endif
   2382 	RETURN
   2383 
   2384 	.p2align 4
   2385 L(fwd_write_46bytes_align):
   2386 	movdqa	-46(%eax), %xmm0
   2387 	movdqa	%xmm0, -46(%edx)
   2388 L(fwd_write_30bytes_align):
   2389 	movdqa	-30(%eax), %xmm0
   2390 	movdqa	%xmm0, -30(%edx)
   2391 L(fwd_write_14bytes_align):
   2392 	movq	-14(%eax), %xmm0
   2393 	movq	%xmm0, -14(%edx)
   2394 L(fwd_write_6bytes_align):
   2395 	movl	-6(%eax), %ecx
   2396 	movl	%ecx, -6(%edx)
   2397 	movzwl	-2(%eax), %ecx
   2398 	movw	%cx, -2(%edx)
   2399 #ifndef USE_AS_BCOPY
   2400 # ifdef USE_AS_MEMPCPY
   2401 	movl	%edx, %eax
   2402 # else
   2403 	movl	DEST(%esp), %eax
   2404 # endif
   2405 #endif
   2406 	RETURN
   2407 
   2408 	.p2align 4
   2409 L(fwd_write_38bytes_align):
   2410 	movdqa	-38(%eax), %xmm0
   2411 	movdqa	%xmm0, -38(%edx)
   2412 L(fwd_write_22bytes_align):
   2413 	movdqa	-22(%eax), %xmm0
   2414 	movdqa	%xmm0, -22(%edx)
   2415 	movl	-6(%eax), %ecx
   2416 	movl	%ecx, -6(%edx)
   2417 	movzwl	-2(%eax), %ecx
   2418 	movw	%cx, -2(%edx)
   2419 #ifndef USE_AS_BCOPY
   2420 # ifdef USE_AS_MEMPCPY
   2421 	movl	%edx, %eax
   2422 # else
   2423 	movl	DEST(%esp), %eax
   2424 # endif
   2425 #endif
   2426 	RETURN
   2427 
   2428 	.p2align 4
   2429 L(fwd_write_42bytes_align):
   2430 	movdqa	-42(%eax), %xmm0
   2431 	movdqa	%xmm0, -42(%edx)
   2432 L(fwd_write_26bytes_align):
   2433 	movdqa	-26(%eax), %xmm0
   2434 	movdqa	%xmm0, -26(%edx)
   2435 L(fwd_write_10bytes_align):
   2436 	movq	-10(%eax), %xmm0
   2437 	movq	%xmm0, -10(%edx)
   2438 L(fwd_write_2bytes_align):
   2439 	movzwl	-2(%eax), %ecx
   2440 	movw	%cx, -2(%edx)
   2441 #ifndef USE_AS_BCOPY
   2442 # ifdef USE_AS_MEMPCPY
   2443 	movl	%edx, %eax
   2444 # else
   2445 	movl	DEST(%esp), %eax
   2446 # endif
   2447 #endif
   2448 	RETURN
   2449 
   2450 	.p2align 4
   2451 L(fwd_write_34bytes_align):
   2452 	movdqa	-34(%eax), %xmm0
   2453 	movdqa	%xmm0, -34(%edx)
   2454 L(fwd_write_18bytes_align):
   2455 	movdqa	-18(%eax), %xmm0
   2456 	movdqa	%xmm0, -18(%edx)
   2457 	movzwl	-2(%eax), %ecx
   2458 	movw	%cx, -2(%edx)
   2459 #ifndef USE_AS_BCOPY
   2460 # ifdef USE_AS_MEMPCPY
   2461 	movl	%edx, %eax
   2462 # else
   2463 	movl	DEST(%esp), %eax
   2464 # endif
   2465 #endif
   2466 	RETURN
   2467 
   2468 	.p2align 4
   2469 L(fwd_write_47bytes_align):
   2470 	movdqa	-47(%eax), %xmm0
   2471 	movdqa	%xmm0, -47(%edx)
   2472 L(fwd_write_31bytes_align):
   2473 	movdqa	-31(%eax), %xmm0
   2474 	movdqa	%xmm0, -31(%edx)
   2475 L(fwd_write_15bytes_align):
   2476 	movq	-15(%eax), %xmm0
   2477 	movq	%xmm0, -15(%edx)
   2478 L(fwd_write_7bytes_align):
   2479 	movl	-7(%eax), %ecx
   2480 	movl	%ecx, -7(%edx)
   2481 	movzwl	-3(%eax), %ecx
   2482 	movzbl	-1(%eax), %eax
   2483 	movw	%cx, -3(%edx)
   2484 	movb	%al, -1(%edx)
   2485 #ifndef USE_AS_BCOPY
   2486 # ifdef USE_AS_MEMPCPY
   2487 	movl	%edx, %eax
   2488 # else
   2489 	movl	DEST(%esp), %eax
   2490 # endif
   2491 #endif
   2492 	RETURN
   2493 
   2494 	.p2align 4
   2495 L(fwd_write_39bytes_align):
   2496 	movdqa	-39(%eax), %xmm0
   2497 	movdqa	%xmm0, -39(%edx)
   2498 L(fwd_write_23bytes_align):
   2499 	movdqa	-23(%eax), %xmm0
   2500 	movdqa	%xmm0, -23(%edx)
   2501 	movl	-7(%eax), %ecx
   2502 	movl	%ecx, -7(%edx)
   2503 	movzwl	-3(%eax), %ecx
   2504 	movzbl	-1(%eax), %eax
   2505 	movw	%cx, -3(%edx)
   2506 	movb	%al, -1(%edx)
   2507 #ifndef USE_AS_BCOPY
   2508 # ifdef USE_AS_MEMPCPY
   2509 	movl	%edx, %eax
   2510 # else
   2511 	movl	DEST(%esp), %eax
   2512 # endif
   2513 #endif
   2514 	RETURN
   2515 
   2516 	.p2align 4
   2517 L(fwd_write_43bytes_align):
   2518 	movdqa	-43(%eax), %xmm0
   2519 	movdqa	%xmm0, -43(%edx)
   2520 L(fwd_write_27bytes_align):
   2521 	movdqa	-27(%eax), %xmm0
   2522 	movdqa	%xmm0, -27(%edx)
   2523 L(fwd_write_11bytes_align):
   2524 	movq	-11(%eax), %xmm0
   2525 	movq	%xmm0, -11(%edx)
   2526 L(fwd_write_3bytes_align):
   2527 	movzwl	-3(%eax), %ecx
   2528 	movzbl	-1(%eax), %eax
   2529 	movw	%cx, -3(%edx)
   2530 	movb	%al, -1(%edx)
   2531 #ifndef USE_AS_BCOPY
   2532 # ifdef USE_AS_MEMPCPY
   2533 	movl	%edx, %eax
   2534 # else
   2535 	movl	DEST(%esp), %eax
   2536 # endif
   2537 #endif
   2538 	RETURN
   2539 
   2540 	.p2align 4
   2541 L(fwd_write_35bytes_align):
   2542 	movdqa	-35(%eax), %xmm0
   2543 	movdqa	%xmm0, -35(%edx)
   2544 L(fwd_write_19bytes_align):
   2545 	movdqa	-19(%eax), %xmm0
   2546 	movdqa	%xmm0, -19(%edx)
   2547 	movzwl	-3(%eax), %ecx
   2548 	movzbl	-1(%eax), %eax
   2549 	movw	%cx, -3(%edx)
   2550 	movb	%al, -1(%edx)
   2551 #ifndef USE_AS_BCOPY
   2552 # ifdef USE_AS_MEMPCPY
   2553 	movl	%edx, %eax
   2554 # else
   2555 	movl	DEST(%esp), %eax
   2556 # endif
   2557 #endif
   2558 	RETURN
   2559 
   2560 	.p2align 4
   2561 L(fwd_write_44bytes_align):
   2562 	movdqa	-44(%eax), %xmm0
   2563 	movdqa	%xmm0, -44(%edx)
   2564 L(fwd_write_28bytes_align):
   2565 	movdqa	-28(%eax), %xmm0
   2566 	movdqa	%xmm0, -28(%edx)
   2567 L(fwd_write_12bytes_align):
   2568 	movq	-12(%eax), %xmm0
   2569 	movq	%xmm0, -12(%edx)
   2570 L(fwd_write_4bytes_align):
   2571 	movl	-4(%eax), %ecx
   2572 	movl	%ecx, -4(%edx)
   2573 #ifndef USE_AS_BCOPY
   2574 # ifdef USE_AS_MEMPCPY
   2575 	movl	%edx, %eax
   2576 # else
   2577 	movl	DEST(%esp), %eax
   2578 # endif
   2579 #endif
   2580 	RETURN
   2581 
   2582 	.p2align 4
   2583 L(fwd_write_36bytes_align):
   2584 	movdqa	-36(%eax), %xmm0
   2585 	movdqa	%xmm0, -36(%edx)
   2586 L(fwd_write_20bytes_align):
   2587 	movdqa	-20(%eax), %xmm0
   2588 	movdqa	%xmm0, -20(%edx)
   2589 	movl	-4(%eax), %ecx
   2590 	movl	%ecx, -4(%edx)
   2591 #ifndef USE_AS_BCOPY
   2592 # ifdef USE_AS_MEMPCPY
   2593 	movl	%edx, %eax
   2594 # else
   2595 	movl	DEST(%esp), %eax
   2596 # endif
   2597 #endif
   2598 	RETURN_END
   2599 
   2600 	CFI_PUSH (%edi)
   2601 
   2602 	.p2align 4
   2603 L(large_page):
   2604 	movdqu	(%eax), %xmm1
   2605 #ifdef USE_AS_MEMMOVE
   2606 	movl	DEST+4(%esp), %edi
   2607 	movdqu	%xmm0, (%edi)
   2608 #endif
   2609 	lea	16(%eax), %eax
   2610 	movntdq	%xmm1, (%edx)
   2611 	lea	16(%edx), %edx
   2612 	lea	-0x90(%ecx), %ecx
   2613 	POP (%edi)
   2614 
   2615 	.p2align 4
   2616 L(large_page_loop):
   2617 	movdqu	(%eax), %xmm0
   2618 	movdqu	0x10(%eax), %xmm1
   2619 	movdqu	0x20(%eax), %xmm2
   2620 	movdqu	0x30(%eax), %xmm3
   2621 	movdqu	0x40(%eax), %xmm4
   2622 	movdqu	0x50(%eax), %xmm5
   2623 	movdqu	0x60(%eax), %xmm6
   2624 	movdqu	0x70(%eax), %xmm7
   2625 	lea	0x80(%eax), %eax
   2626 
   2627 	sub	$0x80, %ecx
   2628 	movntdq	%xmm0, (%edx)
   2629 	movntdq	%xmm1, 0x10(%edx)
   2630 	movntdq	%xmm2, 0x20(%edx)
   2631 	movntdq	%xmm3, 0x30(%edx)
   2632 	movntdq	%xmm4, 0x40(%edx)
   2633 	movntdq	%xmm5, 0x50(%edx)
   2634 	movntdq	%xmm6, 0x60(%edx)
   2635 	movntdq	%xmm7, 0x70(%edx)
   2636 	lea	0x80(%edx), %edx
   2637 	jae	L(large_page_loop)
   2638 	cmp	$-0x40, %ecx
   2639 	lea	0x80(%ecx), %ecx
   2640 	jl	L(large_page_less_64bytes)
   2641 
   2642 	movdqu	(%eax), %xmm0
   2643 	movdqu	0x10(%eax), %xmm1
   2644 	movdqu	0x20(%eax), %xmm2
   2645 	movdqu	0x30(%eax), %xmm3
   2646 	lea	0x40(%eax), %eax
   2647 
   2648 	movntdq	%xmm0, (%edx)
   2649 	movntdq	%xmm1, 0x10(%edx)
   2650 	movntdq	%xmm2, 0x20(%edx)
   2651 	movntdq	%xmm3, 0x30(%edx)
   2652 	lea	0x40(%edx), %edx
   2653 	sub	$0x40, %ecx
   2654 L(large_page_less_64bytes):
   2655 	cmp	$32, %ecx
   2656 	jb	L(large_page_less_32bytes)
   2657 	movdqu	(%eax), %xmm0
   2658 	movdqu	0x10(%eax), %xmm1
   2659 	lea	0x20(%eax), %eax
   2660 	movntdq	%xmm0, (%edx)
   2661 	movntdq	%xmm1, 0x10(%edx)
   2662 	lea	0x20(%edx), %edx
   2663 	sub	$0x20, %ecx
   2664 L(large_page_less_32bytes):
   2665 	add	%ecx, %edx
   2666 	add	%ecx, %eax
   2667 	sfence
   2668 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
   2669 
   2670 	.p2align 4
   2671 L(bk_write_44bytes):
   2672 	movq	36(%eax), %xmm0
   2673 	movq	%xmm0, 36(%edx)
   2674 L(bk_write_36bytes):
   2675 	movq	28(%eax), %xmm0
   2676 	movq	%xmm0, 28(%edx)
   2677 L(bk_write_28bytes):
   2678 	movq	20(%eax), %xmm0
   2679 	movq	%xmm0, 20(%edx)
   2680 L(bk_write_20bytes):
   2681 	movq	12(%eax), %xmm0
   2682 	movq	%xmm0, 12(%edx)
   2683 L(bk_write_12bytes):
   2684 	movq	4(%eax), %xmm0
   2685 	movq	%xmm0, 4(%edx)
   2686 L(bk_write_4bytes):
   2687 	movl	(%eax), %ecx
   2688 	movl	%ecx, (%edx)
   2689 L(bk_write_0bytes):
   2690 #ifndef USE_AS_BCOPY
   2691 	movl	DEST(%esp), %eax
   2692 # ifdef USE_AS_MEMPCPY
   2693 	movl	LEN(%esp), %ecx
   2694 	add	%ecx, %eax
   2695 # endif
   2696 #endif
   2697 	RETURN
   2698 
   2699 	.p2align 4
   2700 L(bk_write_40bytes):
   2701 	movq	32(%eax), %xmm0
   2702 	movq	%xmm0, 32(%edx)
   2703 L(bk_write_32bytes):
   2704 	movq	24(%eax), %xmm0
   2705 	movq	%xmm0, 24(%edx)
   2706 L(bk_write_24bytes):
   2707 	movq	16(%eax), %xmm0
   2708 	movq	%xmm0, 16(%edx)
   2709 L(bk_write_16bytes):
   2710 	movq	8(%eax), %xmm0
   2711 	movq	%xmm0, 8(%edx)
   2712 L(bk_write_8bytes):
   2713 	movq	(%eax), %xmm0
   2714 	movq	%xmm0, (%edx)
   2715 #ifndef USE_AS_BCOPY
   2716 	movl	DEST(%esp), %eax
   2717 # ifdef USE_AS_MEMPCPY
   2718 	movl	LEN(%esp), %ecx
   2719 	add	%ecx, %eax
   2720 # endif
   2721 #endif
   2722 	RETURN
   2723 
   2724 	.p2align 4
   2725 L(bk_write_45bytes):
   2726 	movq	37(%eax), %xmm0
   2727 	movq	%xmm0, 37(%edx)
   2728 L(bk_write_37bytes):
   2729 	movq	29(%eax), %xmm0
   2730 	movq	%xmm0, 29(%edx)
   2731 L(bk_write_29bytes):
   2732 	movq	21(%eax), %xmm0
   2733 	movq	%xmm0, 21(%edx)
   2734 L(bk_write_21bytes):
   2735 	movq	13(%eax), %xmm0
   2736 	movq	%xmm0, 13(%edx)
   2737 L(bk_write_13bytes):
   2738 	movq	5(%eax), %xmm0
   2739 	movq	%xmm0, 5(%edx)
   2740 L(bk_write_5bytes):
   2741 	movl	1(%eax), %ecx
   2742 	movl	%ecx, 1(%edx)
   2743 L(bk_write_1bytes):
   2744 	movzbl	(%eax), %ecx
   2745 	movb	%cl, (%edx)
   2746 #ifndef USE_AS_BCOPY
   2747 	movl	DEST(%esp), %eax
   2748 # ifdef USE_AS_MEMPCPY
   2749 	movl	LEN(%esp), %ecx
   2750 	add	%ecx, %eax
   2751 # endif
   2752 #endif
   2753 	RETURN
   2754 
   2755 	.p2align 4
   2756 L(bk_write_41bytes):
   2757 	movq	33(%eax), %xmm0
   2758 	movq	%xmm0, 33(%edx)
   2759 L(bk_write_33bytes):
   2760 	movq	25(%eax), %xmm0
   2761 	movq	%xmm0, 25(%edx)
   2762 L(bk_write_25bytes):
   2763 	movq	17(%eax), %xmm0
   2764 	movq	%xmm0, 17(%edx)
   2765 L(bk_write_17bytes):
   2766 	movq	9(%eax), %xmm0
   2767 	movq	%xmm0, 9(%edx)
   2768 L(bk_write_9bytes):
   2769 	movq	1(%eax), %xmm0
   2770 	movq	%xmm0, 1(%edx)
   2771 	movzbl	(%eax), %ecx
   2772 	movb	%cl, (%edx)
   2773 #ifndef USE_AS_BCOPY
   2774 	movl	DEST(%esp), %eax
   2775 # ifdef USE_AS_MEMPCPY
   2776 	movl	LEN(%esp), %ecx
   2777 	add	%ecx, %eax
   2778 # endif
   2779 #endif
   2780 	RETURN
   2781 
   2782 	.p2align 4
   2783 L(bk_write_46bytes):
   2784 	movq	38(%eax), %xmm0
   2785 	movq	%xmm0, 38(%edx)
   2786 L(bk_write_38bytes):
   2787 	movq	30(%eax), %xmm0
   2788 	movq	%xmm0, 30(%edx)
   2789 L(bk_write_30bytes):
   2790 	movq	22(%eax), %xmm0
   2791 	movq	%xmm0, 22(%edx)
   2792 L(bk_write_22bytes):
   2793 	movq	14(%eax), %xmm0
   2794 	movq	%xmm0, 14(%edx)
   2795 L(bk_write_14bytes):
   2796 	movq	6(%eax), %xmm0
   2797 	movq	%xmm0, 6(%edx)
   2798 L(bk_write_6bytes):
   2799 	movl	2(%eax), %ecx
   2800 	movl	%ecx, 2(%edx)
   2801 	movzwl	(%eax), %ecx
   2802 	movw	%cx, (%edx)
   2803 #ifndef USE_AS_BCOPY
   2804 	movl	DEST(%esp), %eax
   2805 # ifdef USE_AS_MEMPCPY
   2806 	movl	LEN(%esp), %ecx
   2807 	add	%ecx, %eax
   2808 # endif
   2809 #endif
   2810 	RETURN
   2811 
   2812 	.p2align 4
   2813 L(bk_write_42bytes):
   2814 	movq	34(%eax), %xmm0
   2815 	movq	%xmm0, 34(%edx)
   2816 L(bk_write_34bytes):
   2817 	movq	26(%eax), %xmm0
   2818 	movq	%xmm0, 26(%edx)
   2819 L(bk_write_26bytes):
   2820 	movq	18(%eax), %xmm0
   2821 	movq	%xmm0, 18(%edx)
   2822 L(bk_write_18bytes):
   2823 	movq	10(%eax), %xmm0
   2824 	movq	%xmm0, 10(%edx)
   2825 L(bk_write_10bytes):
   2826 	movq	2(%eax), %xmm0
   2827 	movq	%xmm0, 2(%edx)
   2828 L(bk_write_2bytes):
   2829 	movzwl	(%eax), %ecx
   2830 	movw	%cx, (%edx)
   2831 #ifndef USE_AS_BCOPY
   2832 	movl	DEST(%esp), %eax
   2833 # ifdef USE_AS_MEMPCPY
   2834 	movl	LEN(%esp), %ecx
   2835 	add	%ecx, %eax
   2836 # endif
   2837 #endif
   2838 	RETURN
   2839 
   2840 	.p2align 4
   2841 L(bk_write_47bytes):
   2842 	movq	39(%eax), %xmm0
   2843 	movq	%xmm0, 39(%edx)
   2844 L(bk_write_39bytes):
   2845 	movq	31(%eax), %xmm0
   2846 	movq	%xmm0, 31(%edx)
   2847 L(bk_write_31bytes):
   2848 	movq	23(%eax), %xmm0
   2849 	movq	%xmm0, 23(%edx)
   2850 L(bk_write_23bytes):
   2851 	movq	15(%eax), %xmm0
   2852 	movq	%xmm0, 15(%edx)
   2853 L(bk_write_15bytes):
   2854 	movq	7(%eax), %xmm0
   2855 	movq	%xmm0, 7(%edx)
   2856 L(bk_write_7bytes):
   2857 	movl	3(%eax), %ecx
   2858 	movl	%ecx, 3(%edx)
   2859 	movzwl	1(%eax), %ecx
   2860 	movw	%cx, 1(%edx)
   2861 	movzbl	(%eax), %eax
   2862 	movb	%al, (%edx)
   2863 #ifndef USE_AS_BCOPY
   2864 	movl	DEST(%esp), %eax
   2865 # ifdef USE_AS_MEMPCPY
   2866 	movl	LEN(%esp), %ecx
   2867 	add	%ecx, %eax
   2868 # endif
   2869 #endif
   2870 	RETURN
   2871 
   2872 	.p2align 4
   2873 L(bk_write_43bytes):
   2874 	movq	35(%eax), %xmm0
   2875 	movq	%xmm0, 35(%edx)
   2876 L(bk_write_35bytes):
   2877 	movq	27(%eax), %xmm0
   2878 	movq	%xmm0, 27(%edx)
   2879 L(bk_write_27bytes):
   2880 	movq	19(%eax), %xmm0
   2881 	movq	%xmm0, 19(%edx)
   2882 L(bk_write_19bytes):
   2883 	movq	11(%eax), %xmm0
   2884 	movq	%xmm0, 11(%edx)
   2885 L(bk_write_11bytes):
   2886 	movq	3(%eax), %xmm0
   2887 	movq	%xmm0, 3(%edx)
   2888 L(bk_write_3bytes):
   2889 	movzwl	1(%eax), %ecx
   2890 	movw	%cx, 1(%edx)
   2891 	movzbl	(%eax), %eax
   2892 	movb	%al, (%edx)
   2893 #ifndef USE_AS_BCOPY
   2894 	movl	DEST(%esp), %eax
   2895 # ifdef USE_AS_MEMPCPY
   2896 	movl	LEN(%esp), %ecx
   2897 	add	%ecx, %eax
   2898 # endif
   2899 #endif
   2900 	RETURN_END
   2901 
   2902 
   2903 	.pushsection .rodata.ssse3,"a",@progbits
   2904 	.p2align 2
   2905 L(table_48bytes_fwd):
   2906 	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
   2907 	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
   2908 	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
   2909 	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
   2910 	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
   2911 	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
   2912 	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
   2913 	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
   2914 	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
   2915 	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
   2916 	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
   2917 	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
   2918 	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
   2919 	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
   2920 	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
   2921 	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
   2922 	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
   2923 	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
   2924 	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
   2925 	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
   2926 	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
   2927 	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
   2928 	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
   2929 	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
   2930 	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
   2931 	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
   2932 	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
   2933 	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
   2934 	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
   2935 	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
   2936 	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
   2937 	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
   2938 	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
   2939 	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
   2940 	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
   2941 	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
   2942 	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
   2943 	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
   2944 	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
   2945 	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
   2946 	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
   2947 	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
   2948 	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
   2949 	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
   2950 	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
   2951 	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
   2952 	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
   2953 	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
   2954 
   2955 	.p2align 2
   2956 L(table_48bytes_fwd_align):
   2957 	.int	JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
   2958 	.int	JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
   2959 	.int	JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
   2960 	.int	JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
   2961 	.int	JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
   2962 	.int	JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
   2963 	.int	JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
   2964 	.int	JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
   2965 	.int	JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
   2966 	.int	JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
   2967 	.int	JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
   2968 	.int	JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
   2969 	.int	JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
   2970 	.int	JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
   2971 	.int	JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
   2972 	.int	JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
   2973 	.int	JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
   2974 	.int	JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
   2975 	.int	JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
   2976 	.int	JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
   2977 	.int	JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
   2978 	.int	JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
   2979 	.int	JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
   2980 	.int	JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
   2981 	.int	JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
   2982 	.int	JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
   2983 	.int	JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
   2984 	.int	JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
   2985 	.int	JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
   2986 	.int	JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
   2987 	.int	JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
   2988 	.int	JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
   2989 	.int	JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
   2990 	.int	JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
   2991 	.int	JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
   2992 	.int	JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
   2993 	.int	JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
   2994 	.int	JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
   2995 	.int	JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
   2996 	.int	JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
   2997 	.int	JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
   2998 	.int	JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
   2999 	.int	JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
   3000 	.int	JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
   3001 	.int	JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
   3002 	.int	JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
   3003 	.int	JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
   3004 	.int	JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
   3005 
   3006 	.p2align 2
   3007 L(shl_table):
   3008 	.int	JMPTBL (L(shl_0), L(shl_table))
   3009 	.int	JMPTBL (L(shl_1), L(shl_table))
   3010 	.int	JMPTBL (L(shl_2), L(shl_table))
   3011 	.int	JMPTBL (L(shl_3), L(shl_table))
   3012 	.int	JMPTBL (L(shl_4), L(shl_table))
   3013 	.int	JMPTBL (L(shl_5), L(shl_table))
   3014 	.int	JMPTBL (L(shl_6), L(shl_table))
   3015 	.int	JMPTBL (L(shl_7), L(shl_table))
   3016 	.int	JMPTBL (L(shl_8), L(shl_table))
   3017 	.int	JMPTBL (L(shl_9), L(shl_table))
   3018 	.int	JMPTBL (L(shl_10), L(shl_table))
   3019 	.int	JMPTBL (L(shl_11), L(shl_table))
   3020 	.int	JMPTBL (L(shl_12), L(shl_table))
   3021 	.int	JMPTBL (L(shl_13), L(shl_table))
   3022 	.int	JMPTBL (L(shl_14), L(shl_table))
   3023 	.int	JMPTBL (L(shl_15), L(shl_table))
   3024 
   3025 	.p2align 2
   3026 L(table_48_bytes_bwd):
   3027 	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
   3028 	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
   3029 	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
   3030 	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
   3031 	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
   3032 	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
   3033 	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
   3034 	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
   3035 	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
   3036 	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
   3037 	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
   3038 	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
   3039 	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
   3040 	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
   3041 	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
   3042 	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
   3043 	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
   3044 	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
   3045 	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
   3046 	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
   3047 	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
   3048 	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
   3049 	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
   3050 	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
   3051 	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
   3052 	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
   3053 	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
   3054 	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
   3055 	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
   3056 	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
   3057 	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
   3058 	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
   3059 	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
   3060 	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
   3061 	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
   3062 	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
   3063 	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
   3064 	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
   3065 	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
   3066 	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
   3067 	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
   3068 	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
   3069 	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
   3070 	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
   3071 	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
   3072 	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
   3073 	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
   3074 	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
   3075 
   3076 	.popsection
   3077 
   3078 #ifdef USE_AS_MEMMOVE
   3079 	.p2align 4
   3080 L(copy_backward):
   3081 	PUSH (%edi)
   3082 	movl	%eax, %edi
   3083 	lea	(%ecx,%edx,1),%edx
   3084 	lea	(%ecx,%edi,1),%edi
   3085 	testl	$0x3, %edx
   3086 	jnz	L(bk_align)
   3087 
   3088 L(bk_aligned_4):
   3089 	cmp	$64, %ecx
   3090 	jae	L(bk_write_more64bytes)
   3091 
   3092 L(bk_write_64bytesless):
   3093 	cmp	$32, %ecx
   3094 	jb	L(bk_write_less32bytes)
   3095 
   3096 L(bk_write_more32bytes):
   3097 	/* Copy 32 bytes at a time.  */
   3098 	sub	$32, %ecx
   3099 	movq	-8(%edi), %xmm0
   3100 	movq	%xmm0, -8(%edx)
   3101 	movq	-16(%edi), %xmm0
   3102 	movq	%xmm0, -16(%edx)
   3103 	movq	-24(%edi), %xmm0
   3104 	movq	%xmm0, -24(%edx)
   3105 	movq	-32(%edi), %xmm0
   3106 	movq	%xmm0, -32(%edx)
   3107 	sub	$32, %edx
   3108 	sub	$32, %edi
   3109 
   3110 L(bk_write_less32bytes):
   3111 	movl	%edi, %eax
   3112 	sub	%ecx, %edx
   3113 	sub	%ecx, %eax
   3114 	POP (%edi)
   3115 L(bk_write_less32bytes_2):
   3116 	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
   3117 
   3118 	CFI_PUSH (%edi)
   3119 
   3120 	.p2align 4
   3121 L(bk_align):
   3122 	cmp	$8, %ecx
   3123 	jbe	L(bk_write_less32bytes)
   3124 	testl	$1, %edx
   3125 	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
   3126 	then	(EDX & 2) must be != 0.  */
   3127 	jz	L(bk_got2)
   3128 	sub	$1, %edi
   3129 	sub	$1, %ecx
   3130 	sub	$1, %edx
   3131 	movzbl	(%edi), %eax
   3132 	movb	%al, (%edx)
   3133 
   3134 	testl	$2, %edx
   3135 	jz	L(bk_aligned_4)
   3136 
   3137 L(bk_got2):
   3138 	sub	$2, %edi
   3139 	sub	$2, %ecx
   3140 	sub	$2, %edx
   3141 	movzwl	(%edi), %eax
   3142 	movw	%ax, (%edx)
   3143 	jmp	L(bk_aligned_4)
   3144 
   3145 	.p2align 4
   3146 L(bk_write_more64bytes):
   3147 	/* Check alignment of last byte.  */
   3148 	testl	$15, %edx
   3149 	jz	L(bk_ssse3_cpy_pre)
   3150 
   3151 /* EDX is aligned 4 bytes, but not 16 bytes.  */
   3152 L(bk_ssse3_align):
   3153 	sub	$4, %edi
   3154 	sub	$4, %ecx
   3155 	sub	$4, %edx
   3156 	movl	(%edi), %eax
   3157 	movl	%eax, (%edx)
   3158 
   3159 	testl	$15, %edx
   3160 	jz	L(bk_ssse3_cpy_pre)
   3161 
   3162 	sub	$4, %edi
   3163 	sub	$4, %ecx
   3164 	sub	$4, %edx
   3165 	movl	(%edi), %eax
   3166 	movl	%eax, (%edx)
   3167 
   3168 	testl	$15, %edx
   3169 	jz	L(bk_ssse3_cpy_pre)
   3170 
   3171 	sub	$4, %edi
   3172 	sub	$4, %ecx
   3173 	sub	$4, %edx
   3174 	movl	(%edi), %eax
   3175 	movl	%eax, (%edx)
   3176 
   3177 L(bk_ssse3_cpy_pre):
   3178 	cmp	$64, %ecx
   3179 	jb	L(bk_write_more32bytes)
   3180 
   3181 	.p2align 4
   3182 L(bk_ssse3_cpy):
   3183 	sub	$64, %edi
   3184 	sub	$64, %ecx
   3185 	sub	$64, %edx
   3186 	movdqu	0x30(%edi), %xmm3
   3187 	movdqa	%xmm3, 0x30(%edx)
   3188 	movdqu	0x20(%edi), %xmm2
   3189 	movdqa	%xmm2, 0x20(%edx)
   3190 	movdqu	0x10(%edi), %xmm1
   3191 	movdqa	%xmm1, 0x10(%edx)
   3192 	movdqu	(%edi), %xmm0
   3193 	movdqa	%xmm0, (%edx)
   3194 	cmp	$64, %ecx
   3195 	jae	L(bk_ssse3_cpy)
   3196 	jmp	L(bk_write_64bytesless)
   3197 
   3198 #endif
   3199 
   3200 END (MEMCPY)
   3201