Home | History | Annotate | Download | only in string
      1 /*
      2 Copyright (c) 2010, Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 #ifndef MEMCPY
     32 # define MEMCPY         ssse3_memcpy5
     33 #endif
     34 
     35 #ifndef L
     36 # define L(label)	.L##label
     37 #endif
     38 
     39 #ifndef ALIGN
     40 # define ALIGN(n)	.p2align n
     41 #endif
     42 
     43 #ifndef cfi_startproc
     44 # define cfi_startproc			.cfi_startproc
     45 #endif
     46 
     47 #ifndef cfi_endproc
     48 # define cfi_endproc			.cfi_endproc
     49 #endif
     50 
     51 #ifndef cfi_rel_offset
     52 # define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
     53 #endif
     54 
     55 #ifndef cfi_restore
     56 # define cfi_restore(reg)		.cfi_restore (reg)
     57 #endif
     58 
     59 #ifndef cfi_adjust_cfa_offset
     60 # define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
     61 #endif
     62 
     63 #ifndef ENTRY
     64 # define ENTRY(name)			\
     65 	.type name,  @function; 	\
     66 	.globl name;			\
     67 	.p2align 4;			\
     68 name:					\
     69 	cfi_startproc
     70 #endif
     71 
     72 #ifndef END
     73 # define END(name)			\
     74 	cfi_endproc;			\
     75 	.size name, .-name
     76 #endif
     77 
     78 #ifdef USE_AS_BCOPY
     79 # define SRC		PARMS
     80 # define DEST		SRC+4
     81 # define LEN		DEST+4
     82 #else
     83 # define DEST		PARMS
     84 # define SRC		DEST+4
     85 # define LEN		SRC+4
     86 #endif
     87 
     88 #define CFI_PUSH(REG)						\
     89   cfi_adjust_cfa_offset (4);					\
     90   cfi_rel_offset (REG, 0)
     91 
     92 #define CFI_POP(REG)						\
     93   cfi_adjust_cfa_offset (-4);					\
     94   cfi_restore (REG)
     95 
     96 #define PUSH(REG)	pushl REG; CFI_PUSH (REG)
     97 #define POP(REG)	popl REG; CFI_POP (REG)
     98 
     99 #ifdef SHARED
    100 # define PARMS		8		/* Preserve EBX.  */
    101 # define ENTRANCE	PUSH (%ebx);
    102 # define RETURN_END	POP (%ebx); ret
    103 # define RETURN		RETURN_END; CFI_PUSH (%ebx)
    104 # define JMPTBL(I, B)	I - B
    105 
    106 /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
    107    jump table with relative offsets.  INDEX is a register contains the
    108    index into the jump table.   SCALE is the scale of INDEX. */
    109 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
    110     /* We first load PC into EBX.  */				\
    111     call	__i686.get_pc_thunk.bx;				\
    112     /* Get the address of the jump table.  */			\
    113     addl	$(TABLE - .), %ebx;				\
    114     /* Get the entry and convert the relative offset to the	\
    115        absolute address.  */					\
    116     addl	(%ebx,INDEX,SCALE), %ebx;			\
    117     /* We loaded the jump table.  Go.  */			\
    118     jmp		*%ebx
    119 
    120 # define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)			\
    121     addl	$(TABLE - .), %ebx
    122 
    123 # define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)	\
    124     addl	(%ebx,INDEX,SCALE), %ebx;			\
    125     /* We loaded the jump table.  Go.  */			\
    126     jmp		*%ebx
    127 
    128 	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
    129 	.globl	__i686.get_pc_thunk.bx
    130 	.hidden	__i686.get_pc_thunk.bx
    131 	ALIGN (4)
    132 	.type	__i686.get_pc_thunk.bx,@function
    133 __i686.get_pc_thunk.bx:
    134 	movl	(%esp), %ebx
    135 	ret
    136 #else
    137 # define PARMS		4
    138 # define ENTRANCE
    139 # define RETURN_END	ret
    140 # define RETURN		RETURN_END
    141 # define JMPTBL(I, B)	I
    142 
    143 /* Branch to an entry in a jump table.  TABLE is a jump table with
    144    absolute offsets.  INDEX is a register contains the index into the
    145    jump table.  SCALE is the scale of INDEX. */
    146 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
    147     jmp		*TABLE(,INDEX,SCALE)
    148 
    149 # define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
    150 
    151 # define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)		\
    152     jmp		*TABLE(,INDEX,SCALE)
    153 #endif
    154 
    155 	.section .text.ssse3,"ax",@progbits
    156 ENTRY (MEMCPY)
    157 	ENTRANCE
    158 	movl	LEN(%esp), %ecx
    159 	movl	SRC(%esp), %eax
    160 	movl	DEST(%esp), %edx
    161 
    162 #ifdef USE_AS_MEMMOVE
    163 	cmp	%eax, %edx
    164 	jb	L(copy_forward)
    165 	je	L(fwd_write_0bytes)
    166 	cmp	$32, %ecx
    167 	jae	L(memmove_bwd)
    168 	jmp	L(bk_write_less32bytes_2)
    169 L(memmove_bwd):
    170 	add	%ecx, %eax
    171 	cmp	%eax, %edx
    172 	movl	SRC(%esp), %eax
    173 	jb	L(copy_backward)
    174 
    175 L(copy_forward):
    176 #endif
    177 	cmp	$48, %ecx
    178 	jae	L(48bytesormore)
    179 
    180 L(fwd_write_less32bytes):
    181 #ifndef USE_AS_MEMMOVE
    182 	cmp	%dl, %al
    183 	jb	L(bk_write)
    184 #endif
    185 	add	%ecx, %edx
    186 	add	%ecx, %eax
    187 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
    188 #ifndef USE_AS_MEMMOVE
    189 L(bk_write):
    190 	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
    191 #endif
    192 
    193 	ALIGN (4)
    194 /* ECX > 32 and EDX is 4 byte aligned.  */
    195 L(48bytesormore):
    196 	movdqu	(%eax), %xmm0
    197 	PUSH (%edi)
    198 	movl	%edx, %edi
    199 	and	$-16, %edx
    200 	PUSH (%esi)
    201 	add	$16, %edx
    202 	movl	%edi, %esi
    203 	sub	%edx, %edi
    204 	add	%edi, %ecx
    205 	sub	%edi, %eax
    206 
    207 #ifdef SHARED_CACHE_SIZE_HALF
    208 	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
    209 #else
    210 # ifdef SHARED
    211 	call	__i686.get_pc_thunk.bx
    212 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    213 	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
    214 # else
    215 	cmp	__x86_shared_cache_size_half, %ecx
    216 # endif
    217 #endif
    218 
    219 	mov	%eax, %edi
    220 	jae	L(large_page)
    221 	and	$0xf, %edi
    222 	jz	L(shl_0)
    223 
    224 	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
    225 
    226 	ALIGN (4)
    227 L(shl_0):
    228 	movdqu	%xmm0, (%esi)
    229 	xor	%edi, %edi
    230 	POP (%esi)
    231 	cmp	$127, %ecx
    232 	ja	L(shl_0_gobble)
    233 	lea	-32(%ecx), %ecx
    234 L(shl_0_loop):
    235 	movdqa	(%eax, %edi), %xmm0
    236 	movdqa	16(%eax, %edi), %xmm1
    237 	sub	$32, %ecx
    238 	movdqa	%xmm0, (%edx, %edi)
    239 	movdqa	%xmm1, 16(%edx, %edi)
    240 	lea	32(%edi), %edi
    241 	jb	L(shl_0_end)
    242 
    243 	movdqa	(%eax, %edi), %xmm0
    244 	movdqa	16(%eax, %edi), %xmm1
    245 	sub	$32, %ecx
    246 	movdqa	%xmm0, (%edx, %edi)
    247 	movdqa	%xmm1, 16(%edx, %edi)
    248 	lea	32(%edi), %edi
    249 	jb	L(shl_0_end)
    250 
    251 	movdqa	(%eax, %edi), %xmm0
    252 	movdqa	16(%eax, %edi), %xmm1
    253 	sub	$32, %ecx
    254 	movdqa	%xmm0, (%edx, %edi)
    255 	movdqa	%xmm1, 16(%edx, %edi)
    256 	lea	32(%edi), %edi
    257 	jb	L(shl_0_end)
    258 
    259 	movdqa	(%eax, %edi), %xmm0
    260 	movdqa	16(%eax, %edi), %xmm1
    261 	sub	$32, %ecx
    262 	movdqa	%xmm0, (%edx, %edi)
    263 	movdqa	%xmm1, 16(%edx, %edi)
    264 	lea	32(%edi), %edi
    265 L(shl_0_end):
    266 	lea	32(%ecx), %ecx
    267 	add	%ecx, %edi
    268 	add	%edi, %edx
    269 	add	%edi, %eax
    270 	POP (%edi)
    271 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
    272 
    273 L(shl_0_gobble):
    274 
    275 #ifdef DATA_CACHE_SIZE_HALF
    276 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    277 #else
    278 # ifdef SHARED
    279 	call	__i686.get_pc_thunk.bx
    280 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    281 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    282 # else
    283 	cmp	__x86_data_cache_size_half, %ecx
    284 # endif
    285 #endif
    286 
    287 	POP (%edi)
    288 	lea	-128(%ecx), %ecx
    289 	jae	L(shl_0_gobble_mem_loop)
    290 L(shl_0_gobble_cache_loop):
    291 	movdqa	(%eax), %xmm0
    292 	movdqa	0x10(%eax), %xmm1
    293 	movdqa	0x20(%eax), %xmm2
    294 	movdqa	0x30(%eax), %xmm3
    295 	movdqa	0x40(%eax), %xmm4
    296 	movdqa	0x50(%eax), %xmm5
    297 	movdqa	0x60(%eax), %xmm6
    298 	movdqa	0x70(%eax), %xmm7
    299 	lea	0x80(%eax), %eax
    300 	sub	$128, %ecx
    301 	movdqa	%xmm0, (%edx)
    302 	movdqa	%xmm1, 0x10(%edx)
    303 	movdqa	%xmm2, 0x20(%edx)
    304 	movdqa	%xmm3, 0x30(%edx)
    305 	movdqa	%xmm4, 0x40(%edx)
    306 	movdqa	%xmm5, 0x50(%edx)
    307 	movdqa	%xmm6, 0x60(%edx)
    308 	movdqa	%xmm7, 0x70(%edx)
    309 	lea	0x80(%edx), %edx
    310 
    311 	jae	L(shl_0_gobble_cache_loop)
    312 	cmp	$-0x40, %ecx
    313 	lea	0x80(%ecx), %ecx
    314 	jl	L(shl_0_cache_less_64bytes)
    315 
    316 	movdqa	(%eax), %xmm0
    317 	sub	$0x40, %ecx
    318 	movdqa	0x10(%eax), %xmm1
    319 
    320 	movdqa	%xmm0, (%edx)
    321 	movdqa	%xmm1, 0x10(%edx)
    322 
    323 	movdqa	0x20(%eax), %xmm0
    324 	movdqa	0x30(%eax), %xmm1
    325 	add	$0x40, %eax
    326 
    327 	movdqa	%xmm0, 0x20(%edx)
    328 	movdqa	%xmm1, 0x30(%edx)
    329 	add	$0x40, %edx
    330 L(shl_0_cache_less_64bytes):
    331 	cmp	$0x20, %ecx
    332 	jb	L(shl_0_cache_less_32bytes)
    333 	movdqa	(%eax), %xmm0
    334 	sub	$0x20, %ecx
    335 	movdqa	0x10(%eax), %xmm1
    336 	add	$0x20, %eax
    337 	movdqa	%xmm0, (%edx)
    338 	movdqa	%xmm1, 0x10(%edx)
    339 	add	$0x20, %edx
    340 L(shl_0_cache_less_32bytes):
    341 	cmp	$0x10, %ecx
    342 	jb	L(shl_0_cache_less_16bytes)
    343 	sub	$0x10, %ecx
    344 	movdqa	(%eax), %xmm0
    345 	add	$0x10, %eax
    346 	movdqa	%xmm0, (%edx)
    347 	add	$0x10, %edx
    348 L(shl_0_cache_less_16bytes):
    349 	add	%ecx, %edx
    350 	add	%ecx, %eax
    351 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
    352 
    353 
    354 	ALIGN (4)
    355 L(shl_0_gobble_mem_loop):
    356 	prefetcht0 0x1c0(%eax)
    357 	prefetcht0 0x280(%eax)
    358 	prefetcht0 0x1c0(%edx)
    359 
    360 	movdqa	(%eax), %xmm0
    361 	movdqa	0x10(%eax), %xmm1
    362 	movdqa	0x20(%eax), %xmm2
    363 	movdqa	0x30(%eax), %xmm3
    364 	movdqa	0x40(%eax), %xmm4
    365 	movdqa	0x50(%eax), %xmm5
    366 	movdqa	0x60(%eax), %xmm6
    367 	movdqa	0x70(%eax), %xmm7
    368 	lea	0x80(%eax), %eax
    369 	sub	$0x80, %ecx
    370 	movdqa	%xmm0, (%edx)
    371 	movdqa	%xmm1, 0x10(%edx)
    372 	movdqa	%xmm2, 0x20(%edx)
    373 	movdqa	%xmm3, 0x30(%edx)
    374 	movdqa	%xmm4, 0x40(%edx)
    375 	movdqa	%xmm5, 0x50(%edx)
    376 	movdqa	%xmm6, 0x60(%edx)
    377 	movdqa	%xmm7, 0x70(%edx)
    378 	lea	0x80(%edx), %edx
    379 
    380 	jae	L(shl_0_gobble_mem_loop)
    381 	cmp	$-0x40, %ecx
    382 	lea	0x80(%ecx), %ecx
    383 	jl	L(shl_0_mem_less_64bytes)
    384 
    385 	movdqa	(%eax), %xmm0
    386 	sub	$0x40, %ecx
    387 	movdqa	0x10(%eax), %xmm1
    388 
    389 	movdqa	%xmm0, (%edx)
    390 	movdqa	%xmm1, 0x10(%edx)
    391 
    392 	movdqa	0x20(%eax), %xmm0
    393 	movdqa	0x30(%eax), %xmm1
    394 	add	$0x40, %eax
    395 
    396 	movdqa	%xmm0, 0x20(%edx)
    397 	movdqa	%xmm1, 0x30(%edx)
    398 	add	$0x40, %edx
    399 L(shl_0_mem_less_64bytes):
    400 	cmp	$0x20, %ecx
    401 	jb	L(shl_0_mem_less_32bytes)
    402 	movdqa	(%eax), %xmm0
    403 	sub	$0x20, %ecx
    404 	movdqa	0x10(%eax), %xmm1
    405 	add	$0x20, %eax
    406 	movdqa	%xmm0, (%edx)
    407 	movdqa	%xmm1, 0x10(%edx)
    408 	add	$0x20, %edx
    409 L(shl_0_mem_less_32bytes):
    410 	cmp	$0x10, %ecx
    411 	jb	L(shl_0_mem_less_16bytes)
    412 	sub	$0x10, %ecx
    413 	movdqa	(%eax), %xmm0
    414 	add	$0x10, %eax
    415 	movdqa	%xmm0, (%edx)
    416 	add	$0x10, %edx
    417 L(shl_0_mem_less_16bytes):
    418 	add	%ecx, %edx
    419 	add	%ecx, %eax
    420 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
    421 
    422 
    423 	ALIGN (4)
    424 L(shl_1):
    425 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    426 	lea	-1(%eax), %eax
    427 	movaps	(%eax), %xmm1
    428 	xor	%edi, %edi
    429 	lea	-32(%ecx), %ecx
    430 	movdqu	%xmm0, (%esi)
    431 	POP (%esi)
    432 L(shl_1_loop):
    433 
    434 	movdqa	16(%eax, %edi), %xmm2
    435 	sub	$32, %ecx
    436 	movdqa	32(%eax, %edi), %xmm3
    437 	movdqa	%xmm3, %xmm4
    438 	palignr	$1, %xmm2, %xmm3
    439 	palignr	$1, %xmm1, %xmm2
    440 	lea	32(%edi), %edi
    441 	movdqa	%xmm2, -32(%edx, %edi)
    442 	movdqa	%xmm3, -16(%edx, %edi)
    443 
    444 	jb	L(shl_1_end)
    445 
    446 	movdqa	16(%eax, %edi), %xmm2
    447 	sub	$32, %ecx
    448 	movdqa	32(%eax, %edi), %xmm3
    449 	movdqa	%xmm3, %xmm1
    450 	palignr	$1, %xmm2, %xmm3
    451 	palignr	$1, %xmm4, %xmm2
    452 	lea	32(%edi), %edi
    453 	movdqa	%xmm2, -32(%edx, %edi)
    454 	movdqa	%xmm3, -16(%edx, %edi)
    455 
    456 	jae	L(shl_1_loop)
    457 
    458 L(shl_1_end):
    459 	lea	32(%ecx), %ecx
    460 	add	%ecx, %edi
    461 	add	%edi, %edx
    462 	lea	1(%edi, %eax), %eax
    463 	POP (%edi)
    464 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    465 
    466 	ALIGN (4)
    467 L(shl_2):
    468 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    469 	lea	-2(%eax), %eax
    470 	movaps	(%eax), %xmm1
    471 	xor	%edi, %edi
    472 	lea	-32(%ecx), %ecx
    473 	movdqu	%xmm0, (%esi)
    474 	POP (%esi)
    475 L(shl_2_loop):
    476 
    477 	movdqa	16(%eax, %edi), %xmm2
    478 	sub	$32, %ecx
    479 	movdqa	32(%eax, %edi), %xmm3
    480 	movdqa	%xmm3, %xmm4
    481 	palignr	$2, %xmm2, %xmm3
    482 	palignr	$2, %xmm1, %xmm2
    483 	lea	32(%edi), %edi
    484 	movdqa	%xmm2, -32(%edx, %edi)
    485 	movdqa	%xmm3, -16(%edx, %edi)
    486 
    487 	jb	L(shl_2_end)
    488 
    489 	movdqa	16(%eax, %edi), %xmm2
    490 	sub	$32, %ecx
    491 	movdqa	32(%eax, %edi), %xmm3
    492 	movdqa	%xmm3, %xmm1
    493 	palignr	$2, %xmm2, %xmm3
    494 	palignr	$2, %xmm4, %xmm2
    495 	lea	32(%edi), %edi
    496 	movdqa	%xmm2, -32(%edx, %edi)
    497 	movdqa	%xmm3, -16(%edx, %edi)
    498 
    499 	jae	L(shl_2_loop)
    500 
    501 L(shl_2_end):
    502 	lea	32(%ecx), %ecx
    503 	add	%ecx, %edi
    504 	add	%edi, %edx
    505 	lea	2(%edi, %eax), %eax
    506 	POP (%edi)
    507 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    508 
    509 	ALIGN (4)
    510 L(shl_3):
    511 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    512 	lea	-3(%eax), %eax
    513 	movaps	(%eax), %xmm1
    514 	xor	%edi, %edi
    515 	lea	-32(%ecx), %ecx
    516 	movdqu	%xmm0, (%esi)
    517 	POP (%esi)
    518 L(shl_3_loop):
    519 
    520 	movdqa	16(%eax, %edi), %xmm2
    521 	sub	$32, %ecx
    522 	movdqa	32(%eax, %edi), %xmm3
    523 	movdqa	%xmm3, %xmm4
    524 	palignr	$3, %xmm2, %xmm3
    525 	palignr	$3, %xmm1, %xmm2
    526 	lea	32(%edi), %edi
    527 	movdqa	%xmm2, -32(%edx, %edi)
    528 	movdqa	%xmm3, -16(%edx, %edi)
    529 
    530 	jb	L(shl_3_end)
    531 
    532 	movdqa	16(%eax, %edi), %xmm2
    533 	sub	$32, %ecx
    534 	movdqa	32(%eax, %edi), %xmm3
    535 	movdqa	%xmm3, %xmm1
    536 	palignr	$3, %xmm2, %xmm3
    537 	palignr	$3, %xmm4, %xmm2
    538 	lea	32(%edi), %edi
    539 	movdqa	%xmm2, -32(%edx, %edi)
    540 	movdqa	%xmm3, -16(%edx, %edi)
    541 
    542 	jae	L(shl_3_loop)
    543 
    544 L(shl_3_end):
    545 	lea	32(%ecx), %ecx
    546 	add	%ecx, %edi
    547 	add	%edi, %edx
    548 	lea	3(%edi, %eax), %eax
    549 	POP (%edi)
    550 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    551 
    552 	ALIGN (4)
    553 L(shl_4):
    554 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    555 	lea	-4(%eax), %eax
    556 	movaps	(%eax), %xmm1
    557 	xor	%edi, %edi
    558 	lea	-32(%ecx), %ecx
    559 	movdqu	%xmm0, (%esi)
    560 	POP (%esi)
    561 L(shl_4_loop):
    562 
    563 	movdqa	16(%eax, %edi), %xmm2
    564 	sub	$32, %ecx
    565 	movdqa	32(%eax, %edi), %xmm3
    566 	movdqa	%xmm3, %xmm4
    567 	palignr	$4, %xmm2, %xmm3
    568 	palignr	$4, %xmm1, %xmm2
    569 	lea	32(%edi), %edi
    570 	movdqa	%xmm2, -32(%edx, %edi)
    571 	movdqa	%xmm3, -16(%edx, %edi)
    572 
    573 	jb	L(shl_4_end)
    574 
    575 	movdqa	16(%eax, %edi), %xmm2
    576 	sub	$32, %ecx
    577 	movdqa	32(%eax, %edi), %xmm3
    578 	movdqa	%xmm3, %xmm1
    579 	palignr	$4, %xmm2, %xmm3
    580 	palignr	$4, %xmm4, %xmm2
    581 	lea	32(%edi), %edi
    582 	movdqa	%xmm2, -32(%edx, %edi)
    583 	movdqa	%xmm3, -16(%edx, %edi)
    584 
    585 	jae	L(shl_4_loop)
    586 
    587 L(shl_4_end):
    588 	lea	32(%ecx), %ecx
    589 	add	%ecx, %edi
    590 	add	%edi, %edx
    591 	lea	4(%edi, %eax), %eax
    592 	POP (%edi)
    593 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    594 
    595 	ALIGN (4)
    596 L(shl_5):
    597 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    598 	lea	-5(%eax), %eax
    599 	movaps	(%eax), %xmm1
    600 	xor	%edi, %edi
    601 	lea	-32(%ecx), %ecx
    602 	movdqu	%xmm0, (%esi)
    603 	POP (%esi)
    604 L(shl_5_loop):
    605 
    606 	movdqa	16(%eax, %edi), %xmm2
    607 	sub	$32, %ecx
    608 	movdqa	32(%eax, %edi), %xmm3
    609 	movdqa	%xmm3, %xmm4
    610 	palignr	$5, %xmm2, %xmm3
    611 	palignr	$5, %xmm1, %xmm2
    612 	lea	32(%edi), %edi
    613 	movdqa	%xmm2, -32(%edx, %edi)
    614 	movdqa	%xmm3, -16(%edx, %edi)
    615 
    616 	jb	L(shl_5_end)
    617 
    618 	movdqa	16(%eax, %edi), %xmm2
    619 	sub	$32, %ecx
    620 	movdqa	32(%eax, %edi), %xmm3
    621 	movdqa	%xmm3, %xmm1
    622 	palignr	$5, %xmm2, %xmm3
    623 	palignr	$5, %xmm4, %xmm2
    624 	lea	32(%edi), %edi
    625 	movdqa	%xmm2, -32(%edx, %edi)
    626 	movdqa	%xmm3, -16(%edx, %edi)
    627 
    628 	jae	L(shl_5_loop)
    629 
    630 L(shl_5_end):
    631 	lea	32(%ecx), %ecx
    632 	add	%ecx, %edi
    633 	add	%edi, %edx
    634 	lea	5(%edi, %eax), %eax
    635 	POP (%edi)
    636 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    637 
    638 
    639 	ALIGN (4)
    640 L(shl_6):
    641 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    642 	lea	-6(%eax), %eax
    643 	movaps	(%eax), %xmm1
    644 	xor	%edi, %edi
    645 	lea	-32(%ecx), %ecx
    646 	movdqu	%xmm0, (%esi)
    647 	POP (%esi)
    648 L(shl_6_loop):
    649 
    650 	movdqa	16(%eax, %edi), %xmm2
    651 	sub	$32, %ecx
    652 	movdqa	32(%eax, %edi), %xmm3
    653 	movdqa	%xmm3, %xmm4
    654 	palignr	$6, %xmm2, %xmm3
    655 	palignr	$6, %xmm1, %xmm2
    656 	lea	32(%edi), %edi
    657 	movdqa	%xmm2, -32(%edx, %edi)
    658 	movdqa	%xmm3, -16(%edx, %edi)
    659 
    660 	jb	L(shl_6_end)
    661 
    662 	movdqa	16(%eax, %edi), %xmm2
    663 	sub	$32, %ecx
    664 	movdqa	32(%eax, %edi), %xmm3
    665 	movdqa	%xmm3, %xmm1
    666 	palignr	$6, %xmm2, %xmm3
    667 	palignr	$6, %xmm4, %xmm2
    668 	lea	32(%edi), %edi
    669 	movdqa	%xmm2, -32(%edx, %edi)
    670 	movdqa	%xmm3, -16(%edx, %edi)
    671 
    672 	jae	L(shl_6_loop)
    673 
    674 L(shl_6_end):
    675 	lea	32(%ecx), %ecx
    676 	add	%ecx, %edi
    677 	add	%edi, %edx
    678 	lea	6(%edi, %eax), %eax
    679 	POP (%edi)
    680 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    681 
    682 	ALIGN (4)
    683 L(shl_7):
    684 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    685 	lea	-7(%eax), %eax
    686 	movaps	(%eax), %xmm1
    687 	xor	%edi, %edi
    688 	lea	-32(%ecx), %ecx
    689 	movdqu	%xmm0, (%esi)
    690 	POP (%esi)
    691 L(shl_7_loop):
    692 
    693 	movdqa	16(%eax, %edi), %xmm2
    694 	sub	$32, %ecx
    695 	movdqa	32(%eax, %edi), %xmm3
    696 	movdqa	%xmm3, %xmm4
    697 	palignr	$7, %xmm2, %xmm3
    698 	palignr	$7, %xmm1, %xmm2
    699 	lea	32(%edi), %edi
    700 	movdqa	%xmm2, -32(%edx, %edi)
    701 	movdqa	%xmm3, -16(%edx, %edi)
    702 
    703 	jb	L(shl_7_end)
    704 
    705 	movdqa	16(%eax, %edi), %xmm2
    706 	sub	$32, %ecx
    707 	movdqa	32(%eax, %edi), %xmm3
    708 	movdqa	%xmm3, %xmm1
    709 	palignr	$7, %xmm2, %xmm3
    710 	palignr	$7, %xmm4, %xmm2
    711 	lea	32(%edi), %edi
    712 	movdqa	%xmm2, -32(%edx, %edi)
    713 	movdqa	%xmm3, -16(%edx, %edi)
    714 
    715 	jae	L(shl_7_loop)
    716 
    717 L(shl_7_end):
    718 	lea	32(%ecx), %ecx
    719 	add	%ecx, %edi
    720 	add	%edi, %edx
    721 	lea	7(%edi, %eax), %eax
    722 	POP (%edi)
    723 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    724 
    725 	ALIGN (4)
    726 L(shl_8):
    727 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    728 	lea	-8(%eax), %eax
    729 	movaps	(%eax), %xmm1
    730 	xor	%edi, %edi
    731 	lea	-32(%ecx), %ecx
    732 	movdqu	%xmm0, (%esi)
    733 	POP (%esi)
    734 L(shl_8_loop):
    735 
    736 	movdqa	16(%eax, %edi), %xmm2
    737 	sub	$32, %ecx
    738 	movdqa	32(%eax, %edi), %xmm3
    739 	movdqa	%xmm3, %xmm4
    740 	palignr	$8, %xmm2, %xmm3
    741 	palignr	$8, %xmm1, %xmm2
    742 	lea	32(%edi), %edi
    743 	movdqa	%xmm2, -32(%edx, %edi)
    744 	movdqa	%xmm3, -16(%edx, %edi)
    745 
    746 	jb	L(shl_8_end)
    747 
    748 	movdqa	16(%eax, %edi), %xmm2
    749 	sub	$32, %ecx
    750 	movdqa	32(%eax, %edi), %xmm3
    751 	movdqa	%xmm3, %xmm1
    752 	palignr	$8, %xmm2, %xmm3
    753 	palignr	$8, %xmm4, %xmm2
    754 	lea	32(%edi), %edi
    755 	movdqa	%xmm2, -32(%edx, %edi)
    756 	movdqa	%xmm3, -16(%edx, %edi)
    757 
    758 	jae	L(shl_8_loop)
    759 
    760 L(shl_8_end):
    761 	lea	32(%ecx), %ecx
    762 	add	%ecx, %edi
    763 	add	%edi, %edx
    764 	lea	8(%edi, %eax), %eax
    765 	POP (%edi)
    766 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    767 
    768 	ALIGN (4)
    769 L(shl_9):
    770 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    771 	lea	-9(%eax), %eax
    772 	movaps	(%eax), %xmm1
    773 	xor	%edi, %edi
    774 	lea	-32(%ecx), %ecx
    775 	movdqu	%xmm0, (%esi)
    776 	POP (%esi)
    777 L(shl_9_loop):
    778 
    779 	movdqa	16(%eax, %edi), %xmm2
    780 	sub	$32, %ecx
    781 	movdqa	32(%eax, %edi), %xmm3
    782 	movdqa	%xmm3, %xmm4
    783 	palignr	$9, %xmm2, %xmm3
    784 	palignr	$9, %xmm1, %xmm2
    785 	lea	32(%edi), %edi
    786 	movdqa	%xmm2, -32(%edx, %edi)
    787 	movdqa	%xmm3, -16(%edx, %edi)
    788 
    789 	jb	L(shl_9_end)
    790 
    791 	movdqa	16(%eax, %edi), %xmm2
    792 	sub	$32, %ecx
    793 	movdqa	32(%eax, %edi), %xmm3
    794 	movdqa	%xmm3, %xmm1
    795 	palignr	$9, %xmm2, %xmm3
    796 	palignr	$9, %xmm4, %xmm2
    797 	lea	32(%edi), %edi
    798 	movdqa	%xmm2, -32(%edx, %edi)
    799 	movdqa	%xmm3, -16(%edx, %edi)
    800 
    801 	jae	L(shl_9_loop)
    802 
    803 L(shl_9_end):
    804 	lea	32(%ecx), %ecx
    805 	add	%ecx, %edi
    806 	add	%edi, %edx
    807 	lea	9(%edi, %eax), %eax
    808 	POP (%edi)
    809 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    810 
    811 	ALIGN (4)
    812 L(shl_10):
    813 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    814 	lea	-10(%eax), %eax
    815 	movaps	(%eax), %xmm1
    816 	xor	%edi, %edi
    817 	lea	-32(%ecx), %ecx
    818 	movdqu	%xmm0, (%esi)
    819 	POP (%esi)
    820 L(shl_10_loop):
    821 
    822 	movdqa	16(%eax, %edi), %xmm2
    823 	sub	$32, %ecx
    824 	movdqa	32(%eax, %edi), %xmm3
    825 	movdqa	%xmm3, %xmm4
    826 	palignr	$10, %xmm2, %xmm3
    827 	palignr	$10, %xmm1, %xmm2
    828 	lea	32(%edi), %edi
    829 	movdqa	%xmm2, -32(%edx, %edi)
    830 	movdqa	%xmm3, -16(%edx, %edi)
    831 
    832 	jb	L(shl_10_end)
    833 
    834 	movdqa	16(%eax, %edi), %xmm2
    835 	sub	$32, %ecx
    836 	movdqa	32(%eax, %edi), %xmm3
    837 	movdqa	%xmm3, %xmm1
    838 	palignr	$10, %xmm2, %xmm3
    839 	palignr	$10, %xmm4, %xmm2
    840 	lea	32(%edi), %edi
    841 	movdqa	%xmm2, -32(%edx, %edi)
    842 	movdqa	%xmm3, -16(%edx, %edi)
    843 
    844 	jae	L(shl_10_loop)
    845 
    846 L(shl_10_end):
    847 	lea	32(%ecx), %ecx
    848 	add	%ecx, %edi
    849 	add	%edi, %edx
    850 	lea	10(%edi, %eax), %eax
    851 	POP (%edi)
    852 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    853 
    854 	ALIGN (4)
    855 L(shl_11):
    856 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    857 	lea	-11(%eax), %eax
    858 	movaps	(%eax), %xmm1
    859 	xor	%edi, %edi
    860 	lea	-32(%ecx), %ecx
    861 	movdqu	%xmm0, (%esi)
    862 	POP (%esi)
    863 L(shl_11_loop):
    864 
    865 	movdqa	16(%eax, %edi), %xmm2
    866 	sub	$32, %ecx
    867 	movdqa	32(%eax, %edi), %xmm3
    868 	movdqa	%xmm3, %xmm4
    869 	palignr	$11, %xmm2, %xmm3
    870 	palignr	$11, %xmm1, %xmm2
    871 	lea	32(%edi), %edi
    872 	movdqa	%xmm2, -32(%edx, %edi)
    873 	movdqa	%xmm3, -16(%edx, %edi)
    874 
    875 	jb	L(shl_11_end)
    876 
    877 	movdqa	16(%eax, %edi), %xmm2
    878 	sub	$32, %ecx
    879 	movdqa	32(%eax, %edi), %xmm3
    880 	movdqa	%xmm3, %xmm1
    881 	palignr	$11, %xmm2, %xmm3
    882 	palignr	$11, %xmm4, %xmm2
    883 	lea	32(%edi), %edi
    884 	movdqa	%xmm2, -32(%edx, %edi)
    885 	movdqa	%xmm3, -16(%edx, %edi)
    886 
    887 	jae	L(shl_11_loop)
    888 
    889 L(shl_11_end):
    890 	lea	32(%ecx), %ecx
    891 	add	%ecx, %edi
    892 	add	%edi, %edx
    893 	lea	11(%edi, %eax), %eax
    894 	POP (%edi)
    895 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    896 
    897 	ALIGN (4)
    898 L(shl_12):
    899 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    900 	lea	-12(%eax), %eax
    901 	movaps	(%eax), %xmm1
    902 	xor	%edi, %edi
    903 	lea	-32(%ecx), %ecx
    904 	movdqu	%xmm0, (%esi)
    905 	POP (%esi)
    906 L(shl_12_loop):
    907 
    908 	movdqa	16(%eax, %edi), %xmm2
    909 	sub	$32, %ecx
    910 	movdqa	32(%eax, %edi), %xmm3
    911 	movdqa	%xmm3, %xmm4
    912 	palignr	$12, %xmm2, %xmm3
    913 	palignr	$12, %xmm1, %xmm2
    914 	lea	32(%edi), %edi
    915 	movdqa	%xmm2, -32(%edx, %edi)
    916 	movdqa	%xmm3, -16(%edx, %edi)
    917 
    918 	jb	L(shl_12_end)
    919 
    920 	movdqa	16(%eax, %edi), %xmm2
    921 	sub	$32, %ecx
    922 	movdqa	32(%eax, %edi), %xmm3
    923 	movdqa	%xmm3, %xmm1
    924 	palignr	$12, %xmm2, %xmm3
    925 	palignr	$12, %xmm4, %xmm2
    926 	lea	32(%edi), %edi
    927 	movdqa	%xmm2, -32(%edx, %edi)
    928 	movdqa	%xmm3, -16(%edx, %edi)
    929 
    930 	jae	L(shl_12_loop)
    931 
    932 L(shl_12_end):
    933 	lea	32(%ecx), %ecx
    934 	add	%ecx, %edi
    935 	add	%edi, %edx
    936 	lea	12(%edi, %eax), %eax
    937 	POP (%edi)
    938 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    939 
    940 	ALIGN (4)
    941 L(shl_13):
    942 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    943 	lea	-13(%eax), %eax
    944 	movaps	(%eax), %xmm1
    945 	xor	%edi, %edi
    946 	lea	-32(%ecx), %ecx
    947 	movdqu	%xmm0, (%esi)
    948 	POP (%esi)
    949 L(shl_13_loop):
    950 
    951 	movdqa	16(%eax, %edi), %xmm2
    952 	sub	$32, %ecx
    953 	movdqa	32(%eax, %edi), %xmm3
    954 	movdqa	%xmm3, %xmm4
    955 	palignr	$13, %xmm2, %xmm3
    956 	palignr	$13, %xmm1, %xmm2
    957 	lea	32(%edi), %edi
    958 	movdqa	%xmm2, -32(%edx, %edi)
    959 	movdqa	%xmm3, -16(%edx, %edi)
    960 
    961 	jb	L(shl_13_end)
    962 
    963 	movdqa	16(%eax, %edi), %xmm2
    964 	sub	$32, %ecx
    965 	movdqa	32(%eax, %edi), %xmm3
    966 	movdqa	%xmm3, %xmm1
    967 	palignr	$13, %xmm2, %xmm3
    968 	palignr	$13, %xmm4, %xmm2
    969 	lea	32(%edi), %edi
    970 	movdqa	%xmm2, -32(%edx, %edi)
    971 	movdqa	%xmm3, -16(%edx, %edi)
    972 
    973 	jae	L(shl_13_loop)
    974 
    975 L(shl_13_end):
    976 	lea	32(%ecx), %ecx
    977 	add	%ecx, %edi
    978 	add	%edi, %edx
    979 	lea	13(%edi, %eax), %eax
    980 	POP (%edi)
    981 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    982 
    983 	ALIGN (4)
    984 L(shl_14):
    985 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    986 	lea	-14(%eax), %eax
    987 	movaps	(%eax), %xmm1
    988 	xor	%edi, %edi
    989 	lea	-32(%ecx), %ecx
    990 	movdqu	%xmm0, (%esi)
    991 	POP (%esi)
    992 L(shl_14_loop):
    993 
    994 	movdqa	16(%eax, %edi), %xmm2
    995 	sub	$32, %ecx
    996 	movdqa	32(%eax, %edi), %xmm3
    997 	movdqa	%xmm3, %xmm4
    998 	palignr	$14, %xmm2, %xmm3
    999 	palignr	$14, %xmm1, %xmm2
   1000 	lea	32(%edi), %edi
   1001 	movdqa	%xmm2, -32(%edx, %edi)
   1002 	movdqa	%xmm3, -16(%edx, %edi)
   1003 
   1004 	jb	L(shl_14_end)
   1005 
   1006 	movdqa	16(%eax, %edi), %xmm2
   1007 	sub	$32, %ecx
   1008 	movdqa	32(%eax, %edi), %xmm3
   1009 	movdqa	%xmm3, %xmm1
   1010 	palignr	$14, %xmm2, %xmm3
   1011 	palignr	$14, %xmm4, %xmm2
   1012 	lea	32(%edi), %edi
   1013 	movdqa	%xmm2, -32(%edx, %edi)
   1014 	movdqa	%xmm3, -16(%edx, %edi)
   1015 
   1016 	jae	L(shl_14_loop)
   1017 
   1018 L(shl_14_end):
   1019 	lea	32(%ecx), %ecx
   1020 	add	%ecx, %edi
   1021 	add	%edi, %edx
   1022 	lea	14(%edi, %eax), %eax
   1023 	POP (%edi)
   1024 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
   1025 
   1026 
   1027 	ALIGN (4)
   1028 L(shl_15):
   1029 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
   1030 	lea	-15(%eax), %eax
   1031 	movaps	(%eax), %xmm1
   1032 	xor	%edi, %edi
   1033 	lea	-32(%ecx), %ecx
   1034 	movdqu	%xmm0, (%esi)
   1035 	POP (%esi)
   1036 L(shl_15_loop):
   1037 
   1038 	movdqa	16(%eax, %edi), %xmm2
   1039 	sub	$32, %ecx
   1040 	movdqa	32(%eax, %edi), %xmm3
   1041 	movdqa	%xmm3, %xmm4
   1042 	palignr	$15, %xmm2, %xmm3
   1043 	palignr	$15, %xmm1, %xmm2
   1044 	lea	32(%edi), %edi
   1045 	movdqa	%xmm2, -32(%edx, %edi)
   1046 	movdqa	%xmm3, -16(%edx, %edi)
   1047 
   1048 	jb	L(shl_15_end)
   1049 
   1050 	movdqa	16(%eax, %edi), %xmm2
   1051 	sub	$32, %ecx
   1052 	movdqa	32(%eax, %edi), %xmm3
   1053 	movdqa	%xmm3, %xmm1
   1054 	palignr	$15, %xmm2, %xmm3
   1055 	palignr	$15, %xmm4, %xmm2
   1056 	lea	32(%edi), %edi
   1057 	movdqa	%xmm2, -32(%edx, %edi)
   1058 	movdqa	%xmm3, -16(%edx, %edi)
   1059 
   1060 	jae	L(shl_15_loop)
   1061 
   1062 L(shl_15_end):
   1063 	lea	32(%ecx), %ecx
   1064 	add	%ecx, %edi
   1065 	add	%edi, %edx
   1066 	lea	15(%edi, %eax), %eax
   1067 	POP (%edi)
   1068 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
   1069 
   1070 
   1071 	ALIGN (4)
   1072 L(fwd_write_44bytes):
   1073 	movl	-44(%eax), %ecx
   1074 	movl	%ecx, -44(%edx)
   1075 L(fwd_write_40bytes):
   1076 	movl	-40(%eax), %ecx
   1077 	movl	%ecx, -40(%edx)
   1078 L(fwd_write_36bytes):
   1079 	movl	-36(%eax), %ecx
   1080 	movl	%ecx, -36(%edx)
   1081 L(fwd_write_32bytes):
   1082 	movl	-32(%eax), %ecx
   1083 	movl	%ecx, -32(%edx)
   1084 L(fwd_write_28bytes):
   1085 	movl	-28(%eax), %ecx
   1086 	movl	%ecx, -28(%edx)
   1087 L(fwd_write_24bytes):
   1088 	movl	-24(%eax), %ecx
   1089 	movl	%ecx, -24(%edx)
   1090 L(fwd_write_20bytes):
   1091 	movl	-20(%eax), %ecx
   1092 	movl	%ecx, -20(%edx)
   1093 L(fwd_write_16bytes):
   1094 	movl	-16(%eax), %ecx
   1095 	movl	%ecx, -16(%edx)
   1096 L(fwd_write_12bytes):
   1097 	movl	-12(%eax), %ecx
   1098 	movl	%ecx, -12(%edx)
   1099 L(fwd_write_8bytes):
   1100 	movl	-8(%eax), %ecx
   1101 	movl	%ecx, -8(%edx)
   1102 L(fwd_write_4bytes):
   1103 	movl	-4(%eax), %ecx
   1104 	movl	%ecx, -4(%edx)
   1105 L(fwd_write_0bytes):
   1106 #ifndef USE_AS_BCOPY
   1107 # ifdef USE_AS_MEMPCPY
   1108 	movl	%edx, %eax
   1109 # else
   1110 	movl	DEST(%esp), %eax
   1111 # endif
   1112 #endif
   1113 	RETURN
   1114 
   1115 	ALIGN (4)
   1116 L(fwd_write_5bytes):
   1117 	movl	-5(%eax), %ecx
   1118 	movl	-4(%eax), %eax
   1119 	movl	%ecx, -5(%edx)
   1120 	movl	%eax, -4(%edx)
   1121 #ifndef USE_AS_BCOPY
   1122 # ifdef USE_AS_MEMPCPY
   1123 	movl	%edx, %eax
   1124 # else
   1125 	movl	DEST(%esp), %eax
   1126 # endif
   1127 #endif
   1128 	RETURN
   1129 
   1130 	ALIGN (4)
   1131 L(fwd_write_45bytes):
   1132 	movl	-45(%eax), %ecx
   1133 	movl	%ecx, -45(%edx)
   1134 L(fwd_write_41bytes):
   1135 	movl	-41(%eax), %ecx
   1136 	movl	%ecx, -41(%edx)
   1137 L(fwd_write_37bytes):
   1138 	movl	-37(%eax), %ecx
   1139 	movl	%ecx, -37(%edx)
   1140 L(fwd_write_33bytes):
   1141 	movl	-33(%eax), %ecx
   1142 	movl	%ecx, -33(%edx)
   1143 L(fwd_write_29bytes):
   1144 	movl	-29(%eax), %ecx
   1145 	movl	%ecx, -29(%edx)
   1146 L(fwd_write_25bytes):
   1147 	movl	-25(%eax), %ecx
   1148 	movl	%ecx, -25(%edx)
   1149 L(fwd_write_21bytes):
   1150 	movl	-21(%eax), %ecx
   1151 	movl	%ecx, -21(%edx)
   1152 L(fwd_write_17bytes):
   1153 	movl	-17(%eax), %ecx
   1154 	movl	%ecx, -17(%edx)
   1155 L(fwd_write_13bytes):
   1156 	movl	-13(%eax), %ecx
   1157 	movl	%ecx, -13(%edx)
   1158 L(fwd_write_9bytes):
   1159 	movl	-9(%eax), %ecx
   1160 	movl	%ecx, -9(%edx)
   1161 	movl	-5(%eax), %ecx
   1162 	movl	%ecx, -5(%edx)
   1163 L(fwd_write_1bytes):
   1164 	movzbl	-1(%eax), %ecx
   1165 	movb	%cl, -1(%edx)
   1166 #ifndef USE_AS_BCOPY
   1167 # ifdef USE_AS_MEMPCPY
   1168 	movl	%edx, %eax
   1169 # else
   1170 	movl	DEST(%esp), %eax
   1171 # endif
   1172 #endif
   1173 	RETURN
   1174 
   1175 	ALIGN (4)
   1176 L(fwd_write_46bytes):
   1177 	movl	-46(%eax), %ecx
   1178 	movl	%ecx, -46(%edx)
   1179 L(fwd_write_42bytes):
   1180 	movl	-42(%eax), %ecx
   1181 	movl	%ecx, -42(%edx)
   1182 L(fwd_write_38bytes):
   1183 	movl	-38(%eax), %ecx
   1184 	movl	%ecx, -38(%edx)
   1185 L(fwd_write_34bytes):
   1186 	movl	-34(%eax), %ecx
   1187 	movl	%ecx, -34(%edx)
   1188 L(fwd_write_30bytes):
   1189 	movl	-30(%eax), %ecx
   1190 	movl	%ecx, -30(%edx)
   1191 L(fwd_write_26bytes):
   1192 	movl	-26(%eax), %ecx
   1193 	movl	%ecx, -26(%edx)
   1194 L(fwd_write_22bytes):
   1195 	movl	-22(%eax), %ecx
   1196 	movl	%ecx, -22(%edx)
   1197 L(fwd_write_18bytes):
   1198 	movl	-18(%eax), %ecx
   1199 	movl	%ecx, -18(%edx)
   1200 L(fwd_write_14bytes):
   1201 	movl	-14(%eax), %ecx
   1202 	movl	%ecx, -14(%edx)
   1203 L(fwd_write_10bytes):
   1204 	movl	-10(%eax), %ecx
   1205 	movl	%ecx, -10(%edx)
   1206 L(fwd_write_6bytes):
   1207 	movl	-6(%eax), %ecx
   1208 	movl	%ecx, -6(%edx)
   1209 L(fwd_write_2bytes):
   1210 	movzwl	-2(%eax), %ecx
   1211 	movw	%cx, -2(%edx)
   1212 #ifndef USE_AS_BCOPY
   1213 # ifdef USE_AS_MEMPCPY
   1214 	movl	%edx, %eax
   1215 # else
   1216 	movl	DEST(%esp), %eax
   1217 # endif
   1218 #endif
   1219 	RETURN
   1220 
   1221 	ALIGN (4)
   1222 L(fwd_write_47bytes):
   1223 	movl	-47(%eax), %ecx
   1224 	movl	%ecx, -47(%edx)
   1225 L(fwd_write_43bytes):
   1226 	movl	-43(%eax), %ecx
   1227 	movl	%ecx, -43(%edx)
   1228 L(fwd_write_39bytes):
   1229 	movl	-39(%eax), %ecx
   1230 	movl	%ecx, -39(%edx)
   1231 L(fwd_write_35bytes):
   1232 	movl	-35(%eax), %ecx
   1233 	movl	%ecx, -35(%edx)
   1234 L(fwd_write_31bytes):
   1235 	movl	-31(%eax), %ecx
   1236 	movl	%ecx, -31(%edx)
   1237 L(fwd_write_27bytes):
   1238 	movl	-27(%eax), %ecx
   1239 	movl	%ecx, -27(%edx)
   1240 L(fwd_write_23bytes):
   1241 	movl	-23(%eax), %ecx
   1242 	movl	%ecx, -23(%edx)
   1243 L(fwd_write_19bytes):
   1244 	movl	-19(%eax), %ecx
   1245 	movl	%ecx, -19(%edx)
   1246 L(fwd_write_15bytes):
   1247 	movl	-15(%eax), %ecx
   1248 	movl	%ecx, -15(%edx)
   1249 L(fwd_write_11bytes):
   1250 	movl	-11(%eax), %ecx
   1251 	movl	%ecx, -11(%edx)
   1252 L(fwd_write_7bytes):
   1253 	movl	-7(%eax), %ecx
   1254 	movl	%ecx, -7(%edx)
   1255 L(fwd_write_3bytes):
   1256 	movzwl	-3(%eax), %ecx
   1257 	movzbl	-1(%eax), %eax
   1258 	movw	%cx, -3(%edx)
   1259 	movb	%al, -1(%edx)
   1260 #ifndef USE_AS_BCOPY
   1261 # ifdef USE_AS_MEMPCPY
   1262 	movl	%edx, %eax
   1263 # else
   1264 	movl	DEST(%esp), %eax
   1265 # endif
   1266 #endif
   1267 	RETURN
   1268 
   1269 	ALIGN (4)
   1270 L(large_page):
   1271 	movdqu	(%eax), %xmm1
   1272 	lea	16(%eax), %eax
   1273 	movdqu	%xmm0, (%esi)
   1274 	movntdq	%xmm1, (%edx)
   1275 	lea	16(%edx), %edx
   1276 	POP (%esi)
   1277 	lea	-0x90(%ecx), %ecx
   1278 	POP (%edi)
   1279 L(large_page_loop):
   1280 	movdqu	(%eax), %xmm0
   1281 	movdqu	0x10(%eax), %xmm1
   1282 	movdqu	0x20(%eax), %xmm2
   1283 	movdqu	0x30(%eax), %xmm3
   1284 	movdqu	0x40(%eax), %xmm4
   1285 	movdqu	0x50(%eax), %xmm5
   1286 	movdqu	0x60(%eax), %xmm6
   1287 	movdqu	0x70(%eax), %xmm7
   1288 	lea	0x80(%eax), %eax
   1289 
   1290 	sub	$0x80, %ecx
   1291 	movntdq	%xmm0, (%edx)
   1292 	movntdq	%xmm1, 0x10(%edx)
   1293 	movntdq	%xmm2, 0x20(%edx)
   1294 	movntdq	%xmm3, 0x30(%edx)
   1295 	movntdq	%xmm4, 0x40(%edx)
   1296 	movntdq	%xmm5, 0x50(%edx)
   1297 	movntdq	%xmm6, 0x60(%edx)
   1298 	movntdq	%xmm7, 0x70(%edx)
   1299 	lea	0x80(%edx), %edx
   1300 	jae	L(large_page_loop)
   1301 	cmp	$-0x40, %ecx
   1302 	lea	0x80(%ecx), %ecx
   1303 	jl	L(large_page_less_64bytes)
   1304 
   1305 	movdqu	(%eax), %xmm0
   1306 	movdqu	0x10(%eax), %xmm1
   1307 	movdqu	0x20(%eax), %xmm2
   1308 	movdqu	0x30(%eax), %xmm3
   1309 	lea	0x40(%eax), %eax
   1310 
   1311 	movntdq	%xmm0, (%edx)
   1312 	movntdq	%xmm1, 0x10(%edx)
   1313 	movntdq	%xmm2, 0x20(%edx)
   1314 	movntdq	%xmm3, 0x30(%edx)
   1315 	lea	0x40(%edx), %edx
   1316 	sub	$0x40, %ecx
   1317 L(large_page_less_64bytes):
   1318 	cmp	$32, %ecx
   1319 	jb	L(large_page_less_32bytes)
   1320 	movdqu	(%eax), %xmm0
   1321 	movdqu	0x10(%eax), %xmm1
   1322 	lea	0x20(%eax), %eax
   1323 	movntdq	%xmm0, (%edx)
   1324 	movntdq	%xmm1, 0x10(%edx)
   1325 	lea	0x20(%edx), %edx
   1326 	sub	$0x20, %ecx
   1327 L(large_page_less_32bytes):
   1328 	add	%ecx, %edx
   1329 	add	%ecx, %eax
   1330 	sfence
   1331 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
   1332 
   1333 
   1334 	ALIGN (4)
   1335 L(bk_write_44bytes):
   1336 	movl	40(%eax), %ecx
   1337 	movl	%ecx, 40(%edx)
   1338 L(bk_write_40bytes):
   1339 	movl	36(%eax), %ecx
   1340 	movl	%ecx, 36(%edx)
   1341 L(bk_write_36bytes):
   1342 	movl	32(%eax), %ecx
   1343 	movl	%ecx, 32(%edx)
   1344 L(bk_write_32bytes):
   1345 	movl	28(%eax), %ecx
   1346 	movl	%ecx, 28(%edx)
   1347 L(bk_write_28bytes):
   1348 	movl	24(%eax), %ecx
   1349 	movl	%ecx, 24(%edx)
   1350 L(bk_write_24bytes):
   1351 	movl	20(%eax), %ecx
   1352 	movl	%ecx, 20(%edx)
   1353 L(bk_write_20bytes):
   1354 	movl	16(%eax), %ecx
   1355 	movl	%ecx, 16(%edx)
   1356 L(bk_write_16bytes):
   1357 	movl	12(%eax), %ecx
   1358 	movl	%ecx, 12(%edx)
   1359 L(bk_write_12bytes):
   1360 	movl	8(%eax), %ecx
   1361 	movl	%ecx, 8(%edx)
   1362 L(bk_write_8bytes):
   1363 	movl	4(%eax), %ecx
   1364 	movl	%ecx, 4(%edx)
   1365 L(bk_write_4bytes):
   1366 	movl	(%eax), %ecx
   1367 	movl	%ecx, (%edx)
   1368 L(bk_write_0bytes):
   1369 #ifndef USE_AS_BCOPY
   1370 	movl	DEST(%esp), %eax
   1371 # ifdef USE_AS_MEMPCPY
   1372 	movl	LEN(%esp), %ecx
   1373 	add	%ecx, %eax
   1374 # endif
   1375 #endif
   1376 	RETURN
   1377 
   1378 	ALIGN (4)
   1379 L(bk_write_45bytes):
   1380 	movl	41(%eax), %ecx
   1381 	movl	%ecx, 41(%edx)
   1382 L(bk_write_41bytes):
   1383 	movl	37(%eax), %ecx
   1384 	movl	%ecx, 37(%edx)
   1385 L(bk_write_37bytes):
   1386 	movl	33(%eax), %ecx
   1387 	movl	%ecx, 33(%edx)
   1388 L(bk_write_33bytes):
   1389 	movl	29(%eax), %ecx
   1390 	movl	%ecx, 29(%edx)
   1391 L(bk_write_29bytes):
   1392 	movl	25(%eax), %ecx
   1393 	movl	%ecx, 25(%edx)
   1394 L(bk_write_25bytes):
   1395 	movl	21(%eax), %ecx
   1396 	movl	%ecx, 21(%edx)
   1397 L(bk_write_21bytes):
   1398 	movl	17(%eax), %ecx
   1399 	movl	%ecx, 17(%edx)
   1400 L(bk_write_17bytes):
   1401 	movl	13(%eax), %ecx
   1402 	movl	%ecx, 13(%edx)
   1403 L(bk_write_13bytes):
   1404 	movl	9(%eax), %ecx
   1405 	movl	%ecx, 9(%edx)
   1406 L(bk_write_9bytes):
   1407 	movl	5(%eax), %ecx
   1408 	movl	%ecx, 5(%edx)
   1409 L(bk_write_5bytes):
   1410 	movl	1(%eax), %ecx
   1411 	movl	%ecx, 1(%edx)
   1412 L(bk_write_1bytes):
   1413 	movzbl	(%eax), %ecx
   1414 	movb	%cl, (%edx)
   1415 #ifndef USE_AS_BCOPY
   1416 	movl	DEST(%esp), %eax
   1417 # ifdef USE_AS_MEMPCPY
   1418 	movl	LEN(%esp), %ecx
   1419 	add	%ecx, %eax
   1420 # endif
   1421 #endif
   1422 	RETURN
   1423 
   1424 	ALIGN (4)
   1425 L(bk_write_46bytes):
   1426 	movl	42(%eax), %ecx
   1427 	movl	%ecx, 42(%edx)
   1428 L(bk_write_42bytes):
   1429 	movl	38(%eax), %ecx
   1430 	movl	%ecx, 38(%edx)
   1431 L(bk_write_38bytes):
   1432 	movl	34(%eax), %ecx
   1433 	movl	%ecx, 34(%edx)
   1434 L(bk_write_34bytes):
   1435 	movl	30(%eax), %ecx
   1436 	movl	%ecx, 30(%edx)
   1437 L(bk_write_30bytes):
   1438 	movl	26(%eax), %ecx
   1439 	movl	%ecx, 26(%edx)
   1440 L(bk_write_26bytes):
   1441 	movl	22(%eax), %ecx
   1442 	movl	%ecx, 22(%edx)
   1443 L(bk_write_22bytes):
   1444 	movl	18(%eax), %ecx
   1445 	movl	%ecx, 18(%edx)
   1446 L(bk_write_18bytes):
   1447 	movl	14(%eax), %ecx
   1448 	movl	%ecx, 14(%edx)
   1449 L(bk_write_14bytes):
   1450 	movl	10(%eax), %ecx
   1451 	movl	%ecx, 10(%edx)
   1452 L(bk_write_10bytes):
   1453 	movl	6(%eax), %ecx
   1454 	movl	%ecx, 6(%edx)
   1455 L(bk_write_6bytes):
   1456 	movl	2(%eax), %ecx
   1457 	movl	%ecx, 2(%edx)
   1458 L(bk_write_2bytes):
   1459 	movzwl	(%eax), %ecx
   1460 	movw	%cx, (%edx)
   1461 #ifndef USE_AS_BCOPY
   1462 	movl	DEST(%esp), %eax
   1463 # ifdef USE_AS_MEMPCPY
   1464 	movl	LEN(%esp), %ecx
   1465 	add	%ecx, %eax
   1466 # endif
   1467 #endif
   1468 	RETURN
   1469 
   1470 	ALIGN (4)
   1471 L(bk_write_47bytes):
   1472 	movl	43(%eax), %ecx
   1473 	movl	%ecx, 43(%edx)
   1474 L(bk_write_43bytes):
   1475 	movl	39(%eax), %ecx
   1476 	movl	%ecx, 39(%edx)
   1477 L(bk_write_39bytes):
   1478 	movl	35(%eax), %ecx
   1479 	movl	%ecx, 35(%edx)
   1480 L(bk_write_35bytes):
   1481 	movl	31(%eax), %ecx
   1482 	movl	%ecx, 31(%edx)
   1483 L(bk_write_31bytes):
   1484 	movl	27(%eax), %ecx
   1485 	movl	%ecx, 27(%edx)
   1486 L(bk_write_27bytes):
   1487 	movl	23(%eax), %ecx
   1488 	movl	%ecx, 23(%edx)
   1489 L(bk_write_23bytes):
   1490 	movl	19(%eax), %ecx
   1491 	movl	%ecx, 19(%edx)
   1492 L(bk_write_19bytes):
   1493 	movl	15(%eax), %ecx
   1494 	movl	%ecx, 15(%edx)
   1495 L(bk_write_15bytes):
   1496 	movl	11(%eax), %ecx
   1497 	movl	%ecx, 11(%edx)
   1498 L(bk_write_11bytes):
   1499 	movl	7(%eax), %ecx
   1500 	movl	%ecx, 7(%edx)
   1501 L(bk_write_7bytes):
   1502 	movl	3(%eax), %ecx
   1503 	movl	%ecx, 3(%edx)
   1504 L(bk_write_3bytes):
   1505 	movzwl	1(%eax), %ecx
   1506 	movw	%cx, 1(%edx)
   1507 	movzbl	(%eax), %eax
   1508 	movb	%al, (%edx)
   1509 #ifndef USE_AS_BCOPY
   1510 	movl	DEST(%esp), %eax
   1511 # ifdef USE_AS_MEMPCPY
   1512 	movl	LEN(%esp), %ecx
   1513 	add	%ecx, %eax
   1514 # endif
   1515 #endif
   1516 	RETURN_END
   1517 
   1518 
   1519 	.pushsection .rodata.ssse3,"a",@progbits
   1520 	ALIGN (2)
   1521 L(table_48bytes_fwd):
   1522 	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
   1523 	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
   1524 	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
   1525 	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
   1526 	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
   1527 	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
   1528 	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
   1529 	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
   1530 	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
   1531 	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
   1532 	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
   1533 	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
   1534 	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
   1535 	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
   1536 	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
   1537 	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
   1538 	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
   1539 	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
   1540 	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
   1541 	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
   1542 	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
   1543 	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
   1544 	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
   1545 	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
   1546 	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
   1547 	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
   1548 	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
   1549 	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
   1550 	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
   1551 	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
   1552 	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
   1553 	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
   1554 	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
   1555 	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
   1556 	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
   1557 	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
   1558 	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
   1559 	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
   1560 	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
   1561 	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
   1562 	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
   1563 	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
   1564 	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
   1565 	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
   1566 	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
   1567 	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
   1568 	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
   1569 	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
   1570 
   1571 	ALIGN (2)
   1572 L(shl_table):
   1573 	.int	JMPTBL (L(shl_0), L(shl_table))
   1574 	.int	JMPTBL (L(shl_1), L(shl_table))
   1575 	.int	JMPTBL (L(shl_2), L(shl_table))
   1576 	.int	JMPTBL (L(shl_3), L(shl_table))
   1577 	.int	JMPTBL (L(shl_4), L(shl_table))
   1578 	.int	JMPTBL (L(shl_5), L(shl_table))
   1579 	.int	JMPTBL (L(shl_6), L(shl_table))
   1580 	.int	JMPTBL (L(shl_7), L(shl_table))
   1581 	.int	JMPTBL (L(shl_8), L(shl_table))
   1582 	.int	JMPTBL (L(shl_9), L(shl_table))
   1583 	.int	JMPTBL (L(shl_10), L(shl_table))
   1584 	.int	JMPTBL (L(shl_11), L(shl_table))
   1585 	.int	JMPTBL (L(shl_12), L(shl_table))
   1586 	.int	JMPTBL (L(shl_13), L(shl_table))
   1587 	.int	JMPTBL (L(shl_14), L(shl_table))
   1588 	.int	JMPTBL (L(shl_15), L(shl_table))
   1589 
   1590 	ALIGN (2)
   1591 L(table_48_bytes_bwd):
   1592 	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
   1593 	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
   1594 	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
   1595 	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
   1596 	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
   1597 	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
   1598 	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
   1599 	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
   1600 	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
   1601 	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
   1602 	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
   1603 	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
   1604 	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
   1605 	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
   1606 	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
   1607 	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
   1608 	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
   1609 	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
   1610 	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
   1611 	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
   1612 	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
   1613 	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
   1614 	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
   1615 	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
   1616 	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
   1617 	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
   1618 	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
   1619 	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
   1620 	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
   1621 	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
   1622 	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
   1623 	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
   1624 	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
   1625 	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
   1626 	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
   1627 	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
   1628 	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
   1629 	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
   1630 	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
   1631 	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
   1632 	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
   1633 	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
   1634 	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
   1635 	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
   1636 	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
   1637 	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
   1638 	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
   1639 	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
   1640 
   1641 	.popsection
   1642 
   1643 #ifdef USE_AS_MEMMOVE
   1644 	ALIGN (4)
   1645 L(copy_backward):
   1646 	PUSH (%esi)
   1647 	movl	%eax, %esi
   1648 	lea	(%ecx,%edx,1),%edx
   1649 	lea	(%ecx,%esi,1),%esi
   1650 	testl	$0x3, %edx
   1651 	jnz	L(bk_align)
   1652 
   1653 L(bk_aligned_4):
   1654 	cmp	$64, %ecx
   1655 	jae	L(bk_write_more64bytes)
   1656 
   1657 L(bk_write_64bytesless):
   1658 	cmp	$32, %ecx
   1659 	jb	L(bk_write_less32bytes)
   1660 
   1661 L(bk_write_more32bytes):
   1662 	/* Copy 32 bytes at a time.  */
   1663 	sub	$32, %ecx
   1664 	movl	-4(%esi), %eax
   1665 	movl	%eax, -4(%edx)
   1666 	movl	-8(%esi), %eax
   1667 	movl	%eax, -8(%edx)
   1668 	movl	-12(%esi), %eax
   1669 	movl	%eax, -12(%edx)
   1670 	movl	-16(%esi), %eax
   1671 	movl	%eax, -16(%edx)
   1672 	movl	-20(%esi), %eax
   1673 	movl	%eax, -20(%edx)
   1674 	movl	-24(%esi), %eax
   1675 	movl	%eax, -24(%edx)
   1676 	movl	-28(%esi), %eax
   1677 	movl	%eax, -28(%edx)
   1678 	movl	-32(%esi), %eax
   1679 	movl	%eax, -32(%edx)
   1680 	sub	$32, %edx
   1681 	sub	$32, %esi
   1682 
   1683 L(bk_write_less32bytes):
   1684 	movl	%esi, %eax
   1685 	sub	%ecx, %edx
   1686 	sub	%ecx, %eax
   1687 	POP (%esi)
   1688 L(bk_write_less32bytes_2):
   1689 	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
   1690 
   1691 	ALIGN (4)
   1692 L(bk_align):
   1693 	cmp	$8, %ecx
   1694 	jbe	L(bk_write_less32bytes)
   1695 	testl	$1, %edx
   1696 	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
   1697 	   then (EDX & 2) must be != 0.  */
   1698 	jz	L(bk_got2)
   1699 	sub	$1, %esi
   1700 	sub	$1, %ecx
   1701 	sub	$1, %edx
   1702 	movzbl	(%esi), %eax
   1703 	movb	%al, (%edx)
   1704 
   1705 	testl	$2, %edx
   1706 	jz	L(bk_aligned_4)
   1707 
   1708 L(bk_got2):
   1709 	sub	$2, %esi
   1710 	sub	$2, %ecx
   1711 	sub	$2, %edx
   1712 	movzwl	(%esi), %eax
   1713 	movw	%ax, (%edx)
   1714 	jmp	L(bk_aligned_4)
   1715 
   1716 	ALIGN (4)
   1717 L(bk_write_more64bytes):
   1718 	/* Check alignment of last byte.  */
   1719 	testl	$15, %edx
   1720 	jz	L(bk_ssse3_cpy_pre)
   1721 
   1722 /* EDX is aligned 4 bytes, but not 16 bytes.  */
   1723 L(bk_ssse3_align):
   1724 	sub	$4, %esi
   1725 	sub	$4, %ecx
   1726 	sub	$4, %edx
   1727 	movl	(%esi), %eax
   1728 	movl	%eax, (%edx)
   1729 
   1730 	testl	$15, %edx
   1731 	jz	L(bk_ssse3_cpy_pre)
   1732 
   1733 	sub	$4, %esi
   1734 	sub	$4, %ecx
   1735 	sub	$4, %edx
   1736 	movl	(%esi), %eax
   1737 	movl	%eax, (%edx)
   1738 
   1739 	testl	$15, %edx
   1740 	jz	L(bk_ssse3_cpy_pre)
   1741 
   1742 	sub	$4, %esi
   1743 	sub	$4, %ecx
   1744 	sub	$4, %edx
   1745 	movl	(%esi), %eax
   1746 	movl	%eax, (%edx)
   1747 
   1748 L(bk_ssse3_cpy_pre):
   1749 	cmp	$64, %ecx
   1750 	jb	L(bk_write_more32bytes)
   1751 
   1752 L(bk_ssse3_cpy):
   1753 	sub	$64, %esi
   1754 	sub	$64, %ecx
   1755 	sub	$64, %edx
   1756 	movdqu	0x30(%esi), %xmm3
   1757 	movdqa	%xmm3, 0x30(%edx)
   1758 	movdqu	0x20(%esi), %xmm2
   1759 	movdqa	%xmm2, 0x20(%edx)
   1760 	movdqu	0x10(%esi), %xmm1
   1761 	movdqa	%xmm1, 0x10(%edx)
   1762 	movdqu	(%esi), %xmm0
   1763 	movdqa	%xmm0, (%edx)
   1764 	cmp	$64, %ecx
   1765 	jae	L(bk_ssse3_cpy)
   1766 	jmp	L(bk_write_64bytesless)
   1767 
   1768 #endif
   1769 
   1770 END (MEMCPY)
   1771