Home | History | Annotate | Download | only in string
      1 /*
      2 Copyright (c) 2010, Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 #ifndef MEMCPY
     32 # define MEMCPY         ssse3_memcpy5
     33 #endif
     34 
     35 #ifndef L
     36 # define L(label)	.L##label
     37 #endif
     38 
     39 #ifndef ALIGN
     40 # define ALIGN(n)	.p2align n
     41 #endif
     42 
     43 #ifndef cfi_startproc
     44 # define cfi_startproc			.cfi_startproc
     45 #endif
     46 
     47 #ifndef cfi_endproc
     48 # define cfi_endproc			.cfi_endproc
     49 #endif
     50 
     51 #ifndef cfi_rel_offset
     52 # define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
     53 #endif
     54 
     55 #ifndef cfi_restore
     56 # define cfi_restore(reg)		.cfi_restore reg
     57 #endif
     58 
     59 #ifndef cfi_adjust_cfa_offset
     60 # define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
     61 #endif
     62 
     63 #ifndef cfi_remember_state
     64 # define cfi_remember_state		.cfi_remember_state
     65 #endif
     66 
     67 #ifndef cfi_restore_state
     68 # define cfi_restore_state		.cfi_restore_state
     69 #endif
     70 
     71 #ifndef ENTRY
     72 # define ENTRY(name)			\
     73 	.type name,  @function; 	\
     74 	.globl name;			\
     75 	.p2align 4;			\
     76 name:					\
     77 	cfi_startproc
     78 #endif
     79 
     80 #ifndef END
     81 # define END(name)			\
     82 	cfi_endproc;			\
     83 	.size name, .-name
     84 #endif
     85 
     86 #ifdef USE_AS_BCOPY
     87 # define SRC		PARMS
     88 # define DEST		SRC+4
     89 # define LEN		DEST+4
     90 #else
     91 # define DEST		PARMS
     92 # define SRC		DEST+4
     93 # define LEN		SRC+4
     94 #endif
     95 
     96 #define CFI_PUSH(REG)						\
     97   cfi_adjust_cfa_offset (4);					\
     98   cfi_rel_offset (REG, 0)
     99 
    100 #define CFI_POP(REG)						\
    101   cfi_adjust_cfa_offset (-4);					\
    102   cfi_restore (REG)
    103 
    104 #define PUSH(REG)	pushl REG; CFI_PUSH (REG)
    105 #define POP(REG)	popl REG; CFI_POP (REG)
    106 
    107 #ifdef SHARED
    108 # define PARMS		8		/* Preserve EBX.  */
    109 # define ENTRANCE	PUSH (%ebx);
    110 # define RETURN_END	POP (%ebx); ret
    111 # define RETURN		RETURN_END; CFI_PUSH (%ebx)
    112 # define JMPTBL(I, B)	I - B
    113 
    114 /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
    115    jump table with relative offsets.  INDEX is a register contains the
    116    index into the jump table.   SCALE is the scale of INDEX. */
    117 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
    118     /* We first load PC into EBX.  */				\
    119     call	__i686.get_pc_thunk.bx;				\
    120     /* Get the address of the jump table.  */			\
    121     addl	$(TABLE - .), %ebx;				\
    122     /* Get the entry and convert the relative offset to the	\
    123        absolute address.  */					\
    124     addl	(%ebx,INDEX,SCALE), %ebx;			\
    125     /* We loaded the jump table.  Go.  */			\
    126     jmp		*%ebx
    127 
    128 # define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)			\
    129     addl	$(TABLE - .), %ebx
    130 
    131 # define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)	\
    132     addl	(%ebx,INDEX,SCALE), %ebx;			\
    133     /* We loaded the jump table.  Go.  */			\
    134     jmp		*%ebx
    135 
    136 	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
    137 	.globl	__i686.get_pc_thunk.bx
    138 	.hidden	__i686.get_pc_thunk.bx
    139 	ALIGN (4)
    140 	.type	__i686.get_pc_thunk.bx,@function
    141 __i686.get_pc_thunk.bx:
    142 	movl	(%esp), %ebx
    143 	ret
    144 #else
    145 # define PARMS		4
    146 # define ENTRANCE
    147 # define RETURN_END	ret
    148 # define RETURN		RETURN_END
    149 # define JMPTBL(I, B)	I
    150 
    151 /* Branch to an entry in a jump table.  TABLE is a jump table with
    152    absolute offsets.  INDEX is a register contains the index into the
    153    jump table.  SCALE is the scale of INDEX. */
    154 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
    155     jmp		*TABLE(,INDEX,SCALE)
    156 
    157 # define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
    158 
    159 # define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)		\
    160     jmp		*TABLE(,INDEX,SCALE)
    161 #endif
    162 
    163 	.section .text.ssse3,"ax",@progbits
    164 ENTRY (MEMCPY)
    165 	ENTRANCE
    166 	movl	LEN(%esp), %ecx
    167 	movl	SRC(%esp), %eax
    168 	movl	DEST(%esp), %edx
    169 
    170 #ifdef USE_AS_MEMMOVE
    171 	cmp	%eax, %edx
    172 	jb	L(copy_forward)
    173 	je	L(fwd_write_0bytes)
    174 	cmp	$32, %ecx
    175 	jae	L(memmove_bwd)
    176 	jmp	L(bk_write_less32bytes_2)
    177 L(memmove_bwd):
    178 	add	%ecx, %eax
    179 	cmp	%eax, %edx
    180 	movl	SRC(%esp), %eax
    181 	jb	L(copy_backward)
    182 
    183 L(copy_forward):
    184 #endif
    185 	cmp	$48, %ecx
    186 	jae	L(48bytesormore)
    187 
    188 L(fwd_write_less32bytes):
    189 #ifndef USE_AS_MEMMOVE
    190 	cmp	%dl, %al
    191 	jb	L(bk_write)
    192 #endif
    193 	add	%ecx, %edx
    194 	add	%ecx, %eax
    195 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
    196 #ifndef USE_AS_MEMMOVE
    197 L(bk_write):
    198 	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
    199 #endif
    200 
    201 	ALIGN (4)
    202 /* ECX > 32 and EDX is 4 byte aligned.  */
    203 L(48bytesormore):
    204 	movdqu	(%eax), %xmm0
    205 	PUSH (%edi)
    206 	movl	%edx, %edi
    207 	and	$-16, %edx
    208 	PUSH (%esi)
    209 	cfi_remember_state
    210 	add	$16, %edx
    211 	movl	%edi, %esi
    212 	sub	%edx, %edi
    213 	add	%edi, %ecx
    214 	sub	%edi, %eax
    215 
    216 #ifdef SHARED_CACHE_SIZE_HALF
    217 	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
    218 #else
    219 # ifdef SHARED
    220 	call	__i686.get_pc_thunk.bx
    221 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    222 	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
    223 # else
    224 	cmp	__x86_shared_cache_size_half, %ecx
    225 # endif
    226 #endif
    227 
    228 	mov	%eax, %edi
    229 	jae	L(large_page)
    230 	and	$0xf, %edi
    231 	jz	L(shl_0)
    232 
    233 	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
    234 
    235 	cfi_restore_state
    236 	cfi_remember_state
    237 	ALIGN (4)
    238 L(shl_0):
    239 	movdqu	%xmm0, (%esi)
    240 	xor	%edi, %edi
    241 	POP (%esi)
    242 	cmp	$127, %ecx
    243 	ja	L(shl_0_gobble)
    244 	lea	-32(%ecx), %ecx
    245 L(shl_0_loop):
    246 	movdqa	(%eax, %edi), %xmm0
    247 	movdqa	16(%eax, %edi), %xmm1
    248 	sub	$32, %ecx
    249 	movdqa	%xmm0, (%edx, %edi)
    250 	movdqa	%xmm1, 16(%edx, %edi)
    251 	lea	32(%edi), %edi
    252 	jb	L(shl_0_end)
    253 
    254 	movdqa	(%eax, %edi), %xmm0
    255 	movdqa	16(%eax, %edi), %xmm1
    256 	sub	$32, %ecx
    257 	movdqa	%xmm0, (%edx, %edi)
    258 	movdqa	%xmm1, 16(%edx, %edi)
    259 	lea	32(%edi), %edi
    260 	jb	L(shl_0_end)
    261 
    262 	movdqa	(%eax, %edi), %xmm0
    263 	movdqa	16(%eax, %edi), %xmm1
    264 	sub	$32, %ecx
    265 	movdqa	%xmm0, (%edx, %edi)
    266 	movdqa	%xmm1, 16(%edx, %edi)
    267 	lea	32(%edi), %edi
    268 	jb	L(shl_0_end)
    269 
    270 	movdqa	(%eax, %edi), %xmm0
    271 	movdqa	16(%eax, %edi), %xmm1
    272 	sub	$32, %ecx
    273 	movdqa	%xmm0, (%edx, %edi)
    274 	movdqa	%xmm1, 16(%edx, %edi)
    275 	lea	32(%edi), %edi
    276 L(shl_0_end):
    277 	lea	32(%ecx), %ecx
    278 	add	%ecx, %edi
    279 	add	%edi, %edx
    280 	add	%edi, %eax
    281 	POP (%edi)
    282 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
    283 
    284 	CFI_PUSH (%edi)
    285 L(shl_0_gobble):
    286 
    287 #ifdef DATA_CACHE_SIZE_HALF
    288 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
    289 #else
    290 # ifdef SHARED
    291 	call	__i686.get_pc_thunk.bx
    292 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    293 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
    294 # else
    295 	cmp	__x86_data_cache_size_half, %ecx
    296 # endif
    297 #endif
    298 
    299 	POP (%edi)
    300 	lea	-128(%ecx), %ecx
    301 	jae	L(shl_0_gobble_mem_loop)
    302 L(shl_0_gobble_cache_loop):
    303 	movdqa	(%eax), %xmm0
    304 	movdqa	0x10(%eax), %xmm1
    305 	movdqa	0x20(%eax), %xmm2
    306 	movdqa	0x30(%eax), %xmm3
    307 	movdqa	0x40(%eax), %xmm4
    308 	movdqa	0x50(%eax), %xmm5
    309 	movdqa	0x60(%eax), %xmm6
    310 	movdqa	0x70(%eax), %xmm7
    311 	lea	0x80(%eax), %eax
    312 	sub	$128, %ecx
    313 	movdqa	%xmm0, (%edx)
    314 	movdqa	%xmm1, 0x10(%edx)
    315 	movdqa	%xmm2, 0x20(%edx)
    316 	movdqa	%xmm3, 0x30(%edx)
    317 	movdqa	%xmm4, 0x40(%edx)
    318 	movdqa	%xmm5, 0x50(%edx)
    319 	movdqa	%xmm6, 0x60(%edx)
    320 	movdqa	%xmm7, 0x70(%edx)
    321 	lea	0x80(%edx), %edx
    322 
    323 	jae	L(shl_0_gobble_cache_loop)
    324 	cmp	$-0x40, %ecx
    325 	lea	0x80(%ecx), %ecx
    326 	jl	L(shl_0_cache_less_64bytes)
    327 
    328 	movdqa	(%eax), %xmm0
    329 	sub	$0x40, %ecx
    330 	movdqa	0x10(%eax), %xmm1
    331 
    332 	movdqa	%xmm0, (%edx)
    333 	movdqa	%xmm1, 0x10(%edx)
    334 
    335 	movdqa	0x20(%eax), %xmm0
    336 	movdqa	0x30(%eax), %xmm1
    337 	add	$0x40, %eax
    338 
    339 	movdqa	%xmm0, 0x20(%edx)
    340 	movdqa	%xmm1, 0x30(%edx)
    341 	add	$0x40, %edx
    342 L(shl_0_cache_less_64bytes):
    343 	cmp	$0x20, %ecx
    344 	jb	L(shl_0_cache_less_32bytes)
    345 	movdqa	(%eax), %xmm0
    346 	sub	$0x20, %ecx
    347 	movdqa	0x10(%eax), %xmm1
    348 	add	$0x20, %eax
    349 	movdqa	%xmm0, (%edx)
    350 	movdqa	%xmm1, 0x10(%edx)
    351 	add	$0x20, %edx
    352 L(shl_0_cache_less_32bytes):
    353 	cmp	$0x10, %ecx
    354 	jb	L(shl_0_cache_less_16bytes)
    355 	sub	$0x10, %ecx
    356 	movdqa	(%eax), %xmm0
    357 	add	$0x10, %eax
    358 	movdqa	%xmm0, (%edx)
    359 	add	$0x10, %edx
    360 L(shl_0_cache_less_16bytes):
    361 	add	%ecx, %edx
    362 	add	%ecx, %eax
    363 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
    364 
    365 
    366 	ALIGN (4)
    367 L(shl_0_gobble_mem_loop):
    368 	prefetcht0 0x1c0(%eax)
    369 	prefetcht0 0x280(%eax)
    370 	prefetcht0 0x1c0(%edx)
    371 
    372 	movdqa	(%eax), %xmm0
    373 	movdqa	0x10(%eax), %xmm1
    374 	movdqa	0x20(%eax), %xmm2
    375 	movdqa	0x30(%eax), %xmm3
    376 	movdqa	0x40(%eax), %xmm4
    377 	movdqa	0x50(%eax), %xmm5
    378 	movdqa	0x60(%eax), %xmm6
    379 	movdqa	0x70(%eax), %xmm7
    380 	lea	0x80(%eax), %eax
    381 	sub	$0x80, %ecx
    382 	movdqa	%xmm0, (%edx)
    383 	movdqa	%xmm1, 0x10(%edx)
    384 	movdqa	%xmm2, 0x20(%edx)
    385 	movdqa	%xmm3, 0x30(%edx)
    386 	movdqa	%xmm4, 0x40(%edx)
    387 	movdqa	%xmm5, 0x50(%edx)
    388 	movdqa	%xmm6, 0x60(%edx)
    389 	movdqa	%xmm7, 0x70(%edx)
    390 	lea	0x80(%edx), %edx
    391 
    392 	jae	L(shl_0_gobble_mem_loop)
    393 	cmp	$-0x40, %ecx
    394 	lea	0x80(%ecx), %ecx
    395 	jl	L(shl_0_mem_less_64bytes)
    396 
    397 	movdqa	(%eax), %xmm0
    398 	sub	$0x40, %ecx
    399 	movdqa	0x10(%eax), %xmm1
    400 
    401 	movdqa	%xmm0, (%edx)
    402 	movdqa	%xmm1, 0x10(%edx)
    403 
    404 	movdqa	0x20(%eax), %xmm0
    405 	movdqa	0x30(%eax), %xmm1
    406 	add	$0x40, %eax
    407 
    408 	movdqa	%xmm0, 0x20(%edx)
    409 	movdqa	%xmm1, 0x30(%edx)
    410 	add	$0x40, %edx
    411 L(shl_0_mem_less_64bytes):
    412 	cmp	$0x20, %ecx
    413 	jb	L(shl_0_mem_less_32bytes)
    414 	movdqa	(%eax), %xmm0
    415 	sub	$0x20, %ecx
    416 	movdqa	0x10(%eax), %xmm1
    417 	add	$0x20, %eax
    418 	movdqa	%xmm0, (%edx)
    419 	movdqa	%xmm1, 0x10(%edx)
    420 	add	$0x20, %edx
    421 L(shl_0_mem_less_32bytes):
    422 	cmp	$0x10, %ecx
    423 	jb	L(shl_0_mem_less_16bytes)
    424 	sub	$0x10, %ecx
    425 	movdqa	(%eax), %xmm0
    426 	add	$0x10, %eax
    427 	movdqa	%xmm0, (%edx)
    428 	add	$0x10, %edx
    429 L(shl_0_mem_less_16bytes):
    430 	add	%ecx, %edx
    431 	add	%ecx, %eax
    432 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
    433 
    434 	cfi_restore_state
    435 	cfi_remember_state
    436 	ALIGN (4)
    437 L(shl_1):
    438 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    439 	lea	-1(%eax), %eax
    440 	movaps	(%eax), %xmm1
    441 	xor	%edi, %edi
    442 	lea	-32(%ecx), %ecx
    443 	movdqu	%xmm0, (%esi)
    444 	POP (%esi)
    445 L(shl_1_loop):
    446 
    447 	movdqa	16(%eax, %edi), %xmm2
    448 	sub	$32, %ecx
    449 	movdqa	32(%eax, %edi), %xmm3
    450 	movdqa	%xmm3, %xmm4
    451 	palignr	$1, %xmm2, %xmm3
    452 	palignr	$1, %xmm1, %xmm2
    453 	lea	32(%edi), %edi
    454 	movdqa	%xmm2, -32(%edx, %edi)
    455 	movdqa	%xmm3, -16(%edx, %edi)
    456 
    457 	jb	L(shl_1_end)
    458 
    459 	movdqa	16(%eax, %edi), %xmm2
    460 	sub	$32, %ecx
    461 	movdqa	32(%eax, %edi), %xmm3
    462 	movdqa	%xmm3, %xmm1
    463 	palignr	$1, %xmm2, %xmm3
    464 	palignr	$1, %xmm4, %xmm2
    465 	lea	32(%edi), %edi
    466 	movdqa	%xmm2, -32(%edx, %edi)
    467 	movdqa	%xmm3, -16(%edx, %edi)
    468 
    469 	jae	L(shl_1_loop)
    470 
    471 L(shl_1_end):
    472 	lea	32(%ecx), %ecx
    473 	add	%ecx, %edi
    474 	add	%edi, %edx
    475 	lea	1(%edi, %eax), %eax
    476 	POP (%edi)
    477 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    478 
    479 	cfi_restore_state
    480 	cfi_remember_state
    481 	ALIGN (4)
    482 L(shl_2):
    483 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    484 	lea	-2(%eax), %eax
    485 	movaps	(%eax), %xmm1
    486 	xor	%edi, %edi
    487 	lea	-32(%ecx), %ecx
    488 	movdqu	%xmm0, (%esi)
    489 	POP (%esi)
    490 L(shl_2_loop):
    491 
    492 	movdqa	16(%eax, %edi), %xmm2
    493 	sub	$32, %ecx
    494 	movdqa	32(%eax, %edi), %xmm3
    495 	movdqa	%xmm3, %xmm4
    496 	palignr	$2, %xmm2, %xmm3
    497 	palignr	$2, %xmm1, %xmm2
    498 	lea	32(%edi), %edi
    499 	movdqa	%xmm2, -32(%edx, %edi)
    500 	movdqa	%xmm3, -16(%edx, %edi)
    501 
    502 	jb	L(shl_2_end)
    503 
    504 	movdqa	16(%eax, %edi), %xmm2
    505 	sub	$32, %ecx
    506 	movdqa	32(%eax, %edi), %xmm3
    507 	movdqa	%xmm3, %xmm1
    508 	palignr	$2, %xmm2, %xmm3
    509 	palignr	$2, %xmm4, %xmm2
    510 	lea	32(%edi), %edi
    511 	movdqa	%xmm2, -32(%edx, %edi)
    512 	movdqa	%xmm3, -16(%edx, %edi)
    513 
    514 	jae	L(shl_2_loop)
    515 
    516 L(shl_2_end):
    517 	lea	32(%ecx), %ecx
    518 	add	%ecx, %edi
    519 	add	%edi, %edx
    520 	lea	2(%edi, %eax), %eax
    521 	POP (%edi)
    522 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    523 
    524 	cfi_restore_state
    525 	cfi_remember_state
    526 	ALIGN (4)
    527 L(shl_3):
    528 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    529 	lea	-3(%eax), %eax
    530 	movaps	(%eax), %xmm1
    531 	xor	%edi, %edi
    532 	lea	-32(%ecx), %ecx
    533 	movdqu	%xmm0, (%esi)
    534 	POP (%esi)
    535 L(shl_3_loop):
    536 
    537 	movdqa	16(%eax, %edi), %xmm2
    538 	sub	$32, %ecx
    539 	movdqa	32(%eax, %edi), %xmm3
    540 	movdqa	%xmm3, %xmm4
    541 	palignr	$3, %xmm2, %xmm3
    542 	palignr	$3, %xmm1, %xmm2
    543 	lea	32(%edi), %edi
    544 	movdqa	%xmm2, -32(%edx, %edi)
    545 	movdqa	%xmm3, -16(%edx, %edi)
    546 
    547 	jb	L(shl_3_end)
    548 
    549 	movdqa	16(%eax, %edi), %xmm2
    550 	sub	$32, %ecx
    551 	movdqa	32(%eax, %edi), %xmm3
    552 	movdqa	%xmm3, %xmm1
    553 	palignr	$3, %xmm2, %xmm3
    554 	palignr	$3, %xmm4, %xmm2
    555 	lea	32(%edi), %edi
    556 	movdqa	%xmm2, -32(%edx, %edi)
    557 	movdqa	%xmm3, -16(%edx, %edi)
    558 
    559 	jae	L(shl_3_loop)
    560 
    561 L(shl_3_end):
    562 	lea	32(%ecx), %ecx
    563 	add	%ecx, %edi
    564 	add	%edi, %edx
    565 	lea	3(%edi, %eax), %eax
    566 	POP (%edi)
    567 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    568 
    569 	cfi_restore_state
    570 	cfi_remember_state
    571 	ALIGN (4)
    572 L(shl_4):
    573 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    574 	lea	-4(%eax), %eax
    575 	movaps	(%eax), %xmm1
    576 	xor	%edi, %edi
    577 	lea	-32(%ecx), %ecx
    578 	movdqu	%xmm0, (%esi)
    579 	POP (%esi)
    580 L(shl_4_loop):
    581 
    582 	movdqa	16(%eax, %edi), %xmm2
    583 	sub	$32, %ecx
    584 	movdqa	32(%eax, %edi), %xmm3
    585 	movdqa	%xmm3, %xmm4
    586 	palignr	$4, %xmm2, %xmm3
    587 	palignr	$4, %xmm1, %xmm2
    588 	lea	32(%edi), %edi
    589 	movdqa	%xmm2, -32(%edx, %edi)
    590 	movdqa	%xmm3, -16(%edx, %edi)
    591 
    592 	jb	L(shl_4_end)
    593 
    594 	movdqa	16(%eax, %edi), %xmm2
    595 	sub	$32, %ecx
    596 	movdqa	32(%eax, %edi), %xmm3
    597 	movdqa	%xmm3, %xmm1
    598 	palignr	$4, %xmm2, %xmm3
    599 	palignr	$4, %xmm4, %xmm2
    600 	lea	32(%edi), %edi
    601 	movdqa	%xmm2, -32(%edx, %edi)
    602 	movdqa	%xmm3, -16(%edx, %edi)
    603 
    604 	jae	L(shl_4_loop)
    605 
    606 L(shl_4_end):
    607 	lea	32(%ecx), %ecx
    608 	add	%ecx, %edi
    609 	add	%edi, %edx
    610 	lea	4(%edi, %eax), %eax
    611 	POP (%edi)
    612 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    613 
    614 	cfi_restore_state
    615 	cfi_remember_state
    616 	ALIGN (4)
    617 L(shl_5):
    618 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    619 	lea	-5(%eax), %eax
    620 	movaps	(%eax), %xmm1
    621 	xor	%edi, %edi
    622 	lea	-32(%ecx), %ecx
    623 	movdqu	%xmm0, (%esi)
    624 	POP (%esi)
    625 L(shl_5_loop):
    626 
    627 	movdqa	16(%eax, %edi), %xmm2
    628 	sub	$32, %ecx
    629 	movdqa	32(%eax, %edi), %xmm3
    630 	movdqa	%xmm3, %xmm4
    631 	palignr	$5, %xmm2, %xmm3
    632 	palignr	$5, %xmm1, %xmm2
    633 	lea	32(%edi), %edi
    634 	movdqa	%xmm2, -32(%edx, %edi)
    635 	movdqa	%xmm3, -16(%edx, %edi)
    636 
    637 	jb	L(shl_5_end)
    638 
    639 	movdqa	16(%eax, %edi), %xmm2
    640 	sub	$32, %ecx
    641 	movdqa	32(%eax, %edi), %xmm3
    642 	movdqa	%xmm3, %xmm1
    643 	palignr	$5, %xmm2, %xmm3
    644 	palignr	$5, %xmm4, %xmm2
    645 	lea	32(%edi), %edi
    646 	movdqa	%xmm2, -32(%edx, %edi)
    647 	movdqa	%xmm3, -16(%edx, %edi)
    648 
    649 	jae	L(shl_5_loop)
    650 
    651 L(shl_5_end):
    652 	lea	32(%ecx), %ecx
    653 	add	%ecx, %edi
    654 	add	%edi, %edx
    655 	lea	5(%edi, %eax), %eax
    656 	POP (%edi)
    657 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    658 
    659 	cfi_restore_state
    660 	cfi_remember_state
    661 	ALIGN (4)
    662 L(shl_6):
    663 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    664 	lea	-6(%eax), %eax
    665 	movaps	(%eax), %xmm1
    666 	xor	%edi, %edi
    667 	lea	-32(%ecx), %ecx
    668 	movdqu	%xmm0, (%esi)
    669 	POP (%esi)
    670 L(shl_6_loop):
    671 
    672 	movdqa	16(%eax, %edi), %xmm2
    673 	sub	$32, %ecx
    674 	movdqa	32(%eax, %edi), %xmm3
    675 	movdqa	%xmm3, %xmm4
    676 	palignr	$6, %xmm2, %xmm3
    677 	palignr	$6, %xmm1, %xmm2
    678 	lea	32(%edi), %edi
    679 	movdqa	%xmm2, -32(%edx, %edi)
    680 	movdqa	%xmm3, -16(%edx, %edi)
    681 
    682 	jb	L(shl_6_end)
    683 
    684 	movdqa	16(%eax, %edi), %xmm2
    685 	sub	$32, %ecx
    686 	movdqa	32(%eax, %edi), %xmm3
    687 	movdqa	%xmm3, %xmm1
    688 	palignr	$6, %xmm2, %xmm3
    689 	palignr	$6, %xmm4, %xmm2
    690 	lea	32(%edi), %edi
    691 	movdqa	%xmm2, -32(%edx, %edi)
    692 	movdqa	%xmm3, -16(%edx, %edi)
    693 
    694 	jae	L(shl_6_loop)
    695 
    696 L(shl_6_end):
    697 	lea	32(%ecx), %ecx
    698 	add	%ecx, %edi
    699 	add	%edi, %edx
    700 	lea	6(%edi, %eax), %eax
    701 	POP (%edi)
    702 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    703 
    704 	cfi_restore_state
    705 	cfi_remember_state
    706 	ALIGN (4)
    707 L(shl_7):
    708 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    709 	lea	-7(%eax), %eax
    710 	movaps	(%eax), %xmm1
    711 	xor	%edi, %edi
    712 	lea	-32(%ecx), %ecx
    713 	movdqu	%xmm0, (%esi)
    714 	POP (%esi)
    715 L(shl_7_loop):
    716 
    717 	movdqa	16(%eax, %edi), %xmm2
    718 	sub	$32, %ecx
    719 	movdqa	32(%eax, %edi), %xmm3
    720 	movdqa	%xmm3, %xmm4
    721 	palignr	$7, %xmm2, %xmm3
    722 	palignr	$7, %xmm1, %xmm2
    723 	lea	32(%edi), %edi
    724 	movdqa	%xmm2, -32(%edx, %edi)
    725 	movdqa	%xmm3, -16(%edx, %edi)
    726 
    727 	jb	L(shl_7_end)
    728 
    729 	movdqa	16(%eax, %edi), %xmm2
    730 	sub	$32, %ecx
    731 	movdqa	32(%eax, %edi), %xmm3
    732 	movdqa	%xmm3, %xmm1
    733 	palignr	$7, %xmm2, %xmm3
    734 	palignr	$7, %xmm4, %xmm2
    735 	lea	32(%edi), %edi
    736 	movdqa	%xmm2, -32(%edx, %edi)
    737 	movdqa	%xmm3, -16(%edx, %edi)
    738 
    739 	jae	L(shl_7_loop)
    740 
    741 L(shl_7_end):
    742 	lea	32(%ecx), %ecx
    743 	add	%ecx, %edi
    744 	add	%edi, %edx
    745 	lea	7(%edi, %eax), %eax
    746 	POP (%edi)
    747 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    748 
    749 	cfi_restore_state
    750 	cfi_remember_state
    751 	ALIGN (4)
    752 L(shl_8):
    753 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    754 	lea	-8(%eax), %eax
    755 	movaps	(%eax), %xmm1
    756 	xor	%edi, %edi
    757 	lea	-32(%ecx), %ecx
    758 	movdqu	%xmm0, (%esi)
    759 	POP (%esi)
    760 L(shl_8_loop):
    761 
    762 	movdqa	16(%eax, %edi), %xmm2
    763 	sub	$32, %ecx
    764 	movdqa	32(%eax, %edi), %xmm3
    765 	movdqa	%xmm3, %xmm4
    766 	palignr	$8, %xmm2, %xmm3
    767 	palignr	$8, %xmm1, %xmm2
    768 	lea	32(%edi), %edi
    769 	movdqa	%xmm2, -32(%edx, %edi)
    770 	movdqa	%xmm3, -16(%edx, %edi)
    771 
    772 	jb	L(shl_8_end)
    773 
    774 	movdqa	16(%eax, %edi), %xmm2
    775 	sub	$32, %ecx
    776 	movdqa	32(%eax, %edi), %xmm3
    777 	movdqa	%xmm3, %xmm1
    778 	palignr	$8, %xmm2, %xmm3
    779 	palignr	$8, %xmm4, %xmm2
    780 	lea	32(%edi), %edi
    781 	movdqa	%xmm2, -32(%edx, %edi)
    782 	movdqa	%xmm3, -16(%edx, %edi)
    783 
    784 	jae	L(shl_8_loop)
    785 
    786 L(shl_8_end):
    787 	lea	32(%ecx), %ecx
    788 	add	%ecx, %edi
    789 	add	%edi, %edx
    790 	lea	8(%edi, %eax), %eax
    791 	POP (%edi)
    792 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    793 
    794 	cfi_restore_state
    795 	cfi_remember_state
    796 	ALIGN (4)
    797 L(shl_9):
    798 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    799 	lea	-9(%eax), %eax
    800 	movaps	(%eax), %xmm1
    801 	xor	%edi, %edi
    802 	lea	-32(%ecx), %ecx
    803 	movdqu	%xmm0, (%esi)
    804 	POP (%esi)
    805 L(shl_9_loop):
    806 
    807 	movdqa	16(%eax, %edi), %xmm2
    808 	sub	$32, %ecx
    809 	movdqa	32(%eax, %edi), %xmm3
    810 	movdqa	%xmm3, %xmm4
    811 	palignr	$9, %xmm2, %xmm3
    812 	palignr	$9, %xmm1, %xmm2
    813 	lea	32(%edi), %edi
    814 	movdqa	%xmm2, -32(%edx, %edi)
    815 	movdqa	%xmm3, -16(%edx, %edi)
    816 
    817 	jb	L(shl_9_end)
    818 
    819 	movdqa	16(%eax, %edi), %xmm2
    820 	sub	$32, %ecx
    821 	movdqa	32(%eax, %edi), %xmm3
    822 	movdqa	%xmm3, %xmm1
    823 	palignr	$9, %xmm2, %xmm3
    824 	palignr	$9, %xmm4, %xmm2
    825 	lea	32(%edi), %edi
    826 	movdqa	%xmm2, -32(%edx, %edi)
    827 	movdqa	%xmm3, -16(%edx, %edi)
    828 
    829 	jae	L(shl_9_loop)
    830 
    831 L(shl_9_end):
    832 	lea	32(%ecx), %ecx
    833 	add	%ecx, %edi
    834 	add	%edi, %edx
    835 	lea	9(%edi, %eax), %eax
    836 	POP (%edi)
    837 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    838 
    839 	cfi_restore_state
    840 	cfi_remember_state
    841 	ALIGN (4)
    842 L(shl_10):
    843 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    844 	lea	-10(%eax), %eax
    845 	movaps	(%eax), %xmm1
    846 	xor	%edi, %edi
    847 	lea	-32(%ecx), %ecx
    848 	movdqu	%xmm0, (%esi)
    849 	POP (%esi)
    850 L(shl_10_loop):
    851 
    852 	movdqa	16(%eax, %edi), %xmm2
    853 	sub	$32, %ecx
    854 	movdqa	32(%eax, %edi), %xmm3
    855 	movdqa	%xmm3, %xmm4
    856 	palignr	$10, %xmm2, %xmm3
    857 	palignr	$10, %xmm1, %xmm2
    858 	lea	32(%edi), %edi
    859 	movdqa	%xmm2, -32(%edx, %edi)
    860 	movdqa	%xmm3, -16(%edx, %edi)
    861 
    862 	jb	L(shl_10_end)
    863 
    864 	movdqa	16(%eax, %edi), %xmm2
    865 	sub	$32, %ecx
    866 	movdqa	32(%eax, %edi), %xmm3
    867 	movdqa	%xmm3, %xmm1
    868 	palignr	$10, %xmm2, %xmm3
    869 	palignr	$10, %xmm4, %xmm2
    870 	lea	32(%edi), %edi
    871 	movdqa	%xmm2, -32(%edx, %edi)
    872 	movdqa	%xmm3, -16(%edx, %edi)
    873 
    874 	jae	L(shl_10_loop)
    875 
    876 L(shl_10_end):
    877 	lea	32(%ecx), %ecx
    878 	add	%ecx, %edi
    879 	add	%edi, %edx
    880 	lea	10(%edi, %eax), %eax
    881 	POP (%edi)
    882 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    883 
    884 	cfi_restore_state
    885 	cfi_remember_state
    886 	ALIGN (4)
    887 L(shl_11):
    888 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    889 	lea	-11(%eax), %eax
    890 	movaps	(%eax), %xmm1
    891 	xor	%edi, %edi
    892 	lea	-32(%ecx), %ecx
    893 	movdqu	%xmm0, (%esi)
    894 	POP (%esi)
    895 L(shl_11_loop):
    896 
    897 	movdqa	16(%eax, %edi), %xmm2
    898 	sub	$32, %ecx
    899 	movdqa	32(%eax, %edi), %xmm3
    900 	movdqa	%xmm3, %xmm4
    901 	palignr	$11, %xmm2, %xmm3
    902 	palignr	$11, %xmm1, %xmm2
    903 	lea	32(%edi), %edi
    904 	movdqa	%xmm2, -32(%edx, %edi)
    905 	movdqa	%xmm3, -16(%edx, %edi)
    906 
    907 	jb	L(shl_11_end)
    908 
    909 	movdqa	16(%eax, %edi), %xmm2
    910 	sub	$32, %ecx
    911 	movdqa	32(%eax, %edi), %xmm3
    912 	movdqa	%xmm3, %xmm1
    913 	palignr	$11, %xmm2, %xmm3
    914 	palignr	$11, %xmm4, %xmm2
    915 	lea	32(%edi), %edi
    916 	movdqa	%xmm2, -32(%edx, %edi)
    917 	movdqa	%xmm3, -16(%edx, %edi)
    918 
    919 	jae	L(shl_11_loop)
    920 
    921 L(shl_11_end):
    922 	lea	32(%ecx), %ecx
    923 	add	%ecx, %edi
    924 	add	%edi, %edx
    925 	lea	11(%edi, %eax), %eax
    926 	POP (%edi)
    927 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    928 
    929 	cfi_restore_state
    930 	cfi_remember_state
    931 	ALIGN (4)
    932 L(shl_12):
    933 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    934 	lea	-12(%eax), %eax
    935 	movaps	(%eax), %xmm1
    936 	xor	%edi, %edi
    937 	lea	-32(%ecx), %ecx
    938 	movdqu	%xmm0, (%esi)
    939 	POP (%esi)
    940 L(shl_12_loop):
    941 
    942 	movdqa	16(%eax, %edi), %xmm2
    943 	sub	$32, %ecx
    944 	movdqa	32(%eax, %edi), %xmm3
    945 	movdqa	%xmm3, %xmm4
    946 	palignr	$12, %xmm2, %xmm3
    947 	palignr	$12, %xmm1, %xmm2
    948 	lea	32(%edi), %edi
    949 	movdqa	%xmm2, -32(%edx, %edi)
    950 	movdqa	%xmm3, -16(%edx, %edi)
    951 
    952 	jb	L(shl_12_end)
    953 
    954 	movdqa	16(%eax, %edi), %xmm2
    955 	sub	$32, %ecx
    956 	movdqa	32(%eax, %edi), %xmm3
    957 	movdqa	%xmm3, %xmm1
    958 	palignr	$12, %xmm2, %xmm3
    959 	palignr	$12, %xmm4, %xmm2
    960 	lea	32(%edi), %edi
    961 	movdqa	%xmm2, -32(%edx, %edi)
    962 	movdqa	%xmm3, -16(%edx, %edi)
    963 
    964 	jae	L(shl_12_loop)
    965 
    966 L(shl_12_end):
    967 	lea	32(%ecx), %ecx
    968 	add	%ecx, %edi
    969 	add	%edi, %edx
    970 	lea	12(%edi, %eax), %eax
    971 	POP (%edi)
    972 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
    973 
    974 	cfi_restore_state
    975 	cfi_remember_state
    976 	ALIGN (4)
    977 L(shl_13):
    978 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
    979 	lea	-13(%eax), %eax
    980 	movaps	(%eax), %xmm1
    981 	xor	%edi, %edi
    982 	lea	-32(%ecx), %ecx
    983 	movdqu	%xmm0, (%esi)
    984 	POP (%esi)
    985 L(shl_13_loop):
    986 
    987 	movdqa	16(%eax, %edi), %xmm2
    988 	sub	$32, %ecx
    989 	movdqa	32(%eax, %edi), %xmm3
    990 	movdqa	%xmm3, %xmm4
    991 	palignr	$13, %xmm2, %xmm3
    992 	palignr	$13, %xmm1, %xmm2
    993 	lea	32(%edi), %edi
    994 	movdqa	%xmm2, -32(%edx, %edi)
    995 	movdqa	%xmm3, -16(%edx, %edi)
    996 
    997 	jb	L(shl_13_end)
    998 
    999 	movdqa	16(%eax, %edi), %xmm2
   1000 	sub	$32, %ecx
   1001 	movdqa	32(%eax, %edi), %xmm3
   1002 	movdqa	%xmm3, %xmm1
   1003 	palignr	$13, %xmm2, %xmm3
   1004 	palignr	$13, %xmm4, %xmm2
   1005 	lea	32(%edi), %edi
   1006 	movdqa	%xmm2, -32(%edx, %edi)
   1007 	movdqa	%xmm3, -16(%edx, %edi)
   1008 
   1009 	jae	L(shl_13_loop)
   1010 
   1011 L(shl_13_end):
   1012 	lea	32(%ecx), %ecx
   1013 	add	%ecx, %edi
   1014 	add	%edi, %edx
   1015 	lea	13(%edi, %eax), %eax
   1016 	POP (%edi)
   1017 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
   1018 
   1019 	cfi_restore_state
   1020 	cfi_remember_state
   1021 	ALIGN (4)
   1022 L(shl_14):
   1023 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
   1024 	lea	-14(%eax), %eax
   1025 	movaps	(%eax), %xmm1
   1026 	xor	%edi, %edi
   1027 	lea	-32(%ecx), %ecx
   1028 	movdqu	%xmm0, (%esi)
   1029 	POP (%esi)
   1030 L(shl_14_loop):
   1031 
   1032 	movdqa	16(%eax, %edi), %xmm2
   1033 	sub	$32, %ecx
   1034 	movdqa	32(%eax, %edi), %xmm3
   1035 	movdqa	%xmm3, %xmm4
   1036 	palignr	$14, %xmm2, %xmm3
   1037 	palignr	$14, %xmm1, %xmm2
   1038 	lea	32(%edi), %edi
   1039 	movdqa	%xmm2, -32(%edx, %edi)
   1040 	movdqa	%xmm3, -16(%edx, %edi)
   1041 
   1042 	jb	L(shl_14_end)
   1043 
   1044 	movdqa	16(%eax, %edi), %xmm2
   1045 	sub	$32, %ecx
   1046 	movdqa	32(%eax, %edi), %xmm3
   1047 	movdqa	%xmm3, %xmm1
   1048 	palignr	$14, %xmm2, %xmm3
   1049 	palignr	$14, %xmm4, %xmm2
   1050 	lea	32(%edi), %edi
   1051 	movdqa	%xmm2, -32(%edx, %edi)
   1052 	movdqa	%xmm3, -16(%edx, %edi)
   1053 
   1054 	jae	L(shl_14_loop)
   1055 
   1056 L(shl_14_end):
   1057 	lea	32(%ecx), %ecx
   1058 	add	%ecx, %edi
   1059 	add	%edi, %edx
   1060 	lea	14(%edi, %eax), %eax
   1061 	POP (%edi)
   1062 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
   1063 
   1064 	cfi_restore_state
   1065 	cfi_remember_state
   1066 	ALIGN (4)
   1067 L(shl_15):
   1068 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
   1069 	lea	-15(%eax), %eax
   1070 	movaps	(%eax), %xmm1
   1071 	xor	%edi, %edi
   1072 	lea	-32(%ecx), %ecx
   1073 	movdqu	%xmm0, (%esi)
   1074 	POP (%esi)
   1075 L(shl_15_loop):
   1076 
   1077 	movdqa	16(%eax, %edi), %xmm2
   1078 	sub	$32, %ecx
   1079 	movdqa	32(%eax, %edi), %xmm3
   1080 	movdqa	%xmm3, %xmm4
   1081 	palignr	$15, %xmm2, %xmm3
   1082 	palignr	$15, %xmm1, %xmm2
   1083 	lea	32(%edi), %edi
   1084 	movdqa	%xmm2, -32(%edx, %edi)
   1085 	movdqa	%xmm3, -16(%edx, %edi)
   1086 
   1087 	jb	L(shl_15_end)
   1088 
   1089 	movdqa	16(%eax, %edi), %xmm2
   1090 	sub	$32, %ecx
   1091 	movdqa	32(%eax, %edi), %xmm3
   1092 	movdqa	%xmm3, %xmm1
   1093 	palignr	$15, %xmm2, %xmm3
   1094 	palignr	$15, %xmm4, %xmm2
   1095 	lea	32(%edi), %edi
   1096 	movdqa	%xmm2, -32(%edx, %edi)
   1097 	movdqa	%xmm3, -16(%edx, %edi)
   1098 
   1099 	jae	L(shl_15_loop)
   1100 
   1101 L(shl_15_end):
   1102 	lea	32(%ecx), %ecx
   1103 	add	%ecx, %edi
   1104 	add	%edi, %edx
   1105 	lea	15(%edi, %eax), %eax
   1106 	POP (%edi)
   1107 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
   1108 
   1109 
   1110 	ALIGN (4)
   1111 L(fwd_write_44bytes):
   1112 	movl	-44(%eax), %ecx
   1113 	movl	%ecx, -44(%edx)
   1114 L(fwd_write_40bytes):
   1115 	movl	-40(%eax), %ecx
   1116 	movl	%ecx, -40(%edx)
   1117 L(fwd_write_36bytes):
   1118 	movl	-36(%eax), %ecx
   1119 	movl	%ecx, -36(%edx)
   1120 L(fwd_write_32bytes):
   1121 	movl	-32(%eax), %ecx
   1122 	movl	%ecx, -32(%edx)
   1123 L(fwd_write_28bytes):
   1124 	movl	-28(%eax), %ecx
   1125 	movl	%ecx, -28(%edx)
   1126 L(fwd_write_24bytes):
   1127 	movl	-24(%eax), %ecx
   1128 	movl	%ecx, -24(%edx)
   1129 L(fwd_write_20bytes):
   1130 	movl	-20(%eax), %ecx
   1131 	movl	%ecx, -20(%edx)
   1132 L(fwd_write_16bytes):
   1133 	movl	-16(%eax), %ecx
   1134 	movl	%ecx, -16(%edx)
   1135 L(fwd_write_12bytes):
   1136 	movl	-12(%eax), %ecx
   1137 	movl	%ecx, -12(%edx)
   1138 L(fwd_write_8bytes):
   1139 	movl	-8(%eax), %ecx
   1140 	movl	%ecx, -8(%edx)
   1141 L(fwd_write_4bytes):
   1142 	movl	-4(%eax), %ecx
   1143 	movl	%ecx, -4(%edx)
   1144 L(fwd_write_0bytes):
   1145 #ifndef USE_AS_BCOPY
   1146 # ifdef USE_AS_MEMPCPY
   1147 	movl	%edx, %eax
   1148 # else
   1149 	movl	DEST(%esp), %eax
   1150 # endif
   1151 #endif
   1152 	RETURN
   1153 
   1154 	ALIGN (4)
   1155 L(fwd_write_5bytes):
   1156 	movl	-5(%eax), %ecx
   1157 	movl	-4(%eax), %eax
   1158 	movl	%ecx, -5(%edx)
   1159 	movl	%eax, -4(%edx)
   1160 #ifndef USE_AS_BCOPY
   1161 # ifdef USE_AS_MEMPCPY
   1162 	movl	%edx, %eax
   1163 # else
   1164 	movl	DEST(%esp), %eax
   1165 # endif
   1166 #endif
   1167 	RETURN
   1168 
   1169 	ALIGN (4)
   1170 L(fwd_write_45bytes):
   1171 	movl	-45(%eax), %ecx
   1172 	movl	%ecx, -45(%edx)
   1173 L(fwd_write_41bytes):
   1174 	movl	-41(%eax), %ecx
   1175 	movl	%ecx, -41(%edx)
   1176 L(fwd_write_37bytes):
   1177 	movl	-37(%eax), %ecx
   1178 	movl	%ecx, -37(%edx)
   1179 L(fwd_write_33bytes):
   1180 	movl	-33(%eax), %ecx
   1181 	movl	%ecx, -33(%edx)
   1182 L(fwd_write_29bytes):
   1183 	movl	-29(%eax), %ecx
   1184 	movl	%ecx, -29(%edx)
   1185 L(fwd_write_25bytes):
   1186 	movl	-25(%eax), %ecx
   1187 	movl	%ecx, -25(%edx)
   1188 L(fwd_write_21bytes):
   1189 	movl	-21(%eax), %ecx
   1190 	movl	%ecx, -21(%edx)
   1191 L(fwd_write_17bytes):
   1192 	movl	-17(%eax), %ecx
   1193 	movl	%ecx, -17(%edx)
   1194 L(fwd_write_13bytes):
   1195 	movl	-13(%eax), %ecx
   1196 	movl	%ecx, -13(%edx)
   1197 L(fwd_write_9bytes):
   1198 	movl	-9(%eax), %ecx
   1199 	movl	%ecx, -9(%edx)
   1200 	movl	-5(%eax), %ecx
   1201 	movl	%ecx, -5(%edx)
   1202 L(fwd_write_1bytes):
   1203 	movzbl	-1(%eax), %ecx
   1204 	movb	%cl, -1(%edx)
   1205 #ifndef USE_AS_BCOPY
   1206 # ifdef USE_AS_MEMPCPY
   1207 	movl	%edx, %eax
   1208 # else
   1209 	movl	DEST(%esp), %eax
   1210 # endif
   1211 #endif
   1212 	RETURN
   1213 
   1214 	ALIGN (4)
   1215 L(fwd_write_46bytes):
   1216 	movl	-46(%eax), %ecx
   1217 	movl	%ecx, -46(%edx)
   1218 L(fwd_write_42bytes):
   1219 	movl	-42(%eax), %ecx
   1220 	movl	%ecx, -42(%edx)
   1221 L(fwd_write_38bytes):
   1222 	movl	-38(%eax), %ecx
   1223 	movl	%ecx, -38(%edx)
   1224 L(fwd_write_34bytes):
   1225 	movl	-34(%eax), %ecx
   1226 	movl	%ecx, -34(%edx)
   1227 L(fwd_write_30bytes):
   1228 	movl	-30(%eax), %ecx
   1229 	movl	%ecx, -30(%edx)
   1230 L(fwd_write_26bytes):
   1231 	movl	-26(%eax), %ecx
   1232 	movl	%ecx, -26(%edx)
   1233 L(fwd_write_22bytes):
   1234 	movl	-22(%eax), %ecx
   1235 	movl	%ecx, -22(%edx)
   1236 L(fwd_write_18bytes):
   1237 	movl	-18(%eax), %ecx
   1238 	movl	%ecx, -18(%edx)
   1239 L(fwd_write_14bytes):
   1240 	movl	-14(%eax), %ecx
   1241 	movl	%ecx, -14(%edx)
   1242 L(fwd_write_10bytes):
   1243 	movl	-10(%eax), %ecx
   1244 	movl	%ecx, -10(%edx)
   1245 L(fwd_write_6bytes):
   1246 	movl	-6(%eax), %ecx
   1247 	movl	%ecx, -6(%edx)
   1248 L(fwd_write_2bytes):
   1249 	movzwl	-2(%eax), %ecx
   1250 	movw	%cx, -2(%edx)
   1251 #ifndef USE_AS_BCOPY
   1252 # ifdef USE_AS_MEMPCPY
   1253 	movl	%edx, %eax
   1254 # else
   1255 	movl	DEST(%esp), %eax
   1256 # endif
   1257 #endif
   1258 	RETURN
   1259 
   1260 	ALIGN (4)
   1261 L(fwd_write_47bytes):
   1262 	movl	-47(%eax), %ecx
   1263 	movl	%ecx, -47(%edx)
   1264 L(fwd_write_43bytes):
   1265 	movl	-43(%eax), %ecx
   1266 	movl	%ecx, -43(%edx)
   1267 L(fwd_write_39bytes):
   1268 	movl	-39(%eax), %ecx
   1269 	movl	%ecx, -39(%edx)
   1270 L(fwd_write_35bytes):
   1271 	movl	-35(%eax), %ecx
   1272 	movl	%ecx, -35(%edx)
   1273 L(fwd_write_31bytes):
   1274 	movl	-31(%eax), %ecx
   1275 	movl	%ecx, -31(%edx)
   1276 L(fwd_write_27bytes):
   1277 	movl	-27(%eax), %ecx
   1278 	movl	%ecx, -27(%edx)
   1279 L(fwd_write_23bytes):
   1280 	movl	-23(%eax), %ecx
   1281 	movl	%ecx, -23(%edx)
   1282 L(fwd_write_19bytes):
   1283 	movl	-19(%eax), %ecx
   1284 	movl	%ecx, -19(%edx)
   1285 L(fwd_write_15bytes):
   1286 	movl	-15(%eax), %ecx
   1287 	movl	%ecx, -15(%edx)
   1288 L(fwd_write_11bytes):
   1289 	movl	-11(%eax), %ecx
   1290 	movl	%ecx, -11(%edx)
   1291 L(fwd_write_7bytes):
   1292 	movl	-7(%eax), %ecx
   1293 	movl	%ecx, -7(%edx)
   1294 L(fwd_write_3bytes):
   1295 	movzwl	-3(%eax), %ecx
   1296 	movzbl	-1(%eax), %eax
   1297 	movw	%cx, -3(%edx)
   1298 	movb	%al, -1(%edx)
   1299 #ifndef USE_AS_BCOPY
   1300 # ifdef USE_AS_MEMPCPY
   1301 	movl	%edx, %eax
   1302 # else
   1303 	movl	DEST(%esp), %eax
   1304 # endif
   1305 #endif
   1306 	RETURN_END
   1307 
   1308 	cfi_restore_state
   1309 	cfi_remember_state
   1310 	ALIGN (4)
   1311 L(large_page):
   1312 	movdqu	(%eax), %xmm1
   1313 	lea	16(%eax), %eax
   1314 	movdqu	%xmm0, (%esi)
   1315 	movntdq	%xmm1, (%edx)
   1316 	lea	16(%edx), %edx
   1317 	POP (%esi)
   1318 	lea	-0x90(%ecx), %ecx
   1319 	POP (%edi)
   1320 L(large_page_loop):
   1321 	movdqu	(%eax), %xmm0
   1322 	movdqu	0x10(%eax), %xmm1
   1323 	movdqu	0x20(%eax), %xmm2
   1324 	movdqu	0x30(%eax), %xmm3
   1325 	movdqu	0x40(%eax), %xmm4
   1326 	movdqu	0x50(%eax), %xmm5
   1327 	movdqu	0x60(%eax), %xmm6
   1328 	movdqu	0x70(%eax), %xmm7
   1329 	lea	0x80(%eax), %eax
   1330 
   1331 	sub	$0x80, %ecx
   1332 	movntdq	%xmm0, (%edx)
   1333 	movntdq	%xmm1, 0x10(%edx)
   1334 	movntdq	%xmm2, 0x20(%edx)
   1335 	movntdq	%xmm3, 0x30(%edx)
   1336 	movntdq	%xmm4, 0x40(%edx)
   1337 	movntdq	%xmm5, 0x50(%edx)
   1338 	movntdq	%xmm6, 0x60(%edx)
   1339 	movntdq	%xmm7, 0x70(%edx)
   1340 	lea	0x80(%edx), %edx
   1341 	jae	L(large_page_loop)
   1342 	cmp	$-0x40, %ecx
   1343 	lea	0x80(%ecx), %ecx
   1344 	jl	L(large_page_less_64bytes)
   1345 
   1346 	movdqu	(%eax), %xmm0
   1347 	movdqu	0x10(%eax), %xmm1
   1348 	movdqu	0x20(%eax), %xmm2
   1349 	movdqu	0x30(%eax), %xmm3
   1350 	lea	0x40(%eax), %eax
   1351 
   1352 	movntdq	%xmm0, (%edx)
   1353 	movntdq	%xmm1, 0x10(%edx)
   1354 	movntdq	%xmm2, 0x20(%edx)
   1355 	movntdq	%xmm3, 0x30(%edx)
   1356 	lea	0x40(%edx), %edx
   1357 	sub	$0x40, %ecx
   1358 L(large_page_less_64bytes):
   1359 	cmp	$32, %ecx
   1360 	jb	L(large_page_less_32bytes)
   1361 	movdqu	(%eax), %xmm0
   1362 	movdqu	0x10(%eax), %xmm1
   1363 	lea	0x20(%eax), %eax
   1364 	movntdq	%xmm0, (%edx)
   1365 	movntdq	%xmm1, 0x10(%edx)
   1366 	lea	0x20(%edx), %edx
   1367 	sub	$0x20, %ecx
   1368 L(large_page_less_32bytes):
   1369 	add	%ecx, %edx
   1370 	add	%ecx, %eax
   1371 	sfence
   1372 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
   1373 
   1374 
   1375 	ALIGN (4)
   1376 L(bk_write_44bytes):
   1377 	movl	40(%eax), %ecx
   1378 	movl	%ecx, 40(%edx)
   1379 L(bk_write_40bytes):
   1380 	movl	36(%eax), %ecx
   1381 	movl	%ecx, 36(%edx)
   1382 L(bk_write_36bytes):
   1383 	movl	32(%eax), %ecx
   1384 	movl	%ecx, 32(%edx)
   1385 L(bk_write_32bytes):
   1386 	movl	28(%eax), %ecx
   1387 	movl	%ecx, 28(%edx)
   1388 L(bk_write_28bytes):
   1389 	movl	24(%eax), %ecx
   1390 	movl	%ecx, 24(%edx)
   1391 L(bk_write_24bytes):
   1392 	movl	20(%eax), %ecx
   1393 	movl	%ecx, 20(%edx)
   1394 L(bk_write_20bytes):
   1395 	movl	16(%eax), %ecx
   1396 	movl	%ecx, 16(%edx)
   1397 L(bk_write_16bytes):
   1398 	movl	12(%eax), %ecx
   1399 	movl	%ecx, 12(%edx)
   1400 L(bk_write_12bytes):
   1401 	movl	8(%eax), %ecx
   1402 	movl	%ecx, 8(%edx)
   1403 L(bk_write_8bytes):
   1404 	movl	4(%eax), %ecx
   1405 	movl	%ecx, 4(%edx)
   1406 L(bk_write_4bytes):
   1407 	movl	(%eax), %ecx
   1408 	movl	%ecx, (%edx)
   1409 L(bk_write_0bytes):
   1410 #ifndef USE_AS_BCOPY
   1411 	movl	DEST(%esp), %eax
   1412 # ifdef USE_AS_MEMPCPY
   1413 	movl	LEN(%esp), %ecx
   1414 	add	%ecx, %eax
   1415 # endif
   1416 #endif
   1417 	RETURN
   1418 
   1419 	ALIGN (4)
   1420 L(bk_write_45bytes):
   1421 	movl	41(%eax), %ecx
   1422 	movl	%ecx, 41(%edx)
   1423 L(bk_write_41bytes):
   1424 	movl	37(%eax), %ecx
   1425 	movl	%ecx, 37(%edx)
   1426 L(bk_write_37bytes):
   1427 	movl	33(%eax), %ecx
   1428 	movl	%ecx, 33(%edx)
   1429 L(bk_write_33bytes):
   1430 	movl	29(%eax), %ecx
   1431 	movl	%ecx, 29(%edx)
   1432 L(bk_write_29bytes):
   1433 	movl	25(%eax), %ecx
   1434 	movl	%ecx, 25(%edx)
   1435 L(bk_write_25bytes):
   1436 	movl	21(%eax), %ecx
   1437 	movl	%ecx, 21(%edx)
   1438 L(bk_write_21bytes):
   1439 	movl	17(%eax), %ecx
   1440 	movl	%ecx, 17(%edx)
   1441 L(bk_write_17bytes):
   1442 	movl	13(%eax), %ecx
   1443 	movl	%ecx, 13(%edx)
   1444 L(bk_write_13bytes):
   1445 	movl	9(%eax), %ecx
   1446 	movl	%ecx, 9(%edx)
   1447 L(bk_write_9bytes):
   1448 	movl	5(%eax), %ecx
   1449 	movl	%ecx, 5(%edx)
   1450 L(bk_write_5bytes):
   1451 	movl	1(%eax), %ecx
   1452 	movl	%ecx, 1(%edx)
   1453 L(bk_write_1bytes):
   1454 	movzbl	(%eax), %ecx
   1455 	movb	%cl, (%edx)
   1456 #ifndef USE_AS_BCOPY
   1457 	movl	DEST(%esp), %eax
   1458 # ifdef USE_AS_MEMPCPY
   1459 	movl	LEN(%esp), %ecx
   1460 	add	%ecx, %eax
   1461 # endif
   1462 #endif
   1463 	RETURN
   1464 
   1465 	ALIGN (4)
   1466 L(bk_write_46bytes):
   1467 	movl	42(%eax), %ecx
   1468 	movl	%ecx, 42(%edx)
   1469 L(bk_write_42bytes):
   1470 	movl	38(%eax), %ecx
   1471 	movl	%ecx, 38(%edx)
   1472 L(bk_write_38bytes):
   1473 	movl	34(%eax), %ecx
   1474 	movl	%ecx, 34(%edx)
   1475 L(bk_write_34bytes):
   1476 	movl	30(%eax), %ecx
   1477 	movl	%ecx, 30(%edx)
   1478 L(bk_write_30bytes):
   1479 	movl	26(%eax), %ecx
   1480 	movl	%ecx, 26(%edx)
   1481 L(bk_write_26bytes):
   1482 	movl	22(%eax), %ecx
   1483 	movl	%ecx, 22(%edx)
   1484 L(bk_write_22bytes):
   1485 	movl	18(%eax), %ecx
   1486 	movl	%ecx, 18(%edx)
   1487 L(bk_write_18bytes):
   1488 	movl	14(%eax), %ecx
   1489 	movl	%ecx, 14(%edx)
   1490 L(bk_write_14bytes):
   1491 	movl	10(%eax), %ecx
   1492 	movl	%ecx, 10(%edx)
   1493 L(bk_write_10bytes):
   1494 	movl	6(%eax), %ecx
   1495 	movl	%ecx, 6(%edx)
   1496 L(bk_write_6bytes):
   1497 	movl	2(%eax), %ecx
   1498 	movl	%ecx, 2(%edx)
   1499 L(bk_write_2bytes):
   1500 	movzwl	(%eax), %ecx
   1501 	movw	%cx, (%edx)
   1502 #ifndef USE_AS_BCOPY
   1503 	movl	DEST(%esp), %eax
   1504 # ifdef USE_AS_MEMPCPY
   1505 	movl	LEN(%esp), %ecx
   1506 	add	%ecx, %eax
   1507 # endif
   1508 #endif
   1509 	RETURN
   1510 
   1511 	ALIGN (4)
   1512 L(bk_write_47bytes):
   1513 	movl	43(%eax), %ecx
   1514 	movl	%ecx, 43(%edx)
   1515 L(bk_write_43bytes):
   1516 	movl	39(%eax), %ecx
   1517 	movl	%ecx, 39(%edx)
   1518 L(bk_write_39bytes):
   1519 	movl	35(%eax), %ecx
   1520 	movl	%ecx, 35(%edx)
   1521 L(bk_write_35bytes):
   1522 	movl	31(%eax), %ecx
   1523 	movl	%ecx, 31(%edx)
   1524 L(bk_write_31bytes):
   1525 	movl	27(%eax), %ecx
   1526 	movl	%ecx, 27(%edx)
   1527 L(bk_write_27bytes):
   1528 	movl	23(%eax), %ecx
   1529 	movl	%ecx, 23(%edx)
   1530 L(bk_write_23bytes):
   1531 	movl	19(%eax), %ecx
   1532 	movl	%ecx, 19(%edx)
   1533 L(bk_write_19bytes):
   1534 	movl	15(%eax), %ecx
   1535 	movl	%ecx, 15(%edx)
   1536 L(bk_write_15bytes):
   1537 	movl	11(%eax), %ecx
   1538 	movl	%ecx, 11(%edx)
   1539 L(bk_write_11bytes):
   1540 	movl	7(%eax), %ecx
   1541 	movl	%ecx, 7(%edx)
   1542 L(bk_write_7bytes):
   1543 	movl	3(%eax), %ecx
   1544 	movl	%ecx, 3(%edx)
   1545 L(bk_write_3bytes):
   1546 	movzwl	1(%eax), %ecx
   1547 	movw	%cx, 1(%edx)
   1548 	movzbl	(%eax), %eax
   1549 	movb	%al, (%edx)
   1550 #ifndef USE_AS_BCOPY
   1551 	movl	DEST(%esp), %eax
   1552 # ifdef USE_AS_MEMPCPY
   1553 	movl	LEN(%esp), %ecx
   1554 	add	%ecx, %eax
   1555 # endif
   1556 #endif
   1557 	RETURN_END
   1558 
   1559 
   1560 	.pushsection .rodata.ssse3,"a",@progbits
   1561 	ALIGN (2)
   1562 L(table_48bytes_fwd):
   1563 	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
   1564 	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
   1565 	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
   1566 	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
   1567 	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
   1568 	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
   1569 	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
   1570 	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
   1571 	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
   1572 	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
   1573 	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
   1574 	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
   1575 	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
   1576 	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
   1577 	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
   1578 	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
   1579 	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
   1580 	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
   1581 	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
   1582 	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
   1583 	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
   1584 	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
   1585 	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
   1586 	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
   1587 	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
   1588 	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
   1589 	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
   1590 	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
   1591 	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
   1592 	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
   1593 	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
   1594 	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
   1595 	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
   1596 	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
   1597 	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
   1598 	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
   1599 	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
   1600 	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
   1601 	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
   1602 	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
   1603 	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
   1604 	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
   1605 	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
   1606 	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
   1607 	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
   1608 	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
   1609 	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
   1610 	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
   1611 
   1612 	ALIGN (2)
   1613 L(shl_table):
   1614 	.int	JMPTBL (L(shl_0), L(shl_table))
   1615 	.int	JMPTBL (L(shl_1), L(shl_table))
   1616 	.int	JMPTBL (L(shl_2), L(shl_table))
   1617 	.int	JMPTBL (L(shl_3), L(shl_table))
   1618 	.int	JMPTBL (L(shl_4), L(shl_table))
   1619 	.int	JMPTBL (L(shl_5), L(shl_table))
   1620 	.int	JMPTBL (L(shl_6), L(shl_table))
   1621 	.int	JMPTBL (L(shl_7), L(shl_table))
   1622 	.int	JMPTBL (L(shl_8), L(shl_table))
   1623 	.int	JMPTBL (L(shl_9), L(shl_table))
   1624 	.int	JMPTBL (L(shl_10), L(shl_table))
   1625 	.int	JMPTBL (L(shl_11), L(shl_table))
   1626 	.int	JMPTBL (L(shl_12), L(shl_table))
   1627 	.int	JMPTBL (L(shl_13), L(shl_table))
   1628 	.int	JMPTBL (L(shl_14), L(shl_table))
   1629 	.int	JMPTBL (L(shl_15), L(shl_table))
   1630 
   1631 	ALIGN (2)
   1632 L(table_48_bytes_bwd):
   1633 	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
   1634 	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
   1635 	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
   1636 	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
   1637 	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
   1638 	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
   1639 	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
   1640 	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
   1641 	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
   1642 	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
   1643 	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
   1644 	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
   1645 	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
   1646 	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
   1647 	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
   1648 	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
   1649 	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
   1650 	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
   1651 	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
   1652 	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
   1653 	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
   1654 	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
   1655 	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
   1656 	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
   1657 	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
   1658 	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
   1659 	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
   1660 	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
   1661 	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
   1662 	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
   1663 	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
   1664 	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
   1665 	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
   1666 	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
   1667 	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
   1668 	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
   1669 	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
   1670 	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
   1671 	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
   1672 	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
   1673 	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
   1674 	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
   1675 	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
   1676 	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
   1677 	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
   1678 	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
   1679 	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
   1680 	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
   1681 
   1682 	.popsection
   1683 
   1684 #ifdef USE_AS_MEMMOVE
   1685 	ALIGN (4)
   1686 L(copy_backward):
   1687 	PUSH (%esi)
   1688 	movl	%eax, %esi
   1689 	lea	(%ecx,%edx,1),%edx
   1690 	lea	(%ecx,%esi,1),%esi
   1691 	testl	$0x3, %edx
   1692 	jnz	L(bk_align)
   1693 
   1694 L(bk_aligned_4):
   1695 	cmp	$64, %ecx
   1696 	jae	L(bk_write_more64bytes)
   1697 
   1698 L(bk_write_64bytesless):
   1699 	cmp	$32, %ecx
   1700 	jb	L(bk_write_less32bytes)
   1701 
   1702 L(bk_write_more32bytes):
   1703 	/* Copy 32 bytes at a time.  */
   1704 	sub	$32, %ecx
   1705 	movl	-4(%esi), %eax
   1706 	movl	%eax, -4(%edx)
   1707 	movl	-8(%esi), %eax
   1708 	movl	%eax, -8(%edx)
   1709 	movl	-12(%esi), %eax
   1710 	movl	%eax, -12(%edx)
   1711 	movl	-16(%esi), %eax
   1712 	movl	%eax, -16(%edx)
   1713 	movl	-20(%esi), %eax
   1714 	movl	%eax, -20(%edx)
   1715 	movl	-24(%esi), %eax
   1716 	movl	%eax, -24(%edx)
   1717 	movl	-28(%esi), %eax
   1718 	movl	%eax, -28(%edx)
   1719 	movl	-32(%esi), %eax
   1720 	movl	%eax, -32(%edx)
   1721 	sub	$32, %edx
   1722 	sub	$32, %esi
   1723 
   1724 L(bk_write_less32bytes):
   1725 	movl	%esi, %eax
   1726 	sub	%ecx, %edx
   1727 	sub	%ecx, %eax
   1728 	POP (%esi)
   1729 L(bk_write_less32bytes_2):
   1730 	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
   1731 
   1732 	CFI_PUSH (%esi)
   1733 	ALIGN (4)
   1734 L(bk_align):
   1735 	cmp	$8, %ecx
   1736 	jbe	L(bk_write_less32bytes)
   1737 	testl	$1, %edx
   1738 	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
   1739 	   then (EDX & 2) must be != 0.  */
   1740 	jz	L(bk_got2)
   1741 	sub	$1, %esi
   1742 	sub	$1, %ecx
   1743 	sub	$1, %edx
   1744 	movzbl	(%esi), %eax
   1745 	movb	%al, (%edx)
   1746 
   1747 	testl	$2, %edx
   1748 	jz	L(bk_aligned_4)
   1749 
   1750 L(bk_got2):
   1751 	sub	$2, %esi
   1752 	sub	$2, %ecx
   1753 	sub	$2, %edx
   1754 	movzwl	(%esi), %eax
   1755 	movw	%ax, (%edx)
   1756 	jmp	L(bk_aligned_4)
   1757 
   1758 	ALIGN (4)
   1759 L(bk_write_more64bytes):
   1760 	/* Check alignment of last byte.  */
   1761 	testl	$15, %edx
   1762 	jz	L(bk_ssse3_cpy_pre)
   1763 
   1764 /* EDX is aligned 4 bytes, but not 16 bytes.  */
   1765 L(bk_ssse3_align):
   1766 	sub	$4, %esi
   1767 	sub	$4, %ecx
   1768 	sub	$4, %edx
   1769 	movl	(%esi), %eax
   1770 	movl	%eax, (%edx)
   1771 
   1772 	testl	$15, %edx
   1773 	jz	L(bk_ssse3_cpy_pre)
   1774 
   1775 	sub	$4, %esi
   1776 	sub	$4, %ecx
   1777 	sub	$4, %edx
   1778 	movl	(%esi), %eax
   1779 	movl	%eax, (%edx)
   1780 
   1781 	testl	$15, %edx
   1782 	jz	L(bk_ssse3_cpy_pre)
   1783 
   1784 	sub	$4, %esi
   1785 	sub	$4, %ecx
   1786 	sub	$4, %edx
   1787 	movl	(%esi), %eax
   1788 	movl	%eax, (%edx)
   1789 
   1790 L(bk_ssse3_cpy_pre):
   1791 	cmp	$64, %ecx
   1792 	jb	L(bk_write_more32bytes)
   1793 
   1794 L(bk_ssse3_cpy):
   1795 	sub	$64, %esi
   1796 	sub	$64, %ecx
   1797 	sub	$64, %edx
   1798 	movdqu	0x30(%esi), %xmm3
   1799 	movdqa	%xmm3, 0x30(%edx)
   1800 	movdqu	0x20(%esi), %xmm2
   1801 	movdqa	%xmm2, 0x20(%edx)
   1802 	movdqu	0x10(%esi), %xmm1
   1803 	movdqa	%xmm1, 0x10(%edx)
   1804 	movdqu	(%esi), %xmm0
   1805 	movdqa	%xmm0, (%edx)
   1806 	cmp	$64, %ecx
   1807 	jae	L(bk_ssse3_cpy)
   1808 	jmp	L(bk_write_64bytesless)
   1809 
   1810 #endif
   1811 
   1812 END (MEMCPY)
   1813