Home | History | Annotate | Download | only in string
      1 /*
      2 Copyright (c) 2010, Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 #include "cache.h"
     32 #undef __i686
     33 
     34 #ifndef L
     35 # define L(label)	.L##label
     36 #endif
     37 
     38 #ifndef ALIGN
     39 # define ALIGN(n)	.p2align n
     40 #endif
     41 
     42 #ifndef cfi_startproc
     43 # define cfi_startproc			.cfi_startproc
     44 #endif
     45 
     46 #ifndef cfi_endproc
     47 # define cfi_endproc			.cfi_endproc
     48 #endif
     49 
     50 #ifndef cfi_rel_offset
     51 # define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
     52 #endif
     53 
     54 #ifndef cfi_restore
     55 # define cfi_restore(reg)		.cfi_restore reg
     56 #endif
     57 
     58 #ifndef cfi_adjust_cfa_offset
     59 # define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
     60 #endif
     61 
     62 #ifndef ENTRY
     63 # define ENTRY(name)			\
     64 	.type name,  @function; 	\
     65 	.globl name;			\
     66 	.p2align 4;			\
     67 name:					\
     68 	cfi_startproc
     69 #endif
     70 
     71 #ifndef END
     72 # define END(name)			\
     73 	cfi_endproc;			\
     74 	.size name, .-name
     75 #endif
     76 
     77 #define CFI_PUSH(REG)						\
     78   cfi_adjust_cfa_offset (4);					\
     79   cfi_rel_offset (REG, 0)
     80 
     81 #define CFI_POP(REG)						\
     82   cfi_adjust_cfa_offset (-4);					\
     83   cfi_restore (REG)
     84 
     85 #define PUSH(REG)	pushl REG; CFI_PUSH (REG)
     86 #define POP(REG)	popl REG; CFI_POP (REG)
     87 
     88 #ifdef USE_AS_BZERO
     89 # define DEST		PARMS
     90 # define LEN		DEST+4
     91 # define SETRTNVAL
     92 #else
     93 # define DEST		PARMS
     94 # define CHR		DEST+4
     95 # define LEN		CHR+4
     96 # define SETRTNVAL	movl DEST(%esp), %eax
     97 #endif
     98 
     99 #if (defined SHARED || defined __PIC__)
    100 # define ENTRANCE	PUSH (%ebx);
    101 # define RETURN_END	POP (%ebx); ret
    102 # define RETURN		RETURN_END; CFI_PUSH (%ebx)
    103 # define PARMS		8		/* Preserve EBX.  */
    104 # define JMPTBL(I, B)	I - B
    105 
    106 /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
    107    jump table with relative offsets.   */
    108 # define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
    109     /* We first load PC into EBX.  */				\
    110     call	__i686.get_pc_thunk.bx;				\
    111     /* Get the address of the jump table.  */			\
    112     add		$(TABLE - .), %ebx;				\
    113     /* Get the entry and convert the relative offset to the	\
    114        absolute address.  */					\
    115     add		(%ebx,%ecx,4), %ebx;				\
    116     add		%ecx, %edx;					\
    117     /* We loaded the jump table and adjuested EDX. Go.  */	\
    118     jmp		*%ebx
    119 
    120 	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
    121 	.globl	__i686.get_pc_thunk.bx
    122 	.hidden	__i686.get_pc_thunk.bx
    123 	ALIGN (4)
    124 	.type	__i686.get_pc_thunk.bx,@function
    125 __i686.get_pc_thunk.bx:
    126 	movl	(%esp), %ebx
    127 	ret
    128 #else
    129 # define ENTRANCE
    130 # define RETURN_END	ret
    131 # define RETURN		RETURN_END
    132 # define PARMS		4
    133 # define JMPTBL(I, B)	I
    134 
    135 /* Branch to an entry in a jump table.  TABLE is a jump table with
    136    absolute offsets.  */
    137 # define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
    138     add		%ecx, %edx;					\
    139     jmp		*TABLE(,%ecx,4)
    140 #endif
    141 
    142 #ifndef MEMSET
    143 # define MEMSET memset
    144 #endif
    145 
    146 	.section .text.sse2,"ax",@progbits
    147 	ALIGN (4)
    148 ENTRY (MEMSET)
    149 	ENTRANCE
    150 
    151 	movl	LEN(%esp), %ecx
    152 #ifdef USE_AS_BZERO
    153 	xor	%eax, %eax
    154 #else
    155 	movzbl	CHR(%esp), %eax
    156 	movb	%al, %ah
    157 	/* Fill the whole EAX with pattern.  */
    158 	movl	%eax, %edx
    159 	shl	$16, %eax
    160 	or	%edx, %eax
    161 #endif
    162 	movl	DEST(%esp), %edx
    163 	cmp	$32, %ecx
    164 	jae	L(32bytesormore)
    165 
    166 L(write_less32bytes):
    167 	BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
    168 
    169 
    170 	.pushsection .rodata.sse2,"a",@progbits
    171 	ALIGN (2)
    172 L(table_less_32bytes):
    173 	.int	JMPTBL (L(write_0bytes), L(table_less_32bytes))
    174 	.int	JMPTBL (L(write_1bytes), L(table_less_32bytes))
    175 	.int	JMPTBL (L(write_2bytes), L(table_less_32bytes))
    176 	.int	JMPTBL (L(write_3bytes), L(table_less_32bytes))
    177 	.int	JMPTBL (L(write_4bytes), L(table_less_32bytes))
    178 	.int	JMPTBL (L(write_5bytes), L(table_less_32bytes))
    179 	.int	JMPTBL (L(write_6bytes), L(table_less_32bytes))
    180 	.int	JMPTBL (L(write_7bytes), L(table_less_32bytes))
    181 	.int	JMPTBL (L(write_8bytes), L(table_less_32bytes))
    182 	.int	JMPTBL (L(write_9bytes), L(table_less_32bytes))
    183 	.int	JMPTBL (L(write_10bytes), L(table_less_32bytes))
    184 	.int	JMPTBL (L(write_11bytes), L(table_less_32bytes))
    185 	.int	JMPTBL (L(write_12bytes), L(table_less_32bytes))
    186 	.int	JMPTBL (L(write_13bytes), L(table_less_32bytes))
    187 	.int	JMPTBL (L(write_14bytes), L(table_less_32bytes))
    188 	.int	JMPTBL (L(write_15bytes), L(table_less_32bytes))
    189 	.int	JMPTBL (L(write_16bytes), L(table_less_32bytes))
    190 	.int	JMPTBL (L(write_17bytes), L(table_less_32bytes))
    191 	.int	JMPTBL (L(write_18bytes), L(table_less_32bytes))
    192 	.int	JMPTBL (L(write_19bytes), L(table_less_32bytes))
    193 	.int	JMPTBL (L(write_20bytes), L(table_less_32bytes))
    194 	.int	JMPTBL (L(write_21bytes), L(table_less_32bytes))
    195 	.int	JMPTBL (L(write_22bytes), L(table_less_32bytes))
    196 	.int	JMPTBL (L(write_23bytes), L(table_less_32bytes))
    197 	.int	JMPTBL (L(write_24bytes), L(table_less_32bytes))
    198 	.int	JMPTBL (L(write_25bytes), L(table_less_32bytes))
    199 	.int	JMPTBL (L(write_26bytes), L(table_less_32bytes))
    200 	.int	JMPTBL (L(write_27bytes), L(table_less_32bytes))
    201 	.int	JMPTBL (L(write_28bytes), L(table_less_32bytes))
    202 	.int	JMPTBL (L(write_29bytes), L(table_less_32bytes))
    203 	.int	JMPTBL (L(write_30bytes), L(table_less_32bytes))
    204 	.int	JMPTBL (L(write_31bytes), L(table_less_32bytes))
    205 	.popsection
    206 
    207 	ALIGN (4)
    208 L(write_28bytes):
    209 	movl	%eax, -28(%edx)
    210 L(write_24bytes):
    211 	movl	%eax, -24(%edx)
    212 L(write_20bytes):
    213 	movl	%eax, -20(%edx)
    214 L(write_16bytes):
    215 	movl	%eax, -16(%edx)
    216 L(write_12bytes):
    217 	movl	%eax, -12(%edx)
    218 L(write_8bytes):
    219 	movl	%eax, -8(%edx)
    220 L(write_4bytes):
    221 	movl	%eax, -4(%edx)
    222 L(write_0bytes):
    223 	SETRTNVAL
    224 	RETURN
    225 
    226 	ALIGN (4)
    227 L(write_29bytes):
    228 	movl	%eax, -29(%edx)
    229 L(write_25bytes):
    230 	movl	%eax, -25(%edx)
    231 L(write_21bytes):
    232 	movl	%eax, -21(%edx)
    233 L(write_17bytes):
    234 	movl	%eax, -17(%edx)
    235 L(write_13bytes):
    236 	movl	%eax, -13(%edx)
    237 L(write_9bytes):
    238 	movl	%eax, -9(%edx)
    239 L(write_5bytes):
    240 	movl	%eax, -5(%edx)
    241 L(write_1bytes):
    242 	movb	%al, -1(%edx)
    243 	SETRTNVAL
    244 	RETURN
    245 
    246 	ALIGN (4)
    247 L(write_30bytes):
    248 	movl	%eax, -30(%edx)
    249 L(write_26bytes):
    250 	movl	%eax, -26(%edx)
    251 L(write_22bytes):
    252 	movl	%eax, -22(%edx)
    253 L(write_18bytes):
    254 	movl	%eax, -18(%edx)
    255 L(write_14bytes):
    256 	movl	%eax, -14(%edx)
    257 L(write_10bytes):
    258 	movl	%eax, -10(%edx)
    259 L(write_6bytes):
    260 	movl	%eax, -6(%edx)
    261 L(write_2bytes):
    262 	movw	%ax, -2(%edx)
    263 	SETRTNVAL
    264 	RETURN
    265 
    266 	ALIGN (4)
    267 L(write_31bytes):
    268 	movl	%eax, -31(%edx)
    269 L(write_27bytes):
    270 	movl	%eax, -27(%edx)
    271 L(write_23bytes):
    272 	movl	%eax, -23(%edx)
    273 L(write_19bytes):
    274 	movl	%eax, -19(%edx)
    275 L(write_15bytes):
    276 	movl	%eax, -15(%edx)
    277 L(write_11bytes):
    278 	movl	%eax, -11(%edx)
    279 L(write_7bytes):
    280 	movl	%eax, -7(%edx)
    281 L(write_3bytes):
    282 	movw	%ax, -3(%edx)
    283 	movb	%al, -1(%edx)
    284 	SETRTNVAL
    285 	RETURN
    286 
    287 	ALIGN (4)
    288 /* ECX > 32 and EDX is 4 byte aligned.  */
    289 L(32bytesormore):
    290 	/* Fill xmm0 with the pattern.  */
    291 #ifdef USE_AS_BZERO
    292 	pxor	%xmm0, %xmm0
    293 #else
    294 	movd	%eax, %xmm0
    295 	pshufd	$0, %xmm0, %xmm0
    296 #endif
    297 	testl	$0xf, %edx
    298 	jz	L(aligned_16)
    299 /* ECX > 32 and EDX is not 16 byte aligned.  */
    300 L(not_aligned_16):
    301 	movdqu	%xmm0, (%edx)
    302 	movl	%edx, %eax
    303 	and	$-16, %edx
    304 	add	$16, %edx
    305 	sub	%edx, %eax
    306 	add	%eax, %ecx
    307 	movd	%xmm0, %eax
    308 
    309 	ALIGN (4)
    310 L(aligned_16):
    311 	cmp	$128, %ecx
    312 	jae	L(128bytesormore)
    313 
    314 L(aligned_16_less128bytes):
    315 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
    316 
    317 	ALIGN (4)
    318 L(128bytesormore):
    319 #ifdef SHARED_CACHE_SIZE
    320 	PUSH (%ebx)
    321 	mov	$SHARED_CACHE_SIZE, %ebx
    322 #else
    323 # if (defined SHARED || defined __PIC__)
    324 	call	__i686.get_pc_thunk.bx
    325 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    326 	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
    327 # else
    328 	PUSH (%ebx)
    329 	mov	__x86_shared_cache_size, %ebx
    330 # endif
    331 #endif
    332 	cmp	%ebx, %ecx
    333 	jae	L(128bytesormore_nt_start)
    334 
    335 
    336 #ifdef DATA_CACHE_SIZE
    337 	POP (%ebx)
    338 # define RESTORE_EBX_STATE CFI_PUSH (%ebx)
    339 	cmp	$DATA_CACHE_SIZE, %ecx
    340 #else
    341 # if (defined SHARED || defined __PIC__)
    342 #  define RESTORE_EBX_STATE
    343 	call	__i686.get_pc_thunk.bx
    344 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    345 	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
    346 # else
    347 	POP (%ebx)
    348 #  define RESTORE_EBX_STATE CFI_PUSH (%ebx)
    349 	cmp	__x86_data_cache_size, %ecx
    350 # endif
    351 #endif
    352 
    353 	jae	L(128bytes_L2_normal)
    354 	subl	$128, %ecx
    355 L(128bytesormore_normal):
    356 	sub	$128, %ecx
    357 	movdqa	%xmm0, (%edx)
    358 	movdqa	%xmm0, 0x10(%edx)
    359 	movdqa	%xmm0, 0x20(%edx)
    360 	movdqa	%xmm0, 0x30(%edx)
    361 	movdqa	%xmm0, 0x40(%edx)
    362 	movdqa	%xmm0, 0x50(%edx)
    363 	movdqa	%xmm0, 0x60(%edx)
    364 	movdqa	%xmm0, 0x70(%edx)
    365 	lea	128(%edx), %edx
    366 	jb	L(128bytesless_normal)
    367 
    368 
    369 	sub	$128, %ecx
    370 	movdqa	%xmm0, (%edx)
    371 	movdqa	%xmm0, 0x10(%edx)
    372 	movdqa	%xmm0, 0x20(%edx)
    373 	movdqa	%xmm0, 0x30(%edx)
    374 	movdqa	%xmm0, 0x40(%edx)
    375 	movdqa	%xmm0, 0x50(%edx)
    376 	movdqa	%xmm0, 0x60(%edx)
    377 	movdqa	%xmm0, 0x70(%edx)
    378 	lea	128(%edx), %edx
    379 	jae	L(128bytesormore_normal)
    380 
    381 L(128bytesless_normal):
    382 	add	$128, %ecx
    383 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
    384 
    385 	ALIGN (4)
    386 L(128bytes_L2_normal):
    387 	prefetcht0	0x380(%edx)
    388 	prefetcht0	0x3c0(%edx)
    389 	sub	$128, %ecx
    390 	movdqa	%xmm0, (%edx)
    391 	movaps	%xmm0, 0x10(%edx)
    392 	movaps	%xmm0, 0x20(%edx)
    393 	movaps	%xmm0, 0x30(%edx)
    394 	movaps	%xmm0, 0x40(%edx)
    395 	movaps	%xmm0, 0x50(%edx)
    396 	movaps	%xmm0, 0x60(%edx)
    397 	movaps	%xmm0, 0x70(%edx)
    398 	add	$128, %edx
    399 	cmp	$128, %ecx
    400 	jae	L(128bytes_L2_normal)
    401 
    402 L(128bytesless_L2_normal):
    403 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
    404 
    405 	RESTORE_EBX_STATE
    406 L(128bytesormore_nt_start):
    407 	sub	%ebx, %ecx
    408 	mov	%ebx, %eax
    409 	and	$0x7f, %eax
    410 	add	%eax, %ecx
    411 	movd	%xmm0, %eax
    412 	ALIGN (4)
    413 L(128bytesormore_shared_cache_loop):
    414 	prefetcht0	0x3c0(%edx)
    415 	prefetcht0	0x380(%edx)
    416 	sub	$0x80, %ebx
    417 	movdqa	%xmm0, (%edx)
    418 	movdqa	%xmm0, 0x10(%edx)
    419 	movdqa	%xmm0, 0x20(%edx)
    420 	movdqa	%xmm0, 0x30(%edx)
    421 	movdqa	%xmm0, 0x40(%edx)
    422 	movdqa	%xmm0, 0x50(%edx)
    423 	movdqa	%xmm0, 0x60(%edx)
    424 	movdqa	%xmm0, 0x70(%edx)
    425 	add	$0x80, %edx
    426 	cmp	$0x80, %ebx
    427 	jae	L(128bytesormore_shared_cache_loop)
    428 	cmp	$0x80, %ecx
    429 	jb	L(shared_cache_loop_end)
    430 	ALIGN (4)
    431 L(128bytesormore_nt):
    432 	sub	$0x80, %ecx
    433 	movntdq	%xmm0, (%edx)
    434 	movntdq	%xmm0, 0x10(%edx)
    435 	movntdq	%xmm0, 0x20(%edx)
    436 	movntdq	%xmm0, 0x30(%edx)
    437 	movntdq	%xmm0, 0x40(%edx)
    438 	movntdq	%xmm0, 0x50(%edx)
    439 	movntdq	%xmm0, 0x60(%edx)
    440 	movntdq	%xmm0, 0x70(%edx)
    441 	add	$0x80, %edx
    442 	cmp	$0x80, %ecx
    443 	jae	L(128bytesormore_nt)
    444 	sfence
    445 L(shared_cache_loop_end):
    446 #if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__)
    447 	POP (%ebx)
    448 #endif
    449 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
    450 
    451 
    452 	.pushsection .rodata.sse2,"a",@progbits
    453 	ALIGN (2)
    454 L(table_16_128bytes):
    455 	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
    456 	.int	JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
    457 	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
    458 	.int	JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
    459 	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
    460 	.int	JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
    461 	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
    462 	.int	JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
    463 	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
    464 	.int	JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
    465 	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
    466 	.int	JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
    467 	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
    468 	.int	JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
    469 	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
    470 	.int	JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
    471 	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
    472 	.int	JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
    473 	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
    474 	.int	JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
    475 	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
    476 	.int	JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
    477 	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
    478 	.int	JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
    479 	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
    480 	.int	JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
    481 	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
    482 	.int	JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
    483 	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
    484 	.int	JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
    485 	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
    486 	.int	JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
    487 	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
    488 	.int	JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
    489 	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
    490 	.int	JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
    491 	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
    492 	.int	JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
    493 	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
    494 	.int	JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
    495 	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
    496 	.int	JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
    497 	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
    498 	.int	JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
    499 	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
    500 	.int	JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
    501 	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
    502 	.int	JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
    503 	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
    504 	.int	JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
    505 	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
    506 	.int	JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
    507 	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
    508 	.int	JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
    509 	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
    510 	.int	JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
    511 	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
    512 	.int	JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
    513 	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
    514 	.int	JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
    515 	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
    516 	.int	JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
    517 	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
    518 	.int	JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
    519 	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
    520 	.int	JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
    521 	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
    522 	.int	JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
    523 	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
    524 	.int	JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
    525 	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
    526 	.int	JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
    527 	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
    528 	.int	JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
    529 	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
    530 	.int	JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
    531 	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
    532 	.int	JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
    533 	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
    534 	.int	JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
    535 	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
    536 	.int	JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
    537 	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
    538 	.int	JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
    539 	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
    540 	.int	JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
    541 	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
    542 	.int	JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
    543 	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
    544 	.int	JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
    545 	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
    546 	.int	JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
    547 	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
    548 	.int	JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
    549 	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
    550 	.int	JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
    551 	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
    552 	.int	JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
    553 	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
    554 	.int	JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
    555 	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
    556 	.int	JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
    557 	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
    558 	.int	JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
    559 	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
    560 	.int	JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
    561 	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
    562 	.int	JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
    563 	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
    564 	.int	JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
    565 	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
    566 	.int	JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
    567 	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
    568 	.int	JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
    569 	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
    570 	.int	JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
    571 	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
    572 	.int	JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
    573 	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
    574 	.int	JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
    575 	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
    576 	.int	JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
    577 	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
    578 	.int	JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
    579 	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
    580 	.int	JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
    581 	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
    582 	.int	JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
    583 	.popsection
    584 
    585 	ALIGN (4)
    586 L(aligned_16_112bytes):
    587 	movdqa	%xmm0, -112(%edx)
    588 L(aligned_16_96bytes):
    589 	movdqa	%xmm0, -96(%edx)
    590 L(aligned_16_80bytes):
    591 	movdqa	%xmm0, -80(%edx)
    592 L(aligned_16_64bytes):
    593 	movdqa	%xmm0, -64(%edx)
    594 L(aligned_16_48bytes):
    595 	movdqa	%xmm0, -48(%edx)
    596 L(aligned_16_32bytes):
    597 	movdqa	%xmm0, -32(%edx)
    598 L(aligned_16_16bytes):
    599 	movdqa	%xmm0, -16(%edx)
    600 L(aligned_16_0bytes):
    601 	SETRTNVAL
    602 	RETURN
    603 
    604 	ALIGN (4)
    605 L(aligned_16_113bytes):
    606 	movdqa	%xmm0, -113(%edx)
    607 L(aligned_16_97bytes):
    608 	movdqa	%xmm0, -97(%edx)
    609 L(aligned_16_81bytes):
    610 	movdqa	%xmm0, -81(%edx)
    611 L(aligned_16_65bytes):
    612 	movdqa	%xmm0, -65(%edx)
    613 L(aligned_16_49bytes):
    614 	movdqa	%xmm0, -49(%edx)
    615 L(aligned_16_33bytes):
    616 	movdqa	%xmm0, -33(%edx)
    617 L(aligned_16_17bytes):
    618 	movdqa	%xmm0, -17(%edx)
    619 L(aligned_16_1bytes):
    620 	movb	%al, -1(%edx)
    621 	SETRTNVAL
    622 	RETURN
    623 
    624 	ALIGN (4)
    625 L(aligned_16_114bytes):
    626 	movdqa	%xmm0, -114(%edx)
    627 L(aligned_16_98bytes):
    628 	movdqa	%xmm0, -98(%edx)
    629 L(aligned_16_82bytes):
    630 	movdqa	%xmm0, -82(%edx)
    631 L(aligned_16_66bytes):
    632 	movdqa	%xmm0, -66(%edx)
    633 L(aligned_16_50bytes):
    634 	movdqa	%xmm0, -50(%edx)
    635 L(aligned_16_34bytes):
    636 	movdqa	%xmm0, -34(%edx)
    637 L(aligned_16_18bytes):
    638 	movdqa	%xmm0, -18(%edx)
    639 L(aligned_16_2bytes):
    640 	movw	%ax, -2(%edx)
    641 	SETRTNVAL
    642 	RETURN
    643 
    644 	ALIGN (4)
    645 L(aligned_16_115bytes):
    646 	movdqa	%xmm0, -115(%edx)
    647 L(aligned_16_99bytes):
    648 	movdqa	%xmm0, -99(%edx)
    649 L(aligned_16_83bytes):
    650 	movdqa	%xmm0, -83(%edx)
    651 L(aligned_16_67bytes):
    652 	movdqa	%xmm0, -67(%edx)
    653 L(aligned_16_51bytes):
    654 	movdqa	%xmm0, -51(%edx)
    655 L(aligned_16_35bytes):
    656 	movdqa	%xmm0, -35(%edx)
    657 L(aligned_16_19bytes):
    658 	movdqa	%xmm0, -19(%edx)
    659 L(aligned_16_3bytes):
    660 	movw	%ax, -3(%edx)
    661 	movb	%al, -1(%edx)
    662 	SETRTNVAL
    663 	RETURN
    664 
    665 	ALIGN (4)
    666 L(aligned_16_116bytes):
    667 	movdqa	%xmm0, -116(%edx)
    668 L(aligned_16_100bytes):
    669 	movdqa	%xmm0, -100(%edx)
    670 L(aligned_16_84bytes):
    671 	movdqa	%xmm0, -84(%edx)
    672 L(aligned_16_68bytes):
    673 	movdqa	%xmm0, -68(%edx)
    674 L(aligned_16_52bytes):
    675 	movdqa	%xmm0, -52(%edx)
    676 L(aligned_16_36bytes):
    677 	movdqa	%xmm0, -36(%edx)
    678 L(aligned_16_20bytes):
    679 	movdqa	%xmm0, -20(%edx)
    680 L(aligned_16_4bytes):
    681 	movl	%eax, -4(%edx)
    682 	SETRTNVAL
    683 	RETURN
    684 
    685 	ALIGN (4)
    686 L(aligned_16_117bytes):
    687 	movdqa	%xmm0, -117(%edx)
    688 L(aligned_16_101bytes):
    689 	movdqa	%xmm0, -101(%edx)
    690 L(aligned_16_85bytes):
    691 	movdqa	%xmm0, -85(%edx)
    692 L(aligned_16_69bytes):
    693 	movdqa	%xmm0, -69(%edx)
    694 L(aligned_16_53bytes):
    695 	movdqa	%xmm0, -53(%edx)
    696 L(aligned_16_37bytes):
    697 	movdqa	%xmm0, -37(%edx)
    698 L(aligned_16_21bytes):
    699 	movdqa	%xmm0, -21(%edx)
    700 L(aligned_16_5bytes):
    701 	movl	%eax, -5(%edx)
    702 	movb	%al, -1(%edx)
    703 	SETRTNVAL
    704 	RETURN
    705 
    706 	ALIGN (4)
    707 L(aligned_16_118bytes):
    708 	movdqa	%xmm0, -118(%edx)
    709 L(aligned_16_102bytes):
    710 	movdqa	%xmm0, -102(%edx)
    711 L(aligned_16_86bytes):
    712 	movdqa	%xmm0, -86(%edx)
    713 L(aligned_16_70bytes):
    714 	movdqa	%xmm0, -70(%edx)
    715 L(aligned_16_54bytes):
    716 	movdqa	%xmm0, -54(%edx)
    717 L(aligned_16_38bytes):
    718 	movdqa	%xmm0, -38(%edx)
    719 L(aligned_16_22bytes):
    720 	movdqa	%xmm0, -22(%edx)
    721 L(aligned_16_6bytes):
    722 	movl	%eax, -6(%edx)
    723 	movw	%ax, -2(%edx)
    724 	SETRTNVAL
    725 	RETURN
    726 
    727 	ALIGN (4)
    728 L(aligned_16_119bytes):
    729 	movdqa	%xmm0, -119(%edx)
    730 L(aligned_16_103bytes):
    731 	movdqa	%xmm0, -103(%edx)
    732 L(aligned_16_87bytes):
    733 	movdqa	%xmm0, -87(%edx)
    734 L(aligned_16_71bytes):
    735 	movdqa	%xmm0, -71(%edx)
    736 L(aligned_16_55bytes):
    737 	movdqa	%xmm0, -55(%edx)
    738 L(aligned_16_39bytes):
    739 	movdqa	%xmm0, -39(%edx)
    740 L(aligned_16_23bytes):
    741 	movdqa	%xmm0, -23(%edx)
    742 L(aligned_16_7bytes):
    743 	movl	%eax, -7(%edx)
    744 	movw	%ax, -3(%edx)
    745 	movb	%al, -1(%edx)
    746 	SETRTNVAL
    747 	RETURN
    748 
    749 	ALIGN (4)
    750 L(aligned_16_120bytes):
    751 	movdqa	%xmm0, -120(%edx)
    752 L(aligned_16_104bytes):
    753 	movdqa	%xmm0, -104(%edx)
    754 L(aligned_16_88bytes):
    755 	movdqa	%xmm0, -88(%edx)
    756 L(aligned_16_72bytes):
    757 	movdqa	%xmm0, -72(%edx)
    758 L(aligned_16_56bytes):
    759 	movdqa	%xmm0, -56(%edx)
    760 L(aligned_16_40bytes):
    761 	movdqa	%xmm0, -40(%edx)
    762 L(aligned_16_24bytes):
    763 	movdqa	%xmm0, -24(%edx)
    764 L(aligned_16_8bytes):
    765 	movq	%xmm0, -8(%edx)
    766 	SETRTNVAL
    767 	RETURN
    768 
    769 	ALIGN (4)
    770 L(aligned_16_121bytes):
    771 	movdqa	%xmm0, -121(%edx)
    772 L(aligned_16_105bytes):
    773 	movdqa	%xmm0, -105(%edx)
    774 L(aligned_16_89bytes):
    775 	movdqa	%xmm0, -89(%edx)
    776 L(aligned_16_73bytes):
    777 	movdqa	%xmm0, -73(%edx)
    778 L(aligned_16_57bytes):
    779 	movdqa	%xmm0, -57(%edx)
    780 L(aligned_16_41bytes):
    781 	movdqa	%xmm0, -41(%edx)
    782 L(aligned_16_25bytes):
    783 	movdqa	%xmm0, -25(%edx)
    784 L(aligned_16_9bytes):
    785 	movq	%xmm0, -9(%edx)
    786 	movb	%al, -1(%edx)
    787 	SETRTNVAL
    788 	RETURN
    789 
    790 	ALIGN (4)
    791 L(aligned_16_122bytes):
    792 	movdqa	%xmm0, -122(%edx)
    793 L(aligned_16_106bytes):
    794 	movdqa	%xmm0, -106(%edx)
    795 L(aligned_16_90bytes):
    796 	movdqa	%xmm0, -90(%edx)
    797 L(aligned_16_74bytes):
    798 	movdqa	%xmm0, -74(%edx)
    799 L(aligned_16_58bytes):
    800 	movdqa	%xmm0, -58(%edx)
    801 L(aligned_16_42bytes):
    802 	movdqa	%xmm0, -42(%edx)
    803 L(aligned_16_26bytes):
    804 	movdqa	%xmm0, -26(%edx)
    805 L(aligned_16_10bytes):
    806 	movq	%xmm0, -10(%edx)
    807 	movw	%ax, -2(%edx)
    808 	SETRTNVAL
    809 	RETURN
    810 
    811 	ALIGN (4)
    812 L(aligned_16_123bytes):
    813 	movdqa	%xmm0, -123(%edx)
    814 L(aligned_16_107bytes):
    815 	movdqa	%xmm0, -107(%edx)
    816 L(aligned_16_91bytes):
    817 	movdqa	%xmm0, -91(%edx)
    818 L(aligned_16_75bytes):
    819 	movdqa	%xmm0, -75(%edx)
    820 L(aligned_16_59bytes):
    821 	movdqa	%xmm0, -59(%edx)
    822 L(aligned_16_43bytes):
    823 	movdqa	%xmm0, -43(%edx)
    824 L(aligned_16_27bytes):
    825 	movdqa	%xmm0, -27(%edx)
    826 L(aligned_16_11bytes):
    827 	movq	%xmm0, -11(%edx)
    828 	movw	%ax, -3(%edx)
    829 	movb	%al, -1(%edx)
    830 	SETRTNVAL
    831 	RETURN
    832 
    833 	ALIGN (4)
    834 L(aligned_16_124bytes):
    835 	movdqa	%xmm0, -124(%edx)
    836 L(aligned_16_108bytes):
    837 	movdqa	%xmm0, -108(%edx)
    838 L(aligned_16_92bytes):
    839 	movdqa	%xmm0, -92(%edx)
    840 L(aligned_16_76bytes):
    841 	movdqa	%xmm0, -76(%edx)
    842 L(aligned_16_60bytes):
    843 	movdqa	%xmm0, -60(%edx)
    844 L(aligned_16_44bytes):
    845 	movdqa	%xmm0, -44(%edx)
    846 L(aligned_16_28bytes):
    847 	movdqa	%xmm0, -28(%edx)
    848 L(aligned_16_12bytes):
    849 	movq	%xmm0, -12(%edx)
    850 	movl	%eax, -4(%edx)
    851 	SETRTNVAL
    852 	RETURN
    853 
    854 	ALIGN (4)
    855 L(aligned_16_125bytes):
    856 	movdqa	%xmm0, -125(%edx)
    857 L(aligned_16_109bytes):
    858 	movdqa	%xmm0, -109(%edx)
    859 L(aligned_16_93bytes):
    860 	movdqa	%xmm0, -93(%edx)
    861 L(aligned_16_77bytes):
    862 	movdqa	%xmm0, -77(%edx)
    863 L(aligned_16_61bytes):
    864 	movdqa	%xmm0, -61(%edx)
    865 L(aligned_16_45bytes):
    866 	movdqa	%xmm0, -45(%edx)
    867 L(aligned_16_29bytes):
    868 	movdqa	%xmm0, -29(%edx)
    869 L(aligned_16_13bytes):
    870 	movq	%xmm0, -13(%edx)
    871 	movl	%eax, -5(%edx)
    872 	movb	%al, -1(%edx)
    873 	SETRTNVAL
    874 	RETURN
    875 
    876 	ALIGN (4)
    877 L(aligned_16_126bytes):
    878 	movdqa	%xmm0, -126(%edx)
    879 L(aligned_16_110bytes):
    880 	movdqa	%xmm0, -110(%edx)
    881 L(aligned_16_94bytes):
    882 	movdqa	%xmm0, -94(%edx)
    883 L(aligned_16_78bytes):
    884 	movdqa	%xmm0, -78(%edx)
    885 L(aligned_16_62bytes):
    886 	movdqa	%xmm0, -62(%edx)
    887 L(aligned_16_46bytes):
    888 	movdqa	%xmm0, -46(%edx)
    889 L(aligned_16_30bytes):
    890 	movdqa	%xmm0, -30(%edx)
    891 L(aligned_16_14bytes):
    892 	movq	%xmm0, -14(%edx)
    893 	movl	%eax, -6(%edx)
    894 	movw	%ax, -2(%edx)
    895 	SETRTNVAL
    896 	RETURN
    897 
    898 	ALIGN (4)
    899 L(aligned_16_127bytes):
    900 	movdqa	%xmm0, -127(%edx)
    901 L(aligned_16_111bytes):
    902 	movdqa	%xmm0, -111(%edx)
    903 L(aligned_16_95bytes):
    904 	movdqa	%xmm0, -95(%edx)
    905 L(aligned_16_79bytes):
    906 	movdqa	%xmm0, -79(%edx)
    907 L(aligned_16_63bytes):
    908 	movdqa	%xmm0, -63(%edx)
    909 L(aligned_16_47bytes):
    910 	movdqa	%xmm0, -47(%edx)
    911 L(aligned_16_31bytes):
    912 	movdqa	%xmm0, -31(%edx)
    913 L(aligned_16_15bytes):
    914 	movq	%xmm0, -15(%edx)
    915 	movl	%eax, -7(%edx)
    916 	movw	%ax, -3(%edx)
    917 	movb	%al, -1(%edx)
    918 	SETRTNVAL
    919 	RETURN_END
    920 
    921 END (MEMSET)
    922