Home | History | Annotate | Download | only in string
      1 /*
      2 Copyright (c) 2014, Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 #include "cache.h"
     32 
     33 #ifndef MEMSET
     34 # define MEMSET memset
     35 #endif
     36 
     37 #ifndef L
     38 # define L(label)	.L##label
     39 #endif
     40 
     41 #ifndef ALIGN
     42 # define ALIGN(n)	.p2align n
     43 #endif
     44 
     45 #ifndef cfi_startproc
     46 # define cfi_startproc			.cfi_startproc
     47 #endif
     48 
     49 #ifndef cfi_endproc
     50 # define cfi_endproc			.cfi_endproc
     51 #endif
     52 
     53 #ifndef cfi_rel_offset
     54 # define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
     55 #endif
     56 
     57 #ifndef cfi_restore
     58 # define cfi_restore(reg)		.cfi_restore reg
     59 #endif
     60 
     61 #ifndef cfi_adjust_cfa_offset
     62 # define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
     63 #endif
     64 
     65 #ifndef ENTRY
     66 # define ENTRY(name)			\
     67 	.type name,  @function;		\
     68 	.globl name;			\
     69 	.p2align 4;			\
     70 name:					\
     71 	cfi_startproc
     72 #endif
     73 
     74 #ifndef END
     75 # define END(name)			\
     76 	cfi_endproc;			\
     77 	.size name, .-name
     78 #endif
     79 
     80 #define CFI_PUSH(REG)						\
     81   cfi_adjust_cfa_offset (4);					\
     82   cfi_rel_offset (REG, 0)
     83 
     84 #define CFI_POP(REG)						\
     85   cfi_adjust_cfa_offset (-4);					\
     86   cfi_restore (REG)
     87 
     88 #define PUSH(REG)	pushl REG; CFI_PUSH (REG)
     89 #define POP(REG)	popl REG; CFI_POP (REG)
     90 
     91 #ifdef USE_AS_BZERO
     92 # define DEST		PARMS
     93 # define LEN		DEST+4
     94 # define SETRTNVAL
     95 #else
     96 # define DEST		PARMS
     97 # define CHR		DEST+4
     98 # define LEN		CHR+4
     99 # define SETRTNVAL	movl DEST(%esp), %eax
    100 #endif
    101 
    102 #if (defined SHARED || defined __PIC__)
    103 # define ENTRANCE	PUSH (%ebx);
    104 # define RETURN_END	POP (%ebx); ret
    105 # define RETURN		RETURN_END; CFI_PUSH (%ebx)
    106 # define PARMS		8		/* Preserve EBX.  */
    107 # define JMPTBL(I, B)	I - B
    108 
    109 /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
    110    jump table with relative offsets.   */
    111 # define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
    112     /* We first load PC into EBX.  */				\
    113     call	__x86.get_pc_thunk.bx;				\
    114     /* Get the address of the jump table.  */			\
    115     add		$(TABLE - .), %ebx;				\
    116     /* Get the entry and convert the relative offset to the	\
    117        absolute address.  */					\
    118     add		(%ebx,%ecx,4), %ebx;				\
    119     add		%ecx, %edx;					\
    120     /* We loaded the jump table and adjuested EDX. Go.  */	\
    121     jmp		*%ebx
    122 
    123 	.section	.gnu.linkonce.t.__x86.get_pc_thunk.bx,"ax",@progbits
    124 	.globl	__x86.get_pc_thunk.bx
    125 	.hidden	__x86.get_pc_thunk.bx
    126 	ALIGN (4)
    127 	.type	__x86.get_pc_thunk.bx,@function
    128 __x86.get_pc_thunk.bx:
    129 	movl	(%esp), %ebx
    130 	ret
    131 #else
    132 # define ENTRANCE
    133 # define RETURN_END	ret
    134 # define RETURN		RETURN_END
    135 # define PARMS		4
    136 # define JMPTBL(I, B)	I
    137 
    138 /* Branch to an entry in a jump table.  TABLE is a jump table with
    139    absolute offsets.  */
    140 # define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
    141     add		%ecx, %edx;					\
    142     jmp		*TABLE(,%ecx,4)
    143 #endif
    144 
    145 	.section .text.sse2,"ax",@progbits
    146 	ALIGN (4)
    147 ENTRY (MEMSET)
    148 	ENTRANCE
    149 
    150 	movl	LEN(%esp), %ecx
    151 	cmp	$0, %ecx
    152 	ja	L(1byteormore)
    153 	SETRTNVAL
    154 	RETURN
    155 
    156 L(1byteormore):
    157 #ifdef USE_AS_BZERO
    158 	xor	%eax, %eax
    159 #else
    160 	movzbl	CHR(%esp), %eax
    161 	movb	%al, %ah
    162 	/* Fill the whole EAX with pattern.  */
    163 	movl	%eax, %edx
    164 	shl	 $16, %eax
    165 	or	%edx, %eax
    166 #endif
    167 	movl	DEST(%esp), %edx
    168 	cmp	$1, %ecx
    169 	je	L(1byte)
    170 	cmp	$16, %ecx
    171 	jae	L(16bytesormore)
    172 
    173 	cmp	$4, %ecx
    174 	jb	L(4bytesless)
    175 	movl	%eax, (%edx)
    176 	movl	%eax, -4(%edx, %ecx)
    177 	cmp	$8, %ecx
    178 	jb	L(8bytesless)
    179 	movl	%eax, 4(%edx)
    180 	movl	%eax, -8(%edx, %ecx)
    181 L(8bytesless):
    182 	SETRTNVAL
    183 	RETURN
    184 
    185 L(4bytesless):
    186 	movw	%ax, (%edx)
    187 	movw	%ax, -2(%edx, %ecx)
    188 	SETRTNVAL
    189 	RETURN
    190 
    191 L(1byte):
    192 	movb	%al, (%edx)
    193 	SETRTNVAL
    194 	RETURN
    195 
    196 	ALIGN (4)
    197 L(16bytesormore):
    198 #ifdef USE_AS_BZERO
    199 	pxor	%xmm0, %xmm0
    200 #else
    201 	movd	%eax, %xmm0
    202 	pshufd	$0, %xmm0, %xmm0
    203 #endif
    204 
    205 	cmp	$64, %ecx
    206 	ja	L(64bytesmore)
    207 	movdqu	%xmm0, (%edx)
    208 	movdqu	%xmm0, -16(%edx, %ecx)
    209 	cmp	$32, %ecx
    210 	jbe	L(32bytesless)
    211 	movdqu	%xmm0, 16(%edx)
    212 	movdqu	%xmm0, -32(%edx, %ecx)
    213 L(32bytesless):
    214 	SETRTNVAL
    215 	RETURN
    216 
    217 L(64bytesmore):
    218 	testl	$0xf, %edx
    219 	jz	L(aligned_16)
    220 L(not_aligned_16):
    221 	movdqu	%xmm0, (%edx)
    222 	movl	%edx, %eax
    223 	and	$-16, %edx
    224 	add	$16, %edx
    225 	sub	%edx, %eax
    226 	add	%eax, %ecx
    227 	movd	%xmm0, %eax
    228 
    229 	ALIGN (4)
    230 L(aligned_16):
    231 	cmp	$128, %ecx
    232 	jae	L(128bytesormore)
    233 
    234 L(aligned_16_less128bytes):
    235 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
    236 
    237 	ALIGN (4)
    238 L(128bytesormore):
    239 #ifdef SHARED_CACHE_SIZE
    240 	PUSH (%ebx)
    241 	mov	$SHARED_CACHE_SIZE, %ebx
    242 #else
    243 # if (defined SHARED || defined __PIC__)
    244 	call	__x86.get_pc_thunk.bx
    245 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    246 	mov	$__x86_shared_cache_size@GOTOFF(%ebx), %ebx
    247 # else
    248 	PUSH (%ebx)
    249 	mov	$__x86_shared_cache_size, %ebx
    250 # endif
    251 #endif
    252 	cmp	%ebx, %ecx
    253 	jae	L(128bytesormore_nt_start)
    254 
    255 	POP (%ebx)
    256 
    257 #ifdef DATA_CACHE_SIZE
    258 	PUSH (%ebx)
    259 	mov	$DATA_CACHE_SIZE, %ebx
    260 #else
    261 # if (defined SHARED || defined __PIC__)
    262 	call	__x86.get_pc_thunk.bx
    263 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    264 	mov	$__x86_data_cache_size@GOTOFF(%ebx), %ebx
    265 # else
    266 	PUSH (%ebx)
    267 	mov	$__x86_data_cache_size, %ebx
    268 # endif
    269 #endif
    270 
    271 	cmp	%ebx, %ecx
    272 	jae	L(128bytes_L2_normal)
    273 	subl	$128, %ecx
    274 L(128bytesormore_normal):
    275 	sub	$128, %ecx
    276 	movdqa	%xmm0, (%edx)
    277 	movaps	%xmm0, 0x10(%edx)
    278 	movaps	%xmm0, 0x20(%edx)
    279 	movaps	%xmm0, 0x30(%edx)
    280 	movaps	%xmm0, 0x40(%edx)
    281 	movaps	%xmm0, 0x50(%edx)
    282 	movaps	%xmm0, 0x60(%edx)
    283 	movaps	%xmm0, 0x70(%edx)
    284 	lea	128(%edx), %edx
    285 	jb	L(128bytesless_normal)
    286 
    287 
    288 	sub	$128, %ecx
    289 	movdqa	%xmm0, (%edx)
    290 	movaps	%xmm0, 0x10(%edx)
    291 	movaps	%xmm0, 0x20(%edx)
    292 	movaps	%xmm0, 0x30(%edx)
    293 	movaps	%xmm0, 0x40(%edx)
    294 	movaps	%xmm0, 0x50(%edx)
    295 	movaps	%xmm0, 0x60(%edx)
    296 	movaps	%xmm0, 0x70(%edx)
    297 	lea	128(%edx), %edx
    298 	jae	L(128bytesormore_normal)
    299 
    300 L(128bytesless_normal):
    301 	lea	128(%ecx), %ecx
    302 #if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__)
    303 	POP (%ebx)
    304 #endif
    305 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
    306 
    307 	ALIGN (4)
    308 L(128bytes_L2_normal):
    309 	prefetchnta	0x380(%edx)
    310 	prefetchnta	0x3c0(%edx)
    311 	sub	$128, %ecx
    312 	movdqa	%xmm0, (%edx)
    313 	movaps	%xmm0, 0x10(%edx)
    314 	movaps	%xmm0, 0x20(%edx)
    315 	movaps	%xmm0, 0x30(%edx)
    316 	movaps	%xmm0, 0x40(%edx)
    317 	movaps	%xmm0, 0x50(%edx)
    318 	movaps	%xmm0, 0x60(%edx)
    319 	movaps	%xmm0, 0x70(%edx)
    320 	add	$128, %edx
    321 	cmp	$128, %ecx
    322 	jae	L(128bytes_L2_normal)
    323 
    324 L(128bytesless_L2_normal):
    325 #if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__)
    326 	POP (%ebx)
    327 #endif
    328 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
    329 
    330 L(128bytesormore_nt_start):
    331 	sub	%ebx, %ecx
    332 	ALIGN (4)
    333 L(128bytesormore_shared_cache_loop):
    334 	prefetchnta	0x3c0(%edx)
    335 	prefetchnta	0x380(%edx)
    336 	sub	$0x80, %ebx
    337 	movdqa	%xmm0, (%edx)
    338 	movaps	%xmm0, 0x10(%edx)
    339 	movaps	%xmm0, 0x20(%edx)
    340 	movaps	%xmm0, 0x30(%edx)
    341 	movaps	%xmm0, 0x40(%edx)
    342 	movaps	%xmm0, 0x50(%edx)
    343 	movaps	%xmm0, 0x60(%edx)
    344 	movaps	%xmm0, 0x70(%edx)
    345 	add	$0x80, %edx
    346 	cmp	$0x80, %ebx
    347 	jae	L(128bytesormore_shared_cache_loop)
    348 	cmp	$0x80, %ecx
    349 	jb	L(shared_cache_loop_end)
    350 	ALIGN (4)
    351 L(128bytesormore_nt):
    352 	sub	$0x80, %ecx
    353 	movntdq	%xmm0, (%edx)
    354 	movntdq	%xmm0, 0x10(%edx)
    355 	movntdq	%xmm0, 0x20(%edx)
    356 	movntdq	%xmm0, 0x30(%edx)
    357 	movntdq	%xmm0, 0x40(%edx)
    358 	movntdq	%xmm0, 0x50(%edx)
    359 	movntdq	%xmm0, 0x60(%edx)
    360 	movntdq	%xmm0, 0x70(%edx)
    361 	add	$0x80, %edx
    362 	cmp	$0x80, %ecx
    363 	jae	L(128bytesormore_nt)
    364 	sfence
    365 L(shared_cache_loop_end):
    366 #if defined SHARED_CACHE_SIZE || !(defined SHARED || defined __PIC__)
    367 	POP (%ebx)
    368 #endif
    369 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
    370 
    371 
    372 	.pushsection .rodata.sse2,"a",@progbits
    373 	ALIGN (2)
    374 L(table_16_128bytes):
    375 	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
    376 	.int	JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
    377 	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
    378 	.int	JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
    379 	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
    380 	.int	JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
    381 	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
    382 	.int	JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
    383 	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
    384 	.int	JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
    385 	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
    386 	.int	JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
    387 	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
    388 	.int	JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
    389 	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
    390 	.int	JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
    391 	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
    392 	.int	JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
    393 	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
    394 	.int	JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
    395 	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
    396 	.int	JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
    397 	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
    398 	.int	JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
    399 	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
    400 	.int	JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
    401 	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
    402 	.int	JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
    403 	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
    404 	.int	JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
    405 	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
    406 	.int	JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
    407 	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
    408 	.int	JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
    409 	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
    410 	.int	JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
    411 	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
    412 	.int	JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
    413 	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
    414 	.int	JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
    415 	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
    416 	.int	JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
    417 	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
    418 	.int	JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
    419 	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
    420 	.int	JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
    421 	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
    422 	.int	JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
    423 	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
    424 	.int	JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
    425 	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
    426 	.int	JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
    427 	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
    428 	.int	JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
    429 	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
    430 	.int	JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
    431 	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
    432 	.int	JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
    433 	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
    434 	.int	JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
    435 	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
    436 	.int	JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
    437 	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
    438 	.int	JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
    439 	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
    440 	.int	JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
    441 	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
    442 	.int	JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
    443 	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
    444 	.int	JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
    445 	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
    446 	.int	JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
    447 	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
    448 	.int	JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
    449 	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
    450 	.int	JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
    451 	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
    452 	.int	JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
    453 	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
    454 	.int	JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
    455 	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
    456 	.int	JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
    457 	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
    458 	.int	JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
    459 	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
    460 	.int	JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
    461 	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
    462 	.int	JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
    463 	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
    464 	.int	JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
    465 	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
    466 	.int	JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
    467 	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
    468 	.int	JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
    469 	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
    470 	.int	JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
    471 	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
    472 	.int	JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
    473 	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
    474 	.int	JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
    475 	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
    476 	.int	JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
    477 	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
    478 	.int	JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
    479 	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
    480 	.int	JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
    481 	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
    482 	.int	JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
    483 	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
    484 	.int	JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
    485 	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
    486 	.int	JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
    487 	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
    488 	.int	JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
    489 	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
    490 	.int	JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
    491 	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
    492 	.int	JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
    493 	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
    494 	.int	JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
    495 	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
    496 	.int	JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
    497 	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
    498 	.int	JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
    499 	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
    500 	.int	JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
    501 	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
    502 	.int	JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
    503 	.popsection
    504 
    505 	ALIGN (4)
    506 L(aligned_16_112bytes):
    507 	movdqa	%xmm0, -112(%edx)
    508 L(aligned_16_96bytes):
    509 	movdqa	%xmm0, -96(%edx)
    510 L(aligned_16_80bytes):
    511 	movdqa	%xmm0, -80(%edx)
    512 L(aligned_16_64bytes):
    513 	movdqa	%xmm0, -64(%edx)
    514 L(aligned_16_48bytes):
    515 	movdqa	%xmm0, -48(%edx)
    516 L(aligned_16_32bytes):
    517 	movdqa	%xmm0, -32(%edx)
    518 L(aligned_16_16bytes):
    519 	movdqa	%xmm0, -16(%edx)
    520 L(aligned_16_0bytes):
    521 	SETRTNVAL
    522 	RETURN
    523 
    524 	ALIGN (4)
    525 L(aligned_16_113bytes):
    526 	movdqa	%xmm0, -113(%edx)
    527 L(aligned_16_97bytes):
    528 	movdqa	%xmm0, -97(%edx)
    529 L(aligned_16_81bytes):
    530 	movdqa	%xmm0, -81(%edx)
    531 L(aligned_16_65bytes):
    532 	movdqa	%xmm0, -65(%edx)
    533 L(aligned_16_49bytes):
    534 	movdqa	%xmm0, -49(%edx)
    535 L(aligned_16_33bytes):
    536 	movdqa	%xmm0, -33(%edx)
    537 L(aligned_16_17bytes):
    538 	movdqa	%xmm0, -17(%edx)
    539 L(aligned_16_1bytes):
    540 	movb	%al, -1(%edx)
    541 	SETRTNVAL
    542 	RETURN
    543 
    544 	ALIGN (4)
    545 L(aligned_16_114bytes):
    546 	movdqa	%xmm0, -114(%edx)
    547 L(aligned_16_98bytes):
    548 	movdqa	%xmm0, -98(%edx)
    549 L(aligned_16_82bytes):
    550 	movdqa	%xmm0, -82(%edx)
    551 L(aligned_16_66bytes):
    552 	movdqa	%xmm0, -66(%edx)
    553 L(aligned_16_50bytes):
    554 	movdqa	%xmm0, -50(%edx)
    555 L(aligned_16_34bytes):
    556 	movdqa	%xmm0, -34(%edx)
    557 L(aligned_16_18bytes):
    558 	movdqa	%xmm0, -18(%edx)
    559 L(aligned_16_2bytes):
    560 	movw	%ax, -2(%edx)
    561 	SETRTNVAL
    562 	RETURN
    563 
    564 	ALIGN (4)
    565 L(aligned_16_115bytes):
    566 	movdqa	%xmm0, -115(%edx)
    567 L(aligned_16_99bytes):
    568 	movdqa	%xmm0, -99(%edx)
    569 L(aligned_16_83bytes):
    570 	movdqa	%xmm0, -83(%edx)
    571 L(aligned_16_67bytes):
    572 	movdqa	%xmm0, -67(%edx)
    573 L(aligned_16_51bytes):
    574 	movdqa	%xmm0, -51(%edx)
    575 L(aligned_16_35bytes):
    576 	movdqa	%xmm0, -35(%edx)
    577 L(aligned_16_19bytes):
    578 	movdqa	%xmm0, -19(%edx)
    579 L(aligned_16_3bytes):
    580 	movw	%ax, -3(%edx)
    581 	movb	%al, -1(%edx)
    582 	SETRTNVAL
    583 	RETURN
    584 
    585 	ALIGN (4)
    586 L(aligned_16_116bytes):
    587 	movdqa	%xmm0, -116(%edx)
    588 L(aligned_16_100bytes):
    589 	movdqa	%xmm0, -100(%edx)
    590 L(aligned_16_84bytes):
    591 	movdqa	%xmm0, -84(%edx)
    592 L(aligned_16_68bytes):
    593 	movdqa	%xmm0, -68(%edx)
    594 L(aligned_16_52bytes):
    595 	movdqa	%xmm0, -52(%edx)
    596 L(aligned_16_36bytes):
    597 	movdqa	%xmm0, -36(%edx)
    598 L(aligned_16_20bytes):
    599 	movdqa	%xmm0, -20(%edx)
    600 L(aligned_16_4bytes):
    601 	movl	%eax, -4(%edx)
    602 	SETRTNVAL
    603 	RETURN
    604 
    605 	ALIGN (4)
    606 L(aligned_16_117bytes):
    607 	movdqa	%xmm0, -117(%edx)
    608 L(aligned_16_101bytes):
    609 	movdqa	%xmm0, -101(%edx)
    610 L(aligned_16_85bytes):
    611 	movdqa	%xmm0, -85(%edx)
    612 L(aligned_16_69bytes):
    613 	movdqa	%xmm0, -69(%edx)
    614 L(aligned_16_53bytes):
    615 	movdqa	%xmm0, -53(%edx)
    616 L(aligned_16_37bytes):
    617 	movdqa	%xmm0, -37(%edx)
    618 L(aligned_16_21bytes):
    619 	movdqa	%xmm0, -21(%edx)
    620 L(aligned_16_5bytes):
    621 	movl	%eax, -5(%edx)
    622 	movb	%al, -1(%edx)
    623 	SETRTNVAL
    624 	RETURN
    625 
    626 	ALIGN (4)
    627 L(aligned_16_118bytes):
    628 	movdqa	%xmm0, -118(%edx)
    629 L(aligned_16_102bytes):
    630 	movdqa	%xmm0, -102(%edx)
    631 L(aligned_16_86bytes):
    632 	movdqa	%xmm0, -86(%edx)
    633 L(aligned_16_70bytes):
    634 	movdqa	%xmm0, -70(%edx)
    635 L(aligned_16_54bytes):
    636 	movdqa	%xmm0, -54(%edx)
    637 L(aligned_16_38bytes):
    638 	movdqa	%xmm0, -38(%edx)
    639 L(aligned_16_22bytes):
    640 	movdqa	%xmm0, -22(%edx)
    641 L(aligned_16_6bytes):
    642 	movl	%eax, -6(%edx)
    643 	movw	%ax, -2(%edx)
    644 	SETRTNVAL
    645 	RETURN
    646 
    647 	ALIGN (4)
    648 L(aligned_16_119bytes):
    649 	movdqa	%xmm0, -119(%edx)
    650 L(aligned_16_103bytes):
    651 	movdqa	%xmm0, -103(%edx)
    652 L(aligned_16_87bytes):
    653 	movdqa	%xmm0, -87(%edx)
    654 L(aligned_16_71bytes):
    655 	movdqa	%xmm0, -71(%edx)
    656 L(aligned_16_55bytes):
    657 	movdqa	%xmm0, -55(%edx)
    658 L(aligned_16_39bytes):
    659 	movdqa	%xmm0, -39(%edx)
    660 L(aligned_16_23bytes):
    661 	movdqa	%xmm0, -23(%edx)
    662 L(aligned_16_7bytes):
    663 	movl	%eax, -7(%edx)
    664 	movw	%ax, -3(%edx)
    665 	movb	%al, -1(%edx)
    666 	SETRTNVAL
    667 	RETURN
    668 
    669 	ALIGN (4)
    670 L(aligned_16_120bytes):
    671 	movdqa	%xmm0, -120(%edx)
    672 L(aligned_16_104bytes):
    673 	movdqa	%xmm0, -104(%edx)
    674 L(aligned_16_88bytes):
    675 	movdqa	%xmm0, -88(%edx)
    676 L(aligned_16_72bytes):
    677 	movdqa	%xmm0, -72(%edx)
    678 L(aligned_16_56bytes):
    679 	movdqa	%xmm0, -56(%edx)
    680 L(aligned_16_40bytes):
    681 	movdqa	%xmm0, -40(%edx)
    682 L(aligned_16_24bytes):
    683 	movdqa	%xmm0, -24(%edx)
    684 L(aligned_16_8bytes):
    685 	movq	%xmm0, -8(%edx)
    686 	SETRTNVAL
    687 	RETURN
    688 
    689 	ALIGN (4)
    690 L(aligned_16_121bytes):
    691 	movdqa	%xmm0, -121(%edx)
    692 L(aligned_16_105bytes):
    693 	movdqa	%xmm0, -105(%edx)
    694 L(aligned_16_89bytes):
    695 	movdqa	%xmm0, -89(%edx)
    696 L(aligned_16_73bytes):
    697 	movdqa	%xmm0, -73(%edx)
    698 L(aligned_16_57bytes):
    699 	movdqa	%xmm0, -57(%edx)
    700 L(aligned_16_41bytes):
    701 	movdqa	%xmm0, -41(%edx)
    702 L(aligned_16_25bytes):
    703 	movdqa	%xmm0, -25(%edx)
    704 L(aligned_16_9bytes):
    705 	movq	%xmm0, -9(%edx)
    706 	movb	%al, -1(%edx)
    707 	SETRTNVAL
    708 	RETURN
    709 
    710 	ALIGN (4)
    711 L(aligned_16_122bytes):
    712 	movdqa	%xmm0, -122(%edx)
    713 L(aligned_16_106bytes):
    714 	movdqa	%xmm0, -106(%edx)
    715 L(aligned_16_90bytes):
    716 	movdqa	%xmm0, -90(%edx)
    717 L(aligned_16_74bytes):
    718 	movdqa	%xmm0, -74(%edx)
    719 L(aligned_16_58bytes):
    720 	movdqa	%xmm0, -58(%edx)
    721 L(aligned_16_42bytes):
    722 	movdqa	%xmm0, -42(%edx)
    723 L(aligned_16_26bytes):
    724 	movdqa	%xmm0, -26(%edx)
    725 L(aligned_16_10bytes):
    726 	movq	%xmm0, -10(%edx)
    727 	movw	%ax, -2(%edx)
    728 	SETRTNVAL
    729 	RETURN
    730 
    731 	ALIGN (4)
    732 L(aligned_16_123bytes):
    733 	movdqa	%xmm0, -123(%edx)
    734 L(aligned_16_107bytes):
    735 	movdqa	%xmm0, -107(%edx)
    736 L(aligned_16_91bytes):
    737 	movdqa	%xmm0, -91(%edx)
    738 L(aligned_16_75bytes):
    739 	movdqa	%xmm0, -75(%edx)
    740 L(aligned_16_59bytes):
    741 	movdqa	%xmm0, -59(%edx)
    742 L(aligned_16_43bytes):
    743 	movdqa	%xmm0, -43(%edx)
    744 L(aligned_16_27bytes):
    745 	movdqa	%xmm0, -27(%edx)
    746 L(aligned_16_11bytes):
    747 	movq	%xmm0, -11(%edx)
    748 	movw	%ax, -3(%edx)
    749 	movb	%al, -1(%edx)
    750 	SETRTNVAL
    751 	RETURN
    752 
    753 	ALIGN (4)
    754 L(aligned_16_124bytes):
    755 	movdqa	%xmm0, -124(%edx)
    756 L(aligned_16_108bytes):
    757 	movdqa	%xmm0, -108(%edx)
    758 L(aligned_16_92bytes):
    759 	movdqa	%xmm0, -92(%edx)
    760 L(aligned_16_76bytes):
    761 	movdqa	%xmm0, -76(%edx)
    762 L(aligned_16_60bytes):
    763 	movdqa	%xmm0, -60(%edx)
    764 L(aligned_16_44bytes):
    765 	movdqa	%xmm0, -44(%edx)
    766 L(aligned_16_28bytes):
    767 	movdqa	%xmm0, -28(%edx)
    768 L(aligned_16_12bytes):
    769 	movq	%xmm0, -12(%edx)
    770 	movl	%eax, -4(%edx)
    771 	SETRTNVAL
    772 	RETURN
    773 
    774 	ALIGN (4)
    775 L(aligned_16_125bytes):
    776 	movdqa	%xmm0, -125(%edx)
    777 L(aligned_16_109bytes):
    778 	movdqa	%xmm0, -109(%edx)
    779 L(aligned_16_93bytes):
    780 	movdqa	%xmm0, -93(%edx)
    781 L(aligned_16_77bytes):
    782 	movdqa	%xmm0, -77(%edx)
    783 L(aligned_16_61bytes):
    784 	movdqa	%xmm0, -61(%edx)
    785 L(aligned_16_45bytes):
    786 	movdqa	%xmm0, -45(%edx)
    787 L(aligned_16_29bytes):
    788 	movdqa	%xmm0, -29(%edx)
    789 L(aligned_16_13bytes):
    790 	movq	%xmm0, -13(%edx)
    791 	movl	%eax, -5(%edx)
    792 	movb	%al, -1(%edx)
    793 	SETRTNVAL
    794 	RETURN
    795 
    796 	ALIGN (4)
    797 L(aligned_16_126bytes):
    798 	movdqa	%xmm0, -126(%edx)
    799 L(aligned_16_110bytes):
    800 	movdqa	%xmm0, -110(%edx)
    801 L(aligned_16_94bytes):
    802 	movdqa	%xmm0, -94(%edx)
    803 L(aligned_16_78bytes):
    804 	movdqa	%xmm0, -78(%edx)
    805 L(aligned_16_62bytes):
    806 	movdqa	%xmm0, -62(%edx)
    807 L(aligned_16_46bytes):
    808 	movdqa	%xmm0, -46(%edx)
    809 L(aligned_16_30bytes):
    810 	movdqa	%xmm0, -30(%edx)
    811 L(aligned_16_14bytes):
    812 	movq	%xmm0, -14(%edx)
    813 	movl	%eax, -6(%edx)
    814 	movw	%ax, -2(%edx)
    815 	SETRTNVAL
    816 	RETURN
    817 
    818 	ALIGN (4)
    819 L(aligned_16_127bytes):
    820 	movdqa	%xmm0, -127(%edx)
    821 L(aligned_16_111bytes):
    822 	movdqa	%xmm0, -111(%edx)
    823 L(aligned_16_95bytes):
    824 	movdqa	%xmm0, -95(%edx)
    825 L(aligned_16_79bytes):
    826 	movdqa	%xmm0, -79(%edx)
    827 L(aligned_16_63bytes):
    828 	movdqa	%xmm0, -63(%edx)
    829 L(aligned_16_47bytes):
    830 	movdqa	%xmm0, -47(%edx)
    831 L(aligned_16_31bytes):
    832 	movdqa	%xmm0, -31(%edx)
    833 L(aligned_16_15bytes):
    834 	movq	%xmm0, -15(%edx)
    835 	movl	%eax, -7(%edx)
    836 	movw	%ax, -3(%edx)
    837 	movb	%al, -1(%edx)
    838 	SETRTNVAL
    839 	RETURN_END
    840 
    841 END (MEMSET)
    842