Home | History | Annotate | Download | only in string
      1 /*
      2 Copyright (c) 2010, Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 #include "cache.h"
     32 
     33 #ifndef L
     34 # define L(label)	.L##label
     35 #endif
     36 
     37 #ifndef ALIGN
     38 # define ALIGN(n)	.p2align n
     39 #endif
     40 
     41 #ifndef cfi_startproc
     42 # define cfi_startproc			.cfi_startproc
     43 #endif
     44 
     45 #ifndef cfi_endproc
     46 # define cfi_endproc			.cfi_endproc
     47 #endif
     48 
     49 #ifndef cfi_rel_offset
     50 # define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
     51 #endif
     52 
     53 #ifndef cfi_restore
     54 # define cfi_restore(reg)		.cfi_restore reg
     55 #endif
     56 
     57 #ifndef cfi_adjust_cfa_offset
     58 # define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
     59 #endif
     60 
     61 #ifndef ENTRY
     62 # define ENTRY(name)			\
     63 	.type name,  @function; 	\
     64 	.globl name;			\
     65 	.p2align 4;			\
     66 name:					\
     67 	cfi_startproc
     68 #endif
     69 
     70 #ifndef END
     71 # define END(name)			\
     72 	cfi_endproc;			\
     73 	.size name, .-name
     74 #endif
     75 
     76 #define CFI_PUSH(REG)						\
     77   cfi_adjust_cfa_offset (4);					\
     78   cfi_rel_offset (REG, 0)
     79 
     80 #define CFI_POP(REG)						\
     81   cfi_adjust_cfa_offset (-4);					\
     82   cfi_restore (REG)
     83 
     84 #define PUSH(REG)	pushl REG; CFI_PUSH (REG)
     85 #define POP(REG)	popl REG; CFI_POP (REG)
     86 
     87 #ifdef USE_AS_BZERO
     88 # define DEST		PARMS
     89 # define LEN		DEST+4
     90 # define SETRTNVAL
     91 #else
     92 # define DEST		PARMS
     93 # define CHR		DEST+4
     94 # define LEN		CHR+4
     95 # define SETRTNVAL	movl DEST(%esp), %eax
     96 #endif
     97 
     98 #if (defined SHARED || defined __PIC__)
     99 # define ENTRANCE	PUSH (%ebx);
    100 # define RETURN_END	POP (%ebx); ret
    101 # define RETURN		RETURN_END; CFI_PUSH (%ebx)
    102 # define PARMS		8		/* Preserve EBX.  */
    103 # define JMPTBL(I, B)	I - B
    104 
    105 /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
    106    jump table with relative offsets.   */
    107 # define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
    108     /* We first load PC into EBX.  */				\
    109     call	__x86.get_pc_thunk.bx;				\
    110     /* Get the address of the jump table.  */			\
    111     add		$(TABLE - .), %ebx;				\
    112     /* Get the entry and convert the relative offset to the	\
    113        absolute address.  */					\
    114     add		(%ebx,%ecx,4), %ebx;				\
    115     add		%ecx, %edx;					\
    116     /* We loaded the jump table and adjuested EDX. Go.  */	\
    117     jmp		*%ebx
    118 
    119 	.section	.gnu.linkonce.t.__x86.get_pc_thunk.bx,"ax",@progbits
    120 	.globl	__x86.get_pc_thunk.bx
    121 	.hidden	__x86.get_pc_thunk.bx
    122 	ALIGN (4)
    123 	.type	__x86.get_pc_thunk.bx,@function
    124 __x86.get_pc_thunk.bx:
    125 	movl	(%esp), %ebx
    126 	ret
    127 #else
    128 # define ENTRANCE
    129 # define RETURN_END	ret
    130 # define RETURN		RETURN_END
    131 # define PARMS		4
    132 # define JMPTBL(I, B)	I
    133 
    134 /* Branch to an entry in a jump table.  TABLE is a jump table with
    135    absolute offsets.  */
    136 # define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
    137     add		%ecx, %edx;					\
    138     jmp		*TABLE(,%ecx,4)
    139 #endif
    140 
    141 #ifndef MEMSET
    142 # define MEMSET memset
    143 #endif
    144 
    145 	.section .text.sse2,"ax",@progbits
    146 	ALIGN (4)
    147 ENTRY (MEMSET)
    148 	ENTRANCE
    149 
    150 	movl	LEN(%esp), %ecx
    151 #ifdef USE_AS_BZERO
    152 	xor	%eax, %eax
    153 #else
    154 	movzbl	CHR(%esp), %eax
    155 	movb	%al, %ah
    156 	/* Fill the whole EAX with pattern.  */
    157 	movl	%eax, %edx
    158 	shl	$16, %eax
    159 	or	%edx, %eax
    160 #endif
    161 	movl	DEST(%esp), %edx
    162 	cmp	$32, %ecx
    163 	jae	L(32bytesormore)
    164 
    165 L(write_less32bytes):
    166 	BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
    167 
    168 
    169 	.pushsection .rodata.sse2,"a",@progbits
    170 	ALIGN (2)
    171 L(table_less_32bytes):
    172 	.int	JMPTBL (L(write_0bytes), L(table_less_32bytes))
    173 	.int	JMPTBL (L(write_1bytes), L(table_less_32bytes))
    174 	.int	JMPTBL (L(write_2bytes), L(table_less_32bytes))
    175 	.int	JMPTBL (L(write_3bytes), L(table_less_32bytes))
    176 	.int	JMPTBL (L(write_4bytes), L(table_less_32bytes))
    177 	.int	JMPTBL (L(write_5bytes), L(table_less_32bytes))
    178 	.int	JMPTBL (L(write_6bytes), L(table_less_32bytes))
    179 	.int	JMPTBL (L(write_7bytes), L(table_less_32bytes))
    180 	.int	JMPTBL (L(write_8bytes), L(table_less_32bytes))
    181 	.int	JMPTBL (L(write_9bytes), L(table_less_32bytes))
    182 	.int	JMPTBL (L(write_10bytes), L(table_less_32bytes))
    183 	.int	JMPTBL (L(write_11bytes), L(table_less_32bytes))
    184 	.int	JMPTBL (L(write_12bytes), L(table_less_32bytes))
    185 	.int	JMPTBL (L(write_13bytes), L(table_less_32bytes))
    186 	.int	JMPTBL (L(write_14bytes), L(table_less_32bytes))
    187 	.int	JMPTBL (L(write_15bytes), L(table_less_32bytes))
    188 	.int	JMPTBL (L(write_16bytes), L(table_less_32bytes))
    189 	.int	JMPTBL (L(write_17bytes), L(table_less_32bytes))
    190 	.int	JMPTBL (L(write_18bytes), L(table_less_32bytes))
    191 	.int	JMPTBL (L(write_19bytes), L(table_less_32bytes))
    192 	.int	JMPTBL (L(write_20bytes), L(table_less_32bytes))
    193 	.int	JMPTBL (L(write_21bytes), L(table_less_32bytes))
    194 	.int	JMPTBL (L(write_22bytes), L(table_less_32bytes))
    195 	.int	JMPTBL (L(write_23bytes), L(table_less_32bytes))
    196 	.int	JMPTBL (L(write_24bytes), L(table_less_32bytes))
    197 	.int	JMPTBL (L(write_25bytes), L(table_less_32bytes))
    198 	.int	JMPTBL (L(write_26bytes), L(table_less_32bytes))
    199 	.int	JMPTBL (L(write_27bytes), L(table_less_32bytes))
    200 	.int	JMPTBL (L(write_28bytes), L(table_less_32bytes))
    201 	.int	JMPTBL (L(write_29bytes), L(table_less_32bytes))
    202 	.int	JMPTBL (L(write_30bytes), L(table_less_32bytes))
    203 	.int	JMPTBL (L(write_31bytes), L(table_less_32bytes))
    204 	.popsection
    205 
    206 	ALIGN (4)
    207 L(write_28bytes):
    208 	movl	%eax, -28(%edx)
    209 L(write_24bytes):
    210 	movl	%eax, -24(%edx)
    211 L(write_20bytes):
    212 	movl	%eax, -20(%edx)
    213 L(write_16bytes):
    214 	movl	%eax, -16(%edx)
    215 L(write_12bytes):
    216 	movl	%eax, -12(%edx)
    217 L(write_8bytes):
    218 	movl	%eax, -8(%edx)
    219 L(write_4bytes):
    220 	movl	%eax, -4(%edx)
    221 L(write_0bytes):
    222 	SETRTNVAL
    223 	RETURN
    224 
    225 	ALIGN (4)
    226 L(write_29bytes):
    227 	movl	%eax, -29(%edx)
    228 L(write_25bytes):
    229 	movl	%eax, -25(%edx)
    230 L(write_21bytes):
    231 	movl	%eax, -21(%edx)
    232 L(write_17bytes):
    233 	movl	%eax, -17(%edx)
    234 L(write_13bytes):
    235 	movl	%eax, -13(%edx)
    236 L(write_9bytes):
    237 	movl	%eax, -9(%edx)
    238 L(write_5bytes):
    239 	movl	%eax, -5(%edx)
    240 L(write_1bytes):
    241 	movb	%al, -1(%edx)
    242 	SETRTNVAL
    243 	RETURN
    244 
    245 	ALIGN (4)
    246 L(write_30bytes):
    247 	movl	%eax, -30(%edx)
    248 L(write_26bytes):
    249 	movl	%eax, -26(%edx)
    250 L(write_22bytes):
    251 	movl	%eax, -22(%edx)
    252 L(write_18bytes):
    253 	movl	%eax, -18(%edx)
    254 L(write_14bytes):
    255 	movl	%eax, -14(%edx)
    256 L(write_10bytes):
    257 	movl	%eax, -10(%edx)
    258 L(write_6bytes):
    259 	movl	%eax, -6(%edx)
    260 L(write_2bytes):
    261 	movw	%ax, -2(%edx)
    262 	SETRTNVAL
    263 	RETURN
    264 
    265 	ALIGN (4)
    266 L(write_31bytes):
    267 	movl	%eax, -31(%edx)
    268 L(write_27bytes):
    269 	movl	%eax, -27(%edx)
    270 L(write_23bytes):
    271 	movl	%eax, -23(%edx)
    272 L(write_19bytes):
    273 	movl	%eax, -19(%edx)
    274 L(write_15bytes):
    275 	movl	%eax, -15(%edx)
    276 L(write_11bytes):
    277 	movl	%eax, -11(%edx)
    278 L(write_7bytes):
    279 	movl	%eax, -7(%edx)
    280 L(write_3bytes):
    281 	movw	%ax, -3(%edx)
    282 	movb	%al, -1(%edx)
    283 	SETRTNVAL
    284 	RETURN
    285 
    286 	ALIGN (4)
    287 /* ECX > 32 and EDX is 4 byte aligned.  */
    288 L(32bytesormore):
    289 	/* Fill xmm0 with the pattern.  */
    290 #ifdef USE_AS_BZERO
    291 	pxor	%xmm0, %xmm0
    292 #else
    293 	movd	%eax, %xmm0
    294 	pshufd	$0, %xmm0, %xmm0
    295 #endif
    296 	testl	$0xf, %edx
    297 	jz	L(aligned_16)
    298 /* ECX > 32 and EDX is not 16 byte aligned.  */
    299 L(not_aligned_16):
    300 	movdqu	%xmm0, (%edx)
    301 	movl	%edx, %eax
    302 	and	$-16, %edx
    303 	add	$16, %edx
    304 	sub	%edx, %eax
    305 	add	%eax, %ecx
    306 	movd	%xmm0, %eax
    307 
    308 	ALIGN (4)
    309 L(aligned_16):
    310 	cmp	$128, %ecx
    311 	jae	L(128bytesormore)
    312 
    313 L(aligned_16_less128bytes):
    314 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
    315 
    316 	ALIGN (4)
    317 L(128bytesormore):
    318 #ifdef SHARED_CACHE_SIZE
    319 	PUSH (%ebx)
    320 	mov	$SHARED_CACHE_SIZE, %ebx
    321 #else
    322 # if (defined SHARED || defined __PIC__)
    323 	call	__x86.get_pc_thunk.bx
    324 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    325 	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
    326 # else
    327 	PUSH (%ebx)
    328 	mov	__x86_shared_cache_size, %ebx
    329 # endif
    330 #endif
    331 	cmp	%ebx, %ecx
    332 	jae	L(128bytesormore_nt_start)
    333 
    334 
    335 #ifdef DATA_CACHE_SIZE
    336 	POP (%ebx)
    337 # define RESTORE_EBX_STATE CFI_PUSH (%ebx)
    338 	cmp	$DATA_CACHE_SIZE, %ecx
    339 #else
    340 # if (defined SHARED || defined __PIC__)
    341 #  define RESTORE_EBX_STATE
    342 	call	__x86.get_pc_thunk.bx
    343 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    344 	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
    345 # else
    346 	POP (%ebx)
    347 #  define RESTORE_EBX_STATE CFI_PUSH (%ebx)
    348 	cmp	__x86_data_cache_size, %ecx
    349 # endif
    350 #endif
    351 
    352 	jae	L(128bytes_L2_normal)
    353 	subl	$128, %ecx
    354 L(128bytesormore_normal):
    355 	sub	$128, %ecx
    356 	movdqa	%xmm0, (%edx)
    357 	movdqa	%xmm0, 0x10(%edx)
    358 	movdqa	%xmm0, 0x20(%edx)
    359 	movdqa	%xmm0, 0x30(%edx)
    360 	movdqa	%xmm0, 0x40(%edx)
    361 	movdqa	%xmm0, 0x50(%edx)
    362 	movdqa	%xmm0, 0x60(%edx)
    363 	movdqa	%xmm0, 0x70(%edx)
    364 	lea	128(%edx), %edx
    365 	jb	L(128bytesless_normal)
    366 
    367 
    368 	sub	$128, %ecx
    369 	movdqa	%xmm0, (%edx)
    370 	movdqa	%xmm0, 0x10(%edx)
    371 	movdqa	%xmm0, 0x20(%edx)
    372 	movdqa	%xmm0, 0x30(%edx)
    373 	movdqa	%xmm0, 0x40(%edx)
    374 	movdqa	%xmm0, 0x50(%edx)
    375 	movdqa	%xmm0, 0x60(%edx)
    376 	movdqa	%xmm0, 0x70(%edx)
    377 	lea	128(%edx), %edx
    378 	jae	L(128bytesormore_normal)
    379 
    380 L(128bytesless_normal):
    381 	add	$128, %ecx
    382 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
    383 
    384 	ALIGN (4)
    385 L(128bytes_L2_normal):
    386 	prefetcht0	0x380(%edx)
    387 	prefetcht0	0x3c0(%edx)
    388 	sub	$128, %ecx
    389 	movdqa	%xmm0, (%edx)
    390 	movaps	%xmm0, 0x10(%edx)
    391 	movaps	%xmm0, 0x20(%edx)
    392 	movaps	%xmm0, 0x30(%edx)
    393 	movaps	%xmm0, 0x40(%edx)
    394 	movaps	%xmm0, 0x50(%edx)
    395 	movaps	%xmm0, 0x60(%edx)
    396 	movaps	%xmm0, 0x70(%edx)
    397 	add	$128, %edx
    398 	cmp	$128, %ecx
    399 	jae	L(128bytes_L2_normal)
    400 
    401 L(128bytesless_L2_normal):
    402 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
    403 
    404 	RESTORE_EBX_STATE
    405 L(128bytesormore_nt_start):
    406 	sub	%ebx, %ecx
    407 	mov	%ebx, %eax
    408 	and	$0x7f, %eax
    409 	add	%eax, %ecx
    410 	movd	%xmm0, %eax
    411 	ALIGN (4)
    412 L(128bytesormore_shared_cache_loop):
    413 	prefetcht0	0x3c0(%edx)
    414 	prefetcht0	0x380(%edx)
    415 	sub	$0x80, %ebx
    416 	movdqa	%xmm0, (%edx)
    417 	movdqa	%xmm0, 0x10(%edx)
    418 	movdqa	%xmm0, 0x20(%edx)
    419 	movdqa	%xmm0, 0x30(%edx)
    420 	movdqa	%xmm0, 0x40(%edx)
    421 	movdqa	%xmm0, 0x50(%edx)
    422 	movdqa	%xmm0, 0x60(%edx)
    423 	movdqa	%xmm0, 0x70(%edx)
    424 	add	$0x80, %edx
    425 	cmp	$0x80, %ebx
    426 	jae	L(128bytesormore_shared_cache_loop)
    427 	cmp	$0x80, %ecx
    428 	jb	L(shared_cache_loop_end)
    429 	ALIGN (4)
    430 L(128bytesormore_nt):
    431 	sub	$0x80, %ecx
    432 	movntdq	%xmm0, (%edx)
    433 	movntdq	%xmm0, 0x10(%edx)
    434 	movntdq	%xmm0, 0x20(%edx)
    435 	movntdq	%xmm0, 0x30(%edx)
    436 	movntdq	%xmm0, 0x40(%edx)
    437 	movntdq	%xmm0, 0x50(%edx)
    438 	movntdq	%xmm0, 0x60(%edx)
    439 	movntdq	%xmm0, 0x70(%edx)
    440 	add	$0x80, %edx
    441 	cmp	$0x80, %ecx
    442 	jae	L(128bytesormore_nt)
    443 	sfence
    444 L(shared_cache_loop_end):
    445 #if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__)
    446 	POP (%ebx)
    447 #endif
    448 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
    449 
    450 
    451 	.pushsection .rodata.sse2,"a",@progbits
    452 	ALIGN (2)
    453 L(table_16_128bytes):
    454 	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
    455 	.int	JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
    456 	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
    457 	.int	JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
    458 	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
    459 	.int	JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
    460 	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
    461 	.int	JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
    462 	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
    463 	.int	JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
    464 	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
    465 	.int	JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
    466 	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
    467 	.int	JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
    468 	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
    469 	.int	JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
    470 	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
    471 	.int	JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
    472 	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
    473 	.int	JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
    474 	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
    475 	.int	JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
    476 	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
    477 	.int	JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
    478 	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
    479 	.int	JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
    480 	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
    481 	.int	JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
    482 	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
    483 	.int	JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
    484 	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
    485 	.int	JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
    486 	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
    487 	.int	JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
    488 	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
    489 	.int	JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
    490 	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
    491 	.int	JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
    492 	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
    493 	.int	JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
    494 	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
    495 	.int	JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
    496 	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
    497 	.int	JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
    498 	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
    499 	.int	JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
    500 	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
    501 	.int	JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
    502 	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
    503 	.int	JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
    504 	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
    505 	.int	JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
    506 	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
    507 	.int	JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
    508 	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
    509 	.int	JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
    510 	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
    511 	.int	JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
    512 	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
    513 	.int	JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
    514 	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
    515 	.int	JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
    516 	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
    517 	.int	JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
    518 	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
    519 	.int	JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
    520 	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
    521 	.int	JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
    522 	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
    523 	.int	JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
    524 	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
    525 	.int	JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
    526 	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
    527 	.int	JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
    528 	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
    529 	.int	JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
    530 	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
    531 	.int	JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
    532 	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
    533 	.int	JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
    534 	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
    535 	.int	JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
    536 	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
    537 	.int	JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
    538 	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
    539 	.int	JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
    540 	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
    541 	.int	JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
    542 	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
    543 	.int	JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
    544 	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
    545 	.int	JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
    546 	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
    547 	.int	JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
    548 	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
    549 	.int	JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
    550 	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
    551 	.int	JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
    552 	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
    553 	.int	JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
    554 	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
    555 	.int	JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
    556 	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
    557 	.int	JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
    558 	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
    559 	.int	JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
    560 	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
    561 	.int	JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
    562 	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
    563 	.int	JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
    564 	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
    565 	.int	JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
    566 	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
    567 	.int	JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
    568 	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
    569 	.int	JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
    570 	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
    571 	.int	JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
    572 	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
    573 	.int	JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
    574 	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
    575 	.int	JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
    576 	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
    577 	.int	JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
    578 	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
    579 	.int	JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
    580 	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
    581 	.int	JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
    582 	.popsection
    583 
    584 	ALIGN (4)
    585 L(aligned_16_112bytes):
    586 	movdqa	%xmm0, -112(%edx)
    587 L(aligned_16_96bytes):
    588 	movdqa	%xmm0, -96(%edx)
    589 L(aligned_16_80bytes):
    590 	movdqa	%xmm0, -80(%edx)
    591 L(aligned_16_64bytes):
    592 	movdqa	%xmm0, -64(%edx)
    593 L(aligned_16_48bytes):
    594 	movdqa	%xmm0, -48(%edx)
    595 L(aligned_16_32bytes):
    596 	movdqa	%xmm0, -32(%edx)
    597 L(aligned_16_16bytes):
    598 	movdqa	%xmm0, -16(%edx)
    599 L(aligned_16_0bytes):
    600 	SETRTNVAL
    601 	RETURN
    602 
    603 	ALIGN (4)
    604 L(aligned_16_113bytes):
    605 	movdqa	%xmm0, -113(%edx)
    606 L(aligned_16_97bytes):
    607 	movdqa	%xmm0, -97(%edx)
    608 L(aligned_16_81bytes):
    609 	movdqa	%xmm0, -81(%edx)
    610 L(aligned_16_65bytes):
    611 	movdqa	%xmm0, -65(%edx)
    612 L(aligned_16_49bytes):
    613 	movdqa	%xmm0, -49(%edx)
    614 L(aligned_16_33bytes):
    615 	movdqa	%xmm0, -33(%edx)
    616 L(aligned_16_17bytes):
    617 	movdqa	%xmm0, -17(%edx)
    618 L(aligned_16_1bytes):
    619 	movb	%al, -1(%edx)
    620 	SETRTNVAL
    621 	RETURN
    622 
    623 	ALIGN (4)
    624 L(aligned_16_114bytes):
    625 	movdqa	%xmm0, -114(%edx)
    626 L(aligned_16_98bytes):
    627 	movdqa	%xmm0, -98(%edx)
    628 L(aligned_16_82bytes):
    629 	movdqa	%xmm0, -82(%edx)
    630 L(aligned_16_66bytes):
    631 	movdqa	%xmm0, -66(%edx)
    632 L(aligned_16_50bytes):
    633 	movdqa	%xmm0, -50(%edx)
    634 L(aligned_16_34bytes):
    635 	movdqa	%xmm0, -34(%edx)
    636 L(aligned_16_18bytes):
    637 	movdqa	%xmm0, -18(%edx)
    638 L(aligned_16_2bytes):
    639 	movw	%ax, -2(%edx)
    640 	SETRTNVAL
    641 	RETURN
    642 
    643 	ALIGN (4)
    644 L(aligned_16_115bytes):
    645 	movdqa	%xmm0, -115(%edx)
    646 L(aligned_16_99bytes):
    647 	movdqa	%xmm0, -99(%edx)
    648 L(aligned_16_83bytes):
    649 	movdqa	%xmm0, -83(%edx)
    650 L(aligned_16_67bytes):
    651 	movdqa	%xmm0, -67(%edx)
    652 L(aligned_16_51bytes):
    653 	movdqa	%xmm0, -51(%edx)
    654 L(aligned_16_35bytes):
    655 	movdqa	%xmm0, -35(%edx)
    656 L(aligned_16_19bytes):
    657 	movdqa	%xmm0, -19(%edx)
    658 L(aligned_16_3bytes):
    659 	movw	%ax, -3(%edx)
    660 	movb	%al, -1(%edx)
    661 	SETRTNVAL
    662 	RETURN
    663 
    664 	ALIGN (4)
    665 L(aligned_16_116bytes):
    666 	movdqa	%xmm0, -116(%edx)
    667 L(aligned_16_100bytes):
    668 	movdqa	%xmm0, -100(%edx)
    669 L(aligned_16_84bytes):
    670 	movdqa	%xmm0, -84(%edx)
    671 L(aligned_16_68bytes):
    672 	movdqa	%xmm0, -68(%edx)
    673 L(aligned_16_52bytes):
    674 	movdqa	%xmm0, -52(%edx)
    675 L(aligned_16_36bytes):
    676 	movdqa	%xmm0, -36(%edx)
    677 L(aligned_16_20bytes):
    678 	movdqa	%xmm0, -20(%edx)
    679 L(aligned_16_4bytes):
    680 	movl	%eax, -4(%edx)
    681 	SETRTNVAL
    682 	RETURN
    683 
    684 	ALIGN (4)
    685 L(aligned_16_117bytes):
    686 	movdqa	%xmm0, -117(%edx)
    687 L(aligned_16_101bytes):
    688 	movdqa	%xmm0, -101(%edx)
    689 L(aligned_16_85bytes):
    690 	movdqa	%xmm0, -85(%edx)
    691 L(aligned_16_69bytes):
    692 	movdqa	%xmm0, -69(%edx)
    693 L(aligned_16_53bytes):
    694 	movdqa	%xmm0, -53(%edx)
    695 L(aligned_16_37bytes):
    696 	movdqa	%xmm0, -37(%edx)
    697 L(aligned_16_21bytes):
    698 	movdqa	%xmm0, -21(%edx)
    699 L(aligned_16_5bytes):
    700 	movl	%eax, -5(%edx)
    701 	movb	%al, -1(%edx)
    702 	SETRTNVAL
    703 	RETURN
    704 
    705 	ALIGN (4)
    706 L(aligned_16_118bytes):
    707 	movdqa	%xmm0, -118(%edx)
    708 L(aligned_16_102bytes):
    709 	movdqa	%xmm0, -102(%edx)
    710 L(aligned_16_86bytes):
    711 	movdqa	%xmm0, -86(%edx)
    712 L(aligned_16_70bytes):
    713 	movdqa	%xmm0, -70(%edx)
    714 L(aligned_16_54bytes):
    715 	movdqa	%xmm0, -54(%edx)
    716 L(aligned_16_38bytes):
    717 	movdqa	%xmm0, -38(%edx)
    718 L(aligned_16_22bytes):
    719 	movdqa	%xmm0, -22(%edx)
    720 L(aligned_16_6bytes):
    721 	movl	%eax, -6(%edx)
    722 	movw	%ax, -2(%edx)
    723 	SETRTNVAL
    724 	RETURN
    725 
    726 	ALIGN (4)
    727 L(aligned_16_119bytes):
    728 	movdqa	%xmm0, -119(%edx)
    729 L(aligned_16_103bytes):
    730 	movdqa	%xmm0, -103(%edx)
    731 L(aligned_16_87bytes):
    732 	movdqa	%xmm0, -87(%edx)
    733 L(aligned_16_71bytes):
    734 	movdqa	%xmm0, -71(%edx)
    735 L(aligned_16_55bytes):
    736 	movdqa	%xmm0, -55(%edx)
    737 L(aligned_16_39bytes):
    738 	movdqa	%xmm0, -39(%edx)
    739 L(aligned_16_23bytes):
    740 	movdqa	%xmm0, -23(%edx)
    741 L(aligned_16_7bytes):
    742 	movl	%eax, -7(%edx)
    743 	movw	%ax, -3(%edx)
    744 	movb	%al, -1(%edx)
    745 	SETRTNVAL
    746 	RETURN
    747 
    748 	ALIGN (4)
    749 L(aligned_16_120bytes):
    750 	movdqa	%xmm0, -120(%edx)
    751 L(aligned_16_104bytes):
    752 	movdqa	%xmm0, -104(%edx)
    753 L(aligned_16_88bytes):
    754 	movdqa	%xmm0, -88(%edx)
    755 L(aligned_16_72bytes):
    756 	movdqa	%xmm0, -72(%edx)
    757 L(aligned_16_56bytes):
    758 	movdqa	%xmm0, -56(%edx)
    759 L(aligned_16_40bytes):
    760 	movdqa	%xmm0, -40(%edx)
    761 L(aligned_16_24bytes):
    762 	movdqa	%xmm0, -24(%edx)
    763 L(aligned_16_8bytes):
    764 	movq	%xmm0, -8(%edx)
    765 	SETRTNVAL
    766 	RETURN
    767 
    768 	ALIGN (4)
    769 L(aligned_16_121bytes):
    770 	movdqa	%xmm0, -121(%edx)
    771 L(aligned_16_105bytes):
    772 	movdqa	%xmm0, -105(%edx)
    773 L(aligned_16_89bytes):
    774 	movdqa	%xmm0, -89(%edx)
    775 L(aligned_16_73bytes):
    776 	movdqa	%xmm0, -73(%edx)
    777 L(aligned_16_57bytes):
    778 	movdqa	%xmm0, -57(%edx)
    779 L(aligned_16_41bytes):
    780 	movdqa	%xmm0, -41(%edx)
    781 L(aligned_16_25bytes):
    782 	movdqa	%xmm0, -25(%edx)
    783 L(aligned_16_9bytes):
    784 	movq	%xmm0, -9(%edx)
    785 	movb	%al, -1(%edx)
    786 	SETRTNVAL
    787 	RETURN
    788 
    789 	ALIGN (4)
    790 L(aligned_16_122bytes):
    791 	movdqa	%xmm0, -122(%edx)
    792 L(aligned_16_106bytes):
    793 	movdqa	%xmm0, -106(%edx)
    794 L(aligned_16_90bytes):
    795 	movdqa	%xmm0, -90(%edx)
    796 L(aligned_16_74bytes):
    797 	movdqa	%xmm0, -74(%edx)
    798 L(aligned_16_58bytes):
    799 	movdqa	%xmm0, -58(%edx)
    800 L(aligned_16_42bytes):
    801 	movdqa	%xmm0, -42(%edx)
    802 L(aligned_16_26bytes):
    803 	movdqa	%xmm0, -26(%edx)
    804 L(aligned_16_10bytes):
    805 	movq	%xmm0, -10(%edx)
    806 	movw	%ax, -2(%edx)
    807 	SETRTNVAL
    808 	RETURN
    809 
    810 	ALIGN (4)
    811 L(aligned_16_123bytes):
    812 	movdqa	%xmm0, -123(%edx)
    813 L(aligned_16_107bytes):
    814 	movdqa	%xmm0, -107(%edx)
    815 L(aligned_16_91bytes):
    816 	movdqa	%xmm0, -91(%edx)
    817 L(aligned_16_75bytes):
    818 	movdqa	%xmm0, -75(%edx)
    819 L(aligned_16_59bytes):
    820 	movdqa	%xmm0, -59(%edx)
    821 L(aligned_16_43bytes):
    822 	movdqa	%xmm0, -43(%edx)
    823 L(aligned_16_27bytes):
    824 	movdqa	%xmm0, -27(%edx)
    825 L(aligned_16_11bytes):
    826 	movq	%xmm0, -11(%edx)
    827 	movw	%ax, -3(%edx)
    828 	movb	%al, -1(%edx)
    829 	SETRTNVAL
    830 	RETURN
    831 
    832 	ALIGN (4)
    833 L(aligned_16_124bytes):
    834 	movdqa	%xmm0, -124(%edx)
    835 L(aligned_16_108bytes):
    836 	movdqa	%xmm0, -108(%edx)
    837 L(aligned_16_92bytes):
    838 	movdqa	%xmm0, -92(%edx)
    839 L(aligned_16_76bytes):
    840 	movdqa	%xmm0, -76(%edx)
    841 L(aligned_16_60bytes):
    842 	movdqa	%xmm0, -60(%edx)
    843 L(aligned_16_44bytes):
    844 	movdqa	%xmm0, -44(%edx)
    845 L(aligned_16_28bytes):
    846 	movdqa	%xmm0, -28(%edx)
    847 L(aligned_16_12bytes):
    848 	movq	%xmm0, -12(%edx)
    849 	movl	%eax, -4(%edx)
    850 	SETRTNVAL
    851 	RETURN
    852 
    853 	ALIGN (4)
    854 L(aligned_16_125bytes):
    855 	movdqa	%xmm0, -125(%edx)
    856 L(aligned_16_109bytes):
    857 	movdqa	%xmm0, -109(%edx)
    858 L(aligned_16_93bytes):
    859 	movdqa	%xmm0, -93(%edx)
    860 L(aligned_16_77bytes):
    861 	movdqa	%xmm0, -77(%edx)
    862 L(aligned_16_61bytes):
    863 	movdqa	%xmm0, -61(%edx)
    864 L(aligned_16_45bytes):
    865 	movdqa	%xmm0, -45(%edx)
    866 L(aligned_16_29bytes):
    867 	movdqa	%xmm0, -29(%edx)
    868 L(aligned_16_13bytes):
    869 	movq	%xmm0, -13(%edx)
    870 	movl	%eax, -5(%edx)
    871 	movb	%al, -1(%edx)
    872 	SETRTNVAL
    873 	RETURN
    874 
    875 	ALIGN (4)
    876 L(aligned_16_126bytes):
    877 	movdqa	%xmm0, -126(%edx)
    878 L(aligned_16_110bytes):
    879 	movdqa	%xmm0, -110(%edx)
    880 L(aligned_16_94bytes):
    881 	movdqa	%xmm0, -94(%edx)
    882 L(aligned_16_78bytes):
    883 	movdqa	%xmm0, -78(%edx)
    884 L(aligned_16_62bytes):
    885 	movdqa	%xmm0, -62(%edx)
    886 L(aligned_16_46bytes):
    887 	movdqa	%xmm0, -46(%edx)
    888 L(aligned_16_30bytes):
    889 	movdqa	%xmm0, -30(%edx)
    890 L(aligned_16_14bytes):
    891 	movq	%xmm0, -14(%edx)
    892 	movl	%eax, -6(%edx)
    893 	movw	%ax, -2(%edx)
    894 	SETRTNVAL
    895 	RETURN
    896 
    897 	ALIGN (4)
    898 L(aligned_16_127bytes):
    899 	movdqa	%xmm0, -127(%edx)
    900 L(aligned_16_111bytes):
    901 	movdqa	%xmm0, -111(%edx)
    902 L(aligned_16_95bytes):
    903 	movdqa	%xmm0, -95(%edx)
    904 L(aligned_16_79bytes):
    905 	movdqa	%xmm0, -79(%edx)
    906 L(aligned_16_63bytes):
    907 	movdqa	%xmm0, -63(%edx)
    908 L(aligned_16_47bytes):
    909 	movdqa	%xmm0, -47(%edx)
    910 L(aligned_16_31bytes):
    911 	movdqa	%xmm0, -31(%edx)
    912 L(aligned_16_15bytes):
    913 	movq	%xmm0, -15(%edx)
    914 	movl	%eax, -7(%edx)
    915 	movw	%ax, -3(%edx)
    916 	movb	%al, -1(%edx)
    917 	SETRTNVAL
    918 	RETURN_END
    919 
    920 END (MEMSET)
    921