Home | History | Annotate | Download | only in arch-x86
      1 /*
      2  * Copyright (C) 2010 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 /*
     17  * Contributed by: Intel Corporation
     18  */
     19 
     20 #ifndef L
     21 # define L(label)	.L##label
     22 #endif
     23 
     24 #ifndef ALIGN
     25 # define ALIGN(n)	.p2align n
     26 #endif
     27 
     28 #ifndef cfi_startproc
     29 # define cfi_startproc			.cfi_startproc
     30 #endif
     31 
     32 #ifndef cfi_endproc
     33 # define cfi_endproc			.cfi_endproc
     34 #endif
     35 
     36 #ifndef cfi_rel_offset
     37 # define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
     38 #endif
     39 
     40 #ifndef cfi_restore
     41 # define cfi_restore(reg)		.cfi_restore reg
     42 #endif
     43 
     44 #ifndef cfi_adjust_cfa_offset
     45 # define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
     46 #endif
     47 
     48 #ifndef ENTRY
     49 # define ENTRY(name)			\
     50 	.type name,  @function; 	\
     51 	.globl name;			\
     52 	.p2align 4;			\
     53 name:					\
     54 	cfi_startproc
     55 #endif
     56 
     57 #ifndef END
     58 # define END(name)			\
     59 	cfi_endproc;			\
     60 	.size name, .-name
     61 #endif
     62 
     63 #define CFI_PUSH(REG)						\
     64   cfi_adjust_cfa_offset (4);					\
     65   cfi_rel_offset (REG, 0)
     66 
     67 #define CFI_POP(REG)						\
     68   cfi_adjust_cfa_offset (-4);					\
     69   cfi_restore (REG)
     70 
     71 #define PUSH(REG)	pushl REG; CFI_PUSH (REG)
     72 #define POP(REG)	popl REG; CFI_POP (REG)
     73 
     74 #ifdef USE_AS_BZERO32
     75 # define DEST		PARMS
     76 # define LEN		DEST+4
     77 #else
     78 # define DEST		PARMS
     79 # define DWDS		DEST+4
     80 # define LEN		DWDS+4
     81 #endif
     82 
     83 #ifdef USE_AS_WMEMSET32
     84 # define SETRTNVAL	movl DEST(%esp), %eax
     85 #else
     86 # define SETRTNVAL
     87 #endif
     88 
     89 #ifdef SHARED
     90 # define ENTRANCE	PUSH (%ebx);
     91 # define RETURN_END	POP (%ebx); ret
     92 # define RETURN		RETURN_END; CFI_PUSH (%ebx)
     93 # define PARMS		8		/* Preserve EBX.  */
     94 # define JMPTBL(I, B)	I - B
     95 
     96 /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
     97    jump table with relative offsets.   */
     98 # define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
     99     /* We first load PC into EBX.  */				\
    100     call	__i686.get_pc_thunk.bx;				\
    101     /* Get the address of the jump table.  */			\
    102     add		$(TABLE - .), %ebx;				\
    103     /* Get the entry and convert the relative offset to the	\
    104        absolute address.  */					\
    105     add		(%ebx,%ecx,4), %ebx;				\
    106     /* We loaded the jump table and adjuested EDX. Go.  */	\
    107     jmp		*%ebx
    108 
    109 	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
    110 	.globl	__i686.get_pc_thunk.bx
    111 	.hidden	__i686.get_pc_thunk.bx
    112 	ALIGN (4)
    113 	.type	__i686.get_pc_thunk.bx,@function
    114 __i686.get_pc_thunk.bx:
    115 	movl	(%esp), %ebx
    116 	ret
    117 #else
    118 # define ENTRANCE
    119 # define RETURN_END	ret
    120 # define RETURN		RETURN_END
    121 # define PARMS		4
    122 # define JMPTBL(I, B)	I
    123 
    124 /* Branch to an entry in a jump table.  TABLE is a jump table with
    125    absolute offsets.  */
    126 # define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
    127     jmp		*TABLE(,%ecx,4)
    128 #endif
    129 
    130 	.section .text.sse2,"ax",@progbits
    131 	ALIGN (4)
    132 ENTRY (sse2_memset32_atom)
    133 	ENTRANCE
    134 
    135 	movl	LEN(%esp), %ecx
    136 #ifdef USE_AS_ANDROID
    137 	shr     $2, %ecx
    138 #endif
    139 #ifdef USE_AS_BZERO32
    140 	xor	%eax, %eax
    141 #else
    142 	mov	DWDS(%esp), %eax
    143 	mov	%eax, %edx
    144 #endif
    145 	movl	DEST(%esp), %edx
    146 	cmp	$16, %ecx
    147 	jae	L(16dbwordsormore)
    148 
    149 L(write_less16dbwords):
    150 	lea	(%edx, %ecx, 4), %edx
    151 	BRANCH_TO_JMPTBL_ENTRY (L(table_less16dbwords))
    152 
    153 	.pushsection .rodata.sse2,"a",@progbits
    154 	ALIGN (2)
    155 L(table_less16dbwords):
    156 	.int	JMPTBL (L(write_0dbwords), L(table_less16dbwords))
    157 	.int	JMPTBL (L(write_1dbwords), L(table_less16dbwords))
    158 	.int	JMPTBL (L(write_2dbwords), L(table_less16dbwords))
    159 	.int	JMPTBL (L(write_3dbwords), L(table_less16dbwords))
    160 	.int	JMPTBL (L(write_4dbwords), L(table_less16dbwords))
    161 	.int	JMPTBL (L(write_5dbwords), L(table_less16dbwords))
    162 	.int	JMPTBL (L(write_6dbwords), L(table_less16dbwords))
    163 	.int	JMPTBL (L(write_7dbwords), L(table_less16dbwords))
    164 	.int	JMPTBL (L(write_8dbwords), L(table_less16dbwords))
    165 	.int	JMPTBL (L(write_9dbwords), L(table_less16dbwords))
    166 	.int	JMPTBL (L(write_10dbwords), L(table_less16dbwords))
    167 	.int	JMPTBL (L(write_11dbwords), L(table_less16dbwords))
    168 	.int	JMPTBL (L(write_12dbwords), L(table_less16dbwords))
    169 	.int	JMPTBL (L(write_13dbwords), L(table_less16dbwords))
    170 	.int	JMPTBL (L(write_14dbwords), L(table_less16dbwords))
    171 	.int	JMPTBL (L(write_15dbwords), L(table_less16dbwords))
    172 	.popsection
    173 
    174 	ALIGN (4)
    175 L(write_15dbwords):
    176 	movl	%eax, -60(%edx)
    177 L(write_14dbwords):
    178 	movl	%eax, -56(%edx)
    179 L(write_13dbwords):
    180 	movl	%eax, -52(%edx)
    181 L(write_12dbwords):
    182 	movl	%eax, -48(%edx)
    183 L(write_11dbwords):
    184 	movl	%eax, -44(%edx)
    185 L(write_10dbwords):
    186 	movl	%eax, -40(%edx)
    187 L(write_9dbwords):
    188 	movl	%eax, -36(%edx)
    189 L(write_8dbwords):
    190 	movl	%eax, -32(%edx)
    191 L(write_7dbwords):
    192 	movl	%eax, -28(%edx)
    193 L(write_6dbwords):
    194 	movl	%eax, -24(%edx)
    195 L(write_5dbwords):
    196 	movl	%eax, -20(%edx)
    197 L(write_4dbwords):
    198 	movl	%eax, -16(%edx)
    199 L(write_3dbwords):
    200 	movl	%eax, -12(%edx)
    201 L(write_2dbwords):
    202 	movl	%eax, -8(%edx)
    203 L(write_1dbwords):
    204 	movl	%eax, -4(%edx)
    205 L(write_0dbwords):
    206 	SETRTNVAL
    207 	RETURN
    208 
    209 	ALIGN (4)
    210 L(16dbwordsormore):
    211 	test	$3, %edx
    212 	jz	L(aligned4bytes)
    213 	mov	%eax, (%edx)
    214 	mov	%eax, -4(%edx, %ecx, 4)
    215 	sub	$1, %ecx
    216 	rol	$24, %eax
    217 	add	$1, %edx
    218 	test	$3, %edx
    219 	jz	L(aligned4bytes)
    220 	ror	$8, %eax
    221 	add	$1, %edx
    222 	test	$3, %edx
    223 	jz	L(aligned4bytes)
    224 	ror	$8, %eax
    225 	add	$1, %edx
    226 L(aligned4bytes):
    227 	shl	$2, %ecx
    228 
    229 #ifdef USE_AS_BZERO32
    230 	pxor	%xmm0, %xmm0
    231 #else
    232 	movd	%eax, %xmm0
    233 	pshufd	$0, %xmm0, %xmm0
    234 #endif
    235 	testl	$0xf, %edx
    236 	jz	L(aligned_16)
    237 /* ECX > 32 and EDX is not 16 byte aligned.  */
    238 L(not_aligned_16):
    239 	movdqu	%xmm0, (%edx)
    240 	movl	%edx, %eax
    241 	and	$-16, %edx
    242 	add	$16, %edx
    243 	sub	%edx, %eax
    244 	add	%eax, %ecx
    245 	movd	%xmm0, %eax
    246 	ALIGN (4)
    247 L(aligned_16):
    248 	cmp	$128, %ecx
    249 	jae	L(128bytesormore)
    250 
    251 L(aligned_16_less128bytes):
    252 	add	%ecx, %edx
    253 	shr	$2, %ecx
    254 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
    255 
    256 	ALIGN (4)
    257 L(128bytesormore):
    258 #ifdef SHARED_CACHE_SIZE
    259 	PUSH (%ebx)
    260 	mov	$SHARED_CACHE_SIZE, %ebx
    261 #else
    262 # ifdef SHARED
    263 	call	__i686.get_pc_thunk.bx
    264 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    265 	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
    266 # else
    267 	PUSH (%ebx)
    268 	mov	__x86_shared_cache_size, %ebx
    269 # endif
    270 #endif
    271 	cmp	%ebx, %ecx
    272 	jae	L(128bytesormore_nt_start)
    273 
    274 #ifdef DATA_CACHE_SIZE
    275 	POP (%ebx)
    276 # define RESTORE_EBX_STATE CFI_PUSH (%ebx)
    277 	cmp	$DATA_CACHE_SIZE, %ecx
    278 #else
    279 # ifdef SHARED
    280 #  define RESTORE_EBX_STATE
    281 	call	__i686.get_pc_thunk.bx
    282 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
    283 	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
    284 # else
    285 	POP (%ebx)
    286 #  define RESTORE_EBX_STATE CFI_PUSH (%ebx)
    287 	cmp	__x86_data_cache_size, %ecx
    288 # endif
    289 #endif
    290 
    291 	jae	L(128bytes_L2_normal)
    292 	subl	$128, %ecx
    293 L(128bytesormore_normal):
    294 	sub	$128, %ecx
    295 	movdqa	%xmm0, (%edx)
    296 	movdqa	%xmm0, 0x10(%edx)
    297 	movdqa	%xmm0, 0x20(%edx)
    298 	movdqa	%xmm0, 0x30(%edx)
    299 	movdqa	%xmm0, 0x40(%edx)
    300 	movdqa	%xmm0, 0x50(%edx)
    301 	movdqa	%xmm0, 0x60(%edx)
    302 	movdqa	%xmm0, 0x70(%edx)
    303 	lea	128(%edx), %edx
    304 	jb	L(128bytesless_normal)
    305 
    306 
    307 	sub	$128, %ecx
    308 	movdqa	%xmm0, (%edx)
    309 	movdqa	%xmm0, 0x10(%edx)
    310 	movdqa	%xmm0, 0x20(%edx)
    311 	movdqa	%xmm0, 0x30(%edx)
    312 	movdqa	%xmm0, 0x40(%edx)
    313 	movdqa	%xmm0, 0x50(%edx)
    314 	movdqa	%xmm0, 0x60(%edx)
    315 	movdqa	%xmm0, 0x70(%edx)
    316 	lea	128(%edx), %edx
    317 	jae	L(128bytesormore_normal)
    318 
    319 L(128bytesless_normal):
    320 	lea	128(%ecx), %ecx
    321 	add	%ecx, %edx
    322 	shr	$2, %ecx
    323 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
    324 
    325 	ALIGN (4)
    326 L(128bytes_L2_normal):
    327 	prefetcht0	0x380(%edx)
    328 	prefetcht0	0x3c0(%edx)
    329 	sub	$128, %ecx
    330 	movdqa	%xmm0, (%edx)
    331 	movaps	%xmm0, 0x10(%edx)
    332 	movaps	%xmm0, 0x20(%edx)
    333 	movaps	%xmm0, 0x30(%edx)
    334 	movaps	%xmm0, 0x40(%edx)
    335 	movaps	%xmm0, 0x50(%edx)
    336 	movaps	%xmm0, 0x60(%edx)
    337 	movaps	%xmm0, 0x70(%edx)
    338 	add	$128, %edx
    339 	cmp	$128, %ecx
    340 	jae	L(128bytes_L2_normal)
    341 
    342 L(128bytesless_L2_normal):
    343 	add	%ecx, %edx
    344 	shr	$2, %ecx
    345 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
    346 
    347 	RESTORE_EBX_STATE
    348 L(128bytesormore_nt_start):
    349 	sub	%ebx, %ecx
    350 	mov	%ebx, %eax
    351 	and	$0x7f, %eax
    352 	add	%eax, %ecx
    353 	movd	%xmm0, %eax
    354 	ALIGN (4)
    355 L(128bytesormore_shared_cache_loop):
    356 	prefetcht0	0x3c0(%edx)
    357 	prefetcht0	0x380(%edx)
    358 	sub	$0x80, %ebx
    359 	movdqa	%xmm0, (%edx)
    360 	movdqa	%xmm0, 0x10(%edx)
    361 	movdqa	%xmm0, 0x20(%edx)
    362 	movdqa	%xmm0, 0x30(%edx)
    363 	movdqa	%xmm0, 0x40(%edx)
    364 	movdqa	%xmm0, 0x50(%edx)
    365 	movdqa	%xmm0, 0x60(%edx)
    366 	movdqa	%xmm0, 0x70(%edx)
    367 	add	$0x80, %edx
    368 	cmp	$0x80, %ebx
    369 	jae	L(128bytesormore_shared_cache_loop)
    370 	cmp	$0x80, %ecx
    371 	jb	L(shared_cache_loop_end)
    372 
    373 	ALIGN (4)
    374 L(128bytesormore_nt):
    375 	sub	$0x80, %ecx
    376 	movntdq	%xmm0, (%edx)
    377 	movntdq	%xmm0, 0x10(%edx)
    378 	movntdq	%xmm0, 0x20(%edx)
    379 	movntdq	%xmm0, 0x30(%edx)
    380 	movntdq	%xmm0, 0x40(%edx)
    381 	movntdq	%xmm0, 0x50(%edx)
    382 	movntdq	%xmm0, 0x60(%edx)
    383 	movntdq	%xmm0, 0x70(%edx)
    384 	add	$0x80, %edx
    385 	cmp	$0x80, %ecx
    386 	jae	L(128bytesormore_nt)
    387 	sfence
    388 L(shared_cache_loop_end):
    389 #if defined DATA_CACHE_SIZE || !defined SHARED
    390 	POP (%ebx)
    391 #endif
    392 	add	%ecx, %edx
    393 	shr	$2, %ecx
    394 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
    395 
    396 	.pushsection .rodata.sse2,"a",@progbits
    397 	ALIGN (2)
    398 L(table_16_128bytes):
    399 	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
    400 	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
    401 	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
    402 	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
    403 	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
    404 	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
    405 	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
    406 	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
    407 	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
    408 	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
    409 	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
    410 	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
    411 	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
    412 	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
    413 	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
    414 	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
    415 	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
    416 	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
    417 	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
    418 	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
    419 	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
    420 	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
    421 	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
    422 	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
    423 	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
    424 	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
    425 	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
    426 	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
    427 	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
    428 	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
    429 	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
    430 	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
    431 	.popsection
    432 
    433 	ALIGN (4)
    434 L(aligned_16_112bytes):
    435 	movdqa	%xmm0, -112(%edx)
    436 L(aligned_16_96bytes):
    437 	movdqa	%xmm0, -96(%edx)
    438 L(aligned_16_80bytes):
    439 	movdqa	%xmm0, -80(%edx)
    440 L(aligned_16_64bytes):
    441 	movdqa	%xmm0, -64(%edx)
    442 L(aligned_16_48bytes):
    443 	movdqa	%xmm0, -48(%edx)
    444 L(aligned_16_32bytes):
    445 	movdqa	%xmm0, -32(%edx)
    446 L(aligned_16_16bytes):
    447 	movdqa	%xmm0, -16(%edx)
    448 L(aligned_16_0bytes):
    449 	SETRTNVAL
    450 	RETURN
    451 
    452 	ALIGN (4)
    453 L(aligned_16_116bytes):
    454 	movdqa	%xmm0, -116(%edx)
    455 L(aligned_16_100bytes):
    456 	movdqa	%xmm0, -100(%edx)
    457 L(aligned_16_84bytes):
    458 	movdqa	%xmm0, -84(%edx)
    459 L(aligned_16_68bytes):
    460 	movdqa	%xmm0, -68(%edx)
    461 L(aligned_16_52bytes):
    462 	movdqa	%xmm0, -52(%edx)
    463 L(aligned_16_36bytes):
    464 	movdqa	%xmm0, -36(%edx)
    465 L(aligned_16_20bytes):
    466 	movdqa	%xmm0, -20(%edx)
    467 L(aligned_16_4bytes):
    468 	movl	%eax, -4(%edx)
    469 	SETRTNVAL
    470 	RETURN
    471 
    472 	ALIGN (4)
    473 L(aligned_16_120bytes):
    474 	movdqa	%xmm0, -120(%edx)
    475 L(aligned_16_104bytes):
    476 	movdqa	%xmm0, -104(%edx)
    477 L(aligned_16_88bytes):
    478 	movdqa	%xmm0, -88(%edx)
    479 L(aligned_16_72bytes):
    480 	movdqa	%xmm0, -72(%edx)
    481 L(aligned_16_56bytes):
    482 	movdqa	%xmm0, -56(%edx)
    483 L(aligned_16_40bytes):
    484 	movdqa	%xmm0, -40(%edx)
    485 L(aligned_16_24bytes):
    486 	movdqa	%xmm0, -24(%edx)
    487 L(aligned_16_8bytes):
    488 	movq	%xmm0, -8(%edx)
    489 	SETRTNVAL
    490 	RETURN
    491 
    492 	ALIGN (4)
    493 L(aligned_16_124bytes):
    494 	movdqa	%xmm0, -124(%edx)
    495 L(aligned_16_108bytes):
    496 	movdqa	%xmm0, -108(%edx)
    497 L(aligned_16_92bytes):
    498 	movdqa	%xmm0, -92(%edx)
    499 L(aligned_16_76bytes):
    500 	movdqa	%xmm0, -76(%edx)
    501 L(aligned_16_60bytes):
    502 	movdqa	%xmm0, -60(%edx)
    503 L(aligned_16_44bytes):
    504 	movdqa	%xmm0, -44(%edx)
    505 L(aligned_16_28bytes):
    506 	movdqa	%xmm0, -28(%edx)
    507 L(aligned_16_12bytes):
    508 	movq	%xmm0, -12(%edx)
    509 	movl	%eax, -4(%edx)
    510 	SETRTNVAL
    511 	RETURN
    512 
    513 END (sse2_memset32_atom)
    514