1 /* 2 * Copyright (C) 2010 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 /* 17 * Contributed by: Intel Corporation 18 */ 19 20 #ifndef L 21 # define L(label) .L##label 22 #endif 23 24 #ifndef ALIGN 25 # define ALIGN(n) .p2align n 26 #endif 27 28 #ifndef cfi_startproc 29 # define cfi_startproc .cfi_startproc 30 #endif 31 32 #ifndef cfi_endproc 33 # define cfi_endproc .cfi_endproc 34 #endif 35 36 #ifndef cfi_rel_offset 37 # define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 38 #endif 39 40 #ifndef cfi_restore 41 # define cfi_restore(reg) .cfi_restore reg 42 #endif 43 44 #ifndef cfi_adjust_cfa_offset 45 # define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 46 #endif 47 48 #ifndef ENTRY 49 # define ENTRY(name) \ 50 .type name, @function; \ 51 .globl name; \ 52 .p2align 4; \ 53 name: \ 54 cfi_startproc 55 #endif 56 57 #ifndef END 58 # define END(name) \ 59 cfi_endproc; \ 60 .size name, .-name 61 #endif 62 63 #define CFI_PUSH(REG) \ 64 cfi_adjust_cfa_offset (4); \ 65 cfi_rel_offset (REG, 0) 66 67 #define CFI_POP(REG) \ 68 cfi_adjust_cfa_offset (-4); \ 69 cfi_restore (REG) 70 71 #define PUSH(REG) pushl REG; CFI_PUSH (REG) 72 #define POP(REG) popl REG; CFI_POP (REG) 73 74 #ifdef USE_AS_BZERO32 75 # define DEST PARMS 76 # define LEN DEST+4 77 #else 78 # define DEST PARMS 79 # define DWDS DEST+4 80 # define LEN DWDS+4 81 #endif 82 83 #ifdef USE_AS_WMEMSET32 84 # define SETRTNVAL movl DEST(%esp), %eax 85 #else 86 # define SETRTNVAL 87 #endif 88 89 #ifdef SHARED 90 # define ENTRANCE PUSH (%ebx); 91 # define RETURN_END POP (%ebx); ret 92 # define RETURN RETURN_END; CFI_PUSH (%ebx) 93 # define PARMS 8 /* Preserve EBX. */ 94 # define JMPTBL(I, B) I - B 95 96 /* Load an entry in a jump table into EBX and branch to it. TABLE is a 97 jump table with relative offsets. */ 98 # define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ 99 /* We first load PC into EBX. */ \ 100 call __i686.get_pc_thunk.bx; \ 101 /* Get the address of the jump table. */ \ 102 add $(TABLE - .), %ebx; \ 103 /* Get the entry and convert the relative offset to the \ 104 absolute address. */ \ 105 add (%ebx,%ecx,4), %ebx; \ 106 /* We loaded the jump table and adjuested EDX. Go. */ \ 107 jmp *%ebx 108 109 .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits 110 .globl __i686.get_pc_thunk.bx 111 .hidden __i686.get_pc_thunk.bx 112 ALIGN (4) 113 .type __i686.get_pc_thunk.bx,@function 114 __i686.get_pc_thunk.bx: 115 movl (%esp), %ebx 116 ret 117 #else 118 # define ENTRANCE 119 # define RETURN_END ret 120 # define RETURN RETURN_END 121 # define PARMS 4 122 # define JMPTBL(I, B) I 123 124 /* Branch to an entry in a jump table. TABLE is a jump table with 125 absolute offsets. */ 126 # define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ 127 jmp *TABLE(,%ecx,4) 128 #endif 129 130 .section .text.sse2,"ax",@progbits 131 ALIGN (4) 132 ENTRY (sse2_memset32_atom) 133 ENTRANCE 134 135 movl LEN(%esp), %ecx 136 #ifdef USE_AS_ANDROID 137 shr $2, %ecx 138 #endif 139 #ifdef USE_AS_BZERO32 140 xor %eax, %eax 141 #else 142 mov DWDS(%esp), %eax 143 mov %eax, %edx 144 #endif 145 movl DEST(%esp), %edx 146 cmp $16, %ecx 147 jae L(16dbwordsormore) 148 149 L(write_less16dbwords): 150 lea (%edx, %ecx, 4), %edx 151 BRANCH_TO_JMPTBL_ENTRY (L(table_less16dbwords)) 152 153 .pushsection .rodata.sse2,"a",@progbits 154 ALIGN (2) 155 L(table_less16dbwords): 156 .int JMPTBL (L(write_0dbwords), L(table_less16dbwords)) 157 .int JMPTBL (L(write_1dbwords), L(table_less16dbwords)) 158 .int JMPTBL (L(write_2dbwords), L(table_less16dbwords)) 159 .int JMPTBL (L(write_3dbwords), L(table_less16dbwords)) 160 .int JMPTBL (L(write_4dbwords), L(table_less16dbwords)) 161 .int JMPTBL (L(write_5dbwords), L(table_less16dbwords)) 162 .int JMPTBL (L(write_6dbwords), L(table_less16dbwords)) 163 .int JMPTBL (L(write_7dbwords), L(table_less16dbwords)) 164 .int JMPTBL (L(write_8dbwords), L(table_less16dbwords)) 165 .int JMPTBL (L(write_9dbwords), L(table_less16dbwords)) 166 .int JMPTBL (L(write_10dbwords), L(table_less16dbwords)) 167 .int JMPTBL (L(write_11dbwords), L(table_less16dbwords)) 168 .int JMPTBL (L(write_12dbwords), L(table_less16dbwords)) 169 .int JMPTBL (L(write_13dbwords), L(table_less16dbwords)) 170 .int JMPTBL (L(write_14dbwords), L(table_less16dbwords)) 171 .int JMPTBL (L(write_15dbwords), L(table_less16dbwords)) 172 .popsection 173 174 ALIGN (4) 175 L(write_15dbwords): 176 movl %eax, -60(%edx) 177 L(write_14dbwords): 178 movl %eax, -56(%edx) 179 L(write_13dbwords): 180 movl %eax, -52(%edx) 181 L(write_12dbwords): 182 movl %eax, -48(%edx) 183 L(write_11dbwords): 184 movl %eax, -44(%edx) 185 L(write_10dbwords): 186 movl %eax, -40(%edx) 187 L(write_9dbwords): 188 movl %eax, -36(%edx) 189 L(write_8dbwords): 190 movl %eax, -32(%edx) 191 L(write_7dbwords): 192 movl %eax, -28(%edx) 193 L(write_6dbwords): 194 movl %eax, -24(%edx) 195 L(write_5dbwords): 196 movl %eax, -20(%edx) 197 L(write_4dbwords): 198 movl %eax, -16(%edx) 199 L(write_3dbwords): 200 movl %eax, -12(%edx) 201 L(write_2dbwords): 202 movl %eax, -8(%edx) 203 L(write_1dbwords): 204 movl %eax, -4(%edx) 205 L(write_0dbwords): 206 SETRTNVAL 207 RETURN 208 209 ALIGN (4) 210 L(16dbwordsormore): 211 test $3, %edx 212 jz L(aligned4bytes) 213 mov %eax, (%edx) 214 mov %eax, -4(%edx, %ecx, 4) 215 sub $1, %ecx 216 rol $24, %eax 217 add $1, %edx 218 test $3, %edx 219 jz L(aligned4bytes) 220 ror $8, %eax 221 add $1, %edx 222 test $3, %edx 223 jz L(aligned4bytes) 224 ror $8, %eax 225 add $1, %edx 226 L(aligned4bytes): 227 shl $2, %ecx 228 229 #ifdef USE_AS_BZERO32 230 pxor %xmm0, %xmm0 231 #else 232 movd %eax, %xmm0 233 pshufd $0, %xmm0, %xmm0 234 #endif 235 testl $0xf, %edx 236 jz L(aligned_16) 237 /* ECX > 32 and EDX is not 16 byte aligned. */ 238 L(not_aligned_16): 239 movdqu %xmm0, (%edx) 240 movl %edx, %eax 241 and $-16, %edx 242 add $16, %edx 243 sub %edx, %eax 244 add %eax, %ecx 245 movd %xmm0, %eax 246 ALIGN (4) 247 L(aligned_16): 248 cmp $128, %ecx 249 jae L(128bytesormore) 250 251 L(aligned_16_less128bytes): 252 add %ecx, %edx 253 shr $2, %ecx 254 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 255 256 ALIGN (4) 257 L(128bytesormore): 258 #ifdef SHARED_CACHE_SIZE 259 PUSH (%ebx) 260 mov $SHARED_CACHE_SIZE, %ebx 261 #else 262 # ifdef SHARED 263 call __i686.get_pc_thunk.bx 264 add $_GLOBAL_OFFSET_TABLE_, %ebx 265 mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx 266 # else 267 PUSH (%ebx) 268 mov __x86_shared_cache_size, %ebx 269 # endif 270 #endif 271 cmp %ebx, %ecx 272 jae L(128bytesormore_nt_start) 273 274 #ifdef DATA_CACHE_SIZE 275 POP (%ebx) 276 # define RESTORE_EBX_STATE CFI_PUSH (%ebx) 277 cmp $DATA_CACHE_SIZE, %ecx 278 #else 279 # ifdef SHARED 280 # define RESTORE_EBX_STATE 281 call __i686.get_pc_thunk.bx 282 add $_GLOBAL_OFFSET_TABLE_, %ebx 283 cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx 284 # else 285 POP (%ebx) 286 # define RESTORE_EBX_STATE CFI_PUSH (%ebx) 287 cmp __x86_data_cache_size, %ecx 288 # endif 289 #endif 290 291 jae L(128bytes_L2_normal) 292 subl $128, %ecx 293 L(128bytesormore_normal): 294 sub $128, %ecx 295 movdqa %xmm0, (%edx) 296 movdqa %xmm0, 0x10(%edx) 297 movdqa %xmm0, 0x20(%edx) 298 movdqa %xmm0, 0x30(%edx) 299 movdqa %xmm0, 0x40(%edx) 300 movdqa %xmm0, 0x50(%edx) 301 movdqa %xmm0, 0x60(%edx) 302 movdqa %xmm0, 0x70(%edx) 303 lea 128(%edx), %edx 304 jb L(128bytesless_normal) 305 306 307 sub $128, %ecx 308 movdqa %xmm0, (%edx) 309 movdqa %xmm0, 0x10(%edx) 310 movdqa %xmm0, 0x20(%edx) 311 movdqa %xmm0, 0x30(%edx) 312 movdqa %xmm0, 0x40(%edx) 313 movdqa %xmm0, 0x50(%edx) 314 movdqa %xmm0, 0x60(%edx) 315 movdqa %xmm0, 0x70(%edx) 316 lea 128(%edx), %edx 317 jae L(128bytesormore_normal) 318 319 L(128bytesless_normal): 320 lea 128(%ecx), %ecx 321 add %ecx, %edx 322 shr $2, %ecx 323 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 324 325 ALIGN (4) 326 L(128bytes_L2_normal): 327 prefetcht0 0x380(%edx) 328 prefetcht0 0x3c0(%edx) 329 sub $128, %ecx 330 movdqa %xmm0, (%edx) 331 movaps %xmm0, 0x10(%edx) 332 movaps %xmm0, 0x20(%edx) 333 movaps %xmm0, 0x30(%edx) 334 movaps %xmm0, 0x40(%edx) 335 movaps %xmm0, 0x50(%edx) 336 movaps %xmm0, 0x60(%edx) 337 movaps %xmm0, 0x70(%edx) 338 add $128, %edx 339 cmp $128, %ecx 340 jae L(128bytes_L2_normal) 341 342 L(128bytesless_L2_normal): 343 add %ecx, %edx 344 shr $2, %ecx 345 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 346 347 RESTORE_EBX_STATE 348 L(128bytesormore_nt_start): 349 sub %ebx, %ecx 350 mov %ebx, %eax 351 and $0x7f, %eax 352 add %eax, %ecx 353 movd %xmm0, %eax 354 ALIGN (4) 355 L(128bytesormore_shared_cache_loop): 356 prefetcht0 0x3c0(%edx) 357 prefetcht0 0x380(%edx) 358 sub $0x80, %ebx 359 movdqa %xmm0, (%edx) 360 movdqa %xmm0, 0x10(%edx) 361 movdqa %xmm0, 0x20(%edx) 362 movdqa %xmm0, 0x30(%edx) 363 movdqa %xmm0, 0x40(%edx) 364 movdqa %xmm0, 0x50(%edx) 365 movdqa %xmm0, 0x60(%edx) 366 movdqa %xmm0, 0x70(%edx) 367 add $0x80, %edx 368 cmp $0x80, %ebx 369 jae L(128bytesormore_shared_cache_loop) 370 cmp $0x80, %ecx 371 jb L(shared_cache_loop_end) 372 373 ALIGN (4) 374 L(128bytesormore_nt): 375 sub $0x80, %ecx 376 movntdq %xmm0, (%edx) 377 movntdq %xmm0, 0x10(%edx) 378 movntdq %xmm0, 0x20(%edx) 379 movntdq %xmm0, 0x30(%edx) 380 movntdq %xmm0, 0x40(%edx) 381 movntdq %xmm0, 0x50(%edx) 382 movntdq %xmm0, 0x60(%edx) 383 movntdq %xmm0, 0x70(%edx) 384 add $0x80, %edx 385 cmp $0x80, %ecx 386 jae L(128bytesormore_nt) 387 sfence 388 L(shared_cache_loop_end): 389 #if defined DATA_CACHE_SIZE || !defined SHARED 390 POP (%ebx) 391 #endif 392 add %ecx, %edx 393 shr $2, %ecx 394 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 395 396 .pushsection .rodata.sse2,"a",@progbits 397 ALIGN (2) 398 L(table_16_128bytes): 399 .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) 400 .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) 401 .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) 402 .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) 403 .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) 404 .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) 405 .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) 406 .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) 407 .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) 408 .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) 409 .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) 410 .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) 411 .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) 412 .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) 413 .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) 414 .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) 415 .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) 416 .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) 417 .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) 418 .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) 419 .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) 420 .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) 421 .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) 422 .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) 423 .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) 424 .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) 425 .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) 426 .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) 427 .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) 428 .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) 429 .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) 430 .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) 431 .popsection 432 433 ALIGN (4) 434 L(aligned_16_112bytes): 435 movdqa %xmm0, -112(%edx) 436 L(aligned_16_96bytes): 437 movdqa %xmm0, -96(%edx) 438 L(aligned_16_80bytes): 439 movdqa %xmm0, -80(%edx) 440 L(aligned_16_64bytes): 441 movdqa %xmm0, -64(%edx) 442 L(aligned_16_48bytes): 443 movdqa %xmm0, -48(%edx) 444 L(aligned_16_32bytes): 445 movdqa %xmm0, -32(%edx) 446 L(aligned_16_16bytes): 447 movdqa %xmm0, -16(%edx) 448 L(aligned_16_0bytes): 449 SETRTNVAL 450 RETURN 451 452 ALIGN (4) 453 L(aligned_16_116bytes): 454 movdqa %xmm0, -116(%edx) 455 L(aligned_16_100bytes): 456 movdqa %xmm0, -100(%edx) 457 L(aligned_16_84bytes): 458 movdqa %xmm0, -84(%edx) 459 L(aligned_16_68bytes): 460 movdqa %xmm0, -68(%edx) 461 L(aligned_16_52bytes): 462 movdqa %xmm0, -52(%edx) 463 L(aligned_16_36bytes): 464 movdqa %xmm0, -36(%edx) 465 L(aligned_16_20bytes): 466 movdqa %xmm0, -20(%edx) 467 L(aligned_16_4bytes): 468 movl %eax, -4(%edx) 469 SETRTNVAL 470 RETURN 471 472 ALIGN (4) 473 L(aligned_16_120bytes): 474 movdqa %xmm0, -120(%edx) 475 L(aligned_16_104bytes): 476 movdqa %xmm0, -104(%edx) 477 L(aligned_16_88bytes): 478 movdqa %xmm0, -88(%edx) 479 L(aligned_16_72bytes): 480 movdqa %xmm0, -72(%edx) 481 L(aligned_16_56bytes): 482 movdqa %xmm0, -56(%edx) 483 L(aligned_16_40bytes): 484 movdqa %xmm0, -40(%edx) 485 L(aligned_16_24bytes): 486 movdqa %xmm0, -24(%edx) 487 L(aligned_16_8bytes): 488 movq %xmm0, -8(%edx) 489 SETRTNVAL 490 RETURN 491 492 ALIGN (4) 493 L(aligned_16_124bytes): 494 movdqa %xmm0, -124(%edx) 495 L(aligned_16_108bytes): 496 movdqa %xmm0, -108(%edx) 497 L(aligned_16_92bytes): 498 movdqa %xmm0, -92(%edx) 499 L(aligned_16_76bytes): 500 movdqa %xmm0, -76(%edx) 501 L(aligned_16_60bytes): 502 movdqa %xmm0, -60(%edx) 503 L(aligned_16_44bytes): 504 movdqa %xmm0, -44(%edx) 505 L(aligned_16_28bytes): 506 movdqa %xmm0, -28(%edx) 507 L(aligned_16_12bytes): 508 movq %xmm0, -12(%edx) 509 movl %eax, -4(%edx) 510 SETRTNVAL 511 RETURN 512 513 END (sse2_memset32_atom) 514