1 /* 2 * Copyright (C) 2010 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "cache.h" 18 19 #ifndef MEMSET 20 # define MEMSET android_memset32 21 #endif 22 23 #ifndef L 24 # define L(label) .L##label 25 #endif 26 27 #ifndef ALIGN 28 # define ALIGN(n) .p2align n 29 #endif 30 31 #ifndef cfi_startproc 32 # define cfi_startproc .cfi_startproc 33 #endif 34 35 #ifndef cfi_endproc 36 # define cfi_endproc .cfi_endproc 37 #endif 38 39 #ifndef cfi_rel_offset 40 # define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 41 #endif 42 43 #ifndef cfi_restore 44 # define cfi_restore(reg) .cfi_restore reg 45 #endif 46 47 #ifndef cfi_adjust_cfa_offset 48 # define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 49 #endif 50 51 #ifndef ENTRY 52 # define ENTRY(name) \ 53 .type name, @function; \ 54 .globl name; \ 55 .p2align 4; \ 56 name: \ 57 cfi_startproc 58 #endif 59 60 #ifndef END 61 # define END(name) \ 62 cfi_endproc; \ 63 .size name, .-name 64 #endif 65 66 #define CFI_PUSH(REG) \ 67 cfi_adjust_cfa_offset (4); \ 68 cfi_rel_offset (REG, 0) 69 70 #define CFI_POP(REG) \ 71 cfi_adjust_cfa_offset (-4); \ 72 cfi_restore (REG) 73 74 #define PUSH(REG) pushl REG; CFI_PUSH (REG) 75 #define POP(REG) popl REG; CFI_POP (REG) 76 77 #ifdef USE_AS_BZERO32 78 # define DEST PARMS 79 # define LEN DEST+4 80 # define SETRTNVAL 81 #else 82 # define DEST PARMS 83 # define DWDS DEST+4 84 # define LEN DWDS+4 85 # define SETRTNVAL movl DEST(%esp), %eax 86 #endif 87 88 #if (defined SHARED || defined __PIC__) 89 # define ENTRANCE PUSH (%ebx); 90 # define RETURN_END POP (%ebx); ret 91 # define RETURN RETURN_END; CFI_PUSH (%ebx) 92 # define PARMS 8 /* Preserve EBX. */ 93 # define JMPTBL(I, B) I - B 94 95 /* Load an entry in a jump table into EBX and branch to it. TABLE is a 96 jump table with relative offsets. */ 97 # define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ 98 /* We first load PC into EBX. */ \ 99 call __x86.get_pc_thunk.bx; \ 100 /* Get the address of the jump table. */ \ 101 add $(TABLE - .), %ebx; \ 102 /* Get the entry and convert the relative offset to the \ 103 absolute address. */ \ 104 add (%ebx,%ecx,4), %ebx; \ 105 /* We loaded the jump table and adjuested EDX. Go. */ \ 106 jmp *%ebx 107 108 .section .gnu.linkonce.t.__x86.get_pc_thunk.bx,"ax",@progbits 109 .globl __x86.get_pc_thunk.bx 110 .hidden __x86.get_pc_thunk.bx 111 ALIGN (4) 112 .type __x86.get_pc_thunk.bx,@function 113 __x86.get_pc_thunk.bx: 114 movl (%esp), %ebx 115 ret 116 #else 117 # define ENTRANCE 118 # define RETURN_END ret 119 # define RETURN RETURN_END 120 # define PARMS 4 121 # define JMPTBL(I, B) I 122 123 /* Branch to an entry in a jump table. TABLE is a jump table with 124 absolute offsets. */ 125 # define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ 126 jmp *TABLE(,%ecx,4) 127 #endif 128 129 .section .text.sse2,"ax",@progbits 130 ALIGN (4) 131 ENTRY (MEMSET) 132 ENTRANCE 133 134 movl LEN(%esp), %ecx 135 shr $2, %ecx 136 #ifdef USE_AS_BZERO32 137 xor %eax, %eax 138 #else 139 mov DWDS(%esp), %eax 140 mov %eax, %edx 141 #endif 142 movl DEST(%esp), %edx 143 cmp $16, %ecx 144 jae L(16dbwordsormore) 145 146 L(write_less16dbwords): 147 lea (%edx, %ecx, 4), %edx 148 BRANCH_TO_JMPTBL_ENTRY (L(table_less16dbwords)) 149 150 .pushsection .rodata.sse2,"a",@progbits 151 ALIGN (2) 152 L(table_less16dbwords): 153 .int JMPTBL (L(write_0dbwords), L(table_less16dbwords)) 154 .int JMPTBL (L(write_1dbwords), L(table_less16dbwords)) 155 .int JMPTBL (L(write_2dbwords), L(table_less16dbwords)) 156 .int JMPTBL (L(write_3dbwords), L(table_less16dbwords)) 157 .int JMPTBL (L(write_4dbwords), L(table_less16dbwords)) 158 .int JMPTBL (L(write_5dbwords), L(table_less16dbwords)) 159 .int JMPTBL (L(write_6dbwords), L(table_less16dbwords)) 160 .int JMPTBL (L(write_7dbwords), L(table_less16dbwords)) 161 .int JMPTBL (L(write_8dbwords), L(table_less16dbwords)) 162 .int JMPTBL (L(write_9dbwords), L(table_less16dbwords)) 163 .int JMPTBL (L(write_10dbwords), L(table_less16dbwords)) 164 .int JMPTBL (L(write_11dbwords), L(table_less16dbwords)) 165 .int JMPTBL (L(write_12dbwords), L(table_less16dbwords)) 166 .int JMPTBL (L(write_13dbwords), L(table_less16dbwords)) 167 .int JMPTBL (L(write_14dbwords), L(table_less16dbwords)) 168 .int JMPTBL (L(write_15dbwords), L(table_less16dbwords)) 169 .popsection 170 171 ALIGN (4) 172 L(write_15dbwords): 173 movl %eax, -60(%edx) 174 L(write_14dbwords): 175 movl %eax, -56(%edx) 176 L(write_13dbwords): 177 movl %eax, -52(%edx) 178 L(write_12dbwords): 179 movl %eax, -48(%edx) 180 L(write_11dbwords): 181 movl %eax, -44(%edx) 182 L(write_10dbwords): 183 movl %eax, -40(%edx) 184 L(write_9dbwords): 185 movl %eax, -36(%edx) 186 L(write_8dbwords): 187 movl %eax, -32(%edx) 188 L(write_7dbwords): 189 movl %eax, -28(%edx) 190 L(write_6dbwords): 191 movl %eax, -24(%edx) 192 L(write_5dbwords): 193 movl %eax, -20(%edx) 194 L(write_4dbwords): 195 movl %eax, -16(%edx) 196 L(write_3dbwords): 197 movl %eax, -12(%edx) 198 L(write_2dbwords): 199 movl %eax, -8(%edx) 200 L(write_1dbwords): 201 movl %eax, -4(%edx) 202 L(write_0dbwords): 203 SETRTNVAL 204 RETURN 205 206 ALIGN (4) 207 L(16dbwordsormore): 208 test $3, %edx 209 jz L(aligned4bytes) 210 mov %eax, (%edx) 211 mov %eax, -4(%edx, %ecx, 4) 212 sub $1, %ecx 213 rol $24, %eax 214 add $1, %edx 215 test $3, %edx 216 jz L(aligned4bytes) 217 ror $8, %eax 218 add $1, %edx 219 test $3, %edx 220 jz L(aligned4bytes) 221 ror $8, %eax 222 add $1, %edx 223 L(aligned4bytes): 224 shl $2, %ecx 225 226 #ifdef USE_AS_BZERO32 227 pxor %xmm0, %xmm0 228 #else 229 movd %eax, %xmm0 230 pshufd $0, %xmm0, %xmm0 231 #endif 232 testl $0xf, %edx 233 jz L(aligned_16) 234 /* ECX > 32 and EDX is not 16 byte aligned. */ 235 L(not_aligned_16): 236 movdqu %xmm0, (%edx) 237 movl %edx, %eax 238 and $-16, %edx 239 add $16, %edx 240 sub %edx, %eax 241 add %eax, %ecx 242 movd %xmm0, %eax 243 ALIGN (4) 244 L(aligned_16): 245 cmp $128, %ecx 246 jae L(128bytesormore) 247 248 L(aligned_16_less128bytes): 249 add %ecx, %edx 250 shr $2, %ecx 251 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 252 253 ALIGN (4) 254 L(128bytesormore): 255 #ifdef SHARED_CACHE_SIZE 256 PUSH (%ebx) 257 mov $SHARED_CACHE_SIZE, %ebx 258 #else 259 # if (defined SHARED || defined __PIC__) 260 call __x86.get_pc_thunk.bx 261 add $_GLOBAL_OFFSET_TABLE_, %ebx 262 mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx 263 # else 264 PUSH (%ebx) 265 mov __x86_shared_cache_size, %ebx 266 # endif 267 #endif 268 cmp %ebx, %ecx 269 jae L(128bytesormore_nt_start) 270 271 #ifdef DATA_CACHE_SIZE 272 POP (%ebx) 273 # define RESTORE_EBX_STATE CFI_PUSH (%ebx) 274 cmp $DATA_CACHE_SIZE, %ecx 275 #else 276 # if (defined SHARED || defined __PIC__) 277 # define RESTORE_EBX_STATE 278 call __x86.get_pc_thunk.bx 279 add $_GLOBAL_OFFSET_TABLE_, %ebx 280 cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx 281 # else 282 POP (%ebx) 283 # define RESTORE_EBX_STATE CFI_PUSH (%ebx) 284 cmp __x86_data_cache_size, %ecx 285 # endif 286 #endif 287 288 jae L(128bytes_L2_normal) 289 subl $128, %ecx 290 L(128bytesormore_normal): 291 sub $128, %ecx 292 movdqa %xmm0, (%edx) 293 movdqa %xmm0, 0x10(%edx) 294 movdqa %xmm0, 0x20(%edx) 295 movdqa %xmm0, 0x30(%edx) 296 movdqa %xmm0, 0x40(%edx) 297 movdqa %xmm0, 0x50(%edx) 298 movdqa %xmm0, 0x60(%edx) 299 movdqa %xmm0, 0x70(%edx) 300 lea 128(%edx), %edx 301 jb L(128bytesless_normal) 302 303 304 sub $128, %ecx 305 movdqa %xmm0, (%edx) 306 movdqa %xmm0, 0x10(%edx) 307 movdqa %xmm0, 0x20(%edx) 308 movdqa %xmm0, 0x30(%edx) 309 movdqa %xmm0, 0x40(%edx) 310 movdqa %xmm0, 0x50(%edx) 311 movdqa %xmm0, 0x60(%edx) 312 movdqa %xmm0, 0x70(%edx) 313 lea 128(%edx), %edx 314 jae L(128bytesormore_normal) 315 316 L(128bytesless_normal): 317 lea 128(%ecx), %ecx 318 add %ecx, %edx 319 shr $2, %ecx 320 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 321 322 ALIGN (4) 323 L(128bytes_L2_normal): 324 prefetcht0 0x380(%edx) 325 prefetcht0 0x3c0(%edx) 326 sub $128, %ecx 327 movdqa %xmm0, (%edx) 328 movaps %xmm0, 0x10(%edx) 329 movaps %xmm0, 0x20(%edx) 330 movaps %xmm0, 0x30(%edx) 331 movaps %xmm0, 0x40(%edx) 332 movaps %xmm0, 0x50(%edx) 333 movaps %xmm0, 0x60(%edx) 334 movaps %xmm0, 0x70(%edx) 335 add $128, %edx 336 cmp $128, %ecx 337 jae L(128bytes_L2_normal) 338 339 L(128bytesless_L2_normal): 340 add %ecx, %edx 341 shr $2, %ecx 342 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 343 344 RESTORE_EBX_STATE 345 L(128bytesormore_nt_start): 346 sub %ebx, %ecx 347 mov %ebx, %eax 348 and $0x7f, %eax 349 add %eax, %ecx 350 movd %xmm0, %eax 351 ALIGN (4) 352 L(128bytesormore_shared_cache_loop): 353 prefetcht0 0x3c0(%edx) 354 prefetcht0 0x380(%edx) 355 sub $0x80, %ebx 356 movdqa %xmm0, (%edx) 357 movdqa %xmm0, 0x10(%edx) 358 movdqa %xmm0, 0x20(%edx) 359 movdqa %xmm0, 0x30(%edx) 360 movdqa %xmm0, 0x40(%edx) 361 movdqa %xmm0, 0x50(%edx) 362 movdqa %xmm0, 0x60(%edx) 363 movdqa %xmm0, 0x70(%edx) 364 add $0x80, %edx 365 cmp $0x80, %ebx 366 jae L(128bytesormore_shared_cache_loop) 367 cmp $0x80, %ecx 368 jb L(shared_cache_loop_end) 369 370 ALIGN (4) 371 L(128bytesormore_nt): 372 sub $0x80, %ecx 373 movntdq %xmm0, (%edx) 374 movntdq %xmm0, 0x10(%edx) 375 movntdq %xmm0, 0x20(%edx) 376 movntdq %xmm0, 0x30(%edx) 377 movntdq %xmm0, 0x40(%edx) 378 movntdq %xmm0, 0x50(%edx) 379 movntdq %xmm0, 0x60(%edx) 380 movntdq %xmm0, 0x70(%edx) 381 add $0x80, %edx 382 cmp $0x80, %ecx 383 jae L(128bytesormore_nt) 384 sfence 385 L(shared_cache_loop_end): 386 #if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__) 387 POP (%ebx) 388 #endif 389 add %ecx, %edx 390 shr $2, %ecx 391 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 392 393 .pushsection .rodata.sse2,"a",@progbits 394 ALIGN (2) 395 L(table_16_128bytes): 396 .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) 397 .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) 398 .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) 399 .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) 400 .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) 401 .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) 402 .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) 403 .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) 404 .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) 405 .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) 406 .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) 407 .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) 408 .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) 409 .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) 410 .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) 411 .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) 412 .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) 413 .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) 414 .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) 415 .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) 416 .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) 417 .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) 418 .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) 419 .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) 420 .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) 421 .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) 422 .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) 423 .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) 424 .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) 425 .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) 426 .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) 427 .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) 428 .popsection 429 430 ALIGN (4) 431 L(aligned_16_112bytes): 432 movdqa %xmm0, -112(%edx) 433 L(aligned_16_96bytes): 434 movdqa %xmm0, -96(%edx) 435 L(aligned_16_80bytes): 436 movdqa %xmm0, -80(%edx) 437 L(aligned_16_64bytes): 438 movdqa %xmm0, -64(%edx) 439 L(aligned_16_48bytes): 440 movdqa %xmm0, -48(%edx) 441 L(aligned_16_32bytes): 442 movdqa %xmm0, -32(%edx) 443 L(aligned_16_16bytes): 444 movdqa %xmm0, -16(%edx) 445 L(aligned_16_0bytes): 446 SETRTNVAL 447 RETURN 448 449 ALIGN (4) 450 L(aligned_16_116bytes): 451 movdqa %xmm0, -116(%edx) 452 L(aligned_16_100bytes): 453 movdqa %xmm0, -100(%edx) 454 L(aligned_16_84bytes): 455 movdqa %xmm0, -84(%edx) 456 L(aligned_16_68bytes): 457 movdqa %xmm0, -68(%edx) 458 L(aligned_16_52bytes): 459 movdqa %xmm0, -52(%edx) 460 L(aligned_16_36bytes): 461 movdqa %xmm0, -36(%edx) 462 L(aligned_16_20bytes): 463 movdqa %xmm0, -20(%edx) 464 L(aligned_16_4bytes): 465 movl %eax, -4(%edx) 466 SETRTNVAL 467 RETURN 468 469 ALIGN (4) 470 L(aligned_16_120bytes): 471 movdqa %xmm0, -120(%edx) 472 L(aligned_16_104bytes): 473 movdqa %xmm0, -104(%edx) 474 L(aligned_16_88bytes): 475 movdqa %xmm0, -88(%edx) 476 L(aligned_16_72bytes): 477 movdqa %xmm0, -72(%edx) 478 L(aligned_16_56bytes): 479 movdqa %xmm0, -56(%edx) 480 L(aligned_16_40bytes): 481 movdqa %xmm0, -40(%edx) 482 L(aligned_16_24bytes): 483 movdqa %xmm0, -24(%edx) 484 L(aligned_16_8bytes): 485 movq %xmm0, -8(%edx) 486 SETRTNVAL 487 RETURN 488 489 ALIGN (4) 490 L(aligned_16_124bytes): 491 movdqa %xmm0, -124(%edx) 492 L(aligned_16_108bytes): 493 movdqa %xmm0, -108(%edx) 494 L(aligned_16_92bytes): 495 movdqa %xmm0, -92(%edx) 496 L(aligned_16_76bytes): 497 movdqa %xmm0, -76(%edx) 498 L(aligned_16_60bytes): 499 movdqa %xmm0, -60(%edx) 500 L(aligned_16_44bytes): 501 movdqa %xmm0, -44(%edx) 502 L(aligned_16_28bytes): 503 movdqa %xmm0, -28(%edx) 504 L(aligned_16_12bytes): 505 movq %xmm0, -12(%edx) 506 movl %eax, -4(%edx) 507 SETRTNVAL 508 RETURN 509 510 END (MEMSET) 511