1 /* 2 Copyright (c) 2010, Intel Corporation 3 All rights reserved. 4 5 Redistribution and use in source and binary forms, with or without 6 modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #include "cache.h" 32 #undef __i686 33 34 #ifndef MEMCPY 35 # define MEMCPY memcpy 36 #endif 37 38 #ifndef L 39 # define L(label) .L##label 40 #endif 41 42 #ifndef cfi_startproc 43 # define cfi_startproc .cfi_startproc 44 #endif 45 46 #ifndef cfi_endproc 47 # define cfi_endproc .cfi_endproc 48 #endif 49 50 #ifndef cfi_rel_offset 51 # define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 52 #endif 53 54 #ifndef cfi_restore 55 # define cfi_restore(reg) .cfi_restore reg 56 #endif 57 58 #ifndef cfi_adjust_cfa_offset 59 # define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 60 #endif 61 62 #ifndef ENTRY 63 # define ENTRY(name) \ 64 .type name, @function; \ 65 .globl name; \ 66 .p2align 4; \ 67 name: \ 68 cfi_startproc 69 #endif 70 71 #ifndef END 72 # define END(name) \ 73 cfi_endproc; \ 74 .size name, .-name 75 #endif 76 77 #ifdef USE_AS_BCOPY 78 # define SRC PARMS 79 # define DEST SRC+4 80 # define LEN DEST+4 81 #else 82 # define DEST PARMS 83 # define SRC DEST+4 84 # define LEN SRC+4 85 #endif 86 87 #define CFI_PUSH(REG) \ 88 cfi_adjust_cfa_offset (4); \ 89 cfi_rel_offset (REG, 0) 90 91 #define CFI_POP(REG) \ 92 cfi_adjust_cfa_offset (-4); \ 93 cfi_restore (REG) 94 95 #define PUSH(REG) pushl REG; CFI_PUSH (REG) 96 #define POP(REG) popl REG; CFI_POP (REG) 97 98 #if (defined SHARED || defined __PIC__) 99 # define PARMS 8 /* Preserve EBX. */ 100 # define ENTRANCE PUSH (%ebx); 101 # define RETURN_END POP (%ebx); ret 102 # define RETURN RETURN_END; CFI_PUSH (%ebx) 103 # define JMPTBL(I, B) I - B 104 # undef __i686 105 106 # define SETUP_PIC_REG(x) call __i686.get_pc_thunk.x 107 108 /* Load an entry in a jump table into EBX and branch to it. TABLE is a 109 jump table with relative offsets. INDEX is a register contains the 110 index into the jump table. SCALE is the scale of INDEX. */ 111 112 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 113 /* We first load PC into EBX. */ \ 114 SETUP_PIC_REG(bx); \ 115 /* Get the address of the jump table. */ \ 116 addl $(TABLE - .), %ebx; \ 117 /* Get the entry and convert the relative offset to the \ 118 absolute address. */ \ 119 addl (%ebx, INDEX, SCALE), %ebx; \ 120 /* We loaded the jump table. Go. */ \ 121 jmp *%ebx 122 #else 123 124 # define PARMS 4 125 # define ENTRANCE 126 # define RETURN_END ret 127 # define RETURN RETURN_END 128 # define JMPTBL(I, B) I 129 130 /* Branch to an entry in a jump table. TABLE is a jump table with 131 absolute offsets. INDEX is a register contains the index into the 132 jump table. SCALE is the scale of INDEX. */ 133 134 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 135 jmp *TABLE(, INDEX, SCALE) 136 #endif 137 138 .section .text.ssse3,"ax",@progbits 139 ENTRY (MEMCPY) 140 ENTRANCE 141 movl LEN(%esp), %ecx 142 movl SRC(%esp), %eax 143 movl DEST(%esp), %edx 144 145 #ifdef USE_AS_MEMMOVE 146 cmp %eax, %edx 147 jb L(copy_forward) 148 je L(fwd_write_0bytes) 149 cmp $32, %ecx 150 jae L(memmove_bwd) 151 jmp L(bk_write_less32bytes_2) 152 153 .p2align 4 154 L(memmove_bwd): 155 add %ecx, %eax 156 cmp %eax, %edx 157 movl SRC(%esp), %eax 158 jb L(copy_backward) 159 160 L(copy_forward): 161 #endif 162 cmp $48, %ecx 163 jae L(48bytesormore) 164 165 L(fwd_write_less32bytes): 166 #ifndef USE_AS_MEMMOVE 167 cmp %dl, %al 168 jb L(bk_write) 169 #endif 170 add %ecx, %edx 171 add %ecx, %eax 172 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 173 #ifndef USE_AS_MEMMOVE 174 .p2align 4 175 L(bk_write): 176 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) 177 #endif 178 179 .p2align 4 180 L(48bytesormore): 181 #ifndef USE_AS_MEMMOVE 182 movlpd (%eax), %xmm0 183 movlpd 8(%eax), %xmm1 184 movlpd %xmm0, (%edx) 185 movlpd %xmm1, 8(%edx) 186 #else 187 movdqu (%eax), %xmm0 188 #endif 189 PUSH (%edi) 190 movl %edx, %edi 191 and $-16, %edx 192 add $16, %edx 193 sub %edx, %edi 194 add %edi, %ecx 195 sub %edi, %eax 196 197 #ifdef SHARED_CACHE_SIZE_HALF 198 cmp $SHARED_CACHE_SIZE_HALF, %ecx 199 #else 200 # if (defined SHARED || defined __PIC__) 201 SETUP_PIC_REG(bx) 202 add $_GLOBAL_OFFSET_TABLE_, %ebx 203 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx 204 # else 205 cmp __x86_shared_cache_size_half, %ecx 206 # endif 207 #endif 208 209 mov %eax, %edi 210 jae L(large_page) 211 and $0xf, %edi 212 jz L(shl_0) 213 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) 214 215 .p2align 4 216 L(shl_0): 217 #ifdef USE_AS_MEMMOVE 218 movl DEST+4(%esp), %edi 219 movdqu %xmm0, (%edi) 220 #endif 221 xor %edi, %edi 222 cmp $127, %ecx 223 ja L(shl_0_gobble) 224 lea -32(%ecx), %ecx 225 226 .p2align 4 227 L(shl_0_loop): 228 movdqa (%eax, %edi), %xmm0 229 movdqa 16(%eax, %edi), %xmm1 230 sub $32, %ecx 231 movdqa %xmm0, (%edx, %edi) 232 movdqa %xmm1, 16(%edx, %edi) 233 lea 32(%edi), %edi 234 jb L(shl_0_end) 235 236 movdqa (%eax, %edi), %xmm0 237 movdqa 16(%eax, %edi), %xmm1 238 sub $32, %ecx 239 movdqa %xmm0, (%edx, %edi) 240 movdqa %xmm1, 16(%edx, %edi) 241 lea 32(%edi), %edi 242 jb L(shl_0_end) 243 244 movdqa (%eax, %edi), %xmm0 245 movdqa 16(%eax, %edi), %xmm1 246 sub $32, %ecx 247 movdqa %xmm0, (%edx, %edi) 248 movdqa %xmm1, 16(%edx, %edi) 249 lea 32(%edi), %edi 250 jb L(shl_0_end) 251 252 movdqa (%eax, %edi), %xmm0 253 movdqa 16(%eax, %edi), %xmm1 254 sub $32, %ecx 255 movdqa %xmm0, (%edx, %edi) 256 movdqa %xmm1, 16(%edx, %edi) 257 lea 32(%edi), %edi 258 259 L(shl_0_end): 260 lea 32(%ecx), %ecx 261 add %ecx, %edi 262 add %edi, %edx 263 add %edi, %eax 264 POP (%edi) 265 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) 266 267 CFI_PUSH (%edi) 268 269 .p2align 4 270 L(shl_0_gobble): 271 #ifdef DATA_CACHE_SIZE_HALF 272 cmp $DATA_CACHE_SIZE_HALF, %ecx 273 #else 274 # if (defined SHARED || defined __PIC__) 275 SETUP_PIC_REG(bx) 276 add $_GLOBAL_OFFSET_TABLE_, %ebx 277 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 278 # else 279 cmp __x86_data_cache_size_half, %ecx 280 # endif 281 #endif 282 POP (%edi) 283 lea -128(%ecx), %ecx 284 jae L(shl_0_gobble_mem_loop) 285 286 .p2align 4 287 L(shl_0_gobble_cache_loop): 288 movdqa (%eax), %xmm0 289 movdqa 0x10(%eax), %xmm1 290 movdqa 0x20(%eax), %xmm2 291 movdqa 0x30(%eax), %xmm3 292 movdqa 0x40(%eax), %xmm4 293 movdqa 0x50(%eax), %xmm5 294 movdqa 0x60(%eax), %xmm6 295 movdqa 0x70(%eax), %xmm7 296 lea 0x80(%eax), %eax 297 sub $128, %ecx 298 movdqa %xmm0, (%edx) 299 movdqa %xmm1, 0x10(%edx) 300 movdqa %xmm2, 0x20(%edx) 301 movdqa %xmm3, 0x30(%edx) 302 movdqa %xmm4, 0x40(%edx) 303 movdqa %xmm5, 0x50(%edx) 304 movdqa %xmm6, 0x60(%edx) 305 movdqa %xmm7, 0x70(%edx) 306 lea 0x80(%edx), %edx 307 308 jae L(shl_0_gobble_cache_loop) 309 cmp $-0x40, %ecx 310 lea 0x80(%ecx), %ecx 311 jl L(shl_0_cache_less_64bytes) 312 313 movdqa (%eax), %xmm0 314 sub $0x40, %ecx 315 movdqa 0x10(%eax), %xmm1 316 movdqa %xmm0, (%edx) 317 movdqa %xmm1, 0x10(%edx) 318 movdqa 0x20(%eax), %xmm0 319 movdqa 0x30(%eax), %xmm1 320 add $0x40, %eax 321 movdqa %xmm0, 0x20(%edx) 322 movdqa %xmm1, 0x30(%edx) 323 add $0x40, %edx 324 325 L(shl_0_cache_less_64bytes): 326 cmp $0x20, %ecx 327 jb L(shl_0_cache_less_32bytes) 328 movdqa (%eax), %xmm0 329 sub $0x20, %ecx 330 movdqa 0x10(%eax), %xmm1 331 add $0x20, %eax 332 movdqa %xmm0, (%edx) 333 movdqa %xmm1, 0x10(%edx) 334 add $0x20, %edx 335 336 L(shl_0_cache_less_32bytes): 337 cmp $0x10, %ecx 338 jb L(shl_0_cache_less_16bytes) 339 sub $0x10, %ecx 340 movdqa (%eax), %xmm0 341 add $0x10, %eax 342 movdqa %xmm0, (%edx) 343 add $0x10, %edx 344 345 L(shl_0_cache_less_16bytes): 346 add %ecx, %edx 347 add %ecx, %eax 348 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 349 350 .p2align 4 351 L(shl_0_gobble_mem_loop): 352 prefetcht0 0x1c0(%eax) 353 prefetcht0 0x280(%eax) 354 prefetcht0 0x1c0(%edx) 355 356 movdqa (%eax), %xmm0 357 movdqa 0x10(%eax), %xmm1 358 movdqa 0x20(%eax), %xmm2 359 movdqa 0x30(%eax), %xmm3 360 movdqa 0x40(%eax), %xmm4 361 movdqa 0x50(%eax), %xmm5 362 movdqa 0x60(%eax), %xmm6 363 movdqa 0x70(%eax), %xmm7 364 lea 0x80(%eax), %eax 365 sub $0x80, %ecx 366 movdqa %xmm0, (%edx) 367 movdqa %xmm1, 0x10(%edx) 368 movdqa %xmm2, 0x20(%edx) 369 movdqa %xmm3, 0x30(%edx) 370 movdqa %xmm4, 0x40(%edx) 371 movdqa %xmm5, 0x50(%edx) 372 movdqa %xmm6, 0x60(%edx) 373 movdqa %xmm7, 0x70(%edx) 374 lea 0x80(%edx), %edx 375 376 jae L(shl_0_gobble_mem_loop) 377 cmp $-0x40, %ecx 378 lea 0x80(%ecx), %ecx 379 jl L(shl_0_mem_less_64bytes) 380 381 movdqa (%eax), %xmm0 382 sub $0x40, %ecx 383 movdqa 0x10(%eax), %xmm1 384 385 movdqa %xmm0, (%edx) 386 movdqa %xmm1, 0x10(%edx) 387 388 movdqa 0x20(%eax), %xmm0 389 movdqa 0x30(%eax), %xmm1 390 add $0x40, %eax 391 392 movdqa %xmm0, 0x20(%edx) 393 movdqa %xmm1, 0x30(%edx) 394 add $0x40, %edx 395 396 L(shl_0_mem_less_64bytes): 397 cmp $0x20, %ecx 398 jb L(shl_0_mem_less_32bytes) 399 movdqa (%eax), %xmm0 400 sub $0x20, %ecx 401 movdqa 0x10(%eax), %xmm1 402 add $0x20, %eax 403 movdqa %xmm0, (%edx) 404 movdqa %xmm1, 0x10(%edx) 405 add $0x20, %edx 406 407 L(shl_0_mem_less_32bytes): 408 cmp $0x10, %ecx 409 jb L(shl_0_mem_less_16bytes) 410 sub $0x10, %ecx 411 movdqa (%eax), %xmm0 412 add $0x10, %eax 413 movdqa %xmm0, (%edx) 414 add $0x10, %edx 415 416 L(shl_0_mem_less_16bytes): 417 add %ecx, %edx 418 add %ecx, %eax 419 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) 420 421 .p2align 4 422 L(shl_1): 423 #ifndef USE_AS_MEMMOVE 424 movaps -1(%eax), %xmm1 425 #else 426 movl DEST+4(%esp), %edi 427 movaps -1(%eax), %xmm1 428 movdqu %xmm0, (%edi) 429 #endif 430 #ifdef DATA_CACHE_SIZE_HALF 431 cmp $DATA_CACHE_SIZE_HALF, %ecx 432 #else 433 # if (defined SHARED || defined __PIC__) 434 SETUP_PIC_REG(bx) 435 add $_GLOBAL_OFFSET_TABLE_, %ebx 436 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 437 # else 438 cmp __x86_data_cache_size_half, %ecx 439 # endif 440 #endif 441 jb L(sh_1_no_prefetch) 442 443 lea -64(%ecx), %ecx 444 445 .p2align 4 446 L(Shl1LoopStart): 447 prefetcht0 0x1c0(%eax) 448 prefetcht0 0x1c0(%edx) 449 movaps 15(%eax), %xmm2 450 movaps 31(%eax), %xmm3 451 movaps 47(%eax), %xmm4 452 movaps 63(%eax), %xmm5 453 movaps %xmm5, %xmm7 454 palignr $1, %xmm4, %xmm5 455 palignr $1, %xmm3, %xmm4 456 movaps %xmm5, 48(%edx) 457 palignr $1, %xmm2, %xmm3 458 lea 64(%eax), %eax 459 palignr $1, %xmm1, %xmm2 460 movaps %xmm4, 32(%edx) 461 movaps %xmm3, 16(%edx) 462 movaps %xmm7, %xmm1 463 movaps %xmm2, (%edx) 464 lea 64(%edx), %edx 465 sub $64, %ecx 466 ja L(Shl1LoopStart) 467 468 L(Shl1LoopLeave): 469 add $32, %ecx 470 jle L(shl_end_0) 471 472 movaps 15(%eax), %xmm2 473 movaps 31(%eax), %xmm3 474 palignr $1, %xmm2, %xmm3 475 palignr $1, %xmm1, %xmm2 476 movaps %xmm2, (%edx) 477 movaps %xmm3, 16(%edx) 478 lea 32(%edx, %ecx), %edx 479 lea 32(%eax, %ecx), %eax 480 POP (%edi) 481 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 482 483 CFI_PUSH (%edi) 484 485 .p2align 4 486 L(sh_1_no_prefetch): 487 lea -32(%ecx), %ecx 488 lea -1(%eax), %eax 489 xor %edi, %edi 490 491 .p2align 4 492 L(sh_1_no_prefetch_loop): 493 movdqa 16(%eax, %edi), %xmm2 494 sub $32, %ecx 495 movdqa 32(%eax, %edi), %xmm3 496 movdqa %xmm3, %xmm4 497 palignr $1, %xmm2, %xmm3 498 palignr $1, %xmm1, %xmm2 499 lea 32(%edi), %edi 500 movdqa %xmm2, -32(%edx, %edi) 501 movdqa %xmm3, -16(%edx, %edi) 502 jb L(sh_1_end_no_prefetch_loop) 503 504 movdqa 16(%eax, %edi), %xmm2 505 sub $32, %ecx 506 movdqa 32(%eax, %edi), %xmm3 507 movdqa %xmm3, %xmm1 508 palignr $1, %xmm2, %xmm3 509 palignr $1, %xmm4, %xmm2 510 lea 32(%edi), %edi 511 movdqa %xmm2, -32(%edx, %edi) 512 movdqa %xmm3, -16(%edx, %edi) 513 jae L(sh_1_no_prefetch_loop) 514 515 L(sh_1_end_no_prefetch_loop): 516 lea 32(%ecx), %ecx 517 add %ecx, %edi 518 add %edi, %edx 519 lea 1(%edi, %eax), %eax 520 POP (%edi) 521 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 522 523 CFI_PUSH (%edi) 524 525 .p2align 4 526 L(shl_2): 527 #ifndef USE_AS_MEMMOVE 528 movaps -2(%eax), %xmm1 529 #else 530 movl DEST+4(%esp), %edi 531 movaps -2(%eax), %xmm1 532 movdqu %xmm0, (%edi) 533 #endif 534 #ifdef DATA_CACHE_SIZE_HALF 535 cmp $DATA_CACHE_SIZE_HALF, %ecx 536 #else 537 # if (defined SHARED || defined __PIC__) 538 SETUP_PIC_REG(bx) 539 add $_GLOBAL_OFFSET_TABLE_, %ebx 540 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 541 # else 542 cmp __x86_data_cache_size_half, %ecx 543 # endif 544 #endif 545 jb L(sh_2_no_prefetch) 546 547 lea -64(%ecx), %ecx 548 549 .p2align 4 550 L(Shl2LoopStart): 551 prefetcht0 0x1c0(%eax) 552 prefetcht0 0x1c0(%edx) 553 movaps 14(%eax), %xmm2 554 movaps 30(%eax), %xmm3 555 movaps 46(%eax), %xmm4 556 movaps 62(%eax), %xmm5 557 movaps %xmm5, %xmm7 558 palignr $2, %xmm4, %xmm5 559 palignr $2, %xmm3, %xmm4 560 movaps %xmm5, 48(%edx) 561 palignr $2, %xmm2, %xmm3 562 lea 64(%eax), %eax 563 palignr $2, %xmm1, %xmm2 564 movaps %xmm4, 32(%edx) 565 movaps %xmm3, 16(%edx) 566 movaps %xmm7, %xmm1 567 movaps %xmm2, (%edx) 568 lea 64(%edx), %edx 569 sub $64, %ecx 570 ja L(Shl2LoopStart) 571 572 L(Shl2LoopLeave): 573 add $32, %ecx 574 jle L(shl_end_0) 575 576 movaps 14(%eax), %xmm2 577 movaps 30(%eax), %xmm3 578 palignr $2, %xmm2, %xmm3 579 palignr $2, %xmm1, %xmm2 580 movaps %xmm2, (%edx) 581 movaps %xmm3, 16(%edx) 582 lea 32(%edx, %ecx), %edx 583 lea 32(%eax, %ecx), %eax 584 POP (%edi) 585 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 586 587 CFI_PUSH (%edi) 588 589 .p2align 4 590 L(sh_2_no_prefetch): 591 lea -32(%ecx), %ecx 592 lea -2(%eax), %eax 593 xor %edi, %edi 594 595 .p2align 4 596 L(sh_2_no_prefetch_loop): 597 movdqa 16(%eax, %edi), %xmm2 598 sub $32, %ecx 599 movdqa 32(%eax, %edi), %xmm3 600 movdqa %xmm3, %xmm4 601 palignr $2, %xmm2, %xmm3 602 palignr $2, %xmm1, %xmm2 603 lea 32(%edi), %edi 604 movdqa %xmm2, -32(%edx, %edi) 605 movdqa %xmm3, -16(%edx, %edi) 606 jb L(sh_2_end_no_prefetch_loop) 607 608 movdqa 16(%eax, %edi), %xmm2 609 sub $32, %ecx 610 movdqa 32(%eax, %edi), %xmm3 611 movdqa %xmm3, %xmm1 612 palignr $2, %xmm2, %xmm3 613 palignr $2, %xmm4, %xmm2 614 lea 32(%edi), %edi 615 movdqa %xmm2, -32(%edx, %edi) 616 movdqa %xmm3, -16(%edx, %edi) 617 jae L(sh_2_no_prefetch_loop) 618 619 L(sh_2_end_no_prefetch_loop): 620 lea 32(%ecx), %ecx 621 add %ecx, %edi 622 add %edi, %edx 623 lea 2(%edi, %eax), %eax 624 POP (%edi) 625 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 626 627 CFI_PUSH (%edi) 628 629 .p2align 4 630 L(shl_3): 631 #ifndef USE_AS_MEMMOVE 632 movaps -3(%eax), %xmm1 633 #else 634 movl DEST+4(%esp), %edi 635 movaps -3(%eax), %xmm1 636 movdqu %xmm0, (%edi) 637 #endif 638 #ifdef DATA_CACHE_SIZE_HALF 639 cmp $DATA_CACHE_SIZE_HALF, %ecx 640 #else 641 # if (defined SHARED || defined __PIC__) 642 SETUP_PIC_REG(bx) 643 add $_GLOBAL_OFFSET_TABLE_, %ebx 644 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 645 # else 646 cmp __x86_data_cache_size_half, %ecx 647 # endif 648 #endif 649 jb L(sh_3_no_prefetch) 650 651 lea -64(%ecx), %ecx 652 653 .p2align 4 654 L(Shl3LoopStart): 655 prefetcht0 0x1c0(%eax) 656 prefetcht0 0x1c0(%edx) 657 movaps 13(%eax), %xmm2 658 movaps 29(%eax), %xmm3 659 movaps 45(%eax), %xmm4 660 movaps 61(%eax), %xmm5 661 movaps %xmm5, %xmm7 662 palignr $3, %xmm4, %xmm5 663 palignr $3, %xmm3, %xmm4 664 movaps %xmm5, 48(%edx) 665 palignr $3, %xmm2, %xmm3 666 lea 64(%eax), %eax 667 palignr $3, %xmm1, %xmm2 668 movaps %xmm4, 32(%edx) 669 movaps %xmm3, 16(%edx) 670 movaps %xmm7, %xmm1 671 movaps %xmm2, (%edx) 672 lea 64(%edx), %edx 673 sub $64, %ecx 674 ja L(Shl3LoopStart) 675 676 L(Shl3LoopLeave): 677 add $32, %ecx 678 jle L(shl_end_0) 679 680 movaps 13(%eax), %xmm2 681 movaps 29(%eax), %xmm3 682 palignr $3, %xmm2, %xmm3 683 palignr $3, %xmm1, %xmm2 684 movaps %xmm2, (%edx) 685 movaps %xmm3, 16(%edx) 686 lea 32(%edx, %ecx), %edx 687 lea 32(%eax, %ecx), %eax 688 POP (%edi) 689 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 690 691 CFI_PUSH (%edi) 692 693 .p2align 4 694 L(sh_3_no_prefetch): 695 lea -32(%ecx), %ecx 696 lea -3(%eax), %eax 697 xor %edi, %edi 698 699 .p2align 4 700 L(sh_3_no_prefetch_loop): 701 movdqa 16(%eax, %edi), %xmm2 702 sub $32, %ecx 703 movdqa 32(%eax, %edi), %xmm3 704 movdqa %xmm3, %xmm4 705 palignr $3, %xmm2, %xmm3 706 palignr $3, %xmm1, %xmm2 707 lea 32(%edi), %edi 708 movdqa %xmm2, -32(%edx, %edi) 709 movdqa %xmm3, -16(%edx, %edi) 710 711 jb L(sh_3_end_no_prefetch_loop) 712 713 movdqa 16(%eax, %edi), %xmm2 714 sub $32, %ecx 715 movdqa 32(%eax, %edi), %xmm3 716 movdqa %xmm3, %xmm1 717 palignr $3, %xmm2, %xmm3 718 palignr $3, %xmm4, %xmm2 719 lea 32(%edi), %edi 720 movdqa %xmm2, -32(%edx, %edi) 721 movdqa %xmm3, -16(%edx, %edi) 722 723 jae L(sh_3_no_prefetch_loop) 724 725 L(sh_3_end_no_prefetch_loop): 726 lea 32(%ecx), %ecx 727 add %ecx, %edi 728 add %edi, %edx 729 lea 3(%edi, %eax), %eax 730 POP (%edi) 731 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 732 733 CFI_PUSH (%edi) 734 735 .p2align 4 736 L(shl_4): 737 #ifndef USE_AS_MEMMOVE 738 movaps -4(%eax), %xmm1 739 #else 740 movl DEST+4(%esp), %edi 741 movaps -4(%eax), %xmm1 742 movdqu %xmm0, (%edi) 743 #endif 744 #ifdef DATA_CACHE_SIZE_HALF 745 cmp $DATA_CACHE_SIZE_HALF, %ecx 746 #else 747 # if (defined SHARED || defined __PIC__) 748 SETUP_PIC_REG(bx) 749 add $_GLOBAL_OFFSET_TABLE_, %ebx 750 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 751 # else 752 cmp __x86_data_cache_size_half, %ecx 753 # endif 754 #endif 755 jb L(sh_4_no_prefetch) 756 757 lea -64(%ecx), %ecx 758 759 .p2align 4 760 L(Shl4LoopStart): 761 prefetcht0 0x1c0(%eax) 762 prefetcht0 0x1c0(%edx) 763 movaps 12(%eax), %xmm2 764 movaps 28(%eax), %xmm3 765 movaps 44(%eax), %xmm4 766 movaps 60(%eax), %xmm5 767 movaps %xmm5, %xmm7 768 palignr $4, %xmm4, %xmm5 769 palignr $4, %xmm3, %xmm4 770 movaps %xmm5, 48(%edx) 771 palignr $4, %xmm2, %xmm3 772 lea 64(%eax), %eax 773 palignr $4, %xmm1, %xmm2 774 movaps %xmm4, 32(%edx) 775 movaps %xmm3, 16(%edx) 776 movaps %xmm7, %xmm1 777 movaps %xmm2, (%edx) 778 lea 64(%edx), %edx 779 sub $64, %ecx 780 ja L(Shl4LoopStart) 781 782 L(Shl4LoopLeave): 783 add $32, %ecx 784 jle L(shl_end_0) 785 786 movaps 12(%eax), %xmm2 787 movaps 28(%eax), %xmm3 788 palignr $4, %xmm2, %xmm3 789 palignr $4, %xmm1, %xmm2 790 movaps %xmm2, (%edx) 791 movaps %xmm3, 16(%edx) 792 lea 32(%edx, %ecx), %edx 793 lea 32(%eax, %ecx), %eax 794 POP (%edi) 795 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 796 797 CFI_PUSH (%edi) 798 799 .p2align 4 800 L(sh_4_no_prefetch): 801 lea -32(%ecx), %ecx 802 lea -4(%eax), %eax 803 xor %edi, %edi 804 805 .p2align 4 806 L(sh_4_no_prefetch_loop): 807 movdqa 16(%eax, %edi), %xmm2 808 sub $32, %ecx 809 movdqa 32(%eax, %edi), %xmm3 810 movdqa %xmm3, %xmm4 811 palignr $4, %xmm2, %xmm3 812 palignr $4, %xmm1, %xmm2 813 lea 32(%edi), %edi 814 movdqa %xmm2, -32(%edx, %edi) 815 movdqa %xmm3, -16(%edx, %edi) 816 817 jb L(sh_4_end_no_prefetch_loop) 818 819 movdqa 16(%eax, %edi), %xmm2 820 sub $32, %ecx 821 movdqa 32(%eax, %edi), %xmm3 822 movdqa %xmm3, %xmm1 823 palignr $4, %xmm2, %xmm3 824 palignr $4, %xmm4, %xmm2 825 lea 32(%edi), %edi 826 movdqa %xmm2, -32(%edx, %edi) 827 movdqa %xmm3, -16(%edx, %edi) 828 829 jae L(sh_4_no_prefetch_loop) 830 831 L(sh_4_end_no_prefetch_loop): 832 lea 32(%ecx), %ecx 833 add %ecx, %edi 834 add %edi, %edx 835 lea 4(%edi, %eax), %eax 836 POP (%edi) 837 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 838 839 CFI_PUSH (%edi) 840 841 .p2align 4 842 L(shl_5): 843 #ifndef USE_AS_MEMMOVE 844 movaps -5(%eax), %xmm1 845 #else 846 movl DEST+4(%esp), %edi 847 movaps -5(%eax), %xmm1 848 movdqu %xmm0, (%edi) 849 #endif 850 #ifdef DATA_CACHE_SIZE_HALF 851 cmp $DATA_CACHE_SIZE_HALF, %ecx 852 #else 853 # if (defined SHARED || defined __PIC__) 854 SETUP_PIC_REG(bx) 855 add $_GLOBAL_OFFSET_TABLE_, %ebx 856 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 857 # else 858 cmp __x86_data_cache_size_half, %ecx 859 # endif 860 #endif 861 jb L(sh_5_no_prefetch) 862 863 lea -64(%ecx), %ecx 864 865 .p2align 4 866 L(Shl5LoopStart): 867 prefetcht0 0x1c0(%eax) 868 prefetcht0 0x1c0(%edx) 869 movaps 11(%eax), %xmm2 870 movaps 27(%eax), %xmm3 871 movaps 43(%eax), %xmm4 872 movaps 59(%eax), %xmm5 873 movaps %xmm5, %xmm7 874 palignr $5, %xmm4, %xmm5 875 palignr $5, %xmm3, %xmm4 876 movaps %xmm5, 48(%edx) 877 palignr $5, %xmm2, %xmm3 878 lea 64(%eax), %eax 879 palignr $5, %xmm1, %xmm2 880 movaps %xmm4, 32(%edx) 881 movaps %xmm3, 16(%edx) 882 movaps %xmm7, %xmm1 883 movaps %xmm2, (%edx) 884 lea 64(%edx), %edx 885 sub $64, %ecx 886 ja L(Shl5LoopStart) 887 888 L(Shl5LoopLeave): 889 add $32, %ecx 890 jle L(shl_end_0) 891 892 movaps 11(%eax), %xmm2 893 movaps 27(%eax), %xmm3 894 palignr $5, %xmm2, %xmm3 895 palignr $5, %xmm1, %xmm2 896 movaps %xmm2, (%edx) 897 movaps %xmm3, 16(%edx) 898 lea 32(%edx, %ecx), %edx 899 lea 32(%eax, %ecx), %eax 900 POP (%edi) 901 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 902 903 CFI_PUSH (%edi) 904 905 .p2align 4 906 L(sh_5_no_prefetch): 907 lea -32(%ecx), %ecx 908 lea -5(%eax), %eax 909 xor %edi, %edi 910 911 .p2align 4 912 L(sh_5_no_prefetch_loop): 913 movdqa 16(%eax, %edi), %xmm2 914 sub $32, %ecx 915 movdqa 32(%eax, %edi), %xmm3 916 movdqa %xmm3, %xmm4 917 palignr $5, %xmm2, %xmm3 918 palignr $5, %xmm1, %xmm2 919 lea 32(%edi), %edi 920 movdqa %xmm2, -32(%edx, %edi) 921 movdqa %xmm3, -16(%edx, %edi) 922 923 jb L(sh_5_end_no_prefetch_loop) 924 925 movdqa 16(%eax, %edi), %xmm2 926 sub $32, %ecx 927 movdqa 32(%eax, %edi), %xmm3 928 movdqa %xmm3, %xmm1 929 palignr $5, %xmm2, %xmm3 930 palignr $5, %xmm4, %xmm2 931 lea 32(%edi), %edi 932 movdqa %xmm2, -32(%edx, %edi) 933 movdqa %xmm3, -16(%edx, %edi) 934 935 jae L(sh_5_no_prefetch_loop) 936 937 L(sh_5_end_no_prefetch_loop): 938 lea 32(%ecx), %ecx 939 add %ecx, %edi 940 add %edi, %edx 941 lea 5(%edi, %eax), %eax 942 POP (%edi) 943 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 944 945 CFI_PUSH (%edi) 946 947 .p2align 4 948 L(shl_6): 949 #ifndef USE_AS_MEMMOVE 950 movaps -6(%eax), %xmm1 951 #else 952 movl DEST+4(%esp), %edi 953 movaps -6(%eax), %xmm1 954 movdqu %xmm0, (%edi) 955 #endif 956 #ifdef DATA_CACHE_SIZE_HALF 957 cmp $DATA_CACHE_SIZE_HALF, %ecx 958 #else 959 # if (defined SHARED || defined __PIC__) 960 SETUP_PIC_REG(bx) 961 add $_GLOBAL_OFFSET_TABLE_, %ebx 962 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 963 # else 964 cmp __x86_data_cache_size_half, %ecx 965 # endif 966 #endif 967 jb L(sh_6_no_prefetch) 968 969 lea -64(%ecx), %ecx 970 971 .p2align 4 972 L(Shl6LoopStart): 973 prefetcht0 0x1c0(%eax) 974 prefetcht0 0x1c0(%edx) 975 movaps 10(%eax), %xmm2 976 movaps 26(%eax), %xmm3 977 movaps 42(%eax), %xmm4 978 movaps 58(%eax), %xmm5 979 movaps %xmm5, %xmm7 980 palignr $6, %xmm4, %xmm5 981 palignr $6, %xmm3, %xmm4 982 movaps %xmm5, 48(%edx) 983 palignr $6, %xmm2, %xmm3 984 lea 64(%eax), %eax 985 palignr $6, %xmm1, %xmm2 986 movaps %xmm4, 32(%edx) 987 movaps %xmm3, 16(%edx) 988 movaps %xmm7, %xmm1 989 movaps %xmm2, (%edx) 990 lea 64(%edx), %edx 991 sub $64, %ecx 992 ja L(Shl6LoopStart) 993 994 L(Shl6LoopLeave): 995 add $32, %ecx 996 jle L(shl_end_0) 997 998 movaps 10(%eax), %xmm2 999 movaps 26(%eax), %xmm3 1000 palignr $6, %xmm2, %xmm3 1001 palignr $6, %xmm1, %xmm2 1002 movaps %xmm2, (%edx) 1003 movaps %xmm3, 16(%edx) 1004 lea 32(%edx, %ecx), %edx 1005 lea 32(%eax, %ecx), %eax 1006 POP (%edi) 1007 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1008 1009 CFI_PUSH (%edi) 1010 1011 .p2align 4 1012 L(sh_6_no_prefetch): 1013 lea -32(%ecx), %ecx 1014 lea -6(%eax), %eax 1015 xor %edi, %edi 1016 1017 .p2align 4 1018 L(sh_6_no_prefetch_loop): 1019 movdqa 16(%eax, %edi), %xmm2 1020 sub $32, %ecx 1021 movdqa 32(%eax, %edi), %xmm3 1022 movdqa %xmm3, %xmm4 1023 palignr $6, %xmm2, %xmm3 1024 palignr $6, %xmm1, %xmm2 1025 lea 32(%edi), %edi 1026 movdqa %xmm2, -32(%edx, %edi) 1027 movdqa %xmm3, -16(%edx, %edi) 1028 1029 jb L(sh_6_end_no_prefetch_loop) 1030 1031 movdqa 16(%eax, %edi), %xmm2 1032 sub $32, %ecx 1033 movdqa 32(%eax, %edi), %xmm3 1034 movdqa %xmm3, %xmm1 1035 palignr $6, %xmm2, %xmm3 1036 palignr $6, %xmm4, %xmm2 1037 lea 32(%edi), %edi 1038 movdqa %xmm2, -32(%edx, %edi) 1039 movdqa %xmm3, -16(%edx, %edi) 1040 1041 jae L(sh_6_no_prefetch_loop) 1042 1043 L(sh_6_end_no_prefetch_loop): 1044 lea 32(%ecx), %ecx 1045 add %ecx, %edi 1046 add %edi, %edx 1047 lea 6(%edi, %eax), %eax 1048 POP (%edi) 1049 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1050 1051 CFI_PUSH (%edi) 1052 1053 .p2align 4 1054 L(shl_7): 1055 #ifndef USE_AS_MEMMOVE 1056 movaps -7(%eax), %xmm1 1057 #else 1058 movl DEST+4(%esp), %edi 1059 movaps -7(%eax), %xmm1 1060 movdqu %xmm0, (%edi) 1061 #endif 1062 #ifdef DATA_CACHE_SIZE_HALF 1063 cmp $DATA_CACHE_SIZE_HALF, %ecx 1064 #else 1065 # if (defined SHARED || defined __PIC__) 1066 SETUP_PIC_REG(bx) 1067 add $_GLOBAL_OFFSET_TABLE_, %ebx 1068 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1069 # else 1070 cmp __x86_data_cache_size_half, %ecx 1071 # endif 1072 #endif 1073 jb L(sh_7_no_prefetch) 1074 1075 lea -64(%ecx), %ecx 1076 1077 .p2align 4 1078 L(Shl7LoopStart): 1079 prefetcht0 0x1c0(%eax) 1080 prefetcht0 0x1c0(%edx) 1081 movaps 9(%eax), %xmm2 1082 movaps 25(%eax), %xmm3 1083 movaps 41(%eax), %xmm4 1084 movaps 57(%eax), %xmm5 1085 movaps %xmm5, %xmm7 1086 palignr $7, %xmm4, %xmm5 1087 palignr $7, %xmm3, %xmm4 1088 movaps %xmm5, 48(%edx) 1089 palignr $7, %xmm2, %xmm3 1090 lea 64(%eax), %eax 1091 palignr $7, %xmm1, %xmm2 1092 movaps %xmm4, 32(%edx) 1093 movaps %xmm3, 16(%edx) 1094 movaps %xmm7, %xmm1 1095 movaps %xmm2, (%edx) 1096 lea 64(%edx), %edx 1097 sub $64, %ecx 1098 ja L(Shl7LoopStart) 1099 1100 L(Shl7LoopLeave): 1101 add $32, %ecx 1102 jle L(shl_end_0) 1103 1104 movaps 9(%eax), %xmm2 1105 movaps 25(%eax), %xmm3 1106 palignr $7, %xmm2, %xmm3 1107 palignr $7, %xmm1, %xmm2 1108 movaps %xmm2, (%edx) 1109 movaps %xmm3, 16(%edx) 1110 lea 32(%edx, %ecx), %edx 1111 lea 32(%eax, %ecx), %eax 1112 POP (%edi) 1113 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1114 1115 CFI_PUSH (%edi) 1116 1117 .p2align 4 1118 L(sh_7_no_prefetch): 1119 lea -32(%ecx), %ecx 1120 lea -7(%eax), %eax 1121 xor %edi, %edi 1122 1123 .p2align 4 1124 L(sh_7_no_prefetch_loop): 1125 movdqa 16(%eax, %edi), %xmm2 1126 sub $32, %ecx 1127 movdqa 32(%eax, %edi), %xmm3 1128 movdqa %xmm3, %xmm4 1129 palignr $7, %xmm2, %xmm3 1130 palignr $7, %xmm1, %xmm2 1131 lea 32(%edi), %edi 1132 movdqa %xmm2, -32(%edx, %edi) 1133 movdqa %xmm3, -16(%edx, %edi) 1134 jb L(sh_7_end_no_prefetch_loop) 1135 1136 movdqa 16(%eax, %edi), %xmm2 1137 sub $32, %ecx 1138 movdqa 32(%eax, %edi), %xmm3 1139 movdqa %xmm3, %xmm1 1140 palignr $7, %xmm2, %xmm3 1141 palignr $7, %xmm4, %xmm2 1142 lea 32(%edi), %edi 1143 movdqa %xmm2, -32(%edx, %edi) 1144 movdqa %xmm3, -16(%edx, %edi) 1145 jae L(sh_7_no_prefetch_loop) 1146 1147 L(sh_7_end_no_prefetch_loop): 1148 lea 32(%ecx), %ecx 1149 add %ecx, %edi 1150 add %edi, %edx 1151 lea 7(%edi, %eax), %eax 1152 POP (%edi) 1153 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1154 1155 CFI_PUSH (%edi) 1156 1157 .p2align 4 1158 L(shl_8): 1159 #ifndef USE_AS_MEMMOVE 1160 movaps -8(%eax), %xmm1 1161 #else 1162 movl DEST+4(%esp), %edi 1163 movaps -8(%eax), %xmm1 1164 movdqu %xmm0, (%edi) 1165 #endif 1166 #ifdef DATA_CACHE_SIZE_HALF 1167 cmp $DATA_CACHE_SIZE_HALF, %ecx 1168 #else 1169 # if (defined SHARED || defined __PIC__) 1170 SETUP_PIC_REG(bx) 1171 add $_GLOBAL_OFFSET_TABLE_, %ebx 1172 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1173 # else 1174 cmp __x86_data_cache_size_half, %ecx 1175 # endif 1176 #endif 1177 jb L(sh_8_no_prefetch) 1178 1179 lea -64(%ecx), %ecx 1180 1181 .p2align 4 1182 L(Shl8LoopStart): 1183 prefetcht0 0x1c0(%eax) 1184 prefetcht0 0x1c0(%edx) 1185 movaps 8(%eax), %xmm2 1186 movaps 24(%eax), %xmm3 1187 movaps 40(%eax), %xmm4 1188 movaps 56(%eax), %xmm5 1189 movaps %xmm5, %xmm7 1190 palignr $8, %xmm4, %xmm5 1191 palignr $8, %xmm3, %xmm4 1192 movaps %xmm5, 48(%edx) 1193 palignr $8, %xmm2, %xmm3 1194 lea 64(%eax), %eax 1195 palignr $8, %xmm1, %xmm2 1196 movaps %xmm4, 32(%edx) 1197 movaps %xmm3, 16(%edx) 1198 movaps %xmm7, %xmm1 1199 movaps %xmm2, (%edx) 1200 lea 64(%edx), %edx 1201 sub $64, %ecx 1202 ja L(Shl8LoopStart) 1203 1204 L(LoopLeave8): 1205 add $32, %ecx 1206 jle L(shl_end_0) 1207 1208 movaps 8(%eax), %xmm2 1209 movaps 24(%eax), %xmm3 1210 palignr $8, %xmm2, %xmm3 1211 palignr $8, %xmm1, %xmm2 1212 movaps %xmm2, (%edx) 1213 movaps %xmm3, 16(%edx) 1214 lea 32(%edx, %ecx), %edx 1215 lea 32(%eax, %ecx), %eax 1216 POP (%edi) 1217 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1218 1219 CFI_PUSH (%edi) 1220 1221 .p2align 4 1222 L(sh_8_no_prefetch): 1223 lea -32(%ecx), %ecx 1224 lea -8(%eax), %eax 1225 xor %edi, %edi 1226 1227 .p2align 4 1228 L(sh_8_no_prefetch_loop): 1229 movdqa 16(%eax, %edi), %xmm2 1230 sub $32, %ecx 1231 movdqa 32(%eax, %edi), %xmm3 1232 movdqa %xmm3, %xmm4 1233 palignr $8, %xmm2, %xmm3 1234 palignr $8, %xmm1, %xmm2 1235 lea 32(%edi), %edi 1236 movdqa %xmm2, -32(%edx, %edi) 1237 movdqa %xmm3, -16(%edx, %edi) 1238 jb L(sh_8_end_no_prefetch_loop) 1239 1240 movdqa 16(%eax, %edi), %xmm2 1241 sub $32, %ecx 1242 movdqa 32(%eax, %edi), %xmm3 1243 movdqa %xmm3, %xmm1 1244 palignr $8, %xmm2, %xmm3 1245 palignr $8, %xmm4, %xmm2 1246 lea 32(%edi), %edi 1247 movdqa %xmm2, -32(%edx, %edi) 1248 movdqa %xmm3, -16(%edx, %edi) 1249 jae L(sh_8_no_prefetch_loop) 1250 1251 L(sh_8_end_no_prefetch_loop): 1252 lea 32(%ecx), %ecx 1253 add %ecx, %edi 1254 add %edi, %edx 1255 lea 8(%edi, %eax), %eax 1256 POP (%edi) 1257 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1258 1259 CFI_PUSH (%edi) 1260 1261 .p2align 4 1262 L(shl_9): 1263 #ifndef USE_AS_MEMMOVE 1264 movaps -9(%eax), %xmm1 1265 #else 1266 movl DEST+4(%esp), %edi 1267 movaps -9(%eax), %xmm1 1268 movdqu %xmm0, (%edi) 1269 #endif 1270 #ifdef DATA_CACHE_SIZE_HALF 1271 cmp $DATA_CACHE_SIZE_HALF, %ecx 1272 #else 1273 # if (defined SHARED || defined __PIC__) 1274 SETUP_PIC_REG(bx) 1275 add $_GLOBAL_OFFSET_TABLE_, %ebx 1276 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1277 # else 1278 cmp __x86_data_cache_size_half, %ecx 1279 # endif 1280 #endif 1281 jb L(sh_9_no_prefetch) 1282 1283 lea -64(%ecx), %ecx 1284 1285 .p2align 4 1286 L(Shl9LoopStart): 1287 prefetcht0 0x1c0(%eax) 1288 prefetcht0 0x1c0(%edx) 1289 movaps 7(%eax), %xmm2 1290 movaps 23(%eax), %xmm3 1291 movaps 39(%eax), %xmm4 1292 movaps 55(%eax), %xmm5 1293 movaps %xmm5, %xmm7 1294 palignr $9, %xmm4, %xmm5 1295 palignr $9, %xmm3, %xmm4 1296 movaps %xmm5, 48(%edx) 1297 palignr $9, %xmm2, %xmm3 1298 lea 64(%eax), %eax 1299 palignr $9, %xmm1, %xmm2 1300 movaps %xmm4, 32(%edx) 1301 movaps %xmm3, 16(%edx) 1302 movaps %xmm7, %xmm1 1303 movaps %xmm2, (%edx) 1304 lea 64(%edx), %edx 1305 sub $64, %ecx 1306 ja L(Shl9LoopStart) 1307 1308 L(Shl9LoopLeave): 1309 add $32, %ecx 1310 jle L(shl_end_0) 1311 1312 movaps 7(%eax), %xmm2 1313 movaps 23(%eax), %xmm3 1314 palignr $9, %xmm2, %xmm3 1315 palignr $9, %xmm1, %xmm2 1316 1317 movaps %xmm2, (%edx) 1318 movaps %xmm3, 16(%edx) 1319 lea 32(%edx, %ecx), %edx 1320 lea 32(%eax, %ecx), %eax 1321 POP (%edi) 1322 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1323 1324 CFI_PUSH (%edi) 1325 1326 .p2align 4 1327 L(sh_9_no_prefetch): 1328 lea -32(%ecx), %ecx 1329 lea -9(%eax), %eax 1330 xor %edi, %edi 1331 1332 .p2align 4 1333 L(sh_9_no_prefetch_loop): 1334 movdqa 16(%eax, %edi), %xmm2 1335 sub $32, %ecx 1336 movdqa 32(%eax, %edi), %xmm3 1337 movdqa %xmm3, %xmm4 1338 palignr $9, %xmm2, %xmm3 1339 palignr $9, %xmm1, %xmm2 1340 lea 32(%edi), %edi 1341 movdqa %xmm2, -32(%edx, %edi) 1342 movdqa %xmm3, -16(%edx, %edi) 1343 jb L(sh_9_end_no_prefetch_loop) 1344 1345 movdqa 16(%eax, %edi), %xmm2 1346 sub $32, %ecx 1347 movdqa 32(%eax, %edi), %xmm3 1348 movdqa %xmm3, %xmm1 1349 palignr $9, %xmm2, %xmm3 1350 palignr $9, %xmm4, %xmm2 1351 lea 32(%edi), %edi 1352 movdqa %xmm2, -32(%edx, %edi) 1353 movdqa %xmm3, -16(%edx, %edi) 1354 jae L(sh_9_no_prefetch_loop) 1355 1356 L(sh_9_end_no_prefetch_loop): 1357 lea 32(%ecx), %ecx 1358 add %ecx, %edi 1359 add %edi, %edx 1360 lea 9(%edi, %eax), %eax 1361 POP (%edi) 1362 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1363 1364 CFI_PUSH (%edi) 1365 1366 .p2align 4 1367 L(shl_10): 1368 #ifndef USE_AS_MEMMOVE 1369 movaps -10(%eax), %xmm1 1370 #else 1371 movl DEST+4(%esp), %edi 1372 movaps -10(%eax), %xmm1 1373 movdqu %xmm0, (%edi) 1374 #endif 1375 #ifdef DATA_CACHE_SIZE_HALF 1376 cmp $DATA_CACHE_SIZE_HALF, %ecx 1377 #else 1378 # if (defined SHARED || defined __PIC__) 1379 SETUP_PIC_REG(bx) 1380 add $_GLOBAL_OFFSET_TABLE_, %ebx 1381 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1382 # else 1383 cmp __x86_data_cache_size_half, %ecx 1384 # endif 1385 #endif 1386 jb L(sh_10_no_prefetch) 1387 1388 lea -64(%ecx), %ecx 1389 1390 .p2align 4 1391 L(Shl10LoopStart): 1392 prefetcht0 0x1c0(%eax) 1393 prefetcht0 0x1c0(%edx) 1394 movaps 6(%eax), %xmm2 1395 movaps 22(%eax), %xmm3 1396 movaps 38(%eax), %xmm4 1397 movaps 54(%eax), %xmm5 1398 movaps %xmm5, %xmm7 1399 palignr $10, %xmm4, %xmm5 1400 palignr $10, %xmm3, %xmm4 1401 movaps %xmm5, 48(%edx) 1402 palignr $10, %xmm2, %xmm3 1403 lea 64(%eax), %eax 1404 palignr $10, %xmm1, %xmm2 1405 movaps %xmm4, 32(%edx) 1406 movaps %xmm3, 16(%edx) 1407 movaps %xmm7, %xmm1 1408 movaps %xmm2, (%edx) 1409 lea 64(%edx), %edx 1410 sub $64, %ecx 1411 ja L(Shl10LoopStart) 1412 1413 L(Shl10LoopLeave): 1414 add $32, %ecx 1415 jle L(shl_end_0) 1416 1417 movaps 6(%eax), %xmm2 1418 movaps 22(%eax), %xmm3 1419 palignr $10, %xmm2, %xmm3 1420 palignr $10, %xmm1, %xmm2 1421 1422 movaps %xmm2, (%edx) 1423 movaps %xmm3, 16(%edx) 1424 lea 32(%edx, %ecx), %edx 1425 lea 32(%eax, %ecx), %eax 1426 POP (%edi) 1427 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1428 1429 CFI_PUSH (%edi) 1430 1431 .p2align 4 1432 L(sh_10_no_prefetch): 1433 lea -32(%ecx), %ecx 1434 lea -10(%eax), %eax 1435 xor %edi, %edi 1436 1437 .p2align 4 1438 L(sh_10_no_prefetch_loop): 1439 movdqa 16(%eax, %edi), %xmm2 1440 sub $32, %ecx 1441 movdqa 32(%eax, %edi), %xmm3 1442 movdqa %xmm3, %xmm4 1443 palignr $10, %xmm2, %xmm3 1444 palignr $10, %xmm1, %xmm2 1445 lea 32(%edi), %edi 1446 movdqa %xmm2, -32(%edx, %edi) 1447 movdqa %xmm3, -16(%edx, %edi) 1448 jb L(sh_10_end_no_prefetch_loop) 1449 1450 movdqa 16(%eax, %edi), %xmm2 1451 sub $32, %ecx 1452 movdqa 32(%eax, %edi), %xmm3 1453 movdqa %xmm3, %xmm1 1454 palignr $10, %xmm2, %xmm3 1455 palignr $10, %xmm4, %xmm2 1456 lea 32(%edi), %edi 1457 movdqa %xmm2, -32(%edx, %edi) 1458 movdqa %xmm3, -16(%edx, %edi) 1459 jae L(sh_10_no_prefetch_loop) 1460 1461 L(sh_10_end_no_prefetch_loop): 1462 lea 32(%ecx), %ecx 1463 add %ecx, %edi 1464 add %edi, %edx 1465 lea 10(%edi, %eax), %eax 1466 POP (%edi) 1467 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1468 1469 CFI_PUSH (%edi) 1470 1471 .p2align 4 1472 L(shl_11): 1473 #ifndef USE_AS_MEMMOVE 1474 movaps -11(%eax), %xmm1 1475 #else 1476 movl DEST+4(%esp), %edi 1477 movaps -11(%eax), %xmm1 1478 movdqu %xmm0, (%edi) 1479 #endif 1480 #ifdef DATA_CACHE_SIZE_HALF 1481 cmp $DATA_CACHE_SIZE_HALF, %ecx 1482 #else 1483 # if (defined SHARED || defined __PIC__) 1484 SETUP_PIC_REG(bx) 1485 add $_GLOBAL_OFFSET_TABLE_, %ebx 1486 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1487 # else 1488 cmp __x86_data_cache_size_half, %ecx 1489 # endif 1490 #endif 1491 jb L(sh_11_no_prefetch) 1492 1493 lea -64(%ecx), %ecx 1494 1495 .p2align 4 1496 L(Shl11LoopStart): 1497 prefetcht0 0x1c0(%eax) 1498 prefetcht0 0x1c0(%edx) 1499 movaps 5(%eax), %xmm2 1500 movaps 21(%eax), %xmm3 1501 movaps 37(%eax), %xmm4 1502 movaps 53(%eax), %xmm5 1503 movaps %xmm5, %xmm7 1504 palignr $11, %xmm4, %xmm5 1505 palignr $11, %xmm3, %xmm4 1506 movaps %xmm5, 48(%edx) 1507 palignr $11, %xmm2, %xmm3 1508 lea 64(%eax), %eax 1509 palignr $11, %xmm1, %xmm2 1510 movaps %xmm4, 32(%edx) 1511 movaps %xmm3, 16(%edx) 1512 movaps %xmm7, %xmm1 1513 movaps %xmm2, (%edx) 1514 lea 64(%edx), %edx 1515 sub $64, %ecx 1516 ja L(Shl11LoopStart) 1517 1518 L(Shl11LoopLeave): 1519 add $32, %ecx 1520 jle L(shl_end_0) 1521 1522 movaps 5(%eax), %xmm2 1523 movaps 21(%eax), %xmm3 1524 palignr $11, %xmm2, %xmm3 1525 palignr $11, %xmm1, %xmm2 1526 1527 movaps %xmm2, (%edx) 1528 movaps %xmm3, 16(%edx) 1529 lea 32(%edx, %ecx), %edx 1530 lea 32(%eax, %ecx), %eax 1531 POP (%edi) 1532 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1533 1534 CFI_PUSH (%edi) 1535 1536 .p2align 4 1537 L(sh_11_no_prefetch): 1538 lea -32(%ecx), %ecx 1539 lea -11(%eax), %eax 1540 xor %edi, %edi 1541 1542 .p2align 4 1543 L(sh_11_no_prefetch_loop): 1544 movdqa 16(%eax, %edi), %xmm2 1545 sub $32, %ecx 1546 movdqa 32(%eax, %edi), %xmm3 1547 movdqa %xmm3, %xmm4 1548 palignr $11, %xmm2, %xmm3 1549 palignr $11, %xmm1, %xmm2 1550 lea 32(%edi), %edi 1551 movdqa %xmm2, -32(%edx, %edi) 1552 movdqa %xmm3, -16(%edx, %edi) 1553 jb L(sh_11_end_no_prefetch_loop) 1554 1555 movdqa 16(%eax, %edi), %xmm2 1556 sub $32, %ecx 1557 movdqa 32(%eax, %edi), %xmm3 1558 movdqa %xmm3, %xmm1 1559 palignr $11, %xmm2, %xmm3 1560 palignr $11, %xmm4, %xmm2 1561 lea 32(%edi), %edi 1562 movdqa %xmm2, -32(%edx, %edi) 1563 movdqa %xmm3, -16(%edx, %edi) 1564 jae L(sh_11_no_prefetch_loop) 1565 1566 L(sh_11_end_no_prefetch_loop): 1567 lea 32(%ecx), %ecx 1568 add %ecx, %edi 1569 add %edi, %edx 1570 lea 11(%edi, %eax), %eax 1571 POP (%edi) 1572 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1573 1574 CFI_PUSH (%edi) 1575 1576 .p2align 4 1577 L(shl_12): 1578 #ifndef USE_AS_MEMMOVE 1579 movaps -12(%eax), %xmm1 1580 #else 1581 movl DEST+4(%esp), %edi 1582 movaps -12(%eax), %xmm1 1583 movdqu %xmm0, (%edi) 1584 #endif 1585 #ifdef DATA_CACHE_SIZE_HALF 1586 cmp $DATA_CACHE_SIZE_HALF, %ecx 1587 #else 1588 # if (defined SHARED || defined __PIC__) 1589 SETUP_PIC_REG(bx) 1590 add $_GLOBAL_OFFSET_TABLE_, %ebx 1591 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1592 # else 1593 cmp __x86_data_cache_size_half, %ecx 1594 # endif 1595 #endif 1596 jb L(sh_12_no_prefetch) 1597 1598 lea -64(%ecx), %ecx 1599 1600 .p2align 4 1601 L(Shl12LoopStart): 1602 prefetcht0 0x1c0(%eax) 1603 prefetcht0 0x1c0(%edx) 1604 movaps 4(%eax), %xmm2 1605 movaps 20(%eax), %xmm3 1606 movaps 36(%eax), %xmm4 1607 movaps 52(%eax), %xmm5 1608 movaps %xmm5, %xmm7 1609 palignr $12, %xmm4, %xmm5 1610 palignr $12, %xmm3, %xmm4 1611 movaps %xmm5, 48(%edx) 1612 palignr $12, %xmm2, %xmm3 1613 lea 64(%eax), %eax 1614 palignr $12, %xmm1, %xmm2 1615 movaps %xmm4, 32(%edx) 1616 movaps %xmm3, 16(%edx) 1617 movaps %xmm7, %xmm1 1618 movaps %xmm2, (%edx) 1619 lea 64(%edx), %edx 1620 sub $64, %ecx 1621 ja L(Shl12LoopStart) 1622 1623 L(Shl12LoopLeave): 1624 add $32, %ecx 1625 jle L(shl_end_0) 1626 1627 movaps 4(%eax), %xmm2 1628 movaps 20(%eax), %xmm3 1629 palignr $12, %xmm2, %xmm3 1630 palignr $12, %xmm1, %xmm2 1631 1632 movaps %xmm2, (%edx) 1633 movaps %xmm3, 16(%edx) 1634 lea 32(%edx, %ecx), %edx 1635 lea 32(%eax, %ecx), %eax 1636 POP (%edi) 1637 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1638 1639 CFI_PUSH (%edi) 1640 1641 .p2align 4 1642 L(sh_12_no_prefetch): 1643 lea -32(%ecx), %ecx 1644 lea -12(%eax), %eax 1645 xor %edi, %edi 1646 1647 .p2align 4 1648 L(sh_12_no_prefetch_loop): 1649 movdqa 16(%eax, %edi), %xmm2 1650 sub $32, %ecx 1651 movdqa 32(%eax, %edi), %xmm3 1652 movdqa %xmm3, %xmm4 1653 palignr $12, %xmm2, %xmm3 1654 palignr $12, %xmm1, %xmm2 1655 lea 32(%edi), %edi 1656 movdqa %xmm2, -32(%edx, %edi) 1657 movdqa %xmm3, -16(%edx, %edi) 1658 jb L(sh_12_end_no_prefetch_loop) 1659 1660 movdqa 16(%eax, %edi), %xmm2 1661 sub $32, %ecx 1662 movdqa 32(%eax, %edi), %xmm3 1663 movdqa %xmm3, %xmm1 1664 palignr $12, %xmm2, %xmm3 1665 palignr $12, %xmm4, %xmm2 1666 lea 32(%edi), %edi 1667 movdqa %xmm2, -32(%edx, %edi) 1668 movdqa %xmm3, -16(%edx, %edi) 1669 jae L(sh_12_no_prefetch_loop) 1670 1671 L(sh_12_end_no_prefetch_loop): 1672 lea 32(%ecx), %ecx 1673 add %ecx, %edi 1674 add %edi, %edx 1675 lea 12(%edi, %eax), %eax 1676 POP (%edi) 1677 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1678 1679 CFI_PUSH (%edi) 1680 1681 .p2align 4 1682 L(shl_13): 1683 #ifndef USE_AS_MEMMOVE 1684 movaps -13(%eax), %xmm1 1685 #else 1686 movl DEST+4(%esp), %edi 1687 movaps -13(%eax), %xmm1 1688 movdqu %xmm0, (%edi) 1689 #endif 1690 #ifdef DATA_CACHE_SIZE_HALF 1691 cmp $DATA_CACHE_SIZE_HALF, %ecx 1692 #else 1693 # if (defined SHARED || defined __PIC__) 1694 SETUP_PIC_REG(bx) 1695 add $_GLOBAL_OFFSET_TABLE_, %ebx 1696 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1697 # else 1698 cmp __x86_data_cache_size_half, %ecx 1699 # endif 1700 #endif 1701 jb L(sh_13_no_prefetch) 1702 1703 lea -64(%ecx), %ecx 1704 1705 .p2align 4 1706 L(Shl13LoopStart): 1707 prefetcht0 0x1c0(%eax) 1708 prefetcht0 0x1c0(%edx) 1709 movaps 3(%eax), %xmm2 1710 movaps 19(%eax), %xmm3 1711 movaps 35(%eax), %xmm4 1712 movaps 51(%eax), %xmm5 1713 movaps %xmm5, %xmm7 1714 palignr $13, %xmm4, %xmm5 1715 palignr $13, %xmm3, %xmm4 1716 movaps %xmm5, 48(%edx) 1717 palignr $13, %xmm2, %xmm3 1718 lea 64(%eax), %eax 1719 palignr $13, %xmm1, %xmm2 1720 movaps %xmm4, 32(%edx) 1721 movaps %xmm3, 16(%edx) 1722 movaps %xmm7, %xmm1 1723 movaps %xmm2, (%edx) 1724 lea 64(%edx), %edx 1725 sub $64, %ecx 1726 ja L(Shl13LoopStart) 1727 1728 L(Shl13LoopLeave): 1729 add $32, %ecx 1730 jle L(shl_end_0) 1731 1732 movaps 3(%eax), %xmm2 1733 movaps 19(%eax), %xmm3 1734 palignr $13, %xmm2, %xmm3 1735 palignr $13, %xmm1, %xmm2 1736 1737 movaps %xmm2, (%edx) 1738 movaps %xmm3, 16(%edx) 1739 lea 32(%edx, %ecx), %edx 1740 lea 32(%eax, %ecx), %eax 1741 POP (%edi) 1742 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1743 1744 CFI_PUSH (%edi) 1745 1746 .p2align 4 1747 L(sh_13_no_prefetch): 1748 lea -32(%ecx), %ecx 1749 lea -13(%eax), %eax 1750 xor %edi, %edi 1751 1752 .p2align 4 1753 L(sh_13_no_prefetch_loop): 1754 movdqa 16(%eax, %edi), %xmm2 1755 sub $32, %ecx 1756 movdqa 32(%eax, %edi), %xmm3 1757 movdqa %xmm3, %xmm4 1758 palignr $13, %xmm2, %xmm3 1759 palignr $13, %xmm1, %xmm2 1760 lea 32(%edi), %edi 1761 movdqa %xmm2, -32(%edx, %edi) 1762 movdqa %xmm3, -16(%edx, %edi) 1763 jb L(sh_13_end_no_prefetch_loop) 1764 1765 movdqa 16(%eax, %edi), %xmm2 1766 sub $32, %ecx 1767 movdqa 32(%eax, %edi), %xmm3 1768 movdqa %xmm3, %xmm1 1769 palignr $13, %xmm2, %xmm3 1770 palignr $13, %xmm4, %xmm2 1771 lea 32(%edi), %edi 1772 movdqa %xmm2, -32(%edx, %edi) 1773 movdqa %xmm3, -16(%edx, %edi) 1774 jae L(sh_13_no_prefetch_loop) 1775 1776 L(sh_13_end_no_prefetch_loop): 1777 lea 32(%ecx), %ecx 1778 add %ecx, %edi 1779 add %edi, %edx 1780 lea 13(%edi, %eax), %eax 1781 POP (%edi) 1782 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1783 1784 CFI_PUSH (%edi) 1785 1786 .p2align 4 1787 L(shl_14): 1788 #ifndef USE_AS_MEMMOVE 1789 movaps -14(%eax), %xmm1 1790 #else 1791 movl DEST+4(%esp), %edi 1792 movaps -14(%eax), %xmm1 1793 movdqu %xmm0, (%edi) 1794 #endif 1795 #ifdef DATA_CACHE_SIZE_HALF 1796 cmp $DATA_CACHE_SIZE_HALF, %ecx 1797 #else 1798 # if (defined SHARED || defined __PIC__) 1799 SETUP_PIC_REG(bx) 1800 add $_GLOBAL_OFFSET_TABLE_, %ebx 1801 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1802 # else 1803 cmp __x86_data_cache_size_half, %ecx 1804 # endif 1805 #endif 1806 jb L(sh_14_no_prefetch) 1807 1808 lea -64(%ecx), %ecx 1809 1810 .p2align 4 1811 L(Shl14LoopStart): 1812 prefetcht0 0x1c0(%eax) 1813 prefetcht0 0x1c0(%edx) 1814 movaps 2(%eax), %xmm2 1815 movaps 18(%eax), %xmm3 1816 movaps 34(%eax), %xmm4 1817 movaps 50(%eax), %xmm5 1818 movaps %xmm5, %xmm7 1819 palignr $14, %xmm4, %xmm5 1820 palignr $14, %xmm3, %xmm4 1821 movaps %xmm5, 48(%edx) 1822 palignr $14, %xmm2, %xmm3 1823 lea 64(%eax), %eax 1824 palignr $14, %xmm1, %xmm2 1825 movaps %xmm4, 32(%edx) 1826 movaps %xmm3, 16(%edx) 1827 movaps %xmm7, %xmm1 1828 movaps %xmm2, (%edx) 1829 lea 64(%edx), %edx 1830 sub $64, %ecx 1831 ja L(Shl14LoopStart) 1832 1833 L(Shl14LoopLeave): 1834 add $32, %ecx 1835 jle L(shl_end_0) 1836 1837 movaps 2(%eax), %xmm2 1838 movaps 18(%eax), %xmm3 1839 palignr $14, %xmm2, %xmm3 1840 palignr $14, %xmm1, %xmm2 1841 1842 movaps %xmm2, (%edx) 1843 movaps %xmm3, 16(%edx) 1844 lea 32(%edx, %ecx), %edx 1845 lea 32(%eax, %ecx), %eax 1846 POP (%edi) 1847 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1848 1849 CFI_PUSH (%edi) 1850 1851 .p2align 4 1852 L(sh_14_no_prefetch): 1853 lea -32(%ecx), %ecx 1854 lea -14(%eax), %eax 1855 xor %edi, %edi 1856 1857 .p2align 4 1858 L(sh_14_no_prefetch_loop): 1859 movdqa 16(%eax, %edi), %xmm2 1860 sub $32, %ecx 1861 movdqa 32(%eax, %edi), %xmm3 1862 movdqa %xmm3, %xmm4 1863 palignr $14, %xmm2, %xmm3 1864 palignr $14, %xmm1, %xmm2 1865 lea 32(%edi), %edi 1866 movdqa %xmm2, -32(%edx, %edi) 1867 movdqa %xmm3, -16(%edx, %edi) 1868 jb L(sh_14_end_no_prefetch_loop) 1869 1870 movdqa 16(%eax, %edi), %xmm2 1871 sub $32, %ecx 1872 movdqa 32(%eax, %edi), %xmm3 1873 movdqa %xmm3, %xmm1 1874 palignr $14, %xmm2, %xmm3 1875 palignr $14, %xmm4, %xmm2 1876 lea 32(%edi), %edi 1877 movdqa %xmm2, -32(%edx, %edi) 1878 movdqa %xmm3, -16(%edx, %edi) 1879 jae L(sh_14_no_prefetch_loop) 1880 1881 L(sh_14_end_no_prefetch_loop): 1882 lea 32(%ecx), %ecx 1883 add %ecx, %edi 1884 add %edi, %edx 1885 lea 14(%edi, %eax), %eax 1886 POP (%edi) 1887 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1888 1889 CFI_PUSH (%edi) 1890 1891 .p2align 4 1892 L(shl_15): 1893 #ifndef USE_AS_MEMMOVE 1894 movaps -15(%eax), %xmm1 1895 #else 1896 movl DEST+4(%esp), %edi 1897 movaps -15(%eax), %xmm1 1898 movdqu %xmm0, (%edi) 1899 #endif 1900 #ifdef DATA_CACHE_SIZE_HALF 1901 cmp $DATA_CACHE_SIZE_HALF, %ecx 1902 #else 1903 # if (defined SHARED || defined __PIC__) 1904 SETUP_PIC_REG(bx) 1905 add $_GLOBAL_OFFSET_TABLE_, %ebx 1906 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1907 # else 1908 cmp __x86_data_cache_size_half, %ecx 1909 # endif 1910 #endif 1911 jb L(sh_15_no_prefetch) 1912 1913 lea -64(%ecx), %ecx 1914 1915 .p2align 4 1916 L(Shl15LoopStart): 1917 prefetcht0 0x1c0(%eax) 1918 prefetcht0 0x1c0(%edx) 1919 movaps 1(%eax), %xmm2 1920 movaps 17(%eax), %xmm3 1921 movaps 33(%eax), %xmm4 1922 movaps 49(%eax), %xmm5 1923 movaps %xmm5, %xmm7 1924 palignr $15, %xmm4, %xmm5 1925 palignr $15, %xmm3, %xmm4 1926 movaps %xmm5, 48(%edx) 1927 palignr $15, %xmm2, %xmm3 1928 lea 64(%eax), %eax 1929 palignr $15, %xmm1, %xmm2 1930 movaps %xmm4, 32(%edx) 1931 movaps %xmm3, 16(%edx) 1932 movaps %xmm7, %xmm1 1933 movaps %xmm2, (%edx) 1934 lea 64(%edx), %edx 1935 sub $64, %ecx 1936 ja L(Shl15LoopStart) 1937 1938 L(Shl15LoopLeave): 1939 add $32, %ecx 1940 jle L(shl_end_0) 1941 1942 movaps 1(%eax), %xmm2 1943 movaps 17(%eax), %xmm3 1944 palignr $15, %xmm2, %xmm3 1945 palignr $15, %xmm1, %xmm2 1946 1947 movaps %xmm2, (%edx) 1948 movaps %xmm3, 16(%edx) 1949 lea 32(%edx, %ecx), %edx 1950 lea 32(%eax, %ecx), %eax 1951 POP (%edi) 1952 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1953 1954 CFI_PUSH (%edi) 1955 1956 .p2align 4 1957 L(sh_15_no_prefetch): 1958 lea -32(%ecx), %ecx 1959 lea -15(%eax), %eax 1960 xor %edi, %edi 1961 1962 .p2align 4 1963 L(sh_15_no_prefetch_loop): 1964 movdqa 16(%eax, %edi), %xmm2 1965 sub $32, %ecx 1966 movdqa 32(%eax, %edi), %xmm3 1967 movdqa %xmm3, %xmm4 1968 palignr $15, %xmm2, %xmm3 1969 palignr $15, %xmm1, %xmm2 1970 lea 32(%edi), %edi 1971 movdqa %xmm2, -32(%edx, %edi) 1972 movdqa %xmm3, -16(%edx, %edi) 1973 jb L(sh_15_end_no_prefetch_loop) 1974 1975 movdqa 16(%eax, %edi), %xmm2 1976 sub $32, %ecx 1977 movdqa 32(%eax, %edi), %xmm3 1978 movdqa %xmm3, %xmm1 1979 palignr $15, %xmm2, %xmm3 1980 palignr $15, %xmm4, %xmm2 1981 lea 32(%edi), %edi 1982 movdqa %xmm2, -32(%edx, %edi) 1983 movdqa %xmm3, -16(%edx, %edi) 1984 jae L(sh_15_no_prefetch_loop) 1985 1986 L(sh_15_end_no_prefetch_loop): 1987 lea 32(%ecx), %ecx 1988 add %ecx, %edi 1989 add %edi, %edx 1990 lea 15(%edi, %eax), %eax 1991 POP (%edi) 1992 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1993 1994 CFI_PUSH (%edi) 1995 1996 .p2align 4 1997 L(shl_end_0): 1998 lea 32(%ecx), %ecx 1999 lea (%edx, %ecx), %edx 2000 lea (%eax, %ecx), %eax 2001 POP (%edi) 2002 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 2003 2004 .p2align 4 2005 L(fwd_write_44bytes): 2006 movq -44(%eax), %xmm0 2007 movq %xmm0, -44(%edx) 2008 L(fwd_write_36bytes): 2009 movq -36(%eax), %xmm0 2010 movq %xmm0, -36(%edx) 2011 L(fwd_write_28bytes): 2012 movq -28(%eax), %xmm0 2013 movq %xmm0, -28(%edx) 2014 L(fwd_write_20bytes): 2015 movq -20(%eax), %xmm0 2016 movq %xmm0, -20(%edx) 2017 L(fwd_write_12bytes): 2018 movq -12(%eax), %xmm0 2019 movq %xmm0, -12(%edx) 2020 L(fwd_write_4bytes): 2021 movl -4(%eax), %ecx 2022 movl %ecx, -4(%edx) 2023 #ifndef USE_AS_BCOPY 2024 # ifdef USE_AS_MEMPCPY 2025 movl %edx, %eax 2026 # else 2027 movl DEST(%esp), %eax 2028 # endif 2029 #endif 2030 RETURN 2031 2032 .p2align 4 2033 L(fwd_write_40bytes): 2034 movq -40(%eax), %xmm0 2035 movq %xmm0, -40(%edx) 2036 L(fwd_write_32bytes): 2037 movq -32(%eax), %xmm0 2038 movq %xmm0, -32(%edx) 2039 L(fwd_write_24bytes): 2040 movq -24(%eax), %xmm0 2041 movq %xmm0, -24(%edx) 2042 L(fwd_write_16bytes): 2043 movq -16(%eax), %xmm0 2044 movq %xmm0, -16(%edx) 2045 L(fwd_write_8bytes): 2046 movq -8(%eax), %xmm0 2047 movq %xmm0, -8(%edx) 2048 L(fwd_write_0bytes): 2049 #ifndef USE_AS_BCOPY 2050 # ifdef USE_AS_MEMPCPY 2051 movl %edx, %eax 2052 # else 2053 movl DEST(%esp), %eax 2054 # endif 2055 #endif 2056 RETURN 2057 2058 .p2align 4 2059 L(fwd_write_5bytes): 2060 movl -5(%eax), %ecx 2061 movl -4(%eax), %eax 2062 movl %ecx, -5(%edx) 2063 movl %eax, -4(%edx) 2064 #ifndef USE_AS_BCOPY 2065 # ifdef USE_AS_MEMPCPY 2066 movl %edx, %eax 2067 # else 2068 movl DEST(%esp), %eax 2069 # endif 2070 #endif 2071 RETURN 2072 2073 .p2align 4 2074 L(fwd_write_45bytes): 2075 movq -45(%eax), %xmm0 2076 movq %xmm0, -45(%edx) 2077 L(fwd_write_37bytes): 2078 movq -37(%eax), %xmm0 2079 movq %xmm0, -37(%edx) 2080 L(fwd_write_29bytes): 2081 movq -29(%eax), %xmm0 2082 movq %xmm0, -29(%edx) 2083 L(fwd_write_21bytes): 2084 movq -21(%eax), %xmm0 2085 movq %xmm0, -21(%edx) 2086 L(fwd_write_13bytes): 2087 movq -13(%eax), %xmm0 2088 movq %xmm0, -13(%edx) 2089 movl -5(%eax), %ecx 2090 movl %ecx, -5(%edx) 2091 movzbl -1(%eax), %ecx 2092 movb %cl, -1(%edx) 2093 #ifndef USE_AS_BCOPY 2094 # ifdef USE_AS_MEMPCPY 2095 movl %edx, %eax 2096 # else 2097 movl DEST(%esp), %eax 2098 # endif 2099 #endif 2100 RETURN 2101 2102 .p2align 4 2103 L(fwd_write_41bytes): 2104 movq -41(%eax), %xmm0 2105 movq %xmm0, -41(%edx) 2106 L(fwd_write_33bytes): 2107 movq -33(%eax), %xmm0 2108 movq %xmm0, -33(%edx) 2109 L(fwd_write_25bytes): 2110 movq -25(%eax), %xmm0 2111 movq %xmm0, -25(%edx) 2112 L(fwd_write_17bytes): 2113 movq -17(%eax), %xmm0 2114 movq %xmm0, -17(%edx) 2115 L(fwd_write_9bytes): 2116 movq -9(%eax), %xmm0 2117 movq %xmm0, -9(%edx) 2118 L(fwd_write_1bytes): 2119 movzbl -1(%eax), %ecx 2120 movb %cl, -1(%edx) 2121 #ifndef USE_AS_BCOPY 2122 # ifdef USE_AS_MEMPCPY 2123 movl %edx, %eax 2124 # else 2125 movl DEST(%esp), %eax 2126 # endif 2127 #endif 2128 RETURN 2129 2130 .p2align 4 2131 L(fwd_write_46bytes): 2132 movq -46(%eax), %xmm0 2133 movq %xmm0, -46(%edx) 2134 L(fwd_write_38bytes): 2135 movq -38(%eax), %xmm0 2136 movq %xmm0, -38(%edx) 2137 L(fwd_write_30bytes): 2138 movq -30(%eax), %xmm0 2139 movq %xmm0, -30(%edx) 2140 L(fwd_write_22bytes): 2141 movq -22(%eax), %xmm0 2142 movq %xmm0, -22(%edx) 2143 L(fwd_write_14bytes): 2144 movq -14(%eax), %xmm0 2145 movq %xmm0, -14(%edx) 2146 L(fwd_write_6bytes): 2147 movl -6(%eax), %ecx 2148 movl %ecx, -6(%edx) 2149 movzwl -2(%eax), %ecx 2150 movw %cx, -2(%edx) 2151 #ifndef USE_AS_BCOPY 2152 # ifdef USE_AS_MEMPCPY 2153 movl %edx, %eax 2154 # else 2155 movl DEST(%esp), %eax 2156 # endif 2157 #endif 2158 RETURN 2159 2160 .p2align 4 2161 L(fwd_write_42bytes): 2162 movq -42(%eax), %xmm0 2163 movq %xmm0, -42(%edx) 2164 L(fwd_write_34bytes): 2165 movq -34(%eax), %xmm0 2166 movq %xmm0, -34(%edx) 2167 L(fwd_write_26bytes): 2168 movq -26(%eax), %xmm0 2169 movq %xmm0, -26(%edx) 2170 L(fwd_write_18bytes): 2171 movq -18(%eax), %xmm0 2172 movq %xmm0, -18(%edx) 2173 L(fwd_write_10bytes): 2174 movq -10(%eax), %xmm0 2175 movq %xmm0, -10(%edx) 2176 L(fwd_write_2bytes): 2177 movzwl -2(%eax), %ecx 2178 movw %cx, -2(%edx) 2179 #ifndef USE_AS_BCOPY 2180 # ifdef USE_AS_MEMPCPY 2181 movl %edx, %eax 2182 # else 2183 movl DEST(%esp), %eax 2184 # endif 2185 #endif 2186 RETURN 2187 2188 .p2align 4 2189 L(fwd_write_47bytes): 2190 movq -47(%eax), %xmm0 2191 movq %xmm0, -47(%edx) 2192 L(fwd_write_39bytes): 2193 movq -39(%eax), %xmm0 2194 movq %xmm0, -39(%edx) 2195 L(fwd_write_31bytes): 2196 movq -31(%eax), %xmm0 2197 movq %xmm0, -31(%edx) 2198 L(fwd_write_23bytes): 2199 movq -23(%eax), %xmm0 2200 movq %xmm0, -23(%edx) 2201 L(fwd_write_15bytes): 2202 movq -15(%eax), %xmm0 2203 movq %xmm0, -15(%edx) 2204 L(fwd_write_7bytes): 2205 movl -7(%eax), %ecx 2206 movl %ecx, -7(%edx) 2207 movzwl -3(%eax), %ecx 2208 movzbl -1(%eax), %eax 2209 movw %cx, -3(%edx) 2210 movb %al, -1(%edx) 2211 #ifndef USE_AS_BCOPY 2212 # ifdef USE_AS_MEMPCPY 2213 movl %edx, %eax 2214 # else 2215 movl DEST(%esp), %eax 2216 # endif 2217 #endif 2218 RETURN 2219 2220 .p2align 4 2221 L(fwd_write_43bytes): 2222 movq -43(%eax), %xmm0 2223 movq %xmm0, -43(%edx) 2224 L(fwd_write_35bytes): 2225 movq -35(%eax), %xmm0 2226 movq %xmm0, -35(%edx) 2227 L(fwd_write_27bytes): 2228 movq -27(%eax), %xmm0 2229 movq %xmm0, -27(%edx) 2230 L(fwd_write_19bytes): 2231 movq -19(%eax), %xmm0 2232 movq %xmm0, -19(%edx) 2233 L(fwd_write_11bytes): 2234 movq -11(%eax), %xmm0 2235 movq %xmm0, -11(%edx) 2236 L(fwd_write_3bytes): 2237 movzwl -3(%eax), %ecx 2238 movzbl -1(%eax), %eax 2239 movw %cx, -3(%edx) 2240 movb %al, -1(%edx) 2241 #ifndef USE_AS_BCOPY 2242 # ifdef USE_AS_MEMPCPY 2243 movl %edx, %eax 2244 # else 2245 movl DEST(%esp), %eax 2246 # endif 2247 #endif 2248 RETURN 2249 2250 .p2align 4 2251 L(fwd_write_40bytes_align): 2252 movdqa -40(%eax), %xmm0 2253 movdqa %xmm0, -40(%edx) 2254 L(fwd_write_24bytes_align): 2255 movdqa -24(%eax), %xmm0 2256 movdqa %xmm0, -24(%edx) 2257 L(fwd_write_8bytes_align): 2258 movq -8(%eax), %xmm0 2259 movq %xmm0, -8(%edx) 2260 L(fwd_write_0bytes_align): 2261 #ifndef USE_AS_BCOPY 2262 # ifdef USE_AS_MEMPCPY 2263 movl %edx, %eax 2264 # else 2265 movl DEST(%esp), %eax 2266 # endif 2267 #endif 2268 RETURN 2269 2270 .p2align 4 2271 L(fwd_write_32bytes_align): 2272 movdqa -32(%eax), %xmm0 2273 movdqa %xmm0, -32(%edx) 2274 L(fwd_write_16bytes_align): 2275 movdqa -16(%eax), %xmm0 2276 movdqa %xmm0, -16(%edx) 2277 #ifndef USE_AS_BCOPY 2278 # ifdef USE_AS_MEMPCPY 2279 movl %edx, %eax 2280 # else 2281 movl DEST(%esp), %eax 2282 # endif 2283 #endif 2284 RETURN 2285 2286 .p2align 4 2287 L(fwd_write_5bytes_align): 2288 movl -5(%eax), %ecx 2289 movl -4(%eax), %eax 2290 movl %ecx, -5(%edx) 2291 movl %eax, -4(%edx) 2292 #ifndef USE_AS_BCOPY 2293 # ifdef USE_AS_MEMPCPY 2294 movl %edx, %eax 2295 # else 2296 movl DEST(%esp), %eax 2297 # endif 2298 #endif 2299 RETURN 2300 2301 .p2align 4 2302 L(fwd_write_45bytes_align): 2303 movdqa -45(%eax), %xmm0 2304 movdqa %xmm0, -45(%edx) 2305 L(fwd_write_29bytes_align): 2306 movdqa -29(%eax), %xmm0 2307 movdqa %xmm0, -29(%edx) 2308 L(fwd_write_13bytes_align): 2309 movq -13(%eax), %xmm0 2310 movq %xmm0, -13(%edx) 2311 movl -5(%eax), %ecx 2312 movl %ecx, -5(%edx) 2313 movzbl -1(%eax), %ecx 2314 movb %cl, -1(%edx) 2315 #ifndef USE_AS_BCOPY 2316 # ifdef USE_AS_MEMPCPY 2317 movl %edx, %eax 2318 # else 2319 movl DEST(%esp), %eax 2320 # endif 2321 #endif 2322 RETURN 2323 2324 .p2align 4 2325 L(fwd_write_37bytes_align): 2326 movdqa -37(%eax), %xmm0 2327 movdqa %xmm0, -37(%edx) 2328 L(fwd_write_21bytes_align): 2329 movdqa -21(%eax), %xmm0 2330 movdqa %xmm0, -21(%edx) 2331 movl -5(%eax), %ecx 2332 movl %ecx, -5(%edx) 2333 movzbl -1(%eax), %ecx 2334 movb %cl, -1(%edx) 2335 #ifndef USE_AS_BCOPY 2336 # ifdef USE_AS_MEMPCPY 2337 movl %edx, %eax 2338 # else 2339 movl DEST(%esp), %eax 2340 # endif 2341 #endif 2342 RETURN 2343 2344 .p2align 4 2345 L(fwd_write_41bytes_align): 2346 movdqa -41(%eax), %xmm0 2347 movdqa %xmm0, -41(%edx) 2348 L(fwd_write_25bytes_align): 2349 movdqa -25(%eax), %xmm0 2350 movdqa %xmm0, -25(%edx) 2351 L(fwd_write_9bytes_align): 2352 movq -9(%eax), %xmm0 2353 movq %xmm0, -9(%edx) 2354 L(fwd_write_1bytes_align): 2355 movzbl -1(%eax), %ecx 2356 movb %cl, -1(%edx) 2357 #ifndef USE_AS_BCOPY 2358 # ifdef USE_AS_MEMPCPY 2359 movl %edx, %eax 2360 # else 2361 movl DEST(%esp), %eax 2362 # endif 2363 #endif 2364 RETURN 2365 2366 .p2align 4 2367 L(fwd_write_33bytes_align): 2368 movdqa -33(%eax), %xmm0 2369 movdqa %xmm0, -33(%edx) 2370 L(fwd_write_17bytes_align): 2371 movdqa -17(%eax), %xmm0 2372 movdqa %xmm0, -17(%edx) 2373 movzbl -1(%eax), %ecx 2374 movb %cl, -1(%edx) 2375 #ifndef USE_AS_BCOPY 2376 # ifdef USE_AS_MEMPCPY 2377 movl %edx, %eax 2378 # else 2379 movl DEST(%esp), %eax 2380 # endif 2381 #endif 2382 RETURN 2383 2384 .p2align 4 2385 L(fwd_write_46bytes_align): 2386 movdqa -46(%eax), %xmm0 2387 movdqa %xmm0, -46(%edx) 2388 L(fwd_write_30bytes_align): 2389 movdqa -30(%eax), %xmm0 2390 movdqa %xmm0, -30(%edx) 2391 L(fwd_write_14bytes_align): 2392 movq -14(%eax), %xmm0 2393 movq %xmm0, -14(%edx) 2394 L(fwd_write_6bytes_align): 2395 movl -6(%eax), %ecx 2396 movl %ecx, -6(%edx) 2397 movzwl -2(%eax), %ecx 2398 movw %cx, -2(%edx) 2399 #ifndef USE_AS_BCOPY 2400 # ifdef USE_AS_MEMPCPY 2401 movl %edx, %eax 2402 # else 2403 movl DEST(%esp), %eax 2404 # endif 2405 #endif 2406 RETURN 2407 2408 .p2align 4 2409 L(fwd_write_38bytes_align): 2410 movdqa -38(%eax), %xmm0 2411 movdqa %xmm0, -38(%edx) 2412 L(fwd_write_22bytes_align): 2413 movdqa -22(%eax), %xmm0 2414 movdqa %xmm0, -22(%edx) 2415 movl -6(%eax), %ecx 2416 movl %ecx, -6(%edx) 2417 movzwl -2(%eax), %ecx 2418 movw %cx, -2(%edx) 2419 #ifndef USE_AS_BCOPY 2420 # ifdef USE_AS_MEMPCPY 2421 movl %edx, %eax 2422 # else 2423 movl DEST(%esp), %eax 2424 # endif 2425 #endif 2426 RETURN 2427 2428 .p2align 4 2429 L(fwd_write_42bytes_align): 2430 movdqa -42(%eax), %xmm0 2431 movdqa %xmm0, -42(%edx) 2432 L(fwd_write_26bytes_align): 2433 movdqa -26(%eax), %xmm0 2434 movdqa %xmm0, -26(%edx) 2435 L(fwd_write_10bytes_align): 2436 movq -10(%eax), %xmm0 2437 movq %xmm0, -10(%edx) 2438 L(fwd_write_2bytes_align): 2439 movzwl -2(%eax), %ecx 2440 movw %cx, -2(%edx) 2441 #ifndef USE_AS_BCOPY 2442 # ifdef USE_AS_MEMPCPY 2443 movl %edx, %eax 2444 # else 2445 movl DEST(%esp), %eax 2446 # endif 2447 #endif 2448 RETURN 2449 2450 .p2align 4 2451 L(fwd_write_34bytes_align): 2452 movdqa -34(%eax), %xmm0 2453 movdqa %xmm0, -34(%edx) 2454 L(fwd_write_18bytes_align): 2455 movdqa -18(%eax), %xmm0 2456 movdqa %xmm0, -18(%edx) 2457 movzwl -2(%eax), %ecx 2458 movw %cx, -2(%edx) 2459 #ifndef USE_AS_BCOPY 2460 # ifdef USE_AS_MEMPCPY 2461 movl %edx, %eax 2462 # else 2463 movl DEST(%esp), %eax 2464 # endif 2465 #endif 2466 RETURN 2467 2468 .p2align 4 2469 L(fwd_write_47bytes_align): 2470 movdqa -47(%eax), %xmm0 2471 movdqa %xmm0, -47(%edx) 2472 L(fwd_write_31bytes_align): 2473 movdqa -31(%eax), %xmm0 2474 movdqa %xmm0, -31(%edx) 2475 L(fwd_write_15bytes_align): 2476 movq -15(%eax), %xmm0 2477 movq %xmm0, -15(%edx) 2478 L(fwd_write_7bytes_align): 2479 movl -7(%eax), %ecx 2480 movl %ecx, -7(%edx) 2481 movzwl -3(%eax), %ecx 2482 movzbl -1(%eax), %eax 2483 movw %cx, -3(%edx) 2484 movb %al, -1(%edx) 2485 #ifndef USE_AS_BCOPY 2486 # ifdef USE_AS_MEMPCPY 2487 movl %edx, %eax 2488 # else 2489 movl DEST(%esp), %eax 2490 # endif 2491 #endif 2492 RETURN 2493 2494 .p2align 4 2495 L(fwd_write_39bytes_align): 2496 movdqa -39(%eax), %xmm0 2497 movdqa %xmm0, -39(%edx) 2498 L(fwd_write_23bytes_align): 2499 movdqa -23(%eax), %xmm0 2500 movdqa %xmm0, -23(%edx) 2501 movl -7(%eax), %ecx 2502 movl %ecx, -7(%edx) 2503 movzwl -3(%eax), %ecx 2504 movzbl -1(%eax), %eax 2505 movw %cx, -3(%edx) 2506 movb %al, -1(%edx) 2507 #ifndef USE_AS_BCOPY 2508 # ifdef USE_AS_MEMPCPY 2509 movl %edx, %eax 2510 # else 2511 movl DEST(%esp), %eax 2512 # endif 2513 #endif 2514 RETURN 2515 2516 .p2align 4 2517 L(fwd_write_43bytes_align): 2518 movdqa -43(%eax), %xmm0 2519 movdqa %xmm0, -43(%edx) 2520 L(fwd_write_27bytes_align): 2521 movdqa -27(%eax), %xmm0 2522 movdqa %xmm0, -27(%edx) 2523 L(fwd_write_11bytes_align): 2524 movq -11(%eax), %xmm0 2525 movq %xmm0, -11(%edx) 2526 L(fwd_write_3bytes_align): 2527 movzwl -3(%eax), %ecx 2528 movzbl -1(%eax), %eax 2529 movw %cx, -3(%edx) 2530 movb %al, -1(%edx) 2531 #ifndef USE_AS_BCOPY 2532 # ifdef USE_AS_MEMPCPY 2533 movl %edx, %eax 2534 # else 2535 movl DEST(%esp), %eax 2536 # endif 2537 #endif 2538 RETURN 2539 2540 .p2align 4 2541 L(fwd_write_35bytes_align): 2542 movdqa -35(%eax), %xmm0 2543 movdqa %xmm0, -35(%edx) 2544 L(fwd_write_19bytes_align): 2545 movdqa -19(%eax), %xmm0 2546 movdqa %xmm0, -19(%edx) 2547 movzwl -3(%eax), %ecx 2548 movzbl -1(%eax), %eax 2549 movw %cx, -3(%edx) 2550 movb %al, -1(%edx) 2551 #ifndef USE_AS_BCOPY 2552 # ifdef USE_AS_MEMPCPY 2553 movl %edx, %eax 2554 # else 2555 movl DEST(%esp), %eax 2556 # endif 2557 #endif 2558 RETURN 2559 2560 .p2align 4 2561 L(fwd_write_44bytes_align): 2562 movdqa -44(%eax), %xmm0 2563 movdqa %xmm0, -44(%edx) 2564 L(fwd_write_28bytes_align): 2565 movdqa -28(%eax), %xmm0 2566 movdqa %xmm0, -28(%edx) 2567 L(fwd_write_12bytes_align): 2568 movq -12(%eax), %xmm0 2569 movq %xmm0, -12(%edx) 2570 L(fwd_write_4bytes_align): 2571 movl -4(%eax), %ecx 2572 movl %ecx, -4(%edx) 2573 #ifndef USE_AS_BCOPY 2574 # ifdef USE_AS_MEMPCPY 2575 movl %edx, %eax 2576 # else 2577 movl DEST(%esp), %eax 2578 # endif 2579 #endif 2580 RETURN 2581 2582 .p2align 4 2583 L(fwd_write_36bytes_align): 2584 movdqa -36(%eax), %xmm0 2585 movdqa %xmm0, -36(%edx) 2586 L(fwd_write_20bytes_align): 2587 movdqa -20(%eax), %xmm0 2588 movdqa %xmm0, -20(%edx) 2589 movl -4(%eax), %ecx 2590 movl %ecx, -4(%edx) 2591 #ifndef USE_AS_BCOPY 2592 # ifdef USE_AS_MEMPCPY 2593 movl %edx, %eax 2594 # else 2595 movl DEST(%esp), %eax 2596 # endif 2597 #endif 2598 RETURN_END 2599 2600 CFI_PUSH (%edi) 2601 2602 .p2align 4 2603 L(large_page): 2604 movdqu (%eax), %xmm1 2605 #ifdef USE_AS_MEMMOVE 2606 movl DEST+4(%esp), %edi 2607 movdqu %xmm0, (%edi) 2608 #endif 2609 lea 16(%eax), %eax 2610 movntdq %xmm1, (%edx) 2611 lea 16(%edx), %edx 2612 lea -0x90(%ecx), %ecx 2613 POP (%edi) 2614 2615 .p2align 4 2616 L(large_page_loop): 2617 movdqu (%eax), %xmm0 2618 movdqu 0x10(%eax), %xmm1 2619 movdqu 0x20(%eax), %xmm2 2620 movdqu 0x30(%eax), %xmm3 2621 movdqu 0x40(%eax), %xmm4 2622 movdqu 0x50(%eax), %xmm5 2623 movdqu 0x60(%eax), %xmm6 2624 movdqu 0x70(%eax), %xmm7 2625 lea 0x80(%eax), %eax 2626 2627 sub $0x80, %ecx 2628 movntdq %xmm0, (%edx) 2629 movntdq %xmm1, 0x10(%edx) 2630 movntdq %xmm2, 0x20(%edx) 2631 movntdq %xmm3, 0x30(%edx) 2632 movntdq %xmm4, 0x40(%edx) 2633 movntdq %xmm5, 0x50(%edx) 2634 movntdq %xmm6, 0x60(%edx) 2635 movntdq %xmm7, 0x70(%edx) 2636 lea 0x80(%edx), %edx 2637 jae L(large_page_loop) 2638 cmp $-0x40, %ecx 2639 lea 0x80(%ecx), %ecx 2640 jl L(large_page_less_64bytes) 2641 2642 movdqu (%eax), %xmm0 2643 movdqu 0x10(%eax), %xmm1 2644 movdqu 0x20(%eax), %xmm2 2645 movdqu 0x30(%eax), %xmm3 2646 lea 0x40(%eax), %eax 2647 2648 movntdq %xmm0, (%edx) 2649 movntdq %xmm1, 0x10(%edx) 2650 movntdq %xmm2, 0x20(%edx) 2651 movntdq %xmm3, 0x30(%edx) 2652 lea 0x40(%edx), %edx 2653 sub $0x40, %ecx 2654 L(large_page_less_64bytes): 2655 cmp $32, %ecx 2656 jb L(large_page_less_32bytes) 2657 movdqu (%eax), %xmm0 2658 movdqu 0x10(%eax), %xmm1 2659 lea 0x20(%eax), %eax 2660 movntdq %xmm0, (%edx) 2661 movntdq %xmm1, 0x10(%edx) 2662 lea 0x20(%edx), %edx 2663 sub $0x20, %ecx 2664 L(large_page_less_32bytes): 2665 add %ecx, %edx 2666 add %ecx, %eax 2667 sfence 2668 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 2669 2670 .p2align 4 2671 L(bk_write_44bytes): 2672 movq 36(%eax), %xmm0 2673 movq %xmm0, 36(%edx) 2674 L(bk_write_36bytes): 2675 movq 28(%eax), %xmm0 2676 movq %xmm0, 28(%edx) 2677 L(bk_write_28bytes): 2678 movq 20(%eax), %xmm0 2679 movq %xmm0, 20(%edx) 2680 L(bk_write_20bytes): 2681 movq 12(%eax), %xmm0 2682 movq %xmm0, 12(%edx) 2683 L(bk_write_12bytes): 2684 movq 4(%eax), %xmm0 2685 movq %xmm0, 4(%edx) 2686 L(bk_write_4bytes): 2687 movl (%eax), %ecx 2688 movl %ecx, (%edx) 2689 L(bk_write_0bytes): 2690 #ifndef USE_AS_BCOPY 2691 movl DEST(%esp), %eax 2692 # ifdef USE_AS_MEMPCPY 2693 movl LEN(%esp), %ecx 2694 add %ecx, %eax 2695 # endif 2696 #endif 2697 RETURN 2698 2699 .p2align 4 2700 L(bk_write_40bytes): 2701 movq 32(%eax), %xmm0 2702 movq %xmm0, 32(%edx) 2703 L(bk_write_32bytes): 2704 movq 24(%eax), %xmm0 2705 movq %xmm0, 24(%edx) 2706 L(bk_write_24bytes): 2707 movq 16(%eax), %xmm0 2708 movq %xmm0, 16(%edx) 2709 L(bk_write_16bytes): 2710 movq 8(%eax), %xmm0 2711 movq %xmm0, 8(%edx) 2712 L(bk_write_8bytes): 2713 movq (%eax), %xmm0 2714 movq %xmm0, (%edx) 2715 #ifndef USE_AS_BCOPY 2716 movl DEST(%esp), %eax 2717 # ifdef USE_AS_MEMPCPY 2718 movl LEN(%esp), %ecx 2719 add %ecx, %eax 2720 # endif 2721 #endif 2722 RETURN 2723 2724 .p2align 4 2725 L(bk_write_45bytes): 2726 movq 37(%eax), %xmm0 2727 movq %xmm0, 37(%edx) 2728 L(bk_write_37bytes): 2729 movq 29(%eax), %xmm0 2730 movq %xmm0, 29(%edx) 2731 L(bk_write_29bytes): 2732 movq 21(%eax), %xmm0 2733 movq %xmm0, 21(%edx) 2734 L(bk_write_21bytes): 2735 movq 13(%eax), %xmm0 2736 movq %xmm0, 13(%edx) 2737 L(bk_write_13bytes): 2738 movq 5(%eax), %xmm0 2739 movq %xmm0, 5(%edx) 2740 L(bk_write_5bytes): 2741 movl 1(%eax), %ecx 2742 movl %ecx, 1(%edx) 2743 L(bk_write_1bytes): 2744 movzbl (%eax), %ecx 2745 movb %cl, (%edx) 2746 #ifndef USE_AS_BCOPY 2747 movl DEST(%esp), %eax 2748 # ifdef USE_AS_MEMPCPY 2749 movl LEN(%esp), %ecx 2750 add %ecx, %eax 2751 # endif 2752 #endif 2753 RETURN 2754 2755 .p2align 4 2756 L(bk_write_41bytes): 2757 movq 33(%eax), %xmm0 2758 movq %xmm0, 33(%edx) 2759 L(bk_write_33bytes): 2760 movq 25(%eax), %xmm0 2761 movq %xmm0, 25(%edx) 2762 L(bk_write_25bytes): 2763 movq 17(%eax), %xmm0 2764 movq %xmm0, 17(%edx) 2765 L(bk_write_17bytes): 2766 movq 9(%eax), %xmm0 2767 movq %xmm0, 9(%edx) 2768 L(bk_write_9bytes): 2769 movq 1(%eax), %xmm0 2770 movq %xmm0, 1(%edx) 2771 movzbl (%eax), %ecx 2772 movb %cl, (%edx) 2773 #ifndef USE_AS_BCOPY 2774 movl DEST(%esp), %eax 2775 # ifdef USE_AS_MEMPCPY 2776 movl LEN(%esp), %ecx 2777 add %ecx, %eax 2778 # endif 2779 #endif 2780 RETURN 2781 2782 .p2align 4 2783 L(bk_write_46bytes): 2784 movq 38(%eax), %xmm0 2785 movq %xmm0, 38(%edx) 2786 L(bk_write_38bytes): 2787 movq 30(%eax), %xmm0 2788 movq %xmm0, 30(%edx) 2789 L(bk_write_30bytes): 2790 movq 22(%eax), %xmm0 2791 movq %xmm0, 22(%edx) 2792 L(bk_write_22bytes): 2793 movq 14(%eax), %xmm0 2794 movq %xmm0, 14(%edx) 2795 L(bk_write_14bytes): 2796 movq 6(%eax), %xmm0 2797 movq %xmm0, 6(%edx) 2798 L(bk_write_6bytes): 2799 movl 2(%eax), %ecx 2800 movl %ecx, 2(%edx) 2801 movzwl (%eax), %ecx 2802 movw %cx, (%edx) 2803 #ifndef USE_AS_BCOPY 2804 movl DEST(%esp), %eax 2805 # ifdef USE_AS_MEMPCPY 2806 movl LEN(%esp), %ecx 2807 add %ecx, %eax 2808 # endif 2809 #endif 2810 RETURN 2811 2812 .p2align 4 2813 L(bk_write_42bytes): 2814 movq 34(%eax), %xmm0 2815 movq %xmm0, 34(%edx) 2816 L(bk_write_34bytes): 2817 movq 26(%eax), %xmm0 2818 movq %xmm0, 26(%edx) 2819 L(bk_write_26bytes): 2820 movq 18(%eax), %xmm0 2821 movq %xmm0, 18(%edx) 2822 L(bk_write_18bytes): 2823 movq 10(%eax), %xmm0 2824 movq %xmm0, 10(%edx) 2825 L(bk_write_10bytes): 2826 movq 2(%eax), %xmm0 2827 movq %xmm0, 2(%edx) 2828 L(bk_write_2bytes): 2829 movzwl (%eax), %ecx 2830 movw %cx, (%edx) 2831 #ifndef USE_AS_BCOPY 2832 movl DEST(%esp), %eax 2833 # ifdef USE_AS_MEMPCPY 2834 movl LEN(%esp), %ecx 2835 add %ecx, %eax 2836 # endif 2837 #endif 2838 RETURN 2839 2840 .p2align 4 2841 L(bk_write_47bytes): 2842 movq 39(%eax), %xmm0 2843 movq %xmm0, 39(%edx) 2844 L(bk_write_39bytes): 2845 movq 31(%eax), %xmm0 2846 movq %xmm0, 31(%edx) 2847 L(bk_write_31bytes): 2848 movq 23(%eax), %xmm0 2849 movq %xmm0, 23(%edx) 2850 L(bk_write_23bytes): 2851 movq 15(%eax), %xmm0 2852 movq %xmm0, 15(%edx) 2853 L(bk_write_15bytes): 2854 movq 7(%eax), %xmm0 2855 movq %xmm0, 7(%edx) 2856 L(bk_write_7bytes): 2857 movl 3(%eax), %ecx 2858 movl %ecx, 3(%edx) 2859 movzwl 1(%eax), %ecx 2860 movw %cx, 1(%edx) 2861 movzbl (%eax), %eax 2862 movb %al, (%edx) 2863 #ifndef USE_AS_BCOPY 2864 movl DEST(%esp), %eax 2865 # ifdef USE_AS_MEMPCPY 2866 movl LEN(%esp), %ecx 2867 add %ecx, %eax 2868 # endif 2869 #endif 2870 RETURN 2871 2872 .p2align 4 2873 L(bk_write_43bytes): 2874 movq 35(%eax), %xmm0 2875 movq %xmm0, 35(%edx) 2876 L(bk_write_35bytes): 2877 movq 27(%eax), %xmm0 2878 movq %xmm0, 27(%edx) 2879 L(bk_write_27bytes): 2880 movq 19(%eax), %xmm0 2881 movq %xmm0, 19(%edx) 2882 L(bk_write_19bytes): 2883 movq 11(%eax), %xmm0 2884 movq %xmm0, 11(%edx) 2885 L(bk_write_11bytes): 2886 movq 3(%eax), %xmm0 2887 movq %xmm0, 3(%edx) 2888 L(bk_write_3bytes): 2889 movzwl 1(%eax), %ecx 2890 movw %cx, 1(%edx) 2891 movzbl (%eax), %eax 2892 movb %al, (%edx) 2893 #ifndef USE_AS_BCOPY 2894 movl DEST(%esp), %eax 2895 # ifdef USE_AS_MEMPCPY 2896 movl LEN(%esp), %ecx 2897 add %ecx, %eax 2898 # endif 2899 #endif 2900 RETURN_END 2901 2902 2903 .pushsection .rodata.ssse3,"a",@progbits 2904 .p2align 2 2905 L(table_48bytes_fwd): 2906 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) 2907 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) 2908 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) 2909 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) 2910 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) 2911 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) 2912 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) 2913 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) 2914 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) 2915 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) 2916 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) 2917 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) 2918 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) 2919 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) 2920 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) 2921 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) 2922 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) 2923 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) 2924 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) 2925 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) 2926 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) 2927 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) 2928 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) 2929 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) 2930 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) 2931 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) 2932 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) 2933 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) 2934 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) 2935 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) 2936 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) 2937 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) 2938 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) 2939 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) 2940 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) 2941 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) 2942 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) 2943 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) 2944 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) 2945 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) 2946 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) 2947 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) 2948 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) 2949 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) 2950 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) 2951 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) 2952 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) 2953 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) 2954 2955 .p2align 2 2956 L(table_48bytes_fwd_align): 2957 .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align)) 2958 .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align)) 2959 .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align)) 2960 .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align)) 2961 .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align)) 2962 .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align)) 2963 .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align)) 2964 .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align)) 2965 .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align)) 2966 .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align)) 2967 .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align)) 2968 .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align)) 2969 .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align)) 2970 .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align)) 2971 .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align)) 2972 .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align)) 2973 .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align)) 2974 .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align)) 2975 .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align)) 2976 .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align)) 2977 .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align)) 2978 .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align)) 2979 .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align)) 2980 .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align)) 2981 .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align)) 2982 .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align)) 2983 .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align)) 2984 .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align)) 2985 .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align)) 2986 .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align)) 2987 .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align)) 2988 .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align)) 2989 .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align)) 2990 .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align)) 2991 .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align)) 2992 .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align)) 2993 .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align)) 2994 .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align)) 2995 .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align)) 2996 .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align)) 2997 .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align)) 2998 .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align)) 2999 .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align)) 3000 .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align)) 3001 .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align)) 3002 .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align)) 3003 .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align)) 3004 .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align)) 3005 3006 .p2align 2 3007 L(shl_table): 3008 .int JMPTBL (L(shl_0), L(shl_table)) 3009 .int JMPTBL (L(shl_1), L(shl_table)) 3010 .int JMPTBL (L(shl_2), L(shl_table)) 3011 .int JMPTBL (L(shl_3), L(shl_table)) 3012 .int JMPTBL (L(shl_4), L(shl_table)) 3013 .int JMPTBL (L(shl_5), L(shl_table)) 3014 .int JMPTBL (L(shl_6), L(shl_table)) 3015 .int JMPTBL (L(shl_7), L(shl_table)) 3016 .int JMPTBL (L(shl_8), L(shl_table)) 3017 .int JMPTBL (L(shl_9), L(shl_table)) 3018 .int JMPTBL (L(shl_10), L(shl_table)) 3019 .int JMPTBL (L(shl_11), L(shl_table)) 3020 .int JMPTBL (L(shl_12), L(shl_table)) 3021 .int JMPTBL (L(shl_13), L(shl_table)) 3022 .int JMPTBL (L(shl_14), L(shl_table)) 3023 .int JMPTBL (L(shl_15), L(shl_table)) 3024 3025 .p2align 2 3026 L(table_48_bytes_bwd): 3027 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) 3028 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) 3029 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) 3030 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) 3031 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) 3032 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) 3033 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) 3034 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) 3035 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) 3036 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) 3037 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) 3038 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) 3039 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) 3040 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) 3041 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) 3042 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) 3043 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) 3044 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) 3045 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) 3046 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) 3047 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) 3048 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) 3049 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) 3050 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) 3051 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) 3052 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) 3053 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) 3054 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) 3055 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) 3056 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) 3057 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) 3058 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) 3059 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) 3060 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) 3061 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) 3062 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) 3063 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) 3064 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) 3065 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) 3066 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) 3067 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) 3068 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) 3069 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) 3070 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) 3071 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) 3072 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) 3073 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) 3074 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) 3075 3076 .popsection 3077 3078 #ifdef USE_AS_MEMMOVE 3079 .p2align 4 3080 L(copy_backward): 3081 PUSH (%edi) 3082 movl %eax, %edi 3083 lea (%ecx,%edx,1),%edx 3084 lea (%ecx,%edi,1),%edi 3085 testl $0x3, %edx 3086 jnz L(bk_align) 3087 3088 L(bk_aligned_4): 3089 cmp $64, %ecx 3090 jae L(bk_write_more64bytes) 3091 3092 L(bk_write_64bytesless): 3093 cmp $32, %ecx 3094 jb L(bk_write_less32bytes) 3095 3096 L(bk_write_more32bytes): 3097 /* Copy 32 bytes at a time. */ 3098 sub $32, %ecx 3099 movq -8(%edi), %xmm0 3100 movq %xmm0, -8(%edx) 3101 movq -16(%edi), %xmm0 3102 movq %xmm0, -16(%edx) 3103 movq -24(%edi), %xmm0 3104 movq %xmm0, -24(%edx) 3105 movq -32(%edi), %xmm0 3106 movq %xmm0, -32(%edx) 3107 sub $32, %edx 3108 sub $32, %edi 3109 3110 L(bk_write_less32bytes): 3111 movl %edi, %eax 3112 sub %ecx, %edx 3113 sub %ecx, %eax 3114 POP (%edi) 3115 L(bk_write_less32bytes_2): 3116 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) 3117 3118 CFI_PUSH (%edi) 3119 3120 .p2align 4 3121 L(bk_align): 3122 cmp $8, %ecx 3123 jbe L(bk_write_less32bytes) 3124 testl $1, %edx 3125 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, 3126 then (EDX & 2) must be != 0. */ 3127 jz L(bk_got2) 3128 sub $1, %edi 3129 sub $1, %ecx 3130 sub $1, %edx 3131 movzbl (%edi), %eax 3132 movb %al, (%edx) 3133 3134 testl $2, %edx 3135 jz L(bk_aligned_4) 3136 3137 L(bk_got2): 3138 sub $2, %edi 3139 sub $2, %ecx 3140 sub $2, %edx 3141 movzwl (%edi), %eax 3142 movw %ax, (%edx) 3143 jmp L(bk_aligned_4) 3144 3145 .p2align 4 3146 L(bk_write_more64bytes): 3147 /* Check alignment of last byte. */ 3148 testl $15, %edx 3149 jz L(bk_ssse3_cpy_pre) 3150 3151 /* EDX is aligned 4 bytes, but not 16 bytes. */ 3152 L(bk_ssse3_align): 3153 sub $4, %edi 3154 sub $4, %ecx 3155 sub $4, %edx 3156 movl (%edi), %eax 3157 movl %eax, (%edx) 3158 3159 testl $15, %edx 3160 jz L(bk_ssse3_cpy_pre) 3161 3162 sub $4, %edi 3163 sub $4, %ecx 3164 sub $4, %edx 3165 movl (%edi), %eax 3166 movl %eax, (%edx) 3167 3168 testl $15, %edx 3169 jz L(bk_ssse3_cpy_pre) 3170 3171 sub $4, %edi 3172 sub $4, %ecx 3173 sub $4, %edx 3174 movl (%edi), %eax 3175 movl %eax, (%edx) 3176 3177 L(bk_ssse3_cpy_pre): 3178 cmp $64, %ecx 3179 jb L(bk_write_more32bytes) 3180 3181 .p2align 4 3182 L(bk_ssse3_cpy): 3183 sub $64, %edi 3184 sub $64, %ecx 3185 sub $64, %edx 3186 movdqu 0x30(%edi), %xmm3 3187 movdqa %xmm3, 0x30(%edx) 3188 movdqu 0x20(%edi), %xmm2 3189 movdqa %xmm2, 0x20(%edx) 3190 movdqu 0x10(%edi), %xmm1 3191 movdqa %xmm1, 0x10(%edx) 3192 movdqu (%edi), %xmm0 3193 movdqa %xmm0, (%edx) 3194 cmp $64, %ecx 3195 jae L(bk_ssse3_cpy) 3196 jmp L(bk_write_64bytesless) 3197 3198 #endif 3199 3200 END (MEMCPY) 3201