1 /* 2 Copyright (c) 2010, Intel Corporation 3 All rights reserved. 4 5 Redistribution and use in source and binary forms, with or without 6 modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #ifndef MEMCPY 32 # define MEMCPY ssse3_memcpy5 33 #endif 34 35 #ifndef L 36 # define L(label) .L##label 37 #endif 38 39 #ifndef cfi_startproc 40 # define cfi_startproc .cfi_startproc 41 #endif 42 43 #ifndef cfi_endproc 44 # define cfi_endproc .cfi_endproc 45 #endif 46 47 #ifndef cfi_rel_offset 48 # define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 49 #endif 50 51 #ifndef cfi_restore 52 # define cfi_restore(reg) .cfi_restore reg 53 #endif 54 55 #ifndef cfi_adjust_cfa_offset 56 # define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 57 #endif 58 59 #ifndef ENTRY 60 # define ENTRY(name) \ 61 .type name, @function; \ 62 .globl name; \ 63 .p2align 4; \ 64 name: \ 65 cfi_startproc 66 #endif 67 68 #ifndef END 69 # define END(name) \ 70 cfi_endproc; \ 71 .size name, .-name 72 #endif 73 74 #ifdef USE_AS_BCOPY 75 # define SRC PARMS 76 # define DEST SRC+4 77 # define LEN DEST+4 78 #else 79 # define DEST PARMS 80 # define SRC DEST+4 81 # define LEN SRC+4 82 #endif 83 84 #define CFI_PUSH(REG) \ 85 cfi_adjust_cfa_offset (4); \ 86 cfi_rel_offset (REG, 0) 87 88 #define CFI_POP(REG) \ 89 cfi_adjust_cfa_offset (-4); \ 90 cfi_restore (REG) 91 92 #define PUSH(REG) pushl REG; CFI_PUSH (REG) 93 #define POP(REG) popl REG; CFI_POP (REG) 94 95 #if (defined SHARED || defined __PIC__) 96 # define PARMS 8 /* Preserve EBX. */ 97 # define ENTRANCE PUSH (%ebx); 98 # define RETURN_END POP (%ebx); ret 99 # define RETURN RETURN_END; CFI_PUSH (%ebx) 100 # define JMPTBL(I, B) I - B 101 # undef __i686 102 103 # define SETUP_PIC_REG(x) call __i686.get_pc_thunk.x 104 105 /* Load an entry in a jump table into EBX and branch to it. TABLE is a 106 jump table with relative offsets. INDEX is a register contains the 107 index into the jump table. SCALE is the scale of INDEX. */ 108 109 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 110 /* We first load PC into EBX. */ \ 111 SETUP_PIC_REG(bx); \ 112 /* Get the address of the jump table. */ \ 113 addl $(TABLE - .), %ebx; \ 114 /* Get the entry and convert the relative offset to the \ 115 absolute address. */ \ 116 addl (%ebx, INDEX, SCALE), %ebx; \ 117 /* We loaded the jump table. Go. */ \ 118 jmp *%ebx 119 #else 120 121 # define PARMS 4 122 # define ENTRANCE 123 # define RETURN_END ret 124 # define RETURN RETURN_END 125 # define JMPTBL(I, B) I 126 127 /* Branch to an entry in a jump table. TABLE is a jump table with 128 absolute offsets. INDEX is a register contains the index into the 129 jump table. SCALE is the scale of INDEX. */ 130 131 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 132 jmp *TABLE(, INDEX, SCALE) 133 #endif 134 135 .section .text.ssse3,"ax",@progbits 136 ENTRY (MEMCPY) 137 ENTRANCE 138 movl LEN(%esp), %ecx 139 movl SRC(%esp), %eax 140 movl DEST(%esp), %edx 141 142 #ifdef USE_AS_MEMMOVE 143 cmp %eax, %edx 144 jb L(copy_forward) 145 je L(fwd_write_0bytes) 146 cmp $32, %ecx 147 jae L(memmove_bwd) 148 jmp L(bk_write_less32bytes_2) 149 150 .p2align 4 151 L(memmove_bwd): 152 add %ecx, %eax 153 cmp %eax, %edx 154 movl SRC(%esp), %eax 155 jb L(copy_backward) 156 157 L(copy_forward): 158 #endif 159 cmp $48, %ecx 160 jae L(48bytesormore) 161 162 L(fwd_write_less32bytes): 163 #ifndef USE_AS_MEMMOVE 164 cmp %dl, %al 165 jb L(bk_write) 166 #endif 167 add %ecx, %edx 168 add %ecx, %eax 169 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 170 #ifndef USE_AS_MEMMOVE 171 .p2align 4 172 L(bk_write): 173 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) 174 #endif 175 176 .p2align 4 177 L(48bytesormore): 178 #ifndef USE_AS_MEMMOVE 179 movlpd (%eax), %xmm0 180 movlpd 8(%eax), %xmm1 181 movlpd %xmm0, (%edx) 182 movlpd %xmm1, 8(%edx) 183 #else 184 movdqu (%eax), %xmm0 185 #endif 186 PUSH (%edi) 187 movl %edx, %edi 188 and $-16, %edx 189 add $16, %edx 190 sub %edx, %edi 191 add %edi, %ecx 192 sub %edi, %eax 193 194 #ifdef SHARED_CACHE_SIZE_HALF 195 cmp $SHARED_CACHE_SIZE_HALF, %ecx 196 #else 197 # if (defined SHARED || defined __PIC__) 198 SETUP_PIC_REG(bx) 199 add $_GLOBAL_OFFSET_TABLE_, %ebx 200 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx 201 # else 202 cmp __x86_shared_cache_size_half, %ecx 203 # endif 204 #endif 205 206 mov %eax, %edi 207 jae L(large_page) 208 and $0xf, %edi 209 jz L(shl_0) 210 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) 211 212 .p2align 4 213 L(shl_0): 214 #ifdef USE_AS_MEMMOVE 215 movl DEST+4(%esp), %edi 216 movdqu %xmm0, (%edi) 217 #endif 218 xor %edi, %edi 219 cmp $127, %ecx 220 ja L(shl_0_gobble) 221 lea -32(%ecx), %ecx 222 223 .p2align 4 224 L(shl_0_loop): 225 movdqa (%eax, %edi), %xmm0 226 movdqa 16(%eax, %edi), %xmm1 227 sub $32, %ecx 228 movdqa %xmm0, (%edx, %edi) 229 movdqa %xmm1, 16(%edx, %edi) 230 lea 32(%edi), %edi 231 jb L(shl_0_end) 232 233 movdqa (%eax, %edi), %xmm0 234 movdqa 16(%eax, %edi), %xmm1 235 sub $32, %ecx 236 movdqa %xmm0, (%edx, %edi) 237 movdqa %xmm1, 16(%edx, %edi) 238 lea 32(%edi), %edi 239 jb L(shl_0_end) 240 241 movdqa (%eax, %edi), %xmm0 242 movdqa 16(%eax, %edi), %xmm1 243 sub $32, %ecx 244 movdqa %xmm0, (%edx, %edi) 245 movdqa %xmm1, 16(%edx, %edi) 246 lea 32(%edi), %edi 247 jb L(shl_0_end) 248 249 movdqa (%eax, %edi), %xmm0 250 movdqa 16(%eax, %edi), %xmm1 251 sub $32, %ecx 252 movdqa %xmm0, (%edx, %edi) 253 movdqa %xmm1, 16(%edx, %edi) 254 lea 32(%edi), %edi 255 256 L(shl_0_end): 257 lea 32(%ecx), %ecx 258 add %ecx, %edi 259 add %edi, %edx 260 add %edi, %eax 261 POP (%edi) 262 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) 263 264 CFI_PUSH (%edi) 265 266 .p2align 4 267 L(shl_0_gobble): 268 #ifdef DATA_CACHE_SIZE_HALF 269 cmp $DATA_CACHE_SIZE_HALF, %ecx 270 #else 271 # if (defined SHARED || defined __PIC__) 272 SETUP_PIC_REG(bx) 273 add $_GLOBAL_OFFSET_TABLE_, %ebx 274 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 275 # else 276 cmp __x86_data_cache_size_half, %ecx 277 # endif 278 #endif 279 POP (%edi) 280 lea -128(%ecx), %ecx 281 jae L(shl_0_gobble_mem_loop) 282 283 .p2align 4 284 L(shl_0_gobble_cache_loop): 285 movdqa (%eax), %xmm0 286 movdqa 0x10(%eax), %xmm1 287 movdqa 0x20(%eax), %xmm2 288 movdqa 0x30(%eax), %xmm3 289 movdqa 0x40(%eax), %xmm4 290 movdqa 0x50(%eax), %xmm5 291 movdqa 0x60(%eax), %xmm6 292 movdqa 0x70(%eax), %xmm7 293 lea 0x80(%eax), %eax 294 sub $128, %ecx 295 movdqa %xmm0, (%edx) 296 movdqa %xmm1, 0x10(%edx) 297 movdqa %xmm2, 0x20(%edx) 298 movdqa %xmm3, 0x30(%edx) 299 movdqa %xmm4, 0x40(%edx) 300 movdqa %xmm5, 0x50(%edx) 301 movdqa %xmm6, 0x60(%edx) 302 movdqa %xmm7, 0x70(%edx) 303 lea 0x80(%edx), %edx 304 305 jae L(shl_0_gobble_cache_loop) 306 cmp $-0x40, %ecx 307 lea 0x80(%ecx), %ecx 308 jl L(shl_0_cache_less_64bytes) 309 310 movdqa (%eax), %xmm0 311 sub $0x40, %ecx 312 movdqa 0x10(%eax), %xmm1 313 movdqa %xmm0, (%edx) 314 movdqa %xmm1, 0x10(%edx) 315 movdqa 0x20(%eax), %xmm0 316 movdqa 0x30(%eax), %xmm1 317 add $0x40, %eax 318 movdqa %xmm0, 0x20(%edx) 319 movdqa %xmm1, 0x30(%edx) 320 add $0x40, %edx 321 322 L(shl_0_cache_less_64bytes): 323 cmp $0x20, %ecx 324 jb L(shl_0_cache_less_32bytes) 325 movdqa (%eax), %xmm0 326 sub $0x20, %ecx 327 movdqa 0x10(%eax), %xmm1 328 add $0x20, %eax 329 movdqa %xmm0, (%edx) 330 movdqa %xmm1, 0x10(%edx) 331 add $0x20, %edx 332 333 L(shl_0_cache_less_32bytes): 334 cmp $0x10, %ecx 335 jb L(shl_0_cache_less_16bytes) 336 sub $0x10, %ecx 337 movdqa (%eax), %xmm0 338 add $0x10, %eax 339 movdqa %xmm0, (%edx) 340 add $0x10, %edx 341 342 L(shl_0_cache_less_16bytes): 343 add %ecx, %edx 344 add %ecx, %eax 345 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 346 347 .p2align 4 348 L(shl_0_gobble_mem_loop): 349 prefetcht0 0x1c0(%eax) 350 prefetcht0 0x280(%eax) 351 prefetcht0 0x1c0(%edx) 352 353 movdqa (%eax), %xmm0 354 movdqa 0x10(%eax), %xmm1 355 movdqa 0x20(%eax), %xmm2 356 movdqa 0x30(%eax), %xmm3 357 movdqa 0x40(%eax), %xmm4 358 movdqa 0x50(%eax), %xmm5 359 movdqa 0x60(%eax), %xmm6 360 movdqa 0x70(%eax), %xmm7 361 lea 0x80(%eax), %eax 362 sub $0x80, %ecx 363 movdqa %xmm0, (%edx) 364 movdqa %xmm1, 0x10(%edx) 365 movdqa %xmm2, 0x20(%edx) 366 movdqa %xmm3, 0x30(%edx) 367 movdqa %xmm4, 0x40(%edx) 368 movdqa %xmm5, 0x50(%edx) 369 movdqa %xmm6, 0x60(%edx) 370 movdqa %xmm7, 0x70(%edx) 371 lea 0x80(%edx), %edx 372 373 jae L(shl_0_gobble_mem_loop) 374 cmp $-0x40, %ecx 375 lea 0x80(%ecx), %ecx 376 jl L(shl_0_mem_less_64bytes) 377 378 movdqa (%eax), %xmm0 379 sub $0x40, %ecx 380 movdqa 0x10(%eax), %xmm1 381 382 movdqa %xmm0, (%edx) 383 movdqa %xmm1, 0x10(%edx) 384 385 movdqa 0x20(%eax), %xmm0 386 movdqa 0x30(%eax), %xmm1 387 add $0x40, %eax 388 389 movdqa %xmm0, 0x20(%edx) 390 movdqa %xmm1, 0x30(%edx) 391 add $0x40, %edx 392 393 L(shl_0_mem_less_64bytes): 394 cmp $0x20, %ecx 395 jb L(shl_0_mem_less_32bytes) 396 movdqa (%eax), %xmm0 397 sub $0x20, %ecx 398 movdqa 0x10(%eax), %xmm1 399 add $0x20, %eax 400 movdqa %xmm0, (%edx) 401 movdqa %xmm1, 0x10(%edx) 402 add $0x20, %edx 403 404 L(shl_0_mem_less_32bytes): 405 cmp $0x10, %ecx 406 jb L(shl_0_mem_less_16bytes) 407 sub $0x10, %ecx 408 movdqa (%eax), %xmm0 409 add $0x10, %eax 410 movdqa %xmm0, (%edx) 411 add $0x10, %edx 412 413 L(shl_0_mem_less_16bytes): 414 add %ecx, %edx 415 add %ecx, %eax 416 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) 417 418 .p2align 4 419 L(shl_1): 420 #ifndef USE_AS_MEMMOVE 421 movaps -1(%eax), %xmm1 422 #else 423 movl DEST+4(%esp), %edi 424 movaps -1(%eax), %xmm1 425 movdqu %xmm0, (%edi) 426 #endif 427 #ifdef DATA_CACHE_SIZE_HALF 428 cmp $DATA_CACHE_SIZE_HALF, %ecx 429 #else 430 # if (defined SHARED || defined __PIC__) 431 SETUP_PIC_REG(bx) 432 add $_GLOBAL_OFFSET_TABLE_, %ebx 433 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 434 # else 435 cmp __x86_data_cache_size_half, %ecx 436 # endif 437 #endif 438 jb L(sh_1_no_prefetch) 439 440 lea -64(%ecx), %ecx 441 442 .p2align 4 443 L(Shl1LoopStart): 444 prefetcht0 0x1c0(%eax) 445 prefetcht0 0x1c0(%edx) 446 movaps 15(%eax), %xmm2 447 movaps 31(%eax), %xmm3 448 movaps 47(%eax), %xmm4 449 movaps 63(%eax), %xmm5 450 movaps %xmm5, %xmm7 451 palignr $1, %xmm4, %xmm5 452 palignr $1, %xmm3, %xmm4 453 movaps %xmm5, 48(%edx) 454 palignr $1, %xmm2, %xmm3 455 lea 64(%eax), %eax 456 palignr $1, %xmm1, %xmm2 457 movaps %xmm4, 32(%edx) 458 movaps %xmm3, 16(%edx) 459 movaps %xmm7, %xmm1 460 movaps %xmm2, (%edx) 461 lea 64(%edx), %edx 462 sub $64, %ecx 463 ja L(Shl1LoopStart) 464 465 L(Shl1LoopLeave): 466 add $32, %ecx 467 jle L(shl_end_0) 468 469 movaps 15(%eax), %xmm2 470 movaps 31(%eax), %xmm3 471 palignr $1, %xmm2, %xmm3 472 palignr $1, %xmm1, %xmm2 473 movaps %xmm2, (%edx) 474 movaps %xmm3, 16(%edx) 475 lea 32(%edx, %ecx), %edx 476 lea 32(%eax, %ecx), %eax 477 POP (%edi) 478 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 479 480 CFI_PUSH (%edi) 481 482 .p2align 4 483 L(sh_1_no_prefetch): 484 lea -32(%ecx), %ecx 485 lea -1(%eax), %eax 486 xor %edi, %edi 487 488 .p2align 4 489 L(sh_1_no_prefetch_loop): 490 movdqa 16(%eax, %edi), %xmm2 491 sub $32, %ecx 492 movdqa 32(%eax, %edi), %xmm3 493 movdqa %xmm3, %xmm4 494 palignr $1, %xmm2, %xmm3 495 palignr $1, %xmm1, %xmm2 496 lea 32(%edi), %edi 497 movdqa %xmm2, -32(%edx, %edi) 498 movdqa %xmm3, -16(%edx, %edi) 499 jb L(sh_1_end_no_prefetch_loop) 500 501 movdqa 16(%eax, %edi), %xmm2 502 sub $32, %ecx 503 movdqa 32(%eax, %edi), %xmm3 504 movdqa %xmm3, %xmm1 505 palignr $1, %xmm2, %xmm3 506 palignr $1, %xmm4, %xmm2 507 lea 32(%edi), %edi 508 movdqa %xmm2, -32(%edx, %edi) 509 movdqa %xmm3, -16(%edx, %edi) 510 jae L(sh_1_no_prefetch_loop) 511 512 L(sh_1_end_no_prefetch_loop): 513 lea 32(%ecx), %ecx 514 add %ecx, %edi 515 add %edi, %edx 516 lea 1(%edi, %eax), %eax 517 POP (%edi) 518 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 519 520 CFI_PUSH (%edi) 521 522 .p2align 4 523 L(shl_2): 524 #ifndef USE_AS_MEMMOVE 525 movaps -2(%eax), %xmm1 526 #else 527 movl DEST+4(%esp), %edi 528 movaps -2(%eax), %xmm1 529 movdqu %xmm0, (%edi) 530 #endif 531 #ifdef DATA_CACHE_SIZE_HALF 532 cmp $DATA_CACHE_SIZE_HALF, %ecx 533 #else 534 # if (defined SHARED || defined __PIC__) 535 SETUP_PIC_REG(bx) 536 add $_GLOBAL_OFFSET_TABLE_, %ebx 537 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 538 # else 539 cmp __x86_data_cache_size_half, %ecx 540 # endif 541 #endif 542 jb L(sh_2_no_prefetch) 543 544 lea -64(%ecx), %ecx 545 546 .p2align 4 547 L(Shl2LoopStart): 548 prefetcht0 0x1c0(%eax) 549 prefetcht0 0x1c0(%edx) 550 movaps 14(%eax), %xmm2 551 movaps 30(%eax), %xmm3 552 movaps 46(%eax), %xmm4 553 movaps 62(%eax), %xmm5 554 movaps %xmm5, %xmm7 555 palignr $2, %xmm4, %xmm5 556 palignr $2, %xmm3, %xmm4 557 movaps %xmm5, 48(%edx) 558 palignr $2, %xmm2, %xmm3 559 lea 64(%eax), %eax 560 palignr $2, %xmm1, %xmm2 561 movaps %xmm4, 32(%edx) 562 movaps %xmm3, 16(%edx) 563 movaps %xmm7, %xmm1 564 movaps %xmm2, (%edx) 565 lea 64(%edx), %edx 566 sub $64, %ecx 567 ja L(Shl2LoopStart) 568 569 L(Shl2LoopLeave): 570 add $32, %ecx 571 jle L(shl_end_0) 572 573 movaps 14(%eax), %xmm2 574 movaps 30(%eax), %xmm3 575 palignr $2, %xmm2, %xmm3 576 palignr $2, %xmm1, %xmm2 577 movaps %xmm2, (%edx) 578 movaps %xmm3, 16(%edx) 579 lea 32(%edx, %ecx), %edx 580 lea 32(%eax, %ecx), %eax 581 POP (%edi) 582 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 583 584 CFI_PUSH (%edi) 585 586 .p2align 4 587 L(sh_2_no_prefetch): 588 lea -32(%ecx), %ecx 589 lea -2(%eax), %eax 590 xor %edi, %edi 591 592 .p2align 4 593 L(sh_2_no_prefetch_loop): 594 movdqa 16(%eax, %edi), %xmm2 595 sub $32, %ecx 596 movdqa 32(%eax, %edi), %xmm3 597 movdqa %xmm3, %xmm4 598 palignr $2, %xmm2, %xmm3 599 palignr $2, %xmm1, %xmm2 600 lea 32(%edi), %edi 601 movdqa %xmm2, -32(%edx, %edi) 602 movdqa %xmm3, -16(%edx, %edi) 603 jb L(sh_2_end_no_prefetch_loop) 604 605 movdqa 16(%eax, %edi), %xmm2 606 sub $32, %ecx 607 movdqa 32(%eax, %edi), %xmm3 608 movdqa %xmm3, %xmm1 609 palignr $2, %xmm2, %xmm3 610 palignr $2, %xmm4, %xmm2 611 lea 32(%edi), %edi 612 movdqa %xmm2, -32(%edx, %edi) 613 movdqa %xmm3, -16(%edx, %edi) 614 jae L(sh_2_no_prefetch_loop) 615 616 L(sh_2_end_no_prefetch_loop): 617 lea 32(%ecx), %ecx 618 add %ecx, %edi 619 add %edi, %edx 620 lea 2(%edi, %eax), %eax 621 POP (%edi) 622 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 623 624 CFI_PUSH (%edi) 625 626 .p2align 4 627 L(shl_3): 628 #ifndef USE_AS_MEMMOVE 629 movaps -3(%eax), %xmm1 630 #else 631 movl DEST+4(%esp), %edi 632 movaps -3(%eax), %xmm1 633 movdqu %xmm0, (%edi) 634 #endif 635 #ifdef DATA_CACHE_SIZE_HALF 636 cmp $DATA_CACHE_SIZE_HALF, %ecx 637 #else 638 # if (defined SHARED || defined __PIC__) 639 SETUP_PIC_REG(bx) 640 add $_GLOBAL_OFFSET_TABLE_, %ebx 641 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 642 # else 643 cmp __x86_data_cache_size_half, %ecx 644 # endif 645 #endif 646 jb L(sh_3_no_prefetch) 647 648 lea -64(%ecx), %ecx 649 650 .p2align 4 651 L(Shl3LoopStart): 652 prefetcht0 0x1c0(%eax) 653 prefetcht0 0x1c0(%edx) 654 movaps 13(%eax), %xmm2 655 movaps 29(%eax), %xmm3 656 movaps 45(%eax), %xmm4 657 movaps 61(%eax), %xmm5 658 movaps %xmm5, %xmm7 659 palignr $3, %xmm4, %xmm5 660 palignr $3, %xmm3, %xmm4 661 movaps %xmm5, 48(%edx) 662 palignr $3, %xmm2, %xmm3 663 lea 64(%eax), %eax 664 palignr $3, %xmm1, %xmm2 665 movaps %xmm4, 32(%edx) 666 movaps %xmm3, 16(%edx) 667 movaps %xmm7, %xmm1 668 movaps %xmm2, (%edx) 669 lea 64(%edx), %edx 670 sub $64, %ecx 671 ja L(Shl3LoopStart) 672 673 L(Shl3LoopLeave): 674 add $32, %ecx 675 jle L(shl_end_0) 676 677 movaps 13(%eax), %xmm2 678 movaps 29(%eax), %xmm3 679 palignr $3, %xmm2, %xmm3 680 palignr $3, %xmm1, %xmm2 681 movaps %xmm2, (%edx) 682 movaps %xmm3, 16(%edx) 683 lea 32(%edx, %ecx), %edx 684 lea 32(%eax, %ecx), %eax 685 POP (%edi) 686 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 687 688 CFI_PUSH (%edi) 689 690 .p2align 4 691 L(sh_3_no_prefetch): 692 lea -32(%ecx), %ecx 693 lea -3(%eax), %eax 694 xor %edi, %edi 695 696 .p2align 4 697 L(sh_3_no_prefetch_loop): 698 movdqa 16(%eax, %edi), %xmm2 699 sub $32, %ecx 700 movdqa 32(%eax, %edi), %xmm3 701 movdqa %xmm3, %xmm4 702 palignr $3, %xmm2, %xmm3 703 palignr $3, %xmm1, %xmm2 704 lea 32(%edi), %edi 705 movdqa %xmm2, -32(%edx, %edi) 706 movdqa %xmm3, -16(%edx, %edi) 707 708 jb L(sh_3_end_no_prefetch_loop) 709 710 movdqa 16(%eax, %edi), %xmm2 711 sub $32, %ecx 712 movdqa 32(%eax, %edi), %xmm3 713 movdqa %xmm3, %xmm1 714 palignr $3, %xmm2, %xmm3 715 palignr $3, %xmm4, %xmm2 716 lea 32(%edi), %edi 717 movdqa %xmm2, -32(%edx, %edi) 718 movdqa %xmm3, -16(%edx, %edi) 719 720 jae L(sh_3_no_prefetch_loop) 721 722 L(sh_3_end_no_prefetch_loop): 723 lea 32(%ecx), %ecx 724 add %ecx, %edi 725 add %edi, %edx 726 lea 3(%edi, %eax), %eax 727 POP (%edi) 728 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 729 730 CFI_PUSH (%edi) 731 732 .p2align 4 733 L(shl_4): 734 #ifndef USE_AS_MEMMOVE 735 movaps -4(%eax), %xmm1 736 #else 737 movl DEST+4(%esp), %edi 738 movaps -4(%eax), %xmm1 739 movdqu %xmm0, (%edi) 740 #endif 741 #ifdef DATA_CACHE_SIZE_HALF 742 cmp $DATA_CACHE_SIZE_HALF, %ecx 743 #else 744 # if (defined SHARED || defined __PIC__) 745 SETUP_PIC_REG(bx) 746 add $_GLOBAL_OFFSET_TABLE_, %ebx 747 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 748 # else 749 cmp __x86_data_cache_size_half, %ecx 750 # endif 751 #endif 752 jb L(sh_4_no_prefetch) 753 754 lea -64(%ecx), %ecx 755 756 .p2align 4 757 L(Shl4LoopStart): 758 prefetcht0 0x1c0(%eax) 759 prefetcht0 0x1c0(%edx) 760 movaps 12(%eax), %xmm2 761 movaps 28(%eax), %xmm3 762 movaps 44(%eax), %xmm4 763 movaps 60(%eax), %xmm5 764 movaps %xmm5, %xmm7 765 palignr $4, %xmm4, %xmm5 766 palignr $4, %xmm3, %xmm4 767 movaps %xmm5, 48(%edx) 768 palignr $4, %xmm2, %xmm3 769 lea 64(%eax), %eax 770 palignr $4, %xmm1, %xmm2 771 movaps %xmm4, 32(%edx) 772 movaps %xmm3, 16(%edx) 773 movaps %xmm7, %xmm1 774 movaps %xmm2, (%edx) 775 lea 64(%edx), %edx 776 sub $64, %ecx 777 ja L(Shl4LoopStart) 778 779 L(Shl4LoopLeave): 780 add $32, %ecx 781 jle L(shl_end_0) 782 783 movaps 12(%eax), %xmm2 784 movaps 28(%eax), %xmm3 785 palignr $4, %xmm2, %xmm3 786 palignr $4, %xmm1, %xmm2 787 movaps %xmm2, (%edx) 788 movaps %xmm3, 16(%edx) 789 lea 32(%edx, %ecx), %edx 790 lea 32(%eax, %ecx), %eax 791 POP (%edi) 792 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 793 794 CFI_PUSH (%edi) 795 796 .p2align 4 797 L(sh_4_no_prefetch): 798 lea -32(%ecx), %ecx 799 lea -4(%eax), %eax 800 xor %edi, %edi 801 802 .p2align 4 803 L(sh_4_no_prefetch_loop): 804 movdqa 16(%eax, %edi), %xmm2 805 sub $32, %ecx 806 movdqa 32(%eax, %edi), %xmm3 807 movdqa %xmm3, %xmm4 808 palignr $4, %xmm2, %xmm3 809 palignr $4, %xmm1, %xmm2 810 lea 32(%edi), %edi 811 movdqa %xmm2, -32(%edx, %edi) 812 movdqa %xmm3, -16(%edx, %edi) 813 814 jb L(sh_4_end_no_prefetch_loop) 815 816 movdqa 16(%eax, %edi), %xmm2 817 sub $32, %ecx 818 movdqa 32(%eax, %edi), %xmm3 819 movdqa %xmm3, %xmm1 820 palignr $4, %xmm2, %xmm3 821 palignr $4, %xmm4, %xmm2 822 lea 32(%edi), %edi 823 movdqa %xmm2, -32(%edx, %edi) 824 movdqa %xmm3, -16(%edx, %edi) 825 826 jae L(sh_4_no_prefetch_loop) 827 828 L(sh_4_end_no_prefetch_loop): 829 lea 32(%ecx), %ecx 830 add %ecx, %edi 831 add %edi, %edx 832 lea 4(%edi, %eax), %eax 833 POP (%edi) 834 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 835 836 CFI_PUSH (%edi) 837 838 .p2align 4 839 L(shl_5): 840 #ifndef USE_AS_MEMMOVE 841 movaps -5(%eax), %xmm1 842 #else 843 movl DEST+4(%esp), %edi 844 movaps -5(%eax), %xmm1 845 movdqu %xmm0, (%edi) 846 #endif 847 #ifdef DATA_CACHE_SIZE_HALF 848 cmp $DATA_CACHE_SIZE_HALF, %ecx 849 #else 850 # if (defined SHARED || defined __PIC__) 851 SETUP_PIC_REG(bx) 852 add $_GLOBAL_OFFSET_TABLE_, %ebx 853 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 854 # else 855 cmp __x86_data_cache_size_half, %ecx 856 # endif 857 #endif 858 jb L(sh_5_no_prefetch) 859 860 lea -64(%ecx), %ecx 861 862 .p2align 4 863 L(Shl5LoopStart): 864 prefetcht0 0x1c0(%eax) 865 prefetcht0 0x1c0(%edx) 866 movaps 11(%eax), %xmm2 867 movaps 27(%eax), %xmm3 868 movaps 43(%eax), %xmm4 869 movaps 59(%eax), %xmm5 870 movaps %xmm5, %xmm7 871 palignr $5, %xmm4, %xmm5 872 palignr $5, %xmm3, %xmm4 873 movaps %xmm5, 48(%edx) 874 palignr $5, %xmm2, %xmm3 875 lea 64(%eax), %eax 876 palignr $5, %xmm1, %xmm2 877 movaps %xmm4, 32(%edx) 878 movaps %xmm3, 16(%edx) 879 movaps %xmm7, %xmm1 880 movaps %xmm2, (%edx) 881 lea 64(%edx), %edx 882 sub $64, %ecx 883 ja L(Shl5LoopStart) 884 885 L(Shl5LoopLeave): 886 add $32, %ecx 887 jle L(shl_end_0) 888 889 movaps 11(%eax), %xmm2 890 movaps 27(%eax), %xmm3 891 palignr $5, %xmm2, %xmm3 892 palignr $5, %xmm1, %xmm2 893 movaps %xmm2, (%edx) 894 movaps %xmm3, 16(%edx) 895 lea 32(%edx, %ecx), %edx 896 lea 32(%eax, %ecx), %eax 897 POP (%edi) 898 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 899 900 CFI_PUSH (%edi) 901 902 .p2align 4 903 L(sh_5_no_prefetch): 904 lea -32(%ecx), %ecx 905 lea -5(%eax), %eax 906 xor %edi, %edi 907 908 .p2align 4 909 L(sh_5_no_prefetch_loop): 910 movdqa 16(%eax, %edi), %xmm2 911 sub $32, %ecx 912 movdqa 32(%eax, %edi), %xmm3 913 movdqa %xmm3, %xmm4 914 palignr $5, %xmm2, %xmm3 915 palignr $5, %xmm1, %xmm2 916 lea 32(%edi), %edi 917 movdqa %xmm2, -32(%edx, %edi) 918 movdqa %xmm3, -16(%edx, %edi) 919 920 jb L(sh_5_end_no_prefetch_loop) 921 922 movdqa 16(%eax, %edi), %xmm2 923 sub $32, %ecx 924 movdqa 32(%eax, %edi), %xmm3 925 movdqa %xmm3, %xmm1 926 palignr $5, %xmm2, %xmm3 927 palignr $5, %xmm4, %xmm2 928 lea 32(%edi), %edi 929 movdqa %xmm2, -32(%edx, %edi) 930 movdqa %xmm3, -16(%edx, %edi) 931 932 jae L(sh_5_no_prefetch_loop) 933 934 L(sh_5_end_no_prefetch_loop): 935 lea 32(%ecx), %ecx 936 add %ecx, %edi 937 add %edi, %edx 938 lea 5(%edi, %eax), %eax 939 POP (%edi) 940 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 941 942 CFI_PUSH (%edi) 943 944 .p2align 4 945 L(shl_6): 946 #ifndef USE_AS_MEMMOVE 947 movaps -6(%eax), %xmm1 948 #else 949 movl DEST+4(%esp), %edi 950 movaps -6(%eax), %xmm1 951 movdqu %xmm0, (%edi) 952 #endif 953 #ifdef DATA_CACHE_SIZE_HALF 954 cmp $DATA_CACHE_SIZE_HALF, %ecx 955 #else 956 # if (defined SHARED || defined __PIC__) 957 SETUP_PIC_REG(bx) 958 add $_GLOBAL_OFFSET_TABLE_, %ebx 959 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 960 # else 961 cmp __x86_data_cache_size_half, %ecx 962 # endif 963 #endif 964 jb L(sh_6_no_prefetch) 965 966 lea -64(%ecx), %ecx 967 968 .p2align 4 969 L(Shl6LoopStart): 970 prefetcht0 0x1c0(%eax) 971 prefetcht0 0x1c0(%edx) 972 movaps 10(%eax), %xmm2 973 movaps 26(%eax), %xmm3 974 movaps 42(%eax), %xmm4 975 movaps 58(%eax), %xmm5 976 movaps %xmm5, %xmm7 977 palignr $6, %xmm4, %xmm5 978 palignr $6, %xmm3, %xmm4 979 movaps %xmm5, 48(%edx) 980 palignr $6, %xmm2, %xmm3 981 lea 64(%eax), %eax 982 palignr $6, %xmm1, %xmm2 983 movaps %xmm4, 32(%edx) 984 movaps %xmm3, 16(%edx) 985 movaps %xmm7, %xmm1 986 movaps %xmm2, (%edx) 987 lea 64(%edx), %edx 988 sub $64, %ecx 989 ja L(Shl6LoopStart) 990 991 L(Shl6LoopLeave): 992 add $32, %ecx 993 jle L(shl_end_0) 994 995 movaps 10(%eax), %xmm2 996 movaps 26(%eax), %xmm3 997 palignr $6, %xmm2, %xmm3 998 palignr $6, %xmm1, %xmm2 999 movaps %xmm2, (%edx) 1000 movaps %xmm3, 16(%edx) 1001 lea 32(%edx, %ecx), %edx 1002 lea 32(%eax, %ecx), %eax 1003 POP (%edi) 1004 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1005 1006 CFI_PUSH (%edi) 1007 1008 .p2align 4 1009 L(sh_6_no_prefetch): 1010 lea -32(%ecx), %ecx 1011 lea -6(%eax), %eax 1012 xor %edi, %edi 1013 1014 .p2align 4 1015 L(sh_6_no_prefetch_loop): 1016 movdqa 16(%eax, %edi), %xmm2 1017 sub $32, %ecx 1018 movdqa 32(%eax, %edi), %xmm3 1019 movdqa %xmm3, %xmm4 1020 palignr $6, %xmm2, %xmm3 1021 palignr $6, %xmm1, %xmm2 1022 lea 32(%edi), %edi 1023 movdqa %xmm2, -32(%edx, %edi) 1024 movdqa %xmm3, -16(%edx, %edi) 1025 1026 jb L(sh_6_end_no_prefetch_loop) 1027 1028 movdqa 16(%eax, %edi), %xmm2 1029 sub $32, %ecx 1030 movdqa 32(%eax, %edi), %xmm3 1031 movdqa %xmm3, %xmm1 1032 palignr $6, %xmm2, %xmm3 1033 palignr $6, %xmm4, %xmm2 1034 lea 32(%edi), %edi 1035 movdqa %xmm2, -32(%edx, %edi) 1036 movdqa %xmm3, -16(%edx, %edi) 1037 1038 jae L(sh_6_no_prefetch_loop) 1039 1040 L(sh_6_end_no_prefetch_loop): 1041 lea 32(%ecx), %ecx 1042 add %ecx, %edi 1043 add %edi, %edx 1044 lea 6(%edi, %eax), %eax 1045 POP (%edi) 1046 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1047 1048 CFI_PUSH (%edi) 1049 1050 .p2align 4 1051 L(shl_7): 1052 #ifndef USE_AS_MEMMOVE 1053 movaps -7(%eax), %xmm1 1054 #else 1055 movl DEST+4(%esp), %edi 1056 movaps -7(%eax), %xmm1 1057 movdqu %xmm0, (%edi) 1058 #endif 1059 #ifdef DATA_CACHE_SIZE_HALF 1060 cmp $DATA_CACHE_SIZE_HALF, %ecx 1061 #else 1062 # if (defined SHARED || defined __PIC__) 1063 SETUP_PIC_REG(bx) 1064 add $_GLOBAL_OFFSET_TABLE_, %ebx 1065 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1066 # else 1067 cmp __x86_data_cache_size_half, %ecx 1068 # endif 1069 #endif 1070 jb L(sh_7_no_prefetch) 1071 1072 lea -64(%ecx), %ecx 1073 1074 .p2align 4 1075 L(Shl7LoopStart): 1076 prefetcht0 0x1c0(%eax) 1077 prefetcht0 0x1c0(%edx) 1078 movaps 9(%eax), %xmm2 1079 movaps 25(%eax), %xmm3 1080 movaps 41(%eax), %xmm4 1081 movaps 57(%eax), %xmm5 1082 movaps %xmm5, %xmm7 1083 palignr $7, %xmm4, %xmm5 1084 palignr $7, %xmm3, %xmm4 1085 movaps %xmm5, 48(%edx) 1086 palignr $7, %xmm2, %xmm3 1087 lea 64(%eax), %eax 1088 palignr $7, %xmm1, %xmm2 1089 movaps %xmm4, 32(%edx) 1090 movaps %xmm3, 16(%edx) 1091 movaps %xmm7, %xmm1 1092 movaps %xmm2, (%edx) 1093 lea 64(%edx), %edx 1094 sub $64, %ecx 1095 ja L(Shl7LoopStart) 1096 1097 L(Shl7LoopLeave): 1098 add $32, %ecx 1099 jle L(shl_end_0) 1100 1101 movaps 9(%eax), %xmm2 1102 movaps 25(%eax), %xmm3 1103 palignr $7, %xmm2, %xmm3 1104 palignr $7, %xmm1, %xmm2 1105 movaps %xmm2, (%edx) 1106 movaps %xmm3, 16(%edx) 1107 lea 32(%edx, %ecx), %edx 1108 lea 32(%eax, %ecx), %eax 1109 POP (%edi) 1110 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1111 1112 CFI_PUSH (%edi) 1113 1114 .p2align 4 1115 L(sh_7_no_prefetch): 1116 lea -32(%ecx), %ecx 1117 lea -7(%eax), %eax 1118 xor %edi, %edi 1119 1120 .p2align 4 1121 L(sh_7_no_prefetch_loop): 1122 movdqa 16(%eax, %edi), %xmm2 1123 sub $32, %ecx 1124 movdqa 32(%eax, %edi), %xmm3 1125 movdqa %xmm3, %xmm4 1126 palignr $7, %xmm2, %xmm3 1127 palignr $7, %xmm1, %xmm2 1128 lea 32(%edi), %edi 1129 movdqa %xmm2, -32(%edx, %edi) 1130 movdqa %xmm3, -16(%edx, %edi) 1131 jb L(sh_7_end_no_prefetch_loop) 1132 1133 movdqa 16(%eax, %edi), %xmm2 1134 sub $32, %ecx 1135 movdqa 32(%eax, %edi), %xmm3 1136 movdqa %xmm3, %xmm1 1137 palignr $7, %xmm2, %xmm3 1138 palignr $7, %xmm4, %xmm2 1139 lea 32(%edi), %edi 1140 movdqa %xmm2, -32(%edx, %edi) 1141 movdqa %xmm3, -16(%edx, %edi) 1142 jae L(sh_7_no_prefetch_loop) 1143 1144 L(sh_7_end_no_prefetch_loop): 1145 lea 32(%ecx), %ecx 1146 add %ecx, %edi 1147 add %edi, %edx 1148 lea 7(%edi, %eax), %eax 1149 POP (%edi) 1150 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1151 1152 CFI_PUSH (%edi) 1153 1154 .p2align 4 1155 L(shl_8): 1156 #ifndef USE_AS_MEMMOVE 1157 movaps -8(%eax), %xmm1 1158 #else 1159 movl DEST+4(%esp), %edi 1160 movaps -8(%eax), %xmm1 1161 movdqu %xmm0, (%edi) 1162 #endif 1163 #ifdef DATA_CACHE_SIZE_HALF 1164 cmp $DATA_CACHE_SIZE_HALF, %ecx 1165 #else 1166 # if (defined SHARED || defined __PIC__) 1167 SETUP_PIC_REG(bx) 1168 add $_GLOBAL_OFFSET_TABLE_, %ebx 1169 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1170 # else 1171 cmp __x86_data_cache_size_half, %ecx 1172 # endif 1173 #endif 1174 jb L(sh_8_no_prefetch) 1175 1176 lea -64(%ecx), %ecx 1177 1178 .p2align 4 1179 L(Shl8LoopStart): 1180 prefetcht0 0x1c0(%eax) 1181 prefetcht0 0x1c0(%edx) 1182 movaps 8(%eax), %xmm2 1183 movaps 24(%eax), %xmm3 1184 movaps 40(%eax), %xmm4 1185 movaps 56(%eax), %xmm5 1186 movaps %xmm5, %xmm7 1187 palignr $8, %xmm4, %xmm5 1188 palignr $8, %xmm3, %xmm4 1189 movaps %xmm5, 48(%edx) 1190 palignr $8, %xmm2, %xmm3 1191 lea 64(%eax), %eax 1192 palignr $8, %xmm1, %xmm2 1193 movaps %xmm4, 32(%edx) 1194 movaps %xmm3, 16(%edx) 1195 movaps %xmm7, %xmm1 1196 movaps %xmm2, (%edx) 1197 lea 64(%edx), %edx 1198 sub $64, %ecx 1199 ja L(Shl8LoopStart) 1200 1201 L(LoopLeave8): 1202 add $32, %ecx 1203 jle L(shl_end_0) 1204 1205 movaps 8(%eax), %xmm2 1206 movaps 24(%eax), %xmm3 1207 palignr $8, %xmm2, %xmm3 1208 palignr $8, %xmm1, %xmm2 1209 movaps %xmm2, (%edx) 1210 movaps %xmm3, 16(%edx) 1211 lea 32(%edx, %ecx), %edx 1212 lea 32(%eax, %ecx), %eax 1213 POP (%edi) 1214 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1215 1216 CFI_PUSH (%edi) 1217 1218 .p2align 4 1219 L(sh_8_no_prefetch): 1220 lea -32(%ecx), %ecx 1221 lea -8(%eax), %eax 1222 xor %edi, %edi 1223 1224 .p2align 4 1225 L(sh_8_no_prefetch_loop): 1226 movdqa 16(%eax, %edi), %xmm2 1227 sub $32, %ecx 1228 movdqa 32(%eax, %edi), %xmm3 1229 movdqa %xmm3, %xmm4 1230 palignr $8, %xmm2, %xmm3 1231 palignr $8, %xmm1, %xmm2 1232 lea 32(%edi), %edi 1233 movdqa %xmm2, -32(%edx, %edi) 1234 movdqa %xmm3, -16(%edx, %edi) 1235 jb L(sh_8_end_no_prefetch_loop) 1236 1237 movdqa 16(%eax, %edi), %xmm2 1238 sub $32, %ecx 1239 movdqa 32(%eax, %edi), %xmm3 1240 movdqa %xmm3, %xmm1 1241 palignr $8, %xmm2, %xmm3 1242 palignr $8, %xmm4, %xmm2 1243 lea 32(%edi), %edi 1244 movdqa %xmm2, -32(%edx, %edi) 1245 movdqa %xmm3, -16(%edx, %edi) 1246 jae L(sh_8_no_prefetch_loop) 1247 1248 L(sh_8_end_no_prefetch_loop): 1249 lea 32(%ecx), %ecx 1250 add %ecx, %edi 1251 add %edi, %edx 1252 lea 8(%edi, %eax), %eax 1253 POP (%edi) 1254 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1255 1256 CFI_PUSH (%edi) 1257 1258 .p2align 4 1259 L(shl_9): 1260 #ifndef USE_AS_MEMMOVE 1261 movaps -9(%eax), %xmm1 1262 #else 1263 movl DEST+4(%esp), %edi 1264 movaps -9(%eax), %xmm1 1265 movdqu %xmm0, (%edi) 1266 #endif 1267 #ifdef DATA_CACHE_SIZE_HALF 1268 cmp $DATA_CACHE_SIZE_HALF, %ecx 1269 #else 1270 # if (defined SHARED || defined __PIC__) 1271 SETUP_PIC_REG(bx) 1272 add $_GLOBAL_OFFSET_TABLE_, %ebx 1273 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1274 # else 1275 cmp __x86_data_cache_size_half, %ecx 1276 # endif 1277 #endif 1278 jb L(sh_9_no_prefetch) 1279 1280 lea -64(%ecx), %ecx 1281 1282 .p2align 4 1283 L(Shl9LoopStart): 1284 prefetcht0 0x1c0(%eax) 1285 prefetcht0 0x1c0(%edx) 1286 movaps 7(%eax), %xmm2 1287 movaps 23(%eax), %xmm3 1288 movaps 39(%eax), %xmm4 1289 movaps 55(%eax), %xmm5 1290 movaps %xmm5, %xmm7 1291 palignr $9, %xmm4, %xmm5 1292 palignr $9, %xmm3, %xmm4 1293 movaps %xmm5, 48(%edx) 1294 palignr $9, %xmm2, %xmm3 1295 lea 64(%eax), %eax 1296 palignr $9, %xmm1, %xmm2 1297 movaps %xmm4, 32(%edx) 1298 movaps %xmm3, 16(%edx) 1299 movaps %xmm7, %xmm1 1300 movaps %xmm2, (%edx) 1301 lea 64(%edx), %edx 1302 sub $64, %ecx 1303 ja L(Shl9LoopStart) 1304 1305 L(Shl9LoopLeave): 1306 add $32, %ecx 1307 jle L(shl_end_0) 1308 1309 movaps 7(%eax), %xmm2 1310 movaps 23(%eax), %xmm3 1311 palignr $9, %xmm2, %xmm3 1312 palignr $9, %xmm1, %xmm2 1313 1314 movaps %xmm2, (%edx) 1315 movaps %xmm3, 16(%edx) 1316 lea 32(%edx, %ecx), %edx 1317 lea 32(%eax, %ecx), %eax 1318 POP (%edi) 1319 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1320 1321 CFI_PUSH (%edi) 1322 1323 .p2align 4 1324 L(sh_9_no_prefetch): 1325 lea -32(%ecx), %ecx 1326 lea -9(%eax), %eax 1327 xor %edi, %edi 1328 1329 .p2align 4 1330 L(sh_9_no_prefetch_loop): 1331 movdqa 16(%eax, %edi), %xmm2 1332 sub $32, %ecx 1333 movdqa 32(%eax, %edi), %xmm3 1334 movdqa %xmm3, %xmm4 1335 palignr $9, %xmm2, %xmm3 1336 palignr $9, %xmm1, %xmm2 1337 lea 32(%edi), %edi 1338 movdqa %xmm2, -32(%edx, %edi) 1339 movdqa %xmm3, -16(%edx, %edi) 1340 jb L(sh_9_end_no_prefetch_loop) 1341 1342 movdqa 16(%eax, %edi), %xmm2 1343 sub $32, %ecx 1344 movdqa 32(%eax, %edi), %xmm3 1345 movdqa %xmm3, %xmm1 1346 palignr $9, %xmm2, %xmm3 1347 palignr $9, %xmm4, %xmm2 1348 lea 32(%edi), %edi 1349 movdqa %xmm2, -32(%edx, %edi) 1350 movdqa %xmm3, -16(%edx, %edi) 1351 jae L(sh_9_no_prefetch_loop) 1352 1353 L(sh_9_end_no_prefetch_loop): 1354 lea 32(%ecx), %ecx 1355 add %ecx, %edi 1356 add %edi, %edx 1357 lea 9(%edi, %eax), %eax 1358 POP (%edi) 1359 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1360 1361 CFI_PUSH (%edi) 1362 1363 .p2align 4 1364 L(shl_10): 1365 #ifndef USE_AS_MEMMOVE 1366 movaps -10(%eax), %xmm1 1367 #else 1368 movl DEST+4(%esp), %edi 1369 movaps -10(%eax), %xmm1 1370 movdqu %xmm0, (%edi) 1371 #endif 1372 #ifdef DATA_CACHE_SIZE_HALF 1373 cmp $DATA_CACHE_SIZE_HALF, %ecx 1374 #else 1375 # if (defined SHARED || defined __PIC__) 1376 SETUP_PIC_REG(bx) 1377 add $_GLOBAL_OFFSET_TABLE_, %ebx 1378 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1379 # else 1380 cmp __x86_data_cache_size_half, %ecx 1381 # endif 1382 #endif 1383 jb L(sh_10_no_prefetch) 1384 1385 lea -64(%ecx), %ecx 1386 1387 .p2align 4 1388 L(Shl10LoopStart): 1389 prefetcht0 0x1c0(%eax) 1390 prefetcht0 0x1c0(%edx) 1391 movaps 6(%eax), %xmm2 1392 movaps 22(%eax), %xmm3 1393 movaps 38(%eax), %xmm4 1394 movaps 54(%eax), %xmm5 1395 movaps %xmm5, %xmm7 1396 palignr $10, %xmm4, %xmm5 1397 palignr $10, %xmm3, %xmm4 1398 movaps %xmm5, 48(%edx) 1399 palignr $10, %xmm2, %xmm3 1400 lea 64(%eax), %eax 1401 palignr $10, %xmm1, %xmm2 1402 movaps %xmm4, 32(%edx) 1403 movaps %xmm3, 16(%edx) 1404 movaps %xmm7, %xmm1 1405 movaps %xmm2, (%edx) 1406 lea 64(%edx), %edx 1407 sub $64, %ecx 1408 ja L(Shl10LoopStart) 1409 1410 L(Shl10LoopLeave): 1411 add $32, %ecx 1412 jle L(shl_end_0) 1413 1414 movaps 6(%eax), %xmm2 1415 movaps 22(%eax), %xmm3 1416 palignr $10, %xmm2, %xmm3 1417 palignr $10, %xmm1, %xmm2 1418 1419 movaps %xmm2, (%edx) 1420 movaps %xmm3, 16(%edx) 1421 lea 32(%edx, %ecx), %edx 1422 lea 32(%eax, %ecx), %eax 1423 POP (%edi) 1424 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1425 1426 CFI_PUSH (%edi) 1427 1428 .p2align 4 1429 L(sh_10_no_prefetch): 1430 lea -32(%ecx), %ecx 1431 lea -10(%eax), %eax 1432 xor %edi, %edi 1433 1434 .p2align 4 1435 L(sh_10_no_prefetch_loop): 1436 movdqa 16(%eax, %edi), %xmm2 1437 sub $32, %ecx 1438 movdqa 32(%eax, %edi), %xmm3 1439 movdqa %xmm3, %xmm4 1440 palignr $10, %xmm2, %xmm3 1441 palignr $10, %xmm1, %xmm2 1442 lea 32(%edi), %edi 1443 movdqa %xmm2, -32(%edx, %edi) 1444 movdqa %xmm3, -16(%edx, %edi) 1445 jb L(sh_10_end_no_prefetch_loop) 1446 1447 movdqa 16(%eax, %edi), %xmm2 1448 sub $32, %ecx 1449 movdqa 32(%eax, %edi), %xmm3 1450 movdqa %xmm3, %xmm1 1451 palignr $10, %xmm2, %xmm3 1452 palignr $10, %xmm4, %xmm2 1453 lea 32(%edi), %edi 1454 movdqa %xmm2, -32(%edx, %edi) 1455 movdqa %xmm3, -16(%edx, %edi) 1456 jae L(sh_10_no_prefetch_loop) 1457 1458 L(sh_10_end_no_prefetch_loop): 1459 lea 32(%ecx), %ecx 1460 add %ecx, %edi 1461 add %edi, %edx 1462 lea 10(%edi, %eax), %eax 1463 POP (%edi) 1464 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1465 1466 CFI_PUSH (%edi) 1467 1468 .p2align 4 1469 L(shl_11): 1470 #ifndef USE_AS_MEMMOVE 1471 movaps -11(%eax), %xmm1 1472 #else 1473 movl DEST+4(%esp), %edi 1474 movaps -11(%eax), %xmm1 1475 movdqu %xmm0, (%edi) 1476 #endif 1477 #ifdef DATA_CACHE_SIZE_HALF 1478 cmp $DATA_CACHE_SIZE_HALF, %ecx 1479 #else 1480 # if (defined SHARED || defined __PIC__) 1481 SETUP_PIC_REG(bx) 1482 add $_GLOBAL_OFFSET_TABLE_, %ebx 1483 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1484 # else 1485 cmp __x86_data_cache_size_half, %ecx 1486 # endif 1487 #endif 1488 jb L(sh_11_no_prefetch) 1489 1490 lea -64(%ecx), %ecx 1491 1492 .p2align 4 1493 L(Shl11LoopStart): 1494 prefetcht0 0x1c0(%eax) 1495 prefetcht0 0x1c0(%edx) 1496 movaps 5(%eax), %xmm2 1497 movaps 21(%eax), %xmm3 1498 movaps 37(%eax), %xmm4 1499 movaps 53(%eax), %xmm5 1500 movaps %xmm5, %xmm7 1501 palignr $11, %xmm4, %xmm5 1502 palignr $11, %xmm3, %xmm4 1503 movaps %xmm5, 48(%edx) 1504 palignr $11, %xmm2, %xmm3 1505 lea 64(%eax), %eax 1506 palignr $11, %xmm1, %xmm2 1507 movaps %xmm4, 32(%edx) 1508 movaps %xmm3, 16(%edx) 1509 movaps %xmm7, %xmm1 1510 movaps %xmm2, (%edx) 1511 lea 64(%edx), %edx 1512 sub $64, %ecx 1513 ja L(Shl11LoopStart) 1514 1515 L(Shl11LoopLeave): 1516 add $32, %ecx 1517 jle L(shl_end_0) 1518 1519 movaps 5(%eax), %xmm2 1520 movaps 21(%eax), %xmm3 1521 palignr $11, %xmm2, %xmm3 1522 palignr $11, %xmm1, %xmm2 1523 1524 movaps %xmm2, (%edx) 1525 movaps %xmm3, 16(%edx) 1526 lea 32(%edx, %ecx), %edx 1527 lea 32(%eax, %ecx), %eax 1528 POP (%edi) 1529 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1530 1531 CFI_PUSH (%edi) 1532 1533 .p2align 4 1534 L(sh_11_no_prefetch): 1535 lea -32(%ecx), %ecx 1536 lea -11(%eax), %eax 1537 xor %edi, %edi 1538 1539 .p2align 4 1540 L(sh_11_no_prefetch_loop): 1541 movdqa 16(%eax, %edi), %xmm2 1542 sub $32, %ecx 1543 movdqa 32(%eax, %edi), %xmm3 1544 movdqa %xmm3, %xmm4 1545 palignr $11, %xmm2, %xmm3 1546 palignr $11, %xmm1, %xmm2 1547 lea 32(%edi), %edi 1548 movdqa %xmm2, -32(%edx, %edi) 1549 movdqa %xmm3, -16(%edx, %edi) 1550 jb L(sh_11_end_no_prefetch_loop) 1551 1552 movdqa 16(%eax, %edi), %xmm2 1553 sub $32, %ecx 1554 movdqa 32(%eax, %edi), %xmm3 1555 movdqa %xmm3, %xmm1 1556 palignr $11, %xmm2, %xmm3 1557 palignr $11, %xmm4, %xmm2 1558 lea 32(%edi), %edi 1559 movdqa %xmm2, -32(%edx, %edi) 1560 movdqa %xmm3, -16(%edx, %edi) 1561 jae L(sh_11_no_prefetch_loop) 1562 1563 L(sh_11_end_no_prefetch_loop): 1564 lea 32(%ecx), %ecx 1565 add %ecx, %edi 1566 add %edi, %edx 1567 lea 11(%edi, %eax), %eax 1568 POP (%edi) 1569 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1570 1571 CFI_PUSH (%edi) 1572 1573 .p2align 4 1574 L(shl_12): 1575 #ifndef USE_AS_MEMMOVE 1576 movaps -12(%eax), %xmm1 1577 #else 1578 movl DEST+4(%esp), %edi 1579 movaps -12(%eax), %xmm1 1580 movdqu %xmm0, (%edi) 1581 #endif 1582 #ifdef DATA_CACHE_SIZE_HALF 1583 cmp $DATA_CACHE_SIZE_HALF, %ecx 1584 #else 1585 # if (defined SHARED || defined __PIC__) 1586 SETUP_PIC_REG(bx) 1587 add $_GLOBAL_OFFSET_TABLE_, %ebx 1588 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1589 # else 1590 cmp __x86_data_cache_size_half, %ecx 1591 # endif 1592 #endif 1593 jb L(sh_12_no_prefetch) 1594 1595 lea -64(%ecx), %ecx 1596 1597 .p2align 4 1598 L(Shl12LoopStart): 1599 prefetcht0 0x1c0(%eax) 1600 prefetcht0 0x1c0(%edx) 1601 movaps 4(%eax), %xmm2 1602 movaps 20(%eax), %xmm3 1603 movaps 36(%eax), %xmm4 1604 movaps 52(%eax), %xmm5 1605 movaps %xmm5, %xmm7 1606 palignr $12, %xmm4, %xmm5 1607 palignr $12, %xmm3, %xmm4 1608 movaps %xmm5, 48(%edx) 1609 palignr $12, %xmm2, %xmm3 1610 lea 64(%eax), %eax 1611 palignr $12, %xmm1, %xmm2 1612 movaps %xmm4, 32(%edx) 1613 movaps %xmm3, 16(%edx) 1614 movaps %xmm7, %xmm1 1615 movaps %xmm2, (%edx) 1616 lea 64(%edx), %edx 1617 sub $64, %ecx 1618 ja L(Shl12LoopStart) 1619 1620 L(Shl12LoopLeave): 1621 add $32, %ecx 1622 jle L(shl_end_0) 1623 1624 movaps 4(%eax), %xmm2 1625 movaps 20(%eax), %xmm3 1626 palignr $12, %xmm2, %xmm3 1627 palignr $12, %xmm1, %xmm2 1628 1629 movaps %xmm2, (%edx) 1630 movaps %xmm3, 16(%edx) 1631 lea 32(%edx, %ecx), %edx 1632 lea 32(%eax, %ecx), %eax 1633 POP (%edi) 1634 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1635 1636 CFI_PUSH (%edi) 1637 1638 .p2align 4 1639 L(sh_12_no_prefetch): 1640 lea -32(%ecx), %ecx 1641 lea -12(%eax), %eax 1642 xor %edi, %edi 1643 1644 .p2align 4 1645 L(sh_12_no_prefetch_loop): 1646 movdqa 16(%eax, %edi), %xmm2 1647 sub $32, %ecx 1648 movdqa 32(%eax, %edi), %xmm3 1649 movdqa %xmm3, %xmm4 1650 palignr $12, %xmm2, %xmm3 1651 palignr $12, %xmm1, %xmm2 1652 lea 32(%edi), %edi 1653 movdqa %xmm2, -32(%edx, %edi) 1654 movdqa %xmm3, -16(%edx, %edi) 1655 jb L(sh_12_end_no_prefetch_loop) 1656 1657 movdqa 16(%eax, %edi), %xmm2 1658 sub $32, %ecx 1659 movdqa 32(%eax, %edi), %xmm3 1660 movdqa %xmm3, %xmm1 1661 palignr $12, %xmm2, %xmm3 1662 palignr $12, %xmm4, %xmm2 1663 lea 32(%edi), %edi 1664 movdqa %xmm2, -32(%edx, %edi) 1665 movdqa %xmm3, -16(%edx, %edi) 1666 jae L(sh_12_no_prefetch_loop) 1667 1668 L(sh_12_end_no_prefetch_loop): 1669 lea 32(%ecx), %ecx 1670 add %ecx, %edi 1671 add %edi, %edx 1672 lea 12(%edi, %eax), %eax 1673 POP (%edi) 1674 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1675 1676 CFI_PUSH (%edi) 1677 1678 .p2align 4 1679 L(shl_13): 1680 #ifndef USE_AS_MEMMOVE 1681 movaps -13(%eax), %xmm1 1682 #else 1683 movl DEST+4(%esp), %edi 1684 movaps -13(%eax), %xmm1 1685 movdqu %xmm0, (%edi) 1686 #endif 1687 #ifdef DATA_CACHE_SIZE_HALF 1688 cmp $DATA_CACHE_SIZE_HALF, %ecx 1689 #else 1690 # if (defined SHARED || defined __PIC__) 1691 SETUP_PIC_REG(bx) 1692 add $_GLOBAL_OFFSET_TABLE_, %ebx 1693 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1694 # else 1695 cmp __x86_data_cache_size_half, %ecx 1696 # endif 1697 #endif 1698 jb L(sh_13_no_prefetch) 1699 1700 lea -64(%ecx), %ecx 1701 1702 .p2align 4 1703 L(Shl13LoopStart): 1704 prefetcht0 0x1c0(%eax) 1705 prefetcht0 0x1c0(%edx) 1706 movaps 3(%eax), %xmm2 1707 movaps 19(%eax), %xmm3 1708 movaps 35(%eax), %xmm4 1709 movaps 51(%eax), %xmm5 1710 movaps %xmm5, %xmm7 1711 palignr $13, %xmm4, %xmm5 1712 palignr $13, %xmm3, %xmm4 1713 movaps %xmm5, 48(%edx) 1714 palignr $13, %xmm2, %xmm3 1715 lea 64(%eax), %eax 1716 palignr $13, %xmm1, %xmm2 1717 movaps %xmm4, 32(%edx) 1718 movaps %xmm3, 16(%edx) 1719 movaps %xmm7, %xmm1 1720 movaps %xmm2, (%edx) 1721 lea 64(%edx), %edx 1722 sub $64, %ecx 1723 ja L(Shl13LoopStart) 1724 1725 L(Shl13LoopLeave): 1726 add $32, %ecx 1727 jle L(shl_end_0) 1728 1729 movaps 3(%eax), %xmm2 1730 movaps 19(%eax), %xmm3 1731 palignr $13, %xmm2, %xmm3 1732 palignr $13, %xmm1, %xmm2 1733 1734 movaps %xmm2, (%edx) 1735 movaps %xmm3, 16(%edx) 1736 lea 32(%edx, %ecx), %edx 1737 lea 32(%eax, %ecx), %eax 1738 POP (%edi) 1739 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1740 1741 CFI_PUSH (%edi) 1742 1743 .p2align 4 1744 L(sh_13_no_prefetch): 1745 lea -32(%ecx), %ecx 1746 lea -13(%eax), %eax 1747 xor %edi, %edi 1748 1749 .p2align 4 1750 L(sh_13_no_prefetch_loop): 1751 movdqa 16(%eax, %edi), %xmm2 1752 sub $32, %ecx 1753 movdqa 32(%eax, %edi), %xmm3 1754 movdqa %xmm3, %xmm4 1755 palignr $13, %xmm2, %xmm3 1756 palignr $13, %xmm1, %xmm2 1757 lea 32(%edi), %edi 1758 movdqa %xmm2, -32(%edx, %edi) 1759 movdqa %xmm3, -16(%edx, %edi) 1760 jb L(sh_13_end_no_prefetch_loop) 1761 1762 movdqa 16(%eax, %edi), %xmm2 1763 sub $32, %ecx 1764 movdqa 32(%eax, %edi), %xmm3 1765 movdqa %xmm3, %xmm1 1766 palignr $13, %xmm2, %xmm3 1767 palignr $13, %xmm4, %xmm2 1768 lea 32(%edi), %edi 1769 movdqa %xmm2, -32(%edx, %edi) 1770 movdqa %xmm3, -16(%edx, %edi) 1771 jae L(sh_13_no_prefetch_loop) 1772 1773 L(sh_13_end_no_prefetch_loop): 1774 lea 32(%ecx), %ecx 1775 add %ecx, %edi 1776 add %edi, %edx 1777 lea 13(%edi, %eax), %eax 1778 POP (%edi) 1779 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1780 1781 CFI_PUSH (%edi) 1782 1783 .p2align 4 1784 L(shl_14): 1785 #ifndef USE_AS_MEMMOVE 1786 movaps -14(%eax), %xmm1 1787 #else 1788 movl DEST+4(%esp), %edi 1789 movaps -14(%eax), %xmm1 1790 movdqu %xmm0, (%edi) 1791 #endif 1792 #ifdef DATA_CACHE_SIZE_HALF 1793 cmp $DATA_CACHE_SIZE_HALF, %ecx 1794 #else 1795 # if (defined SHARED || defined __PIC__) 1796 SETUP_PIC_REG(bx) 1797 add $_GLOBAL_OFFSET_TABLE_, %ebx 1798 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1799 # else 1800 cmp __x86_data_cache_size_half, %ecx 1801 # endif 1802 #endif 1803 jb L(sh_14_no_prefetch) 1804 1805 lea -64(%ecx), %ecx 1806 1807 .p2align 4 1808 L(Shl14LoopStart): 1809 prefetcht0 0x1c0(%eax) 1810 prefetcht0 0x1c0(%edx) 1811 movaps 2(%eax), %xmm2 1812 movaps 18(%eax), %xmm3 1813 movaps 34(%eax), %xmm4 1814 movaps 50(%eax), %xmm5 1815 movaps %xmm5, %xmm7 1816 palignr $14, %xmm4, %xmm5 1817 palignr $14, %xmm3, %xmm4 1818 movaps %xmm5, 48(%edx) 1819 palignr $14, %xmm2, %xmm3 1820 lea 64(%eax), %eax 1821 palignr $14, %xmm1, %xmm2 1822 movaps %xmm4, 32(%edx) 1823 movaps %xmm3, 16(%edx) 1824 movaps %xmm7, %xmm1 1825 movaps %xmm2, (%edx) 1826 lea 64(%edx), %edx 1827 sub $64, %ecx 1828 ja L(Shl14LoopStart) 1829 1830 L(Shl14LoopLeave): 1831 add $32, %ecx 1832 jle L(shl_end_0) 1833 1834 movaps 2(%eax), %xmm2 1835 movaps 18(%eax), %xmm3 1836 palignr $14, %xmm2, %xmm3 1837 palignr $14, %xmm1, %xmm2 1838 1839 movaps %xmm2, (%edx) 1840 movaps %xmm3, 16(%edx) 1841 lea 32(%edx, %ecx), %edx 1842 lea 32(%eax, %ecx), %eax 1843 POP (%edi) 1844 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1845 1846 CFI_PUSH (%edi) 1847 1848 .p2align 4 1849 L(sh_14_no_prefetch): 1850 lea -32(%ecx), %ecx 1851 lea -14(%eax), %eax 1852 xor %edi, %edi 1853 1854 .p2align 4 1855 L(sh_14_no_prefetch_loop): 1856 movdqa 16(%eax, %edi), %xmm2 1857 sub $32, %ecx 1858 movdqa 32(%eax, %edi), %xmm3 1859 movdqa %xmm3, %xmm4 1860 palignr $14, %xmm2, %xmm3 1861 palignr $14, %xmm1, %xmm2 1862 lea 32(%edi), %edi 1863 movdqa %xmm2, -32(%edx, %edi) 1864 movdqa %xmm3, -16(%edx, %edi) 1865 jb L(sh_14_end_no_prefetch_loop) 1866 1867 movdqa 16(%eax, %edi), %xmm2 1868 sub $32, %ecx 1869 movdqa 32(%eax, %edi), %xmm3 1870 movdqa %xmm3, %xmm1 1871 palignr $14, %xmm2, %xmm3 1872 palignr $14, %xmm4, %xmm2 1873 lea 32(%edi), %edi 1874 movdqa %xmm2, -32(%edx, %edi) 1875 movdqa %xmm3, -16(%edx, %edi) 1876 jae L(sh_14_no_prefetch_loop) 1877 1878 L(sh_14_end_no_prefetch_loop): 1879 lea 32(%ecx), %ecx 1880 add %ecx, %edi 1881 add %edi, %edx 1882 lea 14(%edi, %eax), %eax 1883 POP (%edi) 1884 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1885 1886 CFI_PUSH (%edi) 1887 1888 .p2align 4 1889 L(shl_15): 1890 #ifndef USE_AS_MEMMOVE 1891 movaps -15(%eax), %xmm1 1892 #else 1893 movl DEST+4(%esp), %edi 1894 movaps -15(%eax), %xmm1 1895 movdqu %xmm0, (%edi) 1896 #endif 1897 #ifdef DATA_CACHE_SIZE_HALF 1898 cmp $DATA_CACHE_SIZE_HALF, %ecx 1899 #else 1900 # if (defined SHARED || defined __PIC__) 1901 SETUP_PIC_REG(bx) 1902 add $_GLOBAL_OFFSET_TABLE_, %ebx 1903 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1904 # else 1905 cmp __x86_data_cache_size_half, %ecx 1906 # endif 1907 #endif 1908 jb L(sh_15_no_prefetch) 1909 1910 lea -64(%ecx), %ecx 1911 1912 .p2align 4 1913 L(Shl15LoopStart): 1914 prefetcht0 0x1c0(%eax) 1915 prefetcht0 0x1c0(%edx) 1916 movaps 1(%eax), %xmm2 1917 movaps 17(%eax), %xmm3 1918 movaps 33(%eax), %xmm4 1919 movaps 49(%eax), %xmm5 1920 movaps %xmm5, %xmm7 1921 palignr $15, %xmm4, %xmm5 1922 palignr $15, %xmm3, %xmm4 1923 movaps %xmm5, 48(%edx) 1924 palignr $15, %xmm2, %xmm3 1925 lea 64(%eax), %eax 1926 palignr $15, %xmm1, %xmm2 1927 movaps %xmm4, 32(%edx) 1928 movaps %xmm3, 16(%edx) 1929 movaps %xmm7, %xmm1 1930 movaps %xmm2, (%edx) 1931 lea 64(%edx), %edx 1932 sub $64, %ecx 1933 ja L(Shl15LoopStart) 1934 1935 L(Shl15LoopLeave): 1936 add $32, %ecx 1937 jle L(shl_end_0) 1938 1939 movaps 1(%eax), %xmm2 1940 movaps 17(%eax), %xmm3 1941 palignr $15, %xmm2, %xmm3 1942 palignr $15, %xmm1, %xmm2 1943 1944 movaps %xmm2, (%edx) 1945 movaps %xmm3, 16(%edx) 1946 lea 32(%edx, %ecx), %edx 1947 lea 32(%eax, %ecx), %eax 1948 POP (%edi) 1949 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1950 1951 CFI_PUSH (%edi) 1952 1953 .p2align 4 1954 L(sh_15_no_prefetch): 1955 lea -32(%ecx), %ecx 1956 lea -15(%eax), %eax 1957 xor %edi, %edi 1958 1959 .p2align 4 1960 L(sh_15_no_prefetch_loop): 1961 movdqa 16(%eax, %edi), %xmm2 1962 sub $32, %ecx 1963 movdqa 32(%eax, %edi), %xmm3 1964 movdqa %xmm3, %xmm4 1965 palignr $15, %xmm2, %xmm3 1966 palignr $15, %xmm1, %xmm2 1967 lea 32(%edi), %edi 1968 movdqa %xmm2, -32(%edx, %edi) 1969 movdqa %xmm3, -16(%edx, %edi) 1970 jb L(sh_15_end_no_prefetch_loop) 1971 1972 movdqa 16(%eax, %edi), %xmm2 1973 sub $32, %ecx 1974 movdqa 32(%eax, %edi), %xmm3 1975 movdqa %xmm3, %xmm1 1976 palignr $15, %xmm2, %xmm3 1977 palignr $15, %xmm4, %xmm2 1978 lea 32(%edi), %edi 1979 movdqa %xmm2, -32(%edx, %edi) 1980 movdqa %xmm3, -16(%edx, %edi) 1981 jae L(sh_15_no_prefetch_loop) 1982 1983 L(sh_15_end_no_prefetch_loop): 1984 lea 32(%ecx), %ecx 1985 add %ecx, %edi 1986 add %edi, %edx 1987 lea 15(%edi, %eax), %eax 1988 POP (%edi) 1989 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1990 1991 CFI_PUSH (%edi) 1992 1993 .p2align 4 1994 L(shl_end_0): 1995 lea 32(%ecx), %ecx 1996 lea (%edx, %ecx), %edx 1997 lea (%eax, %ecx), %eax 1998 POP (%edi) 1999 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 2000 2001 .p2align 4 2002 L(fwd_write_44bytes): 2003 movq -44(%eax), %xmm0 2004 movq %xmm0, -44(%edx) 2005 L(fwd_write_36bytes): 2006 movq -36(%eax), %xmm0 2007 movq %xmm0, -36(%edx) 2008 L(fwd_write_28bytes): 2009 movq -28(%eax), %xmm0 2010 movq %xmm0, -28(%edx) 2011 L(fwd_write_20bytes): 2012 movq -20(%eax), %xmm0 2013 movq %xmm0, -20(%edx) 2014 L(fwd_write_12bytes): 2015 movq -12(%eax), %xmm0 2016 movq %xmm0, -12(%edx) 2017 L(fwd_write_4bytes): 2018 movl -4(%eax), %ecx 2019 movl %ecx, -4(%edx) 2020 #ifndef USE_AS_BCOPY 2021 # ifdef USE_AS_MEMPCPY 2022 movl %edx, %eax 2023 # else 2024 movl DEST(%esp), %eax 2025 # endif 2026 #endif 2027 RETURN 2028 2029 .p2align 4 2030 L(fwd_write_40bytes): 2031 movq -40(%eax), %xmm0 2032 movq %xmm0, -40(%edx) 2033 L(fwd_write_32bytes): 2034 movq -32(%eax), %xmm0 2035 movq %xmm0, -32(%edx) 2036 L(fwd_write_24bytes): 2037 movq -24(%eax), %xmm0 2038 movq %xmm0, -24(%edx) 2039 L(fwd_write_16bytes): 2040 movq -16(%eax), %xmm0 2041 movq %xmm0, -16(%edx) 2042 L(fwd_write_8bytes): 2043 movq -8(%eax), %xmm0 2044 movq %xmm0, -8(%edx) 2045 L(fwd_write_0bytes): 2046 #ifndef USE_AS_BCOPY 2047 # ifdef USE_AS_MEMPCPY 2048 movl %edx, %eax 2049 # else 2050 movl DEST(%esp), %eax 2051 # endif 2052 #endif 2053 RETURN 2054 2055 .p2align 4 2056 L(fwd_write_5bytes): 2057 movl -5(%eax), %ecx 2058 movl -4(%eax), %eax 2059 movl %ecx, -5(%edx) 2060 movl %eax, -4(%edx) 2061 #ifndef USE_AS_BCOPY 2062 # ifdef USE_AS_MEMPCPY 2063 movl %edx, %eax 2064 # else 2065 movl DEST(%esp), %eax 2066 # endif 2067 #endif 2068 RETURN 2069 2070 .p2align 4 2071 L(fwd_write_45bytes): 2072 movq -45(%eax), %xmm0 2073 movq %xmm0, -45(%edx) 2074 L(fwd_write_37bytes): 2075 movq -37(%eax), %xmm0 2076 movq %xmm0, -37(%edx) 2077 L(fwd_write_29bytes): 2078 movq -29(%eax), %xmm0 2079 movq %xmm0, -29(%edx) 2080 L(fwd_write_21bytes): 2081 movq -21(%eax), %xmm0 2082 movq %xmm0, -21(%edx) 2083 L(fwd_write_13bytes): 2084 movq -13(%eax), %xmm0 2085 movq %xmm0, -13(%edx) 2086 movl -5(%eax), %ecx 2087 movl %ecx, -5(%edx) 2088 movzbl -1(%eax), %ecx 2089 movb %cl, -1(%edx) 2090 #ifndef USE_AS_BCOPY 2091 # ifdef USE_AS_MEMPCPY 2092 movl %edx, %eax 2093 # else 2094 movl DEST(%esp), %eax 2095 # endif 2096 #endif 2097 RETURN 2098 2099 .p2align 4 2100 L(fwd_write_41bytes): 2101 movq -41(%eax), %xmm0 2102 movq %xmm0, -41(%edx) 2103 L(fwd_write_33bytes): 2104 movq -33(%eax), %xmm0 2105 movq %xmm0, -33(%edx) 2106 L(fwd_write_25bytes): 2107 movq -25(%eax), %xmm0 2108 movq %xmm0, -25(%edx) 2109 L(fwd_write_17bytes): 2110 movq -17(%eax), %xmm0 2111 movq %xmm0, -17(%edx) 2112 L(fwd_write_9bytes): 2113 movq -9(%eax), %xmm0 2114 movq %xmm0, -9(%edx) 2115 L(fwd_write_1bytes): 2116 movzbl -1(%eax), %ecx 2117 movb %cl, -1(%edx) 2118 #ifndef USE_AS_BCOPY 2119 # ifdef USE_AS_MEMPCPY 2120 movl %edx, %eax 2121 # else 2122 movl DEST(%esp), %eax 2123 # endif 2124 #endif 2125 RETURN 2126 2127 .p2align 4 2128 L(fwd_write_46bytes): 2129 movq -46(%eax), %xmm0 2130 movq %xmm0, -46(%edx) 2131 L(fwd_write_38bytes): 2132 movq -38(%eax), %xmm0 2133 movq %xmm0, -38(%edx) 2134 L(fwd_write_30bytes): 2135 movq -30(%eax), %xmm0 2136 movq %xmm0, -30(%edx) 2137 L(fwd_write_22bytes): 2138 movq -22(%eax), %xmm0 2139 movq %xmm0, -22(%edx) 2140 L(fwd_write_14bytes): 2141 movq -14(%eax), %xmm0 2142 movq %xmm0, -14(%edx) 2143 L(fwd_write_6bytes): 2144 movl -6(%eax), %ecx 2145 movl %ecx, -6(%edx) 2146 movzwl -2(%eax), %ecx 2147 movw %cx, -2(%edx) 2148 #ifndef USE_AS_BCOPY 2149 # ifdef USE_AS_MEMPCPY 2150 movl %edx, %eax 2151 # else 2152 movl DEST(%esp), %eax 2153 # endif 2154 #endif 2155 RETURN 2156 2157 .p2align 4 2158 L(fwd_write_42bytes): 2159 movq -42(%eax), %xmm0 2160 movq %xmm0, -42(%edx) 2161 L(fwd_write_34bytes): 2162 movq -34(%eax), %xmm0 2163 movq %xmm0, -34(%edx) 2164 L(fwd_write_26bytes): 2165 movq -26(%eax), %xmm0 2166 movq %xmm0, -26(%edx) 2167 L(fwd_write_18bytes): 2168 movq -18(%eax), %xmm0 2169 movq %xmm0, -18(%edx) 2170 L(fwd_write_10bytes): 2171 movq -10(%eax), %xmm0 2172 movq %xmm0, -10(%edx) 2173 L(fwd_write_2bytes): 2174 movzwl -2(%eax), %ecx 2175 movw %cx, -2(%edx) 2176 #ifndef USE_AS_BCOPY 2177 # ifdef USE_AS_MEMPCPY 2178 movl %edx, %eax 2179 # else 2180 movl DEST(%esp), %eax 2181 # endif 2182 #endif 2183 RETURN 2184 2185 .p2align 4 2186 L(fwd_write_47bytes): 2187 movq -47(%eax), %xmm0 2188 movq %xmm0, -47(%edx) 2189 L(fwd_write_39bytes): 2190 movq -39(%eax), %xmm0 2191 movq %xmm0, -39(%edx) 2192 L(fwd_write_31bytes): 2193 movq -31(%eax), %xmm0 2194 movq %xmm0, -31(%edx) 2195 L(fwd_write_23bytes): 2196 movq -23(%eax), %xmm0 2197 movq %xmm0, -23(%edx) 2198 L(fwd_write_15bytes): 2199 movq -15(%eax), %xmm0 2200 movq %xmm0, -15(%edx) 2201 L(fwd_write_7bytes): 2202 movl -7(%eax), %ecx 2203 movl %ecx, -7(%edx) 2204 movzwl -3(%eax), %ecx 2205 movzbl -1(%eax), %eax 2206 movw %cx, -3(%edx) 2207 movb %al, -1(%edx) 2208 #ifndef USE_AS_BCOPY 2209 # ifdef USE_AS_MEMPCPY 2210 movl %edx, %eax 2211 # else 2212 movl DEST(%esp), %eax 2213 # endif 2214 #endif 2215 RETURN 2216 2217 .p2align 4 2218 L(fwd_write_43bytes): 2219 movq -43(%eax), %xmm0 2220 movq %xmm0, -43(%edx) 2221 L(fwd_write_35bytes): 2222 movq -35(%eax), %xmm0 2223 movq %xmm0, -35(%edx) 2224 L(fwd_write_27bytes): 2225 movq -27(%eax), %xmm0 2226 movq %xmm0, -27(%edx) 2227 L(fwd_write_19bytes): 2228 movq -19(%eax), %xmm0 2229 movq %xmm0, -19(%edx) 2230 L(fwd_write_11bytes): 2231 movq -11(%eax), %xmm0 2232 movq %xmm0, -11(%edx) 2233 L(fwd_write_3bytes): 2234 movzwl -3(%eax), %ecx 2235 movzbl -1(%eax), %eax 2236 movw %cx, -3(%edx) 2237 movb %al, -1(%edx) 2238 #ifndef USE_AS_BCOPY 2239 # ifdef USE_AS_MEMPCPY 2240 movl %edx, %eax 2241 # else 2242 movl DEST(%esp), %eax 2243 # endif 2244 #endif 2245 RETURN 2246 2247 .p2align 4 2248 L(fwd_write_40bytes_align): 2249 movdqa -40(%eax), %xmm0 2250 movdqa %xmm0, -40(%edx) 2251 L(fwd_write_24bytes_align): 2252 movdqa -24(%eax), %xmm0 2253 movdqa %xmm0, -24(%edx) 2254 L(fwd_write_8bytes_align): 2255 movq -8(%eax), %xmm0 2256 movq %xmm0, -8(%edx) 2257 L(fwd_write_0bytes_align): 2258 #ifndef USE_AS_BCOPY 2259 # ifdef USE_AS_MEMPCPY 2260 movl %edx, %eax 2261 # else 2262 movl DEST(%esp), %eax 2263 # endif 2264 #endif 2265 RETURN 2266 2267 .p2align 4 2268 L(fwd_write_32bytes_align): 2269 movdqa -32(%eax), %xmm0 2270 movdqa %xmm0, -32(%edx) 2271 L(fwd_write_16bytes_align): 2272 movdqa -16(%eax), %xmm0 2273 movdqa %xmm0, -16(%edx) 2274 #ifndef USE_AS_BCOPY 2275 # ifdef USE_AS_MEMPCPY 2276 movl %edx, %eax 2277 # else 2278 movl DEST(%esp), %eax 2279 # endif 2280 #endif 2281 RETURN 2282 2283 .p2align 4 2284 L(fwd_write_5bytes_align): 2285 movl -5(%eax), %ecx 2286 movl -4(%eax), %eax 2287 movl %ecx, -5(%edx) 2288 movl %eax, -4(%edx) 2289 #ifndef USE_AS_BCOPY 2290 # ifdef USE_AS_MEMPCPY 2291 movl %edx, %eax 2292 # else 2293 movl DEST(%esp), %eax 2294 # endif 2295 #endif 2296 RETURN 2297 2298 .p2align 4 2299 L(fwd_write_45bytes_align): 2300 movdqa -45(%eax), %xmm0 2301 movdqa %xmm0, -45(%edx) 2302 L(fwd_write_29bytes_align): 2303 movdqa -29(%eax), %xmm0 2304 movdqa %xmm0, -29(%edx) 2305 L(fwd_write_13bytes_align): 2306 movq -13(%eax), %xmm0 2307 movq %xmm0, -13(%edx) 2308 movl -5(%eax), %ecx 2309 movl %ecx, -5(%edx) 2310 movzbl -1(%eax), %ecx 2311 movb %cl, -1(%edx) 2312 #ifndef USE_AS_BCOPY 2313 # ifdef USE_AS_MEMPCPY 2314 movl %edx, %eax 2315 # else 2316 movl DEST(%esp), %eax 2317 # endif 2318 #endif 2319 RETURN 2320 2321 .p2align 4 2322 L(fwd_write_37bytes_align): 2323 movdqa -37(%eax), %xmm0 2324 movdqa %xmm0, -37(%edx) 2325 L(fwd_write_21bytes_align): 2326 movdqa -21(%eax), %xmm0 2327 movdqa %xmm0, -21(%edx) 2328 movl -5(%eax), %ecx 2329 movl %ecx, -5(%edx) 2330 movzbl -1(%eax), %ecx 2331 movb %cl, -1(%edx) 2332 #ifndef USE_AS_BCOPY 2333 # ifdef USE_AS_MEMPCPY 2334 movl %edx, %eax 2335 # else 2336 movl DEST(%esp), %eax 2337 # endif 2338 #endif 2339 RETURN 2340 2341 .p2align 4 2342 L(fwd_write_41bytes_align): 2343 movdqa -41(%eax), %xmm0 2344 movdqa %xmm0, -41(%edx) 2345 L(fwd_write_25bytes_align): 2346 movdqa -25(%eax), %xmm0 2347 movdqa %xmm0, -25(%edx) 2348 L(fwd_write_9bytes_align): 2349 movq -9(%eax), %xmm0 2350 movq %xmm0, -9(%edx) 2351 L(fwd_write_1bytes_align): 2352 movzbl -1(%eax), %ecx 2353 movb %cl, -1(%edx) 2354 #ifndef USE_AS_BCOPY 2355 # ifdef USE_AS_MEMPCPY 2356 movl %edx, %eax 2357 # else 2358 movl DEST(%esp), %eax 2359 # endif 2360 #endif 2361 RETURN 2362 2363 .p2align 4 2364 L(fwd_write_33bytes_align): 2365 movdqa -33(%eax), %xmm0 2366 movdqa %xmm0, -33(%edx) 2367 L(fwd_write_17bytes_align): 2368 movdqa -17(%eax), %xmm0 2369 movdqa %xmm0, -17(%edx) 2370 movzbl -1(%eax), %ecx 2371 movb %cl, -1(%edx) 2372 #ifndef USE_AS_BCOPY 2373 # ifdef USE_AS_MEMPCPY 2374 movl %edx, %eax 2375 # else 2376 movl DEST(%esp), %eax 2377 # endif 2378 #endif 2379 RETURN 2380 2381 .p2align 4 2382 L(fwd_write_46bytes_align): 2383 movdqa -46(%eax), %xmm0 2384 movdqa %xmm0, -46(%edx) 2385 L(fwd_write_30bytes_align): 2386 movdqa -30(%eax), %xmm0 2387 movdqa %xmm0, -30(%edx) 2388 L(fwd_write_14bytes_align): 2389 movq -14(%eax), %xmm0 2390 movq %xmm0, -14(%edx) 2391 L(fwd_write_6bytes_align): 2392 movl -6(%eax), %ecx 2393 movl %ecx, -6(%edx) 2394 movzwl -2(%eax), %ecx 2395 movw %cx, -2(%edx) 2396 #ifndef USE_AS_BCOPY 2397 # ifdef USE_AS_MEMPCPY 2398 movl %edx, %eax 2399 # else 2400 movl DEST(%esp), %eax 2401 # endif 2402 #endif 2403 RETURN 2404 2405 .p2align 4 2406 L(fwd_write_38bytes_align): 2407 movdqa -38(%eax), %xmm0 2408 movdqa %xmm0, -38(%edx) 2409 L(fwd_write_22bytes_align): 2410 movdqa -22(%eax), %xmm0 2411 movdqa %xmm0, -22(%edx) 2412 movl -6(%eax), %ecx 2413 movl %ecx, -6(%edx) 2414 movzwl -2(%eax), %ecx 2415 movw %cx, -2(%edx) 2416 #ifndef USE_AS_BCOPY 2417 # ifdef USE_AS_MEMPCPY 2418 movl %edx, %eax 2419 # else 2420 movl DEST(%esp), %eax 2421 # endif 2422 #endif 2423 RETURN 2424 2425 .p2align 4 2426 L(fwd_write_42bytes_align): 2427 movdqa -42(%eax), %xmm0 2428 movdqa %xmm0, -42(%edx) 2429 L(fwd_write_26bytes_align): 2430 movdqa -26(%eax), %xmm0 2431 movdqa %xmm0, -26(%edx) 2432 L(fwd_write_10bytes_align): 2433 movq -10(%eax), %xmm0 2434 movq %xmm0, -10(%edx) 2435 L(fwd_write_2bytes_align): 2436 movzwl -2(%eax), %ecx 2437 movw %cx, -2(%edx) 2438 #ifndef USE_AS_BCOPY 2439 # ifdef USE_AS_MEMPCPY 2440 movl %edx, %eax 2441 # else 2442 movl DEST(%esp), %eax 2443 # endif 2444 #endif 2445 RETURN 2446 2447 .p2align 4 2448 L(fwd_write_34bytes_align): 2449 movdqa -34(%eax), %xmm0 2450 movdqa %xmm0, -34(%edx) 2451 L(fwd_write_18bytes_align): 2452 movdqa -18(%eax), %xmm0 2453 movdqa %xmm0, -18(%edx) 2454 movzwl -2(%eax), %ecx 2455 movw %cx, -2(%edx) 2456 #ifndef USE_AS_BCOPY 2457 # ifdef USE_AS_MEMPCPY 2458 movl %edx, %eax 2459 # else 2460 movl DEST(%esp), %eax 2461 # endif 2462 #endif 2463 RETURN 2464 2465 .p2align 4 2466 L(fwd_write_47bytes_align): 2467 movdqa -47(%eax), %xmm0 2468 movdqa %xmm0, -47(%edx) 2469 L(fwd_write_31bytes_align): 2470 movdqa -31(%eax), %xmm0 2471 movdqa %xmm0, -31(%edx) 2472 L(fwd_write_15bytes_align): 2473 movq -15(%eax), %xmm0 2474 movq %xmm0, -15(%edx) 2475 L(fwd_write_7bytes_align): 2476 movl -7(%eax), %ecx 2477 movl %ecx, -7(%edx) 2478 movzwl -3(%eax), %ecx 2479 movzbl -1(%eax), %eax 2480 movw %cx, -3(%edx) 2481 movb %al, -1(%edx) 2482 #ifndef USE_AS_BCOPY 2483 # ifdef USE_AS_MEMPCPY 2484 movl %edx, %eax 2485 # else 2486 movl DEST(%esp), %eax 2487 # endif 2488 #endif 2489 RETURN 2490 2491 .p2align 4 2492 L(fwd_write_39bytes_align): 2493 movdqa -39(%eax), %xmm0 2494 movdqa %xmm0, -39(%edx) 2495 L(fwd_write_23bytes_align): 2496 movdqa -23(%eax), %xmm0 2497 movdqa %xmm0, -23(%edx) 2498 movl -7(%eax), %ecx 2499 movl %ecx, -7(%edx) 2500 movzwl -3(%eax), %ecx 2501 movzbl -1(%eax), %eax 2502 movw %cx, -3(%edx) 2503 movb %al, -1(%edx) 2504 #ifndef USE_AS_BCOPY 2505 # ifdef USE_AS_MEMPCPY 2506 movl %edx, %eax 2507 # else 2508 movl DEST(%esp), %eax 2509 # endif 2510 #endif 2511 RETURN 2512 2513 .p2align 4 2514 L(fwd_write_43bytes_align): 2515 movdqa -43(%eax), %xmm0 2516 movdqa %xmm0, -43(%edx) 2517 L(fwd_write_27bytes_align): 2518 movdqa -27(%eax), %xmm0 2519 movdqa %xmm0, -27(%edx) 2520 L(fwd_write_11bytes_align): 2521 movq -11(%eax), %xmm0 2522 movq %xmm0, -11(%edx) 2523 L(fwd_write_3bytes_align): 2524 movzwl -3(%eax), %ecx 2525 movzbl -1(%eax), %eax 2526 movw %cx, -3(%edx) 2527 movb %al, -1(%edx) 2528 #ifndef USE_AS_BCOPY 2529 # ifdef USE_AS_MEMPCPY 2530 movl %edx, %eax 2531 # else 2532 movl DEST(%esp), %eax 2533 # endif 2534 #endif 2535 RETURN 2536 2537 .p2align 4 2538 L(fwd_write_35bytes_align): 2539 movdqa -35(%eax), %xmm0 2540 movdqa %xmm0, -35(%edx) 2541 L(fwd_write_19bytes_align): 2542 movdqa -19(%eax), %xmm0 2543 movdqa %xmm0, -19(%edx) 2544 movzwl -3(%eax), %ecx 2545 movzbl -1(%eax), %eax 2546 movw %cx, -3(%edx) 2547 movb %al, -1(%edx) 2548 #ifndef USE_AS_BCOPY 2549 # ifdef USE_AS_MEMPCPY 2550 movl %edx, %eax 2551 # else 2552 movl DEST(%esp), %eax 2553 # endif 2554 #endif 2555 RETURN 2556 2557 .p2align 4 2558 L(fwd_write_44bytes_align): 2559 movdqa -44(%eax), %xmm0 2560 movdqa %xmm0, -44(%edx) 2561 L(fwd_write_28bytes_align): 2562 movdqa -28(%eax), %xmm0 2563 movdqa %xmm0, -28(%edx) 2564 L(fwd_write_12bytes_align): 2565 movq -12(%eax), %xmm0 2566 movq %xmm0, -12(%edx) 2567 L(fwd_write_4bytes_align): 2568 movl -4(%eax), %ecx 2569 movl %ecx, -4(%edx) 2570 #ifndef USE_AS_BCOPY 2571 # ifdef USE_AS_MEMPCPY 2572 movl %edx, %eax 2573 # else 2574 movl DEST(%esp), %eax 2575 # endif 2576 #endif 2577 RETURN 2578 2579 .p2align 4 2580 L(fwd_write_36bytes_align): 2581 movdqa -36(%eax), %xmm0 2582 movdqa %xmm0, -36(%edx) 2583 L(fwd_write_20bytes_align): 2584 movdqa -20(%eax), %xmm0 2585 movdqa %xmm0, -20(%edx) 2586 movl -4(%eax), %ecx 2587 movl %ecx, -4(%edx) 2588 #ifndef USE_AS_BCOPY 2589 # ifdef USE_AS_MEMPCPY 2590 movl %edx, %eax 2591 # else 2592 movl DEST(%esp), %eax 2593 # endif 2594 #endif 2595 RETURN_END 2596 2597 CFI_PUSH (%edi) 2598 2599 .p2align 4 2600 L(large_page): 2601 movdqu (%eax), %xmm1 2602 #ifdef USE_AS_MEMMOVE 2603 movl DEST+4(%esp), %edi 2604 movdqu %xmm0, (%edi) 2605 #endif 2606 lea 16(%eax), %eax 2607 movntdq %xmm1, (%edx) 2608 lea 16(%edx), %edx 2609 lea -0x90(%ecx), %ecx 2610 POP (%edi) 2611 2612 .p2align 4 2613 L(large_page_loop): 2614 movdqu (%eax), %xmm0 2615 movdqu 0x10(%eax), %xmm1 2616 movdqu 0x20(%eax), %xmm2 2617 movdqu 0x30(%eax), %xmm3 2618 movdqu 0x40(%eax), %xmm4 2619 movdqu 0x50(%eax), %xmm5 2620 movdqu 0x60(%eax), %xmm6 2621 movdqu 0x70(%eax), %xmm7 2622 lea 0x80(%eax), %eax 2623 2624 sub $0x80, %ecx 2625 movntdq %xmm0, (%edx) 2626 movntdq %xmm1, 0x10(%edx) 2627 movntdq %xmm2, 0x20(%edx) 2628 movntdq %xmm3, 0x30(%edx) 2629 movntdq %xmm4, 0x40(%edx) 2630 movntdq %xmm5, 0x50(%edx) 2631 movntdq %xmm6, 0x60(%edx) 2632 movntdq %xmm7, 0x70(%edx) 2633 lea 0x80(%edx), %edx 2634 jae L(large_page_loop) 2635 cmp $-0x40, %ecx 2636 lea 0x80(%ecx), %ecx 2637 jl L(large_page_less_64bytes) 2638 2639 movdqu (%eax), %xmm0 2640 movdqu 0x10(%eax), %xmm1 2641 movdqu 0x20(%eax), %xmm2 2642 movdqu 0x30(%eax), %xmm3 2643 lea 0x40(%eax), %eax 2644 2645 movntdq %xmm0, (%edx) 2646 movntdq %xmm1, 0x10(%edx) 2647 movntdq %xmm2, 0x20(%edx) 2648 movntdq %xmm3, 0x30(%edx) 2649 lea 0x40(%edx), %edx 2650 sub $0x40, %ecx 2651 L(large_page_less_64bytes): 2652 cmp $32, %ecx 2653 jb L(large_page_less_32bytes) 2654 movdqu (%eax), %xmm0 2655 movdqu 0x10(%eax), %xmm1 2656 lea 0x20(%eax), %eax 2657 movntdq %xmm0, (%edx) 2658 movntdq %xmm1, 0x10(%edx) 2659 lea 0x20(%edx), %edx 2660 sub $0x20, %ecx 2661 L(large_page_less_32bytes): 2662 add %ecx, %edx 2663 add %ecx, %eax 2664 sfence 2665 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 2666 2667 .p2align 4 2668 L(bk_write_44bytes): 2669 movq 36(%eax), %xmm0 2670 movq %xmm0, 36(%edx) 2671 L(bk_write_36bytes): 2672 movq 28(%eax), %xmm0 2673 movq %xmm0, 28(%edx) 2674 L(bk_write_28bytes): 2675 movq 20(%eax), %xmm0 2676 movq %xmm0, 20(%edx) 2677 L(bk_write_20bytes): 2678 movq 12(%eax), %xmm0 2679 movq %xmm0, 12(%edx) 2680 L(bk_write_12bytes): 2681 movq 4(%eax), %xmm0 2682 movq %xmm0, 4(%edx) 2683 L(bk_write_4bytes): 2684 movl (%eax), %ecx 2685 movl %ecx, (%edx) 2686 L(bk_write_0bytes): 2687 #ifndef USE_AS_BCOPY 2688 movl DEST(%esp), %eax 2689 # ifdef USE_AS_MEMPCPY 2690 movl LEN(%esp), %ecx 2691 add %ecx, %eax 2692 # endif 2693 #endif 2694 RETURN 2695 2696 .p2align 4 2697 L(bk_write_40bytes): 2698 movq 32(%eax), %xmm0 2699 movq %xmm0, 32(%edx) 2700 L(bk_write_32bytes): 2701 movq 24(%eax), %xmm0 2702 movq %xmm0, 24(%edx) 2703 L(bk_write_24bytes): 2704 movq 16(%eax), %xmm0 2705 movq %xmm0, 16(%edx) 2706 L(bk_write_16bytes): 2707 movq 8(%eax), %xmm0 2708 movq %xmm0, 8(%edx) 2709 L(bk_write_8bytes): 2710 movq (%eax), %xmm0 2711 movq %xmm0, (%edx) 2712 #ifndef USE_AS_BCOPY 2713 movl DEST(%esp), %eax 2714 # ifdef USE_AS_MEMPCPY 2715 movl LEN(%esp), %ecx 2716 add %ecx, %eax 2717 # endif 2718 #endif 2719 RETURN 2720 2721 .p2align 4 2722 L(bk_write_45bytes): 2723 movq 37(%eax), %xmm0 2724 movq %xmm0, 37(%edx) 2725 L(bk_write_37bytes): 2726 movq 29(%eax), %xmm0 2727 movq %xmm0, 29(%edx) 2728 L(bk_write_29bytes): 2729 movq 21(%eax), %xmm0 2730 movq %xmm0, 21(%edx) 2731 L(bk_write_21bytes): 2732 movq 13(%eax), %xmm0 2733 movq %xmm0, 13(%edx) 2734 L(bk_write_13bytes): 2735 movq 5(%eax), %xmm0 2736 movq %xmm0, 5(%edx) 2737 L(bk_write_5bytes): 2738 movl 1(%eax), %ecx 2739 movl %ecx, 1(%edx) 2740 L(bk_write_1bytes): 2741 movzbl (%eax), %ecx 2742 movb %cl, (%edx) 2743 #ifndef USE_AS_BCOPY 2744 movl DEST(%esp), %eax 2745 # ifdef USE_AS_MEMPCPY 2746 movl LEN(%esp), %ecx 2747 add %ecx, %eax 2748 # endif 2749 #endif 2750 RETURN 2751 2752 .p2align 4 2753 L(bk_write_41bytes): 2754 movq 33(%eax), %xmm0 2755 movq %xmm0, 33(%edx) 2756 L(bk_write_33bytes): 2757 movq 25(%eax), %xmm0 2758 movq %xmm0, 25(%edx) 2759 L(bk_write_25bytes): 2760 movq 17(%eax), %xmm0 2761 movq %xmm0, 17(%edx) 2762 L(bk_write_17bytes): 2763 movq 9(%eax), %xmm0 2764 movq %xmm0, 9(%edx) 2765 L(bk_write_9bytes): 2766 movq 1(%eax), %xmm0 2767 movq %xmm0, 1(%edx) 2768 movzbl (%eax), %ecx 2769 movb %cl, (%edx) 2770 #ifndef USE_AS_BCOPY 2771 movl DEST(%esp), %eax 2772 # ifdef USE_AS_MEMPCPY 2773 movl LEN(%esp), %ecx 2774 add %ecx, %eax 2775 # endif 2776 #endif 2777 RETURN 2778 2779 .p2align 4 2780 L(bk_write_46bytes): 2781 movq 38(%eax), %xmm0 2782 movq %xmm0, 38(%edx) 2783 L(bk_write_38bytes): 2784 movq 30(%eax), %xmm0 2785 movq %xmm0, 30(%edx) 2786 L(bk_write_30bytes): 2787 movq 22(%eax), %xmm0 2788 movq %xmm0, 22(%edx) 2789 L(bk_write_22bytes): 2790 movq 14(%eax), %xmm0 2791 movq %xmm0, 14(%edx) 2792 L(bk_write_14bytes): 2793 movq 6(%eax), %xmm0 2794 movq %xmm0, 6(%edx) 2795 L(bk_write_6bytes): 2796 movl 2(%eax), %ecx 2797 movl %ecx, 2(%edx) 2798 movzwl (%eax), %ecx 2799 movw %cx, (%edx) 2800 #ifndef USE_AS_BCOPY 2801 movl DEST(%esp), %eax 2802 # ifdef USE_AS_MEMPCPY 2803 movl LEN(%esp), %ecx 2804 add %ecx, %eax 2805 # endif 2806 #endif 2807 RETURN 2808 2809 .p2align 4 2810 L(bk_write_42bytes): 2811 movq 34(%eax), %xmm0 2812 movq %xmm0, 34(%edx) 2813 L(bk_write_34bytes): 2814 movq 26(%eax), %xmm0 2815 movq %xmm0, 26(%edx) 2816 L(bk_write_26bytes): 2817 movq 18(%eax), %xmm0 2818 movq %xmm0, 18(%edx) 2819 L(bk_write_18bytes): 2820 movq 10(%eax), %xmm0 2821 movq %xmm0, 10(%edx) 2822 L(bk_write_10bytes): 2823 movq 2(%eax), %xmm0 2824 movq %xmm0, 2(%edx) 2825 L(bk_write_2bytes): 2826 movzwl (%eax), %ecx 2827 movw %cx, (%edx) 2828 #ifndef USE_AS_BCOPY 2829 movl DEST(%esp), %eax 2830 # ifdef USE_AS_MEMPCPY 2831 movl LEN(%esp), %ecx 2832 add %ecx, %eax 2833 # endif 2834 #endif 2835 RETURN 2836 2837 .p2align 4 2838 L(bk_write_47bytes): 2839 movq 39(%eax), %xmm0 2840 movq %xmm0, 39(%edx) 2841 L(bk_write_39bytes): 2842 movq 31(%eax), %xmm0 2843 movq %xmm0, 31(%edx) 2844 L(bk_write_31bytes): 2845 movq 23(%eax), %xmm0 2846 movq %xmm0, 23(%edx) 2847 L(bk_write_23bytes): 2848 movq 15(%eax), %xmm0 2849 movq %xmm0, 15(%edx) 2850 L(bk_write_15bytes): 2851 movq 7(%eax), %xmm0 2852 movq %xmm0, 7(%edx) 2853 L(bk_write_7bytes): 2854 movl 3(%eax), %ecx 2855 movl %ecx, 3(%edx) 2856 movzwl 1(%eax), %ecx 2857 movw %cx, 1(%edx) 2858 movzbl (%eax), %eax 2859 movb %al, (%edx) 2860 #ifndef USE_AS_BCOPY 2861 movl DEST(%esp), %eax 2862 # ifdef USE_AS_MEMPCPY 2863 movl LEN(%esp), %ecx 2864 add %ecx, %eax 2865 # endif 2866 #endif 2867 RETURN 2868 2869 .p2align 4 2870 L(bk_write_43bytes): 2871 movq 35(%eax), %xmm0 2872 movq %xmm0, 35(%edx) 2873 L(bk_write_35bytes): 2874 movq 27(%eax), %xmm0 2875 movq %xmm0, 27(%edx) 2876 L(bk_write_27bytes): 2877 movq 19(%eax), %xmm0 2878 movq %xmm0, 19(%edx) 2879 L(bk_write_19bytes): 2880 movq 11(%eax), %xmm0 2881 movq %xmm0, 11(%edx) 2882 L(bk_write_11bytes): 2883 movq 3(%eax), %xmm0 2884 movq %xmm0, 3(%edx) 2885 L(bk_write_3bytes): 2886 movzwl 1(%eax), %ecx 2887 movw %cx, 1(%edx) 2888 movzbl (%eax), %eax 2889 movb %al, (%edx) 2890 #ifndef USE_AS_BCOPY 2891 movl DEST(%esp), %eax 2892 # ifdef USE_AS_MEMPCPY 2893 movl LEN(%esp), %ecx 2894 add %ecx, %eax 2895 # endif 2896 #endif 2897 RETURN_END 2898 2899 2900 .pushsection .rodata.ssse3,"a",@progbits 2901 .p2align 2 2902 L(table_48bytes_fwd): 2903 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) 2904 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) 2905 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) 2906 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) 2907 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) 2908 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) 2909 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) 2910 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) 2911 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) 2912 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) 2913 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) 2914 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) 2915 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) 2916 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) 2917 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) 2918 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) 2919 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) 2920 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) 2921 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) 2922 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) 2923 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) 2924 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) 2925 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) 2926 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) 2927 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) 2928 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) 2929 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) 2930 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) 2931 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) 2932 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) 2933 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) 2934 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) 2935 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) 2936 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) 2937 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) 2938 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) 2939 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) 2940 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) 2941 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) 2942 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) 2943 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) 2944 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) 2945 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) 2946 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) 2947 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) 2948 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) 2949 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) 2950 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) 2951 2952 .p2align 2 2953 L(table_48bytes_fwd_align): 2954 .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align)) 2955 .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align)) 2956 .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align)) 2957 .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align)) 2958 .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align)) 2959 .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align)) 2960 .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align)) 2961 .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align)) 2962 .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align)) 2963 .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align)) 2964 .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align)) 2965 .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align)) 2966 .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align)) 2967 .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align)) 2968 .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align)) 2969 .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align)) 2970 .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align)) 2971 .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align)) 2972 .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align)) 2973 .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align)) 2974 .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align)) 2975 .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align)) 2976 .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align)) 2977 .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align)) 2978 .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align)) 2979 .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align)) 2980 .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align)) 2981 .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align)) 2982 .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align)) 2983 .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align)) 2984 .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align)) 2985 .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align)) 2986 .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align)) 2987 .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align)) 2988 .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align)) 2989 .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align)) 2990 .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align)) 2991 .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align)) 2992 .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align)) 2993 .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align)) 2994 .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align)) 2995 .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align)) 2996 .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align)) 2997 .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align)) 2998 .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align)) 2999 .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align)) 3000 .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align)) 3001 .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align)) 3002 3003 .p2align 2 3004 L(shl_table): 3005 .int JMPTBL (L(shl_0), L(shl_table)) 3006 .int JMPTBL (L(shl_1), L(shl_table)) 3007 .int JMPTBL (L(shl_2), L(shl_table)) 3008 .int JMPTBL (L(shl_3), L(shl_table)) 3009 .int JMPTBL (L(shl_4), L(shl_table)) 3010 .int JMPTBL (L(shl_5), L(shl_table)) 3011 .int JMPTBL (L(shl_6), L(shl_table)) 3012 .int JMPTBL (L(shl_7), L(shl_table)) 3013 .int JMPTBL (L(shl_8), L(shl_table)) 3014 .int JMPTBL (L(shl_9), L(shl_table)) 3015 .int JMPTBL (L(shl_10), L(shl_table)) 3016 .int JMPTBL (L(shl_11), L(shl_table)) 3017 .int JMPTBL (L(shl_12), L(shl_table)) 3018 .int JMPTBL (L(shl_13), L(shl_table)) 3019 .int JMPTBL (L(shl_14), L(shl_table)) 3020 .int JMPTBL (L(shl_15), L(shl_table)) 3021 3022 .p2align 2 3023 L(table_48_bytes_bwd): 3024 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) 3025 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) 3026 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) 3027 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) 3028 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) 3029 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) 3030 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) 3031 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) 3032 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) 3033 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) 3034 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) 3035 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) 3036 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) 3037 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) 3038 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) 3039 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) 3040 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) 3041 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) 3042 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) 3043 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) 3044 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) 3045 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) 3046 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) 3047 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) 3048 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) 3049 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) 3050 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) 3051 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) 3052 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) 3053 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) 3054 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) 3055 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) 3056 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) 3057 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) 3058 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) 3059 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) 3060 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) 3061 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) 3062 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) 3063 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) 3064 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) 3065 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) 3066 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) 3067 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) 3068 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) 3069 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) 3070 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) 3071 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) 3072 3073 .popsection 3074 3075 #ifdef USE_AS_MEMMOVE 3076 .p2align 4 3077 L(copy_backward): 3078 PUSH (%edi) 3079 movl %eax, %edi 3080 lea (%ecx,%edx,1),%edx 3081 lea (%ecx,%edi,1),%edi 3082 testl $0x3, %edx 3083 jnz L(bk_align) 3084 3085 L(bk_aligned_4): 3086 cmp $64, %ecx 3087 jae L(bk_write_more64bytes) 3088 3089 L(bk_write_64bytesless): 3090 cmp $32, %ecx 3091 jb L(bk_write_less32bytes) 3092 3093 L(bk_write_more32bytes): 3094 /* Copy 32 bytes at a time. */ 3095 sub $32, %ecx 3096 movq -8(%edi), %xmm0 3097 movq %xmm0, -8(%edx) 3098 movq -16(%edi), %xmm0 3099 movq %xmm0, -16(%edx) 3100 movq -24(%edi), %xmm0 3101 movq %xmm0, -24(%edx) 3102 movq -32(%edi), %xmm0 3103 movq %xmm0, -32(%edx) 3104 sub $32, %edx 3105 sub $32, %edi 3106 3107 L(bk_write_less32bytes): 3108 movl %edi, %eax 3109 sub %ecx, %edx 3110 sub %ecx, %eax 3111 POP (%edi) 3112 L(bk_write_less32bytes_2): 3113 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) 3114 3115 CFI_PUSH (%edi) 3116 3117 .p2align 4 3118 L(bk_align): 3119 cmp $8, %ecx 3120 jbe L(bk_write_less32bytes) 3121 testl $1, %edx 3122 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, 3123 then (EDX & 2) must be != 0. */ 3124 jz L(bk_got2) 3125 sub $1, %edi 3126 sub $1, %ecx 3127 sub $1, %edx 3128 movzbl (%edi), %eax 3129 movb %al, (%edx) 3130 3131 testl $2, %edx 3132 jz L(bk_aligned_4) 3133 3134 L(bk_got2): 3135 sub $2, %edi 3136 sub $2, %ecx 3137 sub $2, %edx 3138 movzwl (%edi), %eax 3139 movw %ax, (%edx) 3140 jmp L(bk_aligned_4) 3141 3142 .p2align 4 3143 L(bk_write_more64bytes): 3144 /* Check alignment of last byte. */ 3145 testl $15, %edx 3146 jz L(bk_ssse3_cpy_pre) 3147 3148 /* EDX is aligned 4 bytes, but not 16 bytes. */ 3149 L(bk_ssse3_align): 3150 sub $4, %edi 3151 sub $4, %ecx 3152 sub $4, %edx 3153 movl (%edi), %eax 3154 movl %eax, (%edx) 3155 3156 testl $15, %edx 3157 jz L(bk_ssse3_cpy_pre) 3158 3159 sub $4, %edi 3160 sub $4, %ecx 3161 sub $4, %edx 3162 movl (%edi), %eax 3163 movl %eax, (%edx) 3164 3165 testl $15, %edx 3166 jz L(bk_ssse3_cpy_pre) 3167 3168 sub $4, %edi 3169 sub $4, %ecx 3170 sub $4, %edx 3171 movl (%edi), %eax 3172 movl %eax, (%edx) 3173 3174 L(bk_ssse3_cpy_pre): 3175 cmp $64, %ecx 3176 jb L(bk_write_more32bytes) 3177 3178 .p2align 4 3179 L(bk_ssse3_cpy): 3180 sub $64, %edi 3181 sub $64, %ecx 3182 sub $64, %edx 3183 movdqu 0x30(%edi), %xmm3 3184 movdqa %xmm3, 0x30(%edx) 3185 movdqu 0x20(%edi), %xmm2 3186 movdqa %xmm2, 0x20(%edx) 3187 movdqu 0x10(%edi), %xmm1 3188 movdqa %xmm1, 0x10(%edx) 3189 movdqu (%edi), %xmm0 3190 movdqa %xmm0, (%edx) 3191 cmp $64, %ecx 3192 jae L(bk_ssse3_cpy) 3193 jmp L(bk_write_64bytesless) 3194 3195 #endif 3196 3197 END (MEMCPY) 3198