1 /* 2 Copyright (c) 2010, Intel Corporation 3 All rights reserved. 4 5 Redistribution and use in source and binary forms, with or without 6 modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #ifndef MEMCPY 32 # define MEMCPY ssse3_memcpy5 33 #endif 34 35 #ifndef L 36 # define L(label) .L##label 37 #endif 38 39 #ifndef ALIGN 40 # define ALIGN(n) .p2align n 41 #endif 42 43 #ifndef cfi_startproc 44 # define cfi_startproc .cfi_startproc 45 #endif 46 47 #ifndef cfi_endproc 48 # define cfi_endproc .cfi_endproc 49 #endif 50 51 #ifndef cfi_rel_offset 52 # define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 53 #endif 54 55 #ifndef cfi_restore 56 # define cfi_restore(reg) .cfi_restore (reg) 57 #endif 58 59 #ifndef cfi_adjust_cfa_offset 60 # define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 61 #endif 62 63 #ifndef ENTRY 64 # define ENTRY(name) \ 65 .type name, @function; \ 66 .globl name; \ 67 .p2align 4; \ 68 name: \ 69 cfi_startproc 70 #endif 71 72 #ifndef END 73 # define END(name) \ 74 cfi_endproc; \ 75 .size name, .-name 76 #endif 77 78 #ifdef USE_AS_BCOPY 79 # define SRC PARMS 80 # define DEST SRC+4 81 # define LEN DEST+4 82 #else 83 # define DEST PARMS 84 # define SRC DEST+4 85 # define LEN SRC+4 86 #endif 87 88 #define CFI_PUSH(REG) \ 89 cfi_adjust_cfa_offset (4); \ 90 cfi_rel_offset (REG, 0) 91 92 #define CFI_POP(REG) \ 93 cfi_adjust_cfa_offset (-4); \ 94 cfi_restore (REG) 95 96 #define PUSH(REG) pushl REG; CFI_PUSH (REG) 97 #define POP(REG) popl REG; CFI_POP (REG) 98 99 #ifdef SHARED 100 # define PARMS 8 /* Preserve EBX. */ 101 # define ENTRANCE PUSH (%ebx); 102 # define RETURN_END POP (%ebx); ret 103 # define RETURN RETURN_END; CFI_PUSH (%ebx) 104 # define JMPTBL(I, B) I - B 105 106 /* Load an entry in a jump table into EBX and branch to it. TABLE is a 107 jump table with relative offsets. INDEX is a register contains the 108 index into the jump table. SCALE is the scale of INDEX. */ 109 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 110 /* We first load PC into EBX. */ \ 111 call __i686.get_pc_thunk.bx; \ 112 /* Get the address of the jump table. */ \ 113 addl $(TABLE - .), %ebx; \ 114 /* Get the entry and convert the relative offset to the \ 115 absolute address. */ \ 116 addl (%ebx,INDEX,SCALE), %ebx; \ 117 /* We loaded the jump table. Go. */ \ 118 jmp *%ebx 119 120 # define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \ 121 addl $(TABLE - .), %ebx 122 123 # define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ 124 addl (%ebx,INDEX,SCALE), %ebx; \ 125 /* We loaded the jump table. Go. */ \ 126 jmp *%ebx 127 128 .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits 129 .globl __i686.get_pc_thunk.bx 130 .hidden __i686.get_pc_thunk.bx 131 ALIGN (4) 132 .type __i686.get_pc_thunk.bx,@function 133 __i686.get_pc_thunk.bx: 134 movl (%esp), %ebx 135 ret 136 #else 137 # define PARMS 4 138 # define ENTRANCE 139 # define RETURN_END ret 140 # define RETURN RETURN_END 141 # define JMPTBL(I, B) I 142 143 /* Branch to an entry in a jump table. TABLE is a jump table with 144 absolute offsets. INDEX is a register contains the index into the 145 jump table. SCALE is the scale of INDEX. */ 146 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 147 jmp *TABLE(,INDEX,SCALE) 148 149 # define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) 150 151 # define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ 152 jmp *TABLE(,INDEX,SCALE) 153 #endif 154 155 .section .text.ssse3,"ax",@progbits 156 ENTRY (MEMCPY) 157 ENTRANCE 158 movl LEN(%esp), %ecx 159 movl SRC(%esp), %eax 160 movl DEST(%esp), %edx 161 162 #ifdef USE_AS_MEMMOVE 163 cmp %eax, %edx 164 jb L(copy_forward) 165 je L(fwd_write_0bytes) 166 cmp $32, %ecx 167 jae L(memmove_bwd) 168 jmp L(bk_write_less32bytes_2) 169 L(memmove_bwd): 170 add %ecx, %eax 171 cmp %eax, %edx 172 movl SRC(%esp), %eax 173 jb L(copy_backward) 174 175 L(copy_forward): 176 #endif 177 cmp $48, %ecx 178 jae L(48bytesormore) 179 180 L(fwd_write_less32bytes): 181 #ifndef USE_AS_MEMMOVE 182 cmp %dl, %al 183 jb L(bk_write) 184 #endif 185 add %ecx, %edx 186 add %ecx, %eax 187 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 188 #ifndef USE_AS_MEMMOVE 189 L(bk_write): 190 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) 191 #endif 192 193 ALIGN (4) 194 /* ECX > 32 and EDX is 4 byte aligned. */ 195 L(48bytesormore): 196 movdqu (%eax), %xmm0 197 PUSH (%edi) 198 movl %edx, %edi 199 and $-16, %edx 200 PUSH (%esi) 201 add $16, %edx 202 movl %edi, %esi 203 sub %edx, %edi 204 add %edi, %ecx 205 sub %edi, %eax 206 207 #ifdef SHARED_CACHE_SIZE_HALF 208 cmp $SHARED_CACHE_SIZE_HALF, %ecx 209 #else 210 # ifdef SHARED 211 call __i686.get_pc_thunk.bx 212 add $_GLOBAL_OFFSET_TABLE_, %ebx 213 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx 214 # else 215 cmp __x86_shared_cache_size_half, %ecx 216 # endif 217 #endif 218 219 mov %eax, %edi 220 jae L(large_page) 221 and $0xf, %edi 222 jz L(shl_0) 223 224 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) 225 226 ALIGN (4) 227 L(shl_0): 228 movdqu %xmm0, (%esi) 229 xor %edi, %edi 230 POP (%esi) 231 cmp $127, %ecx 232 ja L(shl_0_gobble) 233 lea -32(%ecx), %ecx 234 L(shl_0_loop): 235 movdqa (%eax, %edi), %xmm0 236 movdqa 16(%eax, %edi), %xmm1 237 sub $32, %ecx 238 movdqa %xmm0, (%edx, %edi) 239 movdqa %xmm1, 16(%edx, %edi) 240 lea 32(%edi), %edi 241 jb L(shl_0_end) 242 243 movdqa (%eax, %edi), %xmm0 244 movdqa 16(%eax, %edi), %xmm1 245 sub $32, %ecx 246 movdqa %xmm0, (%edx, %edi) 247 movdqa %xmm1, 16(%edx, %edi) 248 lea 32(%edi), %edi 249 jb L(shl_0_end) 250 251 movdqa (%eax, %edi), %xmm0 252 movdqa 16(%eax, %edi), %xmm1 253 sub $32, %ecx 254 movdqa %xmm0, (%edx, %edi) 255 movdqa %xmm1, 16(%edx, %edi) 256 lea 32(%edi), %edi 257 jb L(shl_0_end) 258 259 movdqa (%eax, %edi), %xmm0 260 movdqa 16(%eax, %edi), %xmm1 261 sub $32, %ecx 262 movdqa %xmm0, (%edx, %edi) 263 movdqa %xmm1, 16(%edx, %edi) 264 lea 32(%edi), %edi 265 L(shl_0_end): 266 lea 32(%ecx), %ecx 267 add %ecx, %edi 268 add %edi, %edx 269 add %edi, %eax 270 POP (%edi) 271 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 272 273 L(shl_0_gobble): 274 275 #ifdef DATA_CACHE_SIZE_HALF 276 cmp $DATA_CACHE_SIZE_HALF, %ecx 277 #else 278 # ifdef SHARED 279 call __i686.get_pc_thunk.bx 280 add $_GLOBAL_OFFSET_TABLE_, %ebx 281 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 282 # else 283 cmp __x86_data_cache_size_half, %ecx 284 # endif 285 #endif 286 287 POP (%edi) 288 lea -128(%ecx), %ecx 289 jae L(shl_0_gobble_mem_loop) 290 L(shl_0_gobble_cache_loop): 291 movdqa (%eax), %xmm0 292 movdqa 0x10(%eax), %xmm1 293 movdqa 0x20(%eax), %xmm2 294 movdqa 0x30(%eax), %xmm3 295 movdqa 0x40(%eax), %xmm4 296 movdqa 0x50(%eax), %xmm5 297 movdqa 0x60(%eax), %xmm6 298 movdqa 0x70(%eax), %xmm7 299 lea 0x80(%eax), %eax 300 sub $128, %ecx 301 movdqa %xmm0, (%edx) 302 movdqa %xmm1, 0x10(%edx) 303 movdqa %xmm2, 0x20(%edx) 304 movdqa %xmm3, 0x30(%edx) 305 movdqa %xmm4, 0x40(%edx) 306 movdqa %xmm5, 0x50(%edx) 307 movdqa %xmm6, 0x60(%edx) 308 movdqa %xmm7, 0x70(%edx) 309 lea 0x80(%edx), %edx 310 311 jae L(shl_0_gobble_cache_loop) 312 cmp $-0x40, %ecx 313 lea 0x80(%ecx), %ecx 314 jl L(shl_0_cache_less_64bytes) 315 316 movdqa (%eax), %xmm0 317 sub $0x40, %ecx 318 movdqa 0x10(%eax), %xmm1 319 320 movdqa %xmm0, (%edx) 321 movdqa %xmm1, 0x10(%edx) 322 323 movdqa 0x20(%eax), %xmm0 324 movdqa 0x30(%eax), %xmm1 325 add $0x40, %eax 326 327 movdqa %xmm0, 0x20(%edx) 328 movdqa %xmm1, 0x30(%edx) 329 add $0x40, %edx 330 L(shl_0_cache_less_64bytes): 331 cmp $0x20, %ecx 332 jb L(shl_0_cache_less_32bytes) 333 movdqa (%eax), %xmm0 334 sub $0x20, %ecx 335 movdqa 0x10(%eax), %xmm1 336 add $0x20, %eax 337 movdqa %xmm0, (%edx) 338 movdqa %xmm1, 0x10(%edx) 339 add $0x20, %edx 340 L(shl_0_cache_less_32bytes): 341 cmp $0x10, %ecx 342 jb L(shl_0_cache_less_16bytes) 343 sub $0x10, %ecx 344 movdqa (%eax), %xmm0 345 add $0x10, %eax 346 movdqa %xmm0, (%edx) 347 add $0x10, %edx 348 L(shl_0_cache_less_16bytes): 349 add %ecx, %edx 350 add %ecx, %eax 351 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 352 353 354 ALIGN (4) 355 L(shl_0_gobble_mem_loop): 356 prefetcht0 0x1c0(%eax) 357 prefetcht0 0x280(%eax) 358 prefetcht0 0x1c0(%edx) 359 360 movdqa (%eax), %xmm0 361 movdqa 0x10(%eax), %xmm1 362 movdqa 0x20(%eax), %xmm2 363 movdqa 0x30(%eax), %xmm3 364 movdqa 0x40(%eax), %xmm4 365 movdqa 0x50(%eax), %xmm5 366 movdqa 0x60(%eax), %xmm6 367 movdqa 0x70(%eax), %xmm7 368 lea 0x80(%eax), %eax 369 sub $0x80, %ecx 370 movdqa %xmm0, (%edx) 371 movdqa %xmm1, 0x10(%edx) 372 movdqa %xmm2, 0x20(%edx) 373 movdqa %xmm3, 0x30(%edx) 374 movdqa %xmm4, 0x40(%edx) 375 movdqa %xmm5, 0x50(%edx) 376 movdqa %xmm6, 0x60(%edx) 377 movdqa %xmm7, 0x70(%edx) 378 lea 0x80(%edx), %edx 379 380 jae L(shl_0_gobble_mem_loop) 381 cmp $-0x40, %ecx 382 lea 0x80(%ecx), %ecx 383 jl L(shl_0_mem_less_64bytes) 384 385 movdqa (%eax), %xmm0 386 sub $0x40, %ecx 387 movdqa 0x10(%eax), %xmm1 388 389 movdqa %xmm0, (%edx) 390 movdqa %xmm1, 0x10(%edx) 391 392 movdqa 0x20(%eax), %xmm0 393 movdqa 0x30(%eax), %xmm1 394 add $0x40, %eax 395 396 movdqa %xmm0, 0x20(%edx) 397 movdqa %xmm1, 0x30(%edx) 398 add $0x40, %edx 399 L(shl_0_mem_less_64bytes): 400 cmp $0x20, %ecx 401 jb L(shl_0_mem_less_32bytes) 402 movdqa (%eax), %xmm0 403 sub $0x20, %ecx 404 movdqa 0x10(%eax), %xmm1 405 add $0x20, %eax 406 movdqa %xmm0, (%edx) 407 movdqa %xmm1, 0x10(%edx) 408 add $0x20, %edx 409 L(shl_0_mem_less_32bytes): 410 cmp $0x10, %ecx 411 jb L(shl_0_mem_less_16bytes) 412 sub $0x10, %ecx 413 movdqa (%eax), %xmm0 414 add $0x10, %eax 415 movdqa %xmm0, (%edx) 416 add $0x10, %edx 417 L(shl_0_mem_less_16bytes): 418 add %ecx, %edx 419 add %ecx, %eax 420 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 421 422 423 ALIGN (4) 424 L(shl_1): 425 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 426 lea -1(%eax), %eax 427 movaps (%eax), %xmm1 428 xor %edi, %edi 429 lea -32(%ecx), %ecx 430 movdqu %xmm0, (%esi) 431 POP (%esi) 432 L(shl_1_loop): 433 434 movdqa 16(%eax, %edi), %xmm2 435 sub $32, %ecx 436 movdqa 32(%eax, %edi), %xmm3 437 movdqa %xmm3, %xmm4 438 palignr $1, %xmm2, %xmm3 439 palignr $1, %xmm1, %xmm2 440 lea 32(%edi), %edi 441 movdqa %xmm2, -32(%edx, %edi) 442 movdqa %xmm3, -16(%edx, %edi) 443 444 jb L(shl_1_end) 445 446 movdqa 16(%eax, %edi), %xmm2 447 sub $32, %ecx 448 movdqa 32(%eax, %edi), %xmm3 449 movdqa %xmm3, %xmm1 450 palignr $1, %xmm2, %xmm3 451 palignr $1, %xmm4, %xmm2 452 lea 32(%edi), %edi 453 movdqa %xmm2, -32(%edx, %edi) 454 movdqa %xmm3, -16(%edx, %edi) 455 456 jae L(shl_1_loop) 457 458 L(shl_1_end): 459 lea 32(%ecx), %ecx 460 add %ecx, %edi 461 add %edi, %edx 462 lea 1(%edi, %eax), %eax 463 POP (%edi) 464 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 465 466 ALIGN (4) 467 L(shl_2): 468 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 469 lea -2(%eax), %eax 470 movaps (%eax), %xmm1 471 xor %edi, %edi 472 lea -32(%ecx), %ecx 473 movdqu %xmm0, (%esi) 474 POP (%esi) 475 L(shl_2_loop): 476 477 movdqa 16(%eax, %edi), %xmm2 478 sub $32, %ecx 479 movdqa 32(%eax, %edi), %xmm3 480 movdqa %xmm3, %xmm4 481 palignr $2, %xmm2, %xmm3 482 palignr $2, %xmm1, %xmm2 483 lea 32(%edi), %edi 484 movdqa %xmm2, -32(%edx, %edi) 485 movdqa %xmm3, -16(%edx, %edi) 486 487 jb L(shl_2_end) 488 489 movdqa 16(%eax, %edi), %xmm2 490 sub $32, %ecx 491 movdqa 32(%eax, %edi), %xmm3 492 movdqa %xmm3, %xmm1 493 palignr $2, %xmm2, %xmm3 494 palignr $2, %xmm4, %xmm2 495 lea 32(%edi), %edi 496 movdqa %xmm2, -32(%edx, %edi) 497 movdqa %xmm3, -16(%edx, %edi) 498 499 jae L(shl_2_loop) 500 501 L(shl_2_end): 502 lea 32(%ecx), %ecx 503 add %ecx, %edi 504 add %edi, %edx 505 lea 2(%edi, %eax), %eax 506 POP (%edi) 507 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 508 509 ALIGN (4) 510 L(shl_3): 511 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 512 lea -3(%eax), %eax 513 movaps (%eax), %xmm1 514 xor %edi, %edi 515 lea -32(%ecx), %ecx 516 movdqu %xmm0, (%esi) 517 POP (%esi) 518 L(shl_3_loop): 519 520 movdqa 16(%eax, %edi), %xmm2 521 sub $32, %ecx 522 movdqa 32(%eax, %edi), %xmm3 523 movdqa %xmm3, %xmm4 524 palignr $3, %xmm2, %xmm3 525 palignr $3, %xmm1, %xmm2 526 lea 32(%edi), %edi 527 movdqa %xmm2, -32(%edx, %edi) 528 movdqa %xmm3, -16(%edx, %edi) 529 530 jb L(shl_3_end) 531 532 movdqa 16(%eax, %edi), %xmm2 533 sub $32, %ecx 534 movdqa 32(%eax, %edi), %xmm3 535 movdqa %xmm3, %xmm1 536 palignr $3, %xmm2, %xmm3 537 palignr $3, %xmm4, %xmm2 538 lea 32(%edi), %edi 539 movdqa %xmm2, -32(%edx, %edi) 540 movdqa %xmm3, -16(%edx, %edi) 541 542 jae L(shl_3_loop) 543 544 L(shl_3_end): 545 lea 32(%ecx), %ecx 546 add %ecx, %edi 547 add %edi, %edx 548 lea 3(%edi, %eax), %eax 549 POP (%edi) 550 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 551 552 ALIGN (4) 553 L(shl_4): 554 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 555 lea -4(%eax), %eax 556 movaps (%eax), %xmm1 557 xor %edi, %edi 558 lea -32(%ecx), %ecx 559 movdqu %xmm0, (%esi) 560 POP (%esi) 561 L(shl_4_loop): 562 563 movdqa 16(%eax, %edi), %xmm2 564 sub $32, %ecx 565 movdqa 32(%eax, %edi), %xmm3 566 movdqa %xmm3, %xmm4 567 palignr $4, %xmm2, %xmm3 568 palignr $4, %xmm1, %xmm2 569 lea 32(%edi), %edi 570 movdqa %xmm2, -32(%edx, %edi) 571 movdqa %xmm3, -16(%edx, %edi) 572 573 jb L(shl_4_end) 574 575 movdqa 16(%eax, %edi), %xmm2 576 sub $32, %ecx 577 movdqa 32(%eax, %edi), %xmm3 578 movdqa %xmm3, %xmm1 579 palignr $4, %xmm2, %xmm3 580 palignr $4, %xmm4, %xmm2 581 lea 32(%edi), %edi 582 movdqa %xmm2, -32(%edx, %edi) 583 movdqa %xmm3, -16(%edx, %edi) 584 585 jae L(shl_4_loop) 586 587 L(shl_4_end): 588 lea 32(%ecx), %ecx 589 add %ecx, %edi 590 add %edi, %edx 591 lea 4(%edi, %eax), %eax 592 POP (%edi) 593 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 594 595 ALIGN (4) 596 L(shl_5): 597 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 598 lea -5(%eax), %eax 599 movaps (%eax), %xmm1 600 xor %edi, %edi 601 lea -32(%ecx), %ecx 602 movdqu %xmm0, (%esi) 603 POP (%esi) 604 L(shl_5_loop): 605 606 movdqa 16(%eax, %edi), %xmm2 607 sub $32, %ecx 608 movdqa 32(%eax, %edi), %xmm3 609 movdqa %xmm3, %xmm4 610 palignr $5, %xmm2, %xmm3 611 palignr $5, %xmm1, %xmm2 612 lea 32(%edi), %edi 613 movdqa %xmm2, -32(%edx, %edi) 614 movdqa %xmm3, -16(%edx, %edi) 615 616 jb L(shl_5_end) 617 618 movdqa 16(%eax, %edi), %xmm2 619 sub $32, %ecx 620 movdqa 32(%eax, %edi), %xmm3 621 movdqa %xmm3, %xmm1 622 palignr $5, %xmm2, %xmm3 623 palignr $5, %xmm4, %xmm2 624 lea 32(%edi), %edi 625 movdqa %xmm2, -32(%edx, %edi) 626 movdqa %xmm3, -16(%edx, %edi) 627 628 jae L(shl_5_loop) 629 630 L(shl_5_end): 631 lea 32(%ecx), %ecx 632 add %ecx, %edi 633 add %edi, %edx 634 lea 5(%edi, %eax), %eax 635 POP (%edi) 636 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 637 638 639 ALIGN (4) 640 L(shl_6): 641 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 642 lea -6(%eax), %eax 643 movaps (%eax), %xmm1 644 xor %edi, %edi 645 lea -32(%ecx), %ecx 646 movdqu %xmm0, (%esi) 647 POP (%esi) 648 L(shl_6_loop): 649 650 movdqa 16(%eax, %edi), %xmm2 651 sub $32, %ecx 652 movdqa 32(%eax, %edi), %xmm3 653 movdqa %xmm3, %xmm4 654 palignr $6, %xmm2, %xmm3 655 palignr $6, %xmm1, %xmm2 656 lea 32(%edi), %edi 657 movdqa %xmm2, -32(%edx, %edi) 658 movdqa %xmm3, -16(%edx, %edi) 659 660 jb L(shl_6_end) 661 662 movdqa 16(%eax, %edi), %xmm2 663 sub $32, %ecx 664 movdqa 32(%eax, %edi), %xmm3 665 movdqa %xmm3, %xmm1 666 palignr $6, %xmm2, %xmm3 667 palignr $6, %xmm4, %xmm2 668 lea 32(%edi), %edi 669 movdqa %xmm2, -32(%edx, %edi) 670 movdqa %xmm3, -16(%edx, %edi) 671 672 jae L(shl_6_loop) 673 674 L(shl_6_end): 675 lea 32(%ecx), %ecx 676 add %ecx, %edi 677 add %edi, %edx 678 lea 6(%edi, %eax), %eax 679 POP (%edi) 680 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 681 682 ALIGN (4) 683 L(shl_7): 684 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 685 lea -7(%eax), %eax 686 movaps (%eax), %xmm1 687 xor %edi, %edi 688 lea -32(%ecx), %ecx 689 movdqu %xmm0, (%esi) 690 POP (%esi) 691 L(shl_7_loop): 692 693 movdqa 16(%eax, %edi), %xmm2 694 sub $32, %ecx 695 movdqa 32(%eax, %edi), %xmm3 696 movdqa %xmm3, %xmm4 697 palignr $7, %xmm2, %xmm3 698 palignr $7, %xmm1, %xmm2 699 lea 32(%edi), %edi 700 movdqa %xmm2, -32(%edx, %edi) 701 movdqa %xmm3, -16(%edx, %edi) 702 703 jb L(shl_7_end) 704 705 movdqa 16(%eax, %edi), %xmm2 706 sub $32, %ecx 707 movdqa 32(%eax, %edi), %xmm3 708 movdqa %xmm3, %xmm1 709 palignr $7, %xmm2, %xmm3 710 palignr $7, %xmm4, %xmm2 711 lea 32(%edi), %edi 712 movdqa %xmm2, -32(%edx, %edi) 713 movdqa %xmm3, -16(%edx, %edi) 714 715 jae L(shl_7_loop) 716 717 L(shl_7_end): 718 lea 32(%ecx), %ecx 719 add %ecx, %edi 720 add %edi, %edx 721 lea 7(%edi, %eax), %eax 722 POP (%edi) 723 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 724 725 ALIGN (4) 726 L(shl_8): 727 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 728 lea -8(%eax), %eax 729 movaps (%eax), %xmm1 730 xor %edi, %edi 731 lea -32(%ecx), %ecx 732 movdqu %xmm0, (%esi) 733 POP (%esi) 734 L(shl_8_loop): 735 736 movdqa 16(%eax, %edi), %xmm2 737 sub $32, %ecx 738 movdqa 32(%eax, %edi), %xmm3 739 movdqa %xmm3, %xmm4 740 palignr $8, %xmm2, %xmm3 741 palignr $8, %xmm1, %xmm2 742 lea 32(%edi), %edi 743 movdqa %xmm2, -32(%edx, %edi) 744 movdqa %xmm3, -16(%edx, %edi) 745 746 jb L(shl_8_end) 747 748 movdqa 16(%eax, %edi), %xmm2 749 sub $32, %ecx 750 movdqa 32(%eax, %edi), %xmm3 751 movdqa %xmm3, %xmm1 752 palignr $8, %xmm2, %xmm3 753 palignr $8, %xmm4, %xmm2 754 lea 32(%edi), %edi 755 movdqa %xmm2, -32(%edx, %edi) 756 movdqa %xmm3, -16(%edx, %edi) 757 758 jae L(shl_8_loop) 759 760 L(shl_8_end): 761 lea 32(%ecx), %ecx 762 add %ecx, %edi 763 add %edi, %edx 764 lea 8(%edi, %eax), %eax 765 POP (%edi) 766 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 767 768 ALIGN (4) 769 L(shl_9): 770 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 771 lea -9(%eax), %eax 772 movaps (%eax), %xmm1 773 xor %edi, %edi 774 lea -32(%ecx), %ecx 775 movdqu %xmm0, (%esi) 776 POP (%esi) 777 L(shl_9_loop): 778 779 movdqa 16(%eax, %edi), %xmm2 780 sub $32, %ecx 781 movdqa 32(%eax, %edi), %xmm3 782 movdqa %xmm3, %xmm4 783 palignr $9, %xmm2, %xmm3 784 palignr $9, %xmm1, %xmm2 785 lea 32(%edi), %edi 786 movdqa %xmm2, -32(%edx, %edi) 787 movdqa %xmm3, -16(%edx, %edi) 788 789 jb L(shl_9_end) 790 791 movdqa 16(%eax, %edi), %xmm2 792 sub $32, %ecx 793 movdqa 32(%eax, %edi), %xmm3 794 movdqa %xmm3, %xmm1 795 palignr $9, %xmm2, %xmm3 796 palignr $9, %xmm4, %xmm2 797 lea 32(%edi), %edi 798 movdqa %xmm2, -32(%edx, %edi) 799 movdqa %xmm3, -16(%edx, %edi) 800 801 jae L(shl_9_loop) 802 803 L(shl_9_end): 804 lea 32(%ecx), %ecx 805 add %ecx, %edi 806 add %edi, %edx 807 lea 9(%edi, %eax), %eax 808 POP (%edi) 809 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 810 811 ALIGN (4) 812 L(shl_10): 813 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 814 lea -10(%eax), %eax 815 movaps (%eax), %xmm1 816 xor %edi, %edi 817 lea -32(%ecx), %ecx 818 movdqu %xmm0, (%esi) 819 POP (%esi) 820 L(shl_10_loop): 821 822 movdqa 16(%eax, %edi), %xmm2 823 sub $32, %ecx 824 movdqa 32(%eax, %edi), %xmm3 825 movdqa %xmm3, %xmm4 826 palignr $10, %xmm2, %xmm3 827 palignr $10, %xmm1, %xmm2 828 lea 32(%edi), %edi 829 movdqa %xmm2, -32(%edx, %edi) 830 movdqa %xmm3, -16(%edx, %edi) 831 832 jb L(shl_10_end) 833 834 movdqa 16(%eax, %edi), %xmm2 835 sub $32, %ecx 836 movdqa 32(%eax, %edi), %xmm3 837 movdqa %xmm3, %xmm1 838 palignr $10, %xmm2, %xmm3 839 palignr $10, %xmm4, %xmm2 840 lea 32(%edi), %edi 841 movdqa %xmm2, -32(%edx, %edi) 842 movdqa %xmm3, -16(%edx, %edi) 843 844 jae L(shl_10_loop) 845 846 L(shl_10_end): 847 lea 32(%ecx), %ecx 848 add %ecx, %edi 849 add %edi, %edx 850 lea 10(%edi, %eax), %eax 851 POP (%edi) 852 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 853 854 ALIGN (4) 855 L(shl_11): 856 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 857 lea -11(%eax), %eax 858 movaps (%eax), %xmm1 859 xor %edi, %edi 860 lea -32(%ecx), %ecx 861 movdqu %xmm0, (%esi) 862 POP (%esi) 863 L(shl_11_loop): 864 865 movdqa 16(%eax, %edi), %xmm2 866 sub $32, %ecx 867 movdqa 32(%eax, %edi), %xmm3 868 movdqa %xmm3, %xmm4 869 palignr $11, %xmm2, %xmm3 870 palignr $11, %xmm1, %xmm2 871 lea 32(%edi), %edi 872 movdqa %xmm2, -32(%edx, %edi) 873 movdqa %xmm3, -16(%edx, %edi) 874 875 jb L(shl_11_end) 876 877 movdqa 16(%eax, %edi), %xmm2 878 sub $32, %ecx 879 movdqa 32(%eax, %edi), %xmm3 880 movdqa %xmm3, %xmm1 881 palignr $11, %xmm2, %xmm3 882 palignr $11, %xmm4, %xmm2 883 lea 32(%edi), %edi 884 movdqa %xmm2, -32(%edx, %edi) 885 movdqa %xmm3, -16(%edx, %edi) 886 887 jae L(shl_11_loop) 888 889 L(shl_11_end): 890 lea 32(%ecx), %ecx 891 add %ecx, %edi 892 add %edi, %edx 893 lea 11(%edi, %eax), %eax 894 POP (%edi) 895 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 896 897 ALIGN (4) 898 L(shl_12): 899 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 900 lea -12(%eax), %eax 901 movaps (%eax), %xmm1 902 xor %edi, %edi 903 lea -32(%ecx), %ecx 904 movdqu %xmm0, (%esi) 905 POP (%esi) 906 L(shl_12_loop): 907 908 movdqa 16(%eax, %edi), %xmm2 909 sub $32, %ecx 910 movdqa 32(%eax, %edi), %xmm3 911 movdqa %xmm3, %xmm4 912 palignr $12, %xmm2, %xmm3 913 palignr $12, %xmm1, %xmm2 914 lea 32(%edi), %edi 915 movdqa %xmm2, -32(%edx, %edi) 916 movdqa %xmm3, -16(%edx, %edi) 917 918 jb L(shl_12_end) 919 920 movdqa 16(%eax, %edi), %xmm2 921 sub $32, %ecx 922 movdqa 32(%eax, %edi), %xmm3 923 movdqa %xmm3, %xmm1 924 palignr $12, %xmm2, %xmm3 925 palignr $12, %xmm4, %xmm2 926 lea 32(%edi), %edi 927 movdqa %xmm2, -32(%edx, %edi) 928 movdqa %xmm3, -16(%edx, %edi) 929 930 jae L(shl_12_loop) 931 932 L(shl_12_end): 933 lea 32(%ecx), %ecx 934 add %ecx, %edi 935 add %edi, %edx 936 lea 12(%edi, %eax), %eax 937 POP (%edi) 938 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 939 940 ALIGN (4) 941 L(shl_13): 942 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 943 lea -13(%eax), %eax 944 movaps (%eax), %xmm1 945 xor %edi, %edi 946 lea -32(%ecx), %ecx 947 movdqu %xmm0, (%esi) 948 POP (%esi) 949 L(shl_13_loop): 950 951 movdqa 16(%eax, %edi), %xmm2 952 sub $32, %ecx 953 movdqa 32(%eax, %edi), %xmm3 954 movdqa %xmm3, %xmm4 955 palignr $13, %xmm2, %xmm3 956 palignr $13, %xmm1, %xmm2 957 lea 32(%edi), %edi 958 movdqa %xmm2, -32(%edx, %edi) 959 movdqa %xmm3, -16(%edx, %edi) 960 961 jb L(shl_13_end) 962 963 movdqa 16(%eax, %edi), %xmm2 964 sub $32, %ecx 965 movdqa 32(%eax, %edi), %xmm3 966 movdqa %xmm3, %xmm1 967 palignr $13, %xmm2, %xmm3 968 palignr $13, %xmm4, %xmm2 969 lea 32(%edi), %edi 970 movdqa %xmm2, -32(%edx, %edi) 971 movdqa %xmm3, -16(%edx, %edi) 972 973 jae L(shl_13_loop) 974 975 L(shl_13_end): 976 lea 32(%ecx), %ecx 977 add %ecx, %edi 978 add %edi, %edx 979 lea 13(%edi, %eax), %eax 980 POP (%edi) 981 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 982 983 ALIGN (4) 984 L(shl_14): 985 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 986 lea -14(%eax), %eax 987 movaps (%eax), %xmm1 988 xor %edi, %edi 989 lea -32(%ecx), %ecx 990 movdqu %xmm0, (%esi) 991 POP (%esi) 992 L(shl_14_loop): 993 994 movdqa 16(%eax, %edi), %xmm2 995 sub $32, %ecx 996 movdqa 32(%eax, %edi), %xmm3 997 movdqa %xmm3, %xmm4 998 palignr $14, %xmm2, %xmm3 999 palignr $14, %xmm1, %xmm2 1000 lea 32(%edi), %edi 1001 movdqa %xmm2, -32(%edx, %edi) 1002 movdqa %xmm3, -16(%edx, %edi) 1003 1004 jb L(shl_14_end) 1005 1006 movdqa 16(%eax, %edi), %xmm2 1007 sub $32, %ecx 1008 movdqa 32(%eax, %edi), %xmm3 1009 movdqa %xmm3, %xmm1 1010 palignr $14, %xmm2, %xmm3 1011 palignr $14, %xmm4, %xmm2 1012 lea 32(%edi), %edi 1013 movdqa %xmm2, -32(%edx, %edi) 1014 movdqa %xmm3, -16(%edx, %edi) 1015 1016 jae L(shl_14_loop) 1017 1018 L(shl_14_end): 1019 lea 32(%ecx), %ecx 1020 add %ecx, %edi 1021 add %edi, %edx 1022 lea 14(%edi, %eax), %eax 1023 POP (%edi) 1024 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 1025 1026 1027 ALIGN (4) 1028 L(shl_15): 1029 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 1030 lea -15(%eax), %eax 1031 movaps (%eax), %xmm1 1032 xor %edi, %edi 1033 lea -32(%ecx), %ecx 1034 movdqu %xmm0, (%esi) 1035 POP (%esi) 1036 L(shl_15_loop): 1037 1038 movdqa 16(%eax, %edi), %xmm2 1039 sub $32, %ecx 1040 movdqa 32(%eax, %edi), %xmm3 1041 movdqa %xmm3, %xmm4 1042 palignr $15, %xmm2, %xmm3 1043 palignr $15, %xmm1, %xmm2 1044 lea 32(%edi), %edi 1045 movdqa %xmm2, -32(%edx, %edi) 1046 movdqa %xmm3, -16(%edx, %edi) 1047 1048 jb L(shl_15_end) 1049 1050 movdqa 16(%eax, %edi), %xmm2 1051 sub $32, %ecx 1052 movdqa 32(%eax, %edi), %xmm3 1053 movdqa %xmm3, %xmm1 1054 palignr $15, %xmm2, %xmm3 1055 palignr $15, %xmm4, %xmm2 1056 lea 32(%edi), %edi 1057 movdqa %xmm2, -32(%edx, %edi) 1058 movdqa %xmm3, -16(%edx, %edi) 1059 1060 jae L(shl_15_loop) 1061 1062 L(shl_15_end): 1063 lea 32(%ecx), %ecx 1064 add %ecx, %edi 1065 add %edi, %edx 1066 lea 15(%edi, %eax), %eax 1067 POP (%edi) 1068 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 1069 1070 1071 ALIGN (4) 1072 L(fwd_write_44bytes): 1073 movl -44(%eax), %ecx 1074 movl %ecx, -44(%edx) 1075 L(fwd_write_40bytes): 1076 movl -40(%eax), %ecx 1077 movl %ecx, -40(%edx) 1078 L(fwd_write_36bytes): 1079 movl -36(%eax), %ecx 1080 movl %ecx, -36(%edx) 1081 L(fwd_write_32bytes): 1082 movl -32(%eax), %ecx 1083 movl %ecx, -32(%edx) 1084 L(fwd_write_28bytes): 1085 movl -28(%eax), %ecx 1086 movl %ecx, -28(%edx) 1087 L(fwd_write_24bytes): 1088 movl -24(%eax), %ecx 1089 movl %ecx, -24(%edx) 1090 L(fwd_write_20bytes): 1091 movl -20(%eax), %ecx 1092 movl %ecx, -20(%edx) 1093 L(fwd_write_16bytes): 1094 movl -16(%eax), %ecx 1095 movl %ecx, -16(%edx) 1096 L(fwd_write_12bytes): 1097 movl -12(%eax), %ecx 1098 movl %ecx, -12(%edx) 1099 L(fwd_write_8bytes): 1100 movl -8(%eax), %ecx 1101 movl %ecx, -8(%edx) 1102 L(fwd_write_4bytes): 1103 movl -4(%eax), %ecx 1104 movl %ecx, -4(%edx) 1105 L(fwd_write_0bytes): 1106 #ifndef USE_AS_BCOPY 1107 # ifdef USE_AS_MEMPCPY 1108 movl %edx, %eax 1109 # else 1110 movl DEST(%esp), %eax 1111 # endif 1112 #endif 1113 RETURN 1114 1115 ALIGN (4) 1116 L(fwd_write_5bytes): 1117 movl -5(%eax), %ecx 1118 movl -4(%eax), %eax 1119 movl %ecx, -5(%edx) 1120 movl %eax, -4(%edx) 1121 #ifndef USE_AS_BCOPY 1122 # ifdef USE_AS_MEMPCPY 1123 movl %edx, %eax 1124 # else 1125 movl DEST(%esp), %eax 1126 # endif 1127 #endif 1128 RETURN 1129 1130 ALIGN (4) 1131 L(fwd_write_45bytes): 1132 movl -45(%eax), %ecx 1133 movl %ecx, -45(%edx) 1134 L(fwd_write_41bytes): 1135 movl -41(%eax), %ecx 1136 movl %ecx, -41(%edx) 1137 L(fwd_write_37bytes): 1138 movl -37(%eax), %ecx 1139 movl %ecx, -37(%edx) 1140 L(fwd_write_33bytes): 1141 movl -33(%eax), %ecx 1142 movl %ecx, -33(%edx) 1143 L(fwd_write_29bytes): 1144 movl -29(%eax), %ecx 1145 movl %ecx, -29(%edx) 1146 L(fwd_write_25bytes): 1147 movl -25(%eax), %ecx 1148 movl %ecx, -25(%edx) 1149 L(fwd_write_21bytes): 1150 movl -21(%eax), %ecx 1151 movl %ecx, -21(%edx) 1152 L(fwd_write_17bytes): 1153 movl -17(%eax), %ecx 1154 movl %ecx, -17(%edx) 1155 L(fwd_write_13bytes): 1156 movl -13(%eax), %ecx 1157 movl %ecx, -13(%edx) 1158 L(fwd_write_9bytes): 1159 movl -9(%eax), %ecx 1160 movl %ecx, -9(%edx) 1161 movl -5(%eax), %ecx 1162 movl %ecx, -5(%edx) 1163 L(fwd_write_1bytes): 1164 movzbl -1(%eax), %ecx 1165 movb %cl, -1(%edx) 1166 #ifndef USE_AS_BCOPY 1167 # ifdef USE_AS_MEMPCPY 1168 movl %edx, %eax 1169 # else 1170 movl DEST(%esp), %eax 1171 # endif 1172 #endif 1173 RETURN 1174 1175 ALIGN (4) 1176 L(fwd_write_46bytes): 1177 movl -46(%eax), %ecx 1178 movl %ecx, -46(%edx) 1179 L(fwd_write_42bytes): 1180 movl -42(%eax), %ecx 1181 movl %ecx, -42(%edx) 1182 L(fwd_write_38bytes): 1183 movl -38(%eax), %ecx 1184 movl %ecx, -38(%edx) 1185 L(fwd_write_34bytes): 1186 movl -34(%eax), %ecx 1187 movl %ecx, -34(%edx) 1188 L(fwd_write_30bytes): 1189 movl -30(%eax), %ecx 1190 movl %ecx, -30(%edx) 1191 L(fwd_write_26bytes): 1192 movl -26(%eax), %ecx 1193 movl %ecx, -26(%edx) 1194 L(fwd_write_22bytes): 1195 movl -22(%eax), %ecx 1196 movl %ecx, -22(%edx) 1197 L(fwd_write_18bytes): 1198 movl -18(%eax), %ecx 1199 movl %ecx, -18(%edx) 1200 L(fwd_write_14bytes): 1201 movl -14(%eax), %ecx 1202 movl %ecx, -14(%edx) 1203 L(fwd_write_10bytes): 1204 movl -10(%eax), %ecx 1205 movl %ecx, -10(%edx) 1206 L(fwd_write_6bytes): 1207 movl -6(%eax), %ecx 1208 movl %ecx, -6(%edx) 1209 L(fwd_write_2bytes): 1210 movzwl -2(%eax), %ecx 1211 movw %cx, -2(%edx) 1212 #ifndef USE_AS_BCOPY 1213 # ifdef USE_AS_MEMPCPY 1214 movl %edx, %eax 1215 # else 1216 movl DEST(%esp), %eax 1217 # endif 1218 #endif 1219 RETURN 1220 1221 ALIGN (4) 1222 L(fwd_write_47bytes): 1223 movl -47(%eax), %ecx 1224 movl %ecx, -47(%edx) 1225 L(fwd_write_43bytes): 1226 movl -43(%eax), %ecx 1227 movl %ecx, -43(%edx) 1228 L(fwd_write_39bytes): 1229 movl -39(%eax), %ecx 1230 movl %ecx, -39(%edx) 1231 L(fwd_write_35bytes): 1232 movl -35(%eax), %ecx 1233 movl %ecx, -35(%edx) 1234 L(fwd_write_31bytes): 1235 movl -31(%eax), %ecx 1236 movl %ecx, -31(%edx) 1237 L(fwd_write_27bytes): 1238 movl -27(%eax), %ecx 1239 movl %ecx, -27(%edx) 1240 L(fwd_write_23bytes): 1241 movl -23(%eax), %ecx 1242 movl %ecx, -23(%edx) 1243 L(fwd_write_19bytes): 1244 movl -19(%eax), %ecx 1245 movl %ecx, -19(%edx) 1246 L(fwd_write_15bytes): 1247 movl -15(%eax), %ecx 1248 movl %ecx, -15(%edx) 1249 L(fwd_write_11bytes): 1250 movl -11(%eax), %ecx 1251 movl %ecx, -11(%edx) 1252 L(fwd_write_7bytes): 1253 movl -7(%eax), %ecx 1254 movl %ecx, -7(%edx) 1255 L(fwd_write_3bytes): 1256 movzwl -3(%eax), %ecx 1257 movzbl -1(%eax), %eax 1258 movw %cx, -3(%edx) 1259 movb %al, -1(%edx) 1260 #ifndef USE_AS_BCOPY 1261 # ifdef USE_AS_MEMPCPY 1262 movl %edx, %eax 1263 # else 1264 movl DEST(%esp), %eax 1265 # endif 1266 #endif 1267 RETURN 1268 1269 ALIGN (4) 1270 L(large_page): 1271 movdqu (%eax), %xmm1 1272 lea 16(%eax), %eax 1273 movdqu %xmm0, (%esi) 1274 movntdq %xmm1, (%edx) 1275 lea 16(%edx), %edx 1276 POP (%esi) 1277 lea -0x90(%ecx), %ecx 1278 POP (%edi) 1279 L(large_page_loop): 1280 movdqu (%eax), %xmm0 1281 movdqu 0x10(%eax), %xmm1 1282 movdqu 0x20(%eax), %xmm2 1283 movdqu 0x30(%eax), %xmm3 1284 movdqu 0x40(%eax), %xmm4 1285 movdqu 0x50(%eax), %xmm5 1286 movdqu 0x60(%eax), %xmm6 1287 movdqu 0x70(%eax), %xmm7 1288 lea 0x80(%eax), %eax 1289 1290 sub $0x80, %ecx 1291 movntdq %xmm0, (%edx) 1292 movntdq %xmm1, 0x10(%edx) 1293 movntdq %xmm2, 0x20(%edx) 1294 movntdq %xmm3, 0x30(%edx) 1295 movntdq %xmm4, 0x40(%edx) 1296 movntdq %xmm5, 0x50(%edx) 1297 movntdq %xmm6, 0x60(%edx) 1298 movntdq %xmm7, 0x70(%edx) 1299 lea 0x80(%edx), %edx 1300 jae L(large_page_loop) 1301 cmp $-0x40, %ecx 1302 lea 0x80(%ecx), %ecx 1303 jl L(large_page_less_64bytes) 1304 1305 movdqu (%eax), %xmm0 1306 movdqu 0x10(%eax), %xmm1 1307 movdqu 0x20(%eax), %xmm2 1308 movdqu 0x30(%eax), %xmm3 1309 lea 0x40(%eax), %eax 1310 1311 movntdq %xmm0, (%edx) 1312 movntdq %xmm1, 0x10(%edx) 1313 movntdq %xmm2, 0x20(%edx) 1314 movntdq %xmm3, 0x30(%edx) 1315 lea 0x40(%edx), %edx 1316 sub $0x40, %ecx 1317 L(large_page_less_64bytes): 1318 cmp $32, %ecx 1319 jb L(large_page_less_32bytes) 1320 movdqu (%eax), %xmm0 1321 movdqu 0x10(%eax), %xmm1 1322 lea 0x20(%eax), %eax 1323 movntdq %xmm0, (%edx) 1324 movntdq %xmm1, 0x10(%edx) 1325 lea 0x20(%edx), %edx 1326 sub $0x20, %ecx 1327 L(large_page_less_32bytes): 1328 add %ecx, %edx 1329 add %ecx, %eax 1330 sfence 1331 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 1332 1333 1334 ALIGN (4) 1335 L(bk_write_44bytes): 1336 movl 40(%eax), %ecx 1337 movl %ecx, 40(%edx) 1338 L(bk_write_40bytes): 1339 movl 36(%eax), %ecx 1340 movl %ecx, 36(%edx) 1341 L(bk_write_36bytes): 1342 movl 32(%eax), %ecx 1343 movl %ecx, 32(%edx) 1344 L(bk_write_32bytes): 1345 movl 28(%eax), %ecx 1346 movl %ecx, 28(%edx) 1347 L(bk_write_28bytes): 1348 movl 24(%eax), %ecx 1349 movl %ecx, 24(%edx) 1350 L(bk_write_24bytes): 1351 movl 20(%eax), %ecx 1352 movl %ecx, 20(%edx) 1353 L(bk_write_20bytes): 1354 movl 16(%eax), %ecx 1355 movl %ecx, 16(%edx) 1356 L(bk_write_16bytes): 1357 movl 12(%eax), %ecx 1358 movl %ecx, 12(%edx) 1359 L(bk_write_12bytes): 1360 movl 8(%eax), %ecx 1361 movl %ecx, 8(%edx) 1362 L(bk_write_8bytes): 1363 movl 4(%eax), %ecx 1364 movl %ecx, 4(%edx) 1365 L(bk_write_4bytes): 1366 movl (%eax), %ecx 1367 movl %ecx, (%edx) 1368 L(bk_write_0bytes): 1369 #ifndef USE_AS_BCOPY 1370 movl DEST(%esp), %eax 1371 # ifdef USE_AS_MEMPCPY 1372 movl LEN(%esp), %ecx 1373 add %ecx, %eax 1374 # endif 1375 #endif 1376 RETURN 1377 1378 ALIGN (4) 1379 L(bk_write_45bytes): 1380 movl 41(%eax), %ecx 1381 movl %ecx, 41(%edx) 1382 L(bk_write_41bytes): 1383 movl 37(%eax), %ecx 1384 movl %ecx, 37(%edx) 1385 L(bk_write_37bytes): 1386 movl 33(%eax), %ecx 1387 movl %ecx, 33(%edx) 1388 L(bk_write_33bytes): 1389 movl 29(%eax), %ecx 1390 movl %ecx, 29(%edx) 1391 L(bk_write_29bytes): 1392 movl 25(%eax), %ecx 1393 movl %ecx, 25(%edx) 1394 L(bk_write_25bytes): 1395 movl 21(%eax), %ecx 1396 movl %ecx, 21(%edx) 1397 L(bk_write_21bytes): 1398 movl 17(%eax), %ecx 1399 movl %ecx, 17(%edx) 1400 L(bk_write_17bytes): 1401 movl 13(%eax), %ecx 1402 movl %ecx, 13(%edx) 1403 L(bk_write_13bytes): 1404 movl 9(%eax), %ecx 1405 movl %ecx, 9(%edx) 1406 L(bk_write_9bytes): 1407 movl 5(%eax), %ecx 1408 movl %ecx, 5(%edx) 1409 L(bk_write_5bytes): 1410 movl 1(%eax), %ecx 1411 movl %ecx, 1(%edx) 1412 L(bk_write_1bytes): 1413 movzbl (%eax), %ecx 1414 movb %cl, (%edx) 1415 #ifndef USE_AS_BCOPY 1416 movl DEST(%esp), %eax 1417 # ifdef USE_AS_MEMPCPY 1418 movl LEN(%esp), %ecx 1419 add %ecx, %eax 1420 # endif 1421 #endif 1422 RETURN 1423 1424 ALIGN (4) 1425 L(bk_write_46bytes): 1426 movl 42(%eax), %ecx 1427 movl %ecx, 42(%edx) 1428 L(bk_write_42bytes): 1429 movl 38(%eax), %ecx 1430 movl %ecx, 38(%edx) 1431 L(bk_write_38bytes): 1432 movl 34(%eax), %ecx 1433 movl %ecx, 34(%edx) 1434 L(bk_write_34bytes): 1435 movl 30(%eax), %ecx 1436 movl %ecx, 30(%edx) 1437 L(bk_write_30bytes): 1438 movl 26(%eax), %ecx 1439 movl %ecx, 26(%edx) 1440 L(bk_write_26bytes): 1441 movl 22(%eax), %ecx 1442 movl %ecx, 22(%edx) 1443 L(bk_write_22bytes): 1444 movl 18(%eax), %ecx 1445 movl %ecx, 18(%edx) 1446 L(bk_write_18bytes): 1447 movl 14(%eax), %ecx 1448 movl %ecx, 14(%edx) 1449 L(bk_write_14bytes): 1450 movl 10(%eax), %ecx 1451 movl %ecx, 10(%edx) 1452 L(bk_write_10bytes): 1453 movl 6(%eax), %ecx 1454 movl %ecx, 6(%edx) 1455 L(bk_write_6bytes): 1456 movl 2(%eax), %ecx 1457 movl %ecx, 2(%edx) 1458 L(bk_write_2bytes): 1459 movzwl (%eax), %ecx 1460 movw %cx, (%edx) 1461 #ifndef USE_AS_BCOPY 1462 movl DEST(%esp), %eax 1463 # ifdef USE_AS_MEMPCPY 1464 movl LEN(%esp), %ecx 1465 add %ecx, %eax 1466 # endif 1467 #endif 1468 RETURN 1469 1470 ALIGN (4) 1471 L(bk_write_47bytes): 1472 movl 43(%eax), %ecx 1473 movl %ecx, 43(%edx) 1474 L(bk_write_43bytes): 1475 movl 39(%eax), %ecx 1476 movl %ecx, 39(%edx) 1477 L(bk_write_39bytes): 1478 movl 35(%eax), %ecx 1479 movl %ecx, 35(%edx) 1480 L(bk_write_35bytes): 1481 movl 31(%eax), %ecx 1482 movl %ecx, 31(%edx) 1483 L(bk_write_31bytes): 1484 movl 27(%eax), %ecx 1485 movl %ecx, 27(%edx) 1486 L(bk_write_27bytes): 1487 movl 23(%eax), %ecx 1488 movl %ecx, 23(%edx) 1489 L(bk_write_23bytes): 1490 movl 19(%eax), %ecx 1491 movl %ecx, 19(%edx) 1492 L(bk_write_19bytes): 1493 movl 15(%eax), %ecx 1494 movl %ecx, 15(%edx) 1495 L(bk_write_15bytes): 1496 movl 11(%eax), %ecx 1497 movl %ecx, 11(%edx) 1498 L(bk_write_11bytes): 1499 movl 7(%eax), %ecx 1500 movl %ecx, 7(%edx) 1501 L(bk_write_7bytes): 1502 movl 3(%eax), %ecx 1503 movl %ecx, 3(%edx) 1504 L(bk_write_3bytes): 1505 movzwl 1(%eax), %ecx 1506 movw %cx, 1(%edx) 1507 movzbl (%eax), %eax 1508 movb %al, (%edx) 1509 #ifndef USE_AS_BCOPY 1510 movl DEST(%esp), %eax 1511 # ifdef USE_AS_MEMPCPY 1512 movl LEN(%esp), %ecx 1513 add %ecx, %eax 1514 # endif 1515 #endif 1516 RETURN_END 1517 1518 1519 .pushsection .rodata.ssse3,"a",@progbits 1520 ALIGN (2) 1521 L(table_48bytes_fwd): 1522 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) 1523 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) 1524 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) 1525 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) 1526 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) 1527 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) 1528 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) 1529 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) 1530 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) 1531 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) 1532 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) 1533 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) 1534 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) 1535 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) 1536 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) 1537 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) 1538 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) 1539 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) 1540 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) 1541 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) 1542 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) 1543 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) 1544 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) 1545 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) 1546 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) 1547 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) 1548 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) 1549 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) 1550 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) 1551 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) 1552 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) 1553 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) 1554 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) 1555 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) 1556 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) 1557 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) 1558 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) 1559 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) 1560 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) 1561 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) 1562 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) 1563 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) 1564 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) 1565 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) 1566 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) 1567 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) 1568 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) 1569 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) 1570 1571 ALIGN (2) 1572 L(shl_table): 1573 .int JMPTBL (L(shl_0), L(shl_table)) 1574 .int JMPTBL (L(shl_1), L(shl_table)) 1575 .int JMPTBL (L(shl_2), L(shl_table)) 1576 .int JMPTBL (L(shl_3), L(shl_table)) 1577 .int JMPTBL (L(shl_4), L(shl_table)) 1578 .int JMPTBL (L(shl_5), L(shl_table)) 1579 .int JMPTBL (L(shl_6), L(shl_table)) 1580 .int JMPTBL (L(shl_7), L(shl_table)) 1581 .int JMPTBL (L(shl_8), L(shl_table)) 1582 .int JMPTBL (L(shl_9), L(shl_table)) 1583 .int JMPTBL (L(shl_10), L(shl_table)) 1584 .int JMPTBL (L(shl_11), L(shl_table)) 1585 .int JMPTBL (L(shl_12), L(shl_table)) 1586 .int JMPTBL (L(shl_13), L(shl_table)) 1587 .int JMPTBL (L(shl_14), L(shl_table)) 1588 .int JMPTBL (L(shl_15), L(shl_table)) 1589 1590 ALIGN (2) 1591 L(table_48_bytes_bwd): 1592 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) 1593 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) 1594 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) 1595 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) 1596 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) 1597 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) 1598 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) 1599 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) 1600 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) 1601 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) 1602 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) 1603 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) 1604 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) 1605 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) 1606 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) 1607 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) 1608 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) 1609 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) 1610 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) 1611 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) 1612 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) 1613 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) 1614 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) 1615 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) 1616 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) 1617 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) 1618 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) 1619 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) 1620 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) 1621 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) 1622 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) 1623 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) 1624 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) 1625 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) 1626 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) 1627 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) 1628 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) 1629 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) 1630 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) 1631 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) 1632 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) 1633 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) 1634 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) 1635 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) 1636 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) 1637 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) 1638 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) 1639 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) 1640 1641 .popsection 1642 1643 #ifdef USE_AS_MEMMOVE 1644 ALIGN (4) 1645 L(copy_backward): 1646 PUSH (%esi) 1647 movl %eax, %esi 1648 lea (%ecx,%edx,1),%edx 1649 lea (%ecx,%esi,1),%esi 1650 testl $0x3, %edx 1651 jnz L(bk_align) 1652 1653 L(bk_aligned_4): 1654 cmp $64, %ecx 1655 jae L(bk_write_more64bytes) 1656 1657 L(bk_write_64bytesless): 1658 cmp $32, %ecx 1659 jb L(bk_write_less32bytes) 1660 1661 L(bk_write_more32bytes): 1662 /* Copy 32 bytes at a time. */ 1663 sub $32, %ecx 1664 movl -4(%esi), %eax 1665 movl %eax, -4(%edx) 1666 movl -8(%esi), %eax 1667 movl %eax, -8(%edx) 1668 movl -12(%esi), %eax 1669 movl %eax, -12(%edx) 1670 movl -16(%esi), %eax 1671 movl %eax, -16(%edx) 1672 movl -20(%esi), %eax 1673 movl %eax, -20(%edx) 1674 movl -24(%esi), %eax 1675 movl %eax, -24(%edx) 1676 movl -28(%esi), %eax 1677 movl %eax, -28(%edx) 1678 movl -32(%esi), %eax 1679 movl %eax, -32(%edx) 1680 sub $32, %edx 1681 sub $32, %esi 1682 1683 L(bk_write_less32bytes): 1684 movl %esi, %eax 1685 sub %ecx, %edx 1686 sub %ecx, %eax 1687 POP (%esi) 1688 L(bk_write_less32bytes_2): 1689 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) 1690 1691 ALIGN (4) 1692 L(bk_align): 1693 cmp $8, %ecx 1694 jbe L(bk_write_less32bytes) 1695 testl $1, %edx 1696 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, 1697 then (EDX & 2) must be != 0. */ 1698 jz L(bk_got2) 1699 sub $1, %esi 1700 sub $1, %ecx 1701 sub $1, %edx 1702 movzbl (%esi), %eax 1703 movb %al, (%edx) 1704 1705 testl $2, %edx 1706 jz L(bk_aligned_4) 1707 1708 L(bk_got2): 1709 sub $2, %esi 1710 sub $2, %ecx 1711 sub $2, %edx 1712 movzwl (%esi), %eax 1713 movw %ax, (%edx) 1714 jmp L(bk_aligned_4) 1715 1716 ALIGN (4) 1717 L(bk_write_more64bytes): 1718 /* Check alignment of last byte. */ 1719 testl $15, %edx 1720 jz L(bk_ssse3_cpy_pre) 1721 1722 /* EDX is aligned 4 bytes, but not 16 bytes. */ 1723 L(bk_ssse3_align): 1724 sub $4, %esi 1725 sub $4, %ecx 1726 sub $4, %edx 1727 movl (%esi), %eax 1728 movl %eax, (%edx) 1729 1730 testl $15, %edx 1731 jz L(bk_ssse3_cpy_pre) 1732 1733 sub $4, %esi 1734 sub $4, %ecx 1735 sub $4, %edx 1736 movl (%esi), %eax 1737 movl %eax, (%edx) 1738 1739 testl $15, %edx 1740 jz L(bk_ssse3_cpy_pre) 1741 1742 sub $4, %esi 1743 sub $4, %ecx 1744 sub $4, %edx 1745 movl (%esi), %eax 1746 movl %eax, (%edx) 1747 1748 L(bk_ssse3_cpy_pre): 1749 cmp $64, %ecx 1750 jb L(bk_write_more32bytes) 1751 1752 L(bk_ssse3_cpy): 1753 sub $64, %esi 1754 sub $64, %ecx 1755 sub $64, %edx 1756 movdqu 0x30(%esi), %xmm3 1757 movdqa %xmm3, 0x30(%edx) 1758 movdqu 0x20(%esi), %xmm2 1759 movdqa %xmm2, 0x20(%edx) 1760 movdqu 0x10(%esi), %xmm1 1761 movdqa %xmm1, 0x10(%edx) 1762 movdqu (%esi), %xmm0 1763 movdqa %xmm0, (%edx) 1764 cmp $64, %ecx 1765 jae L(bk_ssse3_cpy) 1766 jmp L(bk_write_64bytesless) 1767 1768 #endif 1769 1770 END (MEMCPY) 1771