1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 %include "vpx_ports/x86_abi_support.asm" 13 14 %idefine QWORD 15 16 %macro PROCESS_16X2X3 1 17 %if %1 18 movdqa xmm0, [rsi] 19 lddqu xmm5, [rdi] 20 lddqu xmm6, [rdi+1] 21 lddqu xmm7, [rdi+2] 22 23 psadbw xmm5, xmm0 24 psadbw xmm6, xmm0 25 psadbw xmm7, xmm0 26 %else 27 movdqa xmm0, [rsi] 28 lddqu xmm1, [rdi] 29 lddqu xmm2, [rdi+1] 30 lddqu xmm3, [rdi+2] 31 32 psadbw xmm1, xmm0 33 psadbw xmm2, xmm0 34 psadbw xmm3, xmm0 35 36 paddw xmm5, xmm1 37 paddw xmm6, xmm2 38 paddw xmm7, xmm3 39 %endif 40 movdqa xmm0, QWORD PTR [rsi+rax] 41 lddqu xmm1, QWORD PTR [rdi+rdx] 42 lddqu xmm2, QWORD PTR [rdi+rdx+1] 43 lddqu xmm3, QWORD PTR [rdi+rdx+2] 44 45 lea rsi, [rsi+rax*2] 46 lea rdi, [rdi+rdx*2] 47 48 psadbw xmm1, xmm0 49 psadbw xmm2, xmm0 50 psadbw xmm3, xmm0 51 52 paddw xmm5, xmm1 53 paddw xmm6, xmm2 54 paddw xmm7, xmm3 55 %endmacro 56 57 %macro PROCESS_8X2X3 1 58 %if %1 59 movq mm0, [rsi] 60 movq mm5, [rdi] 61 movq mm6, [rdi+1] 62 movq mm7, [rdi+2] 63 64 psadbw mm5, mm0 65 psadbw mm6, mm0 66 psadbw mm7, mm0 67 %else 68 movq mm0, [rsi] 69 movq mm1, [rdi] 70 movq mm2, [rdi+1] 71 movq mm3, [rdi+2] 72 73 psadbw mm1, mm0 74 psadbw mm2, mm0 75 psadbw mm3, mm0 76 77 paddw mm5, mm1 78 paddw mm6, mm2 79 paddw mm7, mm3 80 %endif 81 movq mm0, QWORD PTR [rsi+rax] 82 movq mm1, QWORD PTR [rdi+rdx] 83 movq mm2, QWORD PTR [rdi+rdx+1] 84 movq mm3, QWORD PTR [rdi+rdx+2] 85 86 lea rsi, [rsi+rax*2] 87 lea rdi, [rdi+rdx*2] 88 89 psadbw mm1, mm0 90 psadbw mm2, mm0 91 psadbw mm3, mm0 92 93 paddw mm5, mm1 94 paddw mm6, mm2 95 paddw mm7, mm3 96 %endmacro 97 98 %macro LOAD_X4_ADDRESSES 5 99 mov %2, [%1+REG_SZ_BYTES*0] 100 mov %3, [%1+REG_SZ_BYTES*1] 101 102 mov %4, [%1+REG_SZ_BYTES*2] 103 mov %5, [%1+REG_SZ_BYTES*3] 104 %endmacro 105 106 %macro PROCESS_16X2X4 1 107 %if %1 108 movdqa xmm0, [rsi] 109 lddqu xmm4, [rcx] 110 lddqu xmm5, [rdx] 111 lddqu xmm6, [rbx] 112 lddqu xmm7, [rdi] 113 114 psadbw xmm4, xmm0 115 psadbw xmm5, xmm0 116 psadbw xmm6, xmm0 117 psadbw xmm7, xmm0 118 %else 119 movdqa xmm0, [rsi] 120 lddqu xmm1, [rcx] 121 lddqu xmm2, [rdx] 122 lddqu xmm3, [rbx] 123 124 psadbw xmm1, xmm0 125 psadbw xmm2, xmm0 126 psadbw xmm3, xmm0 127 128 paddw xmm4, xmm1 129 lddqu xmm1, [rdi] 130 paddw xmm5, xmm2 131 paddw xmm6, xmm3 132 133 psadbw xmm1, xmm0 134 paddw xmm7, xmm1 135 %endif 136 movdqa xmm0, QWORD PTR [rsi+rax] 137 lddqu xmm1, QWORD PTR [rcx+rbp] 138 lddqu xmm2, QWORD PTR [rdx+rbp] 139 lddqu xmm3, QWORD PTR [rbx+rbp] 140 141 psadbw xmm1, xmm0 142 psadbw xmm2, xmm0 143 psadbw xmm3, xmm0 144 145 paddw xmm4, xmm1 146 lddqu xmm1, QWORD PTR [rdi+rbp] 147 paddw xmm5, xmm2 148 paddw xmm6, xmm3 149 150 lea rsi, [rsi+rax*2] 151 lea rcx, [rcx+rbp*2] 152 153 lea rdx, [rdx+rbp*2] 154 lea rbx, [rbx+rbp*2] 155 156 lea rdi, [rdi+rbp*2] 157 158 psadbw xmm1, xmm0 159 paddw xmm7, xmm1 160 161 %endmacro 162 163 %macro PROCESS_8X2X4 1 164 %if %1 165 movq mm0, [rsi] 166 movq mm4, [rcx] 167 movq mm5, [rdx] 168 movq mm6, [rbx] 169 movq mm7, [rdi] 170 171 psadbw mm4, mm0 172 psadbw mm5, mm0 173 psadbw mm6, mm0 174 psadbw mm7, mm0 175 %else 176 movq mm0, [rsi] 177 movq mm1, [rcx] 178 movq mm2, [rdx] 179 movq mm3, [rbx] 180 181 psadbw mm1, mm0 182 psadbw mm2, mm0 183 psadbw mm3, mm0 184 185 paddw mm4, mm1 186 movq mm1, [rdi] 187 paddw mm5, mm2 188 paddw mm6, mm3 189 190 psadbw mm1, mm0 191 paddw mm7, mm1 192 %endif 193 movq mm0, QWORD PTR [rsi+rax] 194 movq mm1, QWORD PTR [rcx+rbp] 195 movq mm2, QWORD PTR [rdx+rbp] 196 movq mm3, QWORD PTR [rbx+rbp] 197 198 psadbw mm1, mm0 199 psadbw mm2, mm0 200 psadbw mm3, mm0 201 202 paddw mm4, mm1 203 movq mm1, QWORD PTR [rdi+rbp] 204 paddw mm5, mm2 205 paddw mm6, mm3 206 207 lea rsi, [rsi+rax*2] 208 lea rcx, [rcx+rbp*2] 209 210 lea rdx, [rdx+rbp*2] 211 lea rbx, [rbx+rbp*2] 212 213 lea rdi, [rdi+rbp*2] 214 215 psadbw mm1, mm0 216 paddw mm7, mm1 217 218 %endmacro 219 220 ;void int vp8_sad16x16x3_sse3( 221 ; unsigned char *src_ptr, 222 ; int src_stride, 223 ; unsigned char *ref_ptr, 224 ; int ref_stride, 225 ; int *results) 226 global sym(vp8_sad16x16x3_sse3) 227 sym(vp8_sad16x16x3_sse3): 228 push rbp 229 mov rbp, rsp 230 SHADOW_ARGS_TO_STACK 5 231 push rsi 232 push rdi 233 ; end prolog 234 235 mov rsi, arg(0) ;src_ptr 236 mov rdi, arg(2) ;ref_ptr 237 238 movsxd rax, dword ptr arg(1) ;src_stride 239 movsxd rdx, dword ptr arg(3) ;ref_stride 240 241 PROCESS_16X2X3 1 242 PROCESS_16X2X3 0 243 PROCESS_16X2X3 0 244 PROCESS_16X2X3 0 245 PROCESS_16X2X3 0 246 PROCESS_16X2X3 0 247 PROCESS_16X2X3 0 248 PROCESS_16X2X3 0 249 250 mov rdi, arg(4) ;Results 251 252 movq xmm0, xmm5 253 psrldq xmm5, 8 254 255 paddw xmm0, xmm5 256 movd [rdi], xmm0 257 ;- 258 movq xmm0, xmm6 259 psrldq xmm6, 8 260 261 paddw xmm0, xmm6 262 movd [rdi+4], xmm0 263 ;- 264 movq xmm0, xmm7 265 psrldq xmm7, 8 266 267 paddw xmm0, xmm7 268 movd [rdi+8], xmm0 269 270 ; begin epilog 271 pop rdi 272 pop rsi 273 UNSHADOW_ARGS 274 pop rbp 275 ret 276 277 ;void int vp8_sad16x8x3_sse3( 278 ; unsigned char *src_ptr, 279 ; int src_stride, 280 ; unsigned char *ref_ptr, 281 ; int ref_stride, 282 ; int *results) 283 global sym(vp8_sad16x8x3_sse3) 284 sym(vp8_sad16x8x3_sse3): 285 push rbp 286 mov rbp, rsp 287 SHADOW_ARGS_TO_STACK 5 288 push rsi 289 push rdi 290 ; end prolog 291 292 mov rsi, arg(0) ;src_ptr 293 mov rdi, arg(2) ;ref_ptr 294 295 movsxd rax, dword ptr arg(1) ;src_stride 296 movsxd rdx, dword ptr arg(3) ;ref_stride 297 298 PROCESS_16X2X3 1 299 PROCESS_16X2X3 0 300 PROCESS_16X2X3 0 301 PROCESS_16X2X3 0 302 303 mov rdi, arg(4) ;Results 304 305 movq xmm0, xmm5 306 psrldq xmm5, 8 307 308 paddw xmm0, xmm5 309 movd [rdi], xmm0 310 ;- 311 movq xmm0, xmm6 312 psrldq xmm6, 8 313 314 paddw xmm0, xmm6 315 movd [rdi+4], xmm0 316 ;- 317 movq xmm0, xmm7 318 psrldq xmm7, 8 319 320 paddw xmm0, xmm7 321 movd [rdi+8], xmm0 322 323 ; begin epilog 324 pop rdi 325 pop rsi 326 UNSHADOW_ARGS 327 pop rbp 328 ret 329 330 ;void int vp8_sad8x16x3_sse3( 331 ; unsigned char *src_ptr, 332 ; int src_stride, 333 ; unsigned char *ref_ptr, 334 ; int ref_stride, 335 ; int *results) 336 global sym(vp8_sad8x16x3_sse3) 337 sym(vp8_sad8x16x3_sse3): 338 push rbp 339 mov rbp, rsp 340 SHADOW_ARGS_TO_STACK 5 341 push rsi 342 push rdi 343 ; end prolog 344 345 mov rsi, arg(0) ;src_ptr 346 mov rdi, arg(2) ;ref_ptr 347 348 movsxd rax, dword ptr arg(1) ;src_stride 349 movsxd rdx, dword ptr arg(3) ;ref_stride 350 351 PROCESS_8X2X3 1 352 PROCESS_8X2X3 0 353 PROCESS_8X2X3 0 354 PROCESS_8X2X3 0 355 PROCESS_8X2X3 0 356 PROCESS_8X2X3 0 357 PROCESS_8X2X3 0 358 PROCESS_8X2X3 0 359 360 mov rdi, arg(4) ;Results 361 362 movd [rdi], mm5 363 movd [rdi+4], mm6 364 movd [rdi+8], mm7 365 366 ; begin epilog 367 pop rdi 368 pop rsi 369 UNSHADOW_ARGS 370 pop rbp 371 ret 372 373 ;void int vp8_sad8x8x3_sse3( 374 ; unsigned char *src_ptr, 375 ; int src_stride, 376 ; unsigned char *ref_ptr, 377 ; int ref_stride, 378 ; int *results) 379 global sym(vp8_sad8x8x3_sse3) 380 sym(vp8_sad8x8x3_sse3): 381 push rbp 382 mov rbp, rsp 383 SHADOW_ARGS_TO_STACK 5 384 push rsi 385 push rdi 386 ; end prolog 387 388 mov rsi, arg(0) ;src_ptr 389 mov rdi, arg(2) ;ref_ptr 390 391 movsxd rax, dword ptr arg(1) ;src_stride 392 movsxd rdx, dword ptr arg(3) ;ref_stride 393 394 PROCESS_8X2X3 1 395 PROCESS_8X2X3 0 396 PROCESS_8X2X3 0 397 PROCESS_8X2X3 0 398 399 mov rdi, arg(4) ;Results 400 401 movd [rdi], mm5 402 movd [rdi+4], mm6 403 movd [rdi+8], mm7 404 405 ; begin epilog 406 pop rdi 407 pop rsi 408 UNSHADOW_ARGS 409 pop rbp 410 ret 411 412 ;void int vp8_sad4x4x3_sse3( 413 ; unsigned char *src_ptr, 414 ; int src_stride, 415 ; unsigned char *ref_ptr, 416 ; int ref_stride, 417 ; int *results) 418 global sym(vp8_sad4x4x3_sse3) 419 sym(vp8_sad4x4x3_sse3): 420 push rbp 421 mov rbp, rsp 422 SHADOW_ARGS_TO_STACK 5 423 push rsi 424 push rdi 425 ; end prolog 426 427 mov rsi, arg(0) ;src_ptr 428 mov rdi, arg(2) ;ref_ptr 429 430 movsxd rax, dword ptr arg(1) ;src_stride 431 movsxd rdx, dword ptr arg(3) ;ref_stride 432 433 movd mm0, QWORD PTR [rsi] 434 movd mm1, QWORD PTR [rdi] 435 436 movd mm2, QWORD PTR [rsi+rax] 437 movd mm3, QWORD PTR [rdi+rdx] 438 439 punpcklbw mm0, mm2 440 punpcklbw mm1, mm3 441 442 movd mm4, QWORD PTR [rdi+1] 443 movd mm5, QWORD PTR [rdi+2] 444 445 movd mm2, QWORD PTR [rdi+rdx+1] 446 movd mm3, QWORD PTR [rdi+rdx+2] 447 448 psadbw mm1, mm0 449 450 punpcklbw mm4, mm2 451 punpcklbw mm5, mm3 452 453 psadbw mm4, mm0 454 psadbw mm5, mm0 455 456 457 458 lea rsi, [rsi+rax*2] 459 lea rdi, [rdi+rdx*2] 460 461 movd mm0, QWORD PTR [rsi] 462 movd mm2, QWORD PTR [rdi] 463 464 movd mm3, QWORD PTR [rsi+rax] 465 movd mm6, QWORD PTR [rdi+rdx] 466 467 punpcklbw mm0, mm3 468 punpcklbw mm2, mm6 469 470 movd mm3, QWORD PTR [rdi+1] 471 movd mm7, QWORD PTR [rdi+2] 472 473 psadbw mm2, mm0 474 475 paddw mm1, mm2 476 477 movd mm2, QWORD PTR [rdi+rdx+1] 478 movd mm6, QWORD PTR [rdi+rdx+2] 479 480 punpcklbw mm3, mm2 481 punpcklbw mm7, mm6 482 483 psadbw mm3, mm0 484 psadbw mm7, mm0 485 486 paddw mm3, mm4 487 paddw mm7, mm5 488 489 mov rdi, arg(4) ;Results 490 movd [rdi], mm1 491 492 movd [rdi+4], mm3 493 movd [rdi+8], mm7 494 495 496 ; begin epilog 497 pop rdi 498 pop rsi 499 UNSHADOW_ARGS 500 pop rbp 501 ret 502 503 ;unsigned int vp8_sad16x16_sse3( 504 ; unsigned char *src_ptr, 505 ; int src_stride, 506 ; unsigned char *ref_ptr, 507 ; int ref_stride, 508 ; int max_err) 509 ;%define lddqu movdqu 510 global sym(vp8_sad16x16_sse3) 511 sym(vp8_sad16x16_sse3): 512 push rbp 513 mov rbp, rsp 514 SHADOW_ARGS_TO_STACK 5 515 push rbx 516 push rsi 517 push rdi 518 ; end prolog 519 520 mov rsi, arg(0) ;src_ptr 521 mov rdi, arg(2) ;ref_ptr 522 523 movsxd rbx, dword ptr arg(1) ;src_stride 524 movsxd rdx, dword ptr arg(3) ;ref_stride 525 526 lea rcx, [rsi+rbx*8] 527 528 lea rcx, [rcx+rbx*8] 529 pxor mm7, mm7 530 531 vp8_sad16x16_sse3_loop: 532 533 movd rax, mm7 534 cmp rax, arg(4) 535 jg vp8_sad16x16_early_exit 536 537 movq mm0, QWORD PTR [rsi] 538 movq mm2, QWORD PTR [rsi+8] 539 540 movq mm1, QWORD PTR [rdi] 541 movq mm3, QWORD PTR [rdi+8] 542 543 movq mm4, QWORD PTR [rsi+rbx] 544 movq mm5, QWORD PTR [rdi+rdx] 545 546 psadbw mm0, mm1 547 psadbw mm2, mm3 548 549 movq mm1, QWORD PTR [rsi+rbx+8] 550 movq mm3, QWORD PTR [rdi+rdx+8] 551 552 psadbw mm4, mm5 553 psadbw mm1, mm3 554 555 lea rsi, [rsi+rbx*2] 556 lea rdi, [rdi+rdx*2] 557 558 paddw mm0, mm2 559 paddw mm4, mm1 560 561 paddw mm7, mm0 562 paddw mm7, mm4 563 564 cmp rsi, rcx 565 jne vp8_sad16x16_sse3_loop 566 567 movd rax, mm7 568 569 vp8_sad16x16_early_exit: 570 571 ; begin epilog 572 pop rdi 573 pop rsi 574 pop rbx 575 UNSHADOW_ARGS 576 pop rbp 577 ret 578 579 ;void vp8_sad16x16x4d_sse3( 580 ; unsigned char *src_ptr, 581 ; int src_stride, 582 ; unsigned char *ref_ptr_base, 583 ; int ref_stride, 584 ; int *results) 585 global sym(vp8_sad16x16x4d_sse3) 586 sym(vp8_sad16x16x4d_sse3): 587 push rbp 588 mov rbp, rsp 589 SHADOW_ARGS_TO_STACK 5 590 push rsi 591 push rdi 592 push rbx 593 ; end prolog 594 595 push rbp 596 mov rdi, arg(2) ; ref_ptr_base 597 598 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi 599 600 mov rsi, arg(0) ;src_ptr 601 602 movsxd rbx, dword ptr arg(1) ;src_stride 603 movsxd rbp, dword ptr arg(3) ;ref_stride 604 605 xchg rbx, rax 606 607 PROCESS_16X2X4 1 608 PROCESS_16X2X4 0 609 PROCESS_16X2X4 0 610 PROCESS_16X2X4 0 611 PROCESS_16X2X4 0 612 PROCESS_16X2X4 0 613 PROCESS_16X2X4 0 614 PROCESS_16X2X4 0 615 616 pop rbp 617 mov rdi, arg(4) ;Results 618 619 movq xmm0, xmm4 620 psrldq xmm4, 8 621 622 paddw xmm0, xmm4 623 movd [rdi], xmm0 624 ;- 625 movq xmm0, xmm5 626 psrldq xmm5, 8 627 628 paddw xmm0, xmm5 629 movd [rdi+4], xmm0 630 ;- 631 movq xmm0, xmm6 632 psrldq xmm6, 8 633 634 paddw xmm0, xmm6 635 movd [rdi+8], xmm0 636 ;- 637 movq xmm0, xmm7 638 psrldq xmm7, 8 639 640 paddw xmm0, xmm7 641 movd [rdi+12], xmm0 642 643 ; begin epilog 644 pop rbx 645 pop rdi 646 pop rsi 647 UNSHADOW_ARGS 648 pop rbp 649 ret 650 651 ;void vp8_sad16x8x4d_sse3( 652 ; unsigned char *src_ptr, 653 ; int src_stride, 654 ; unsigned char *ref_ptr_base, 655 ; int ref_stride, 656 ; int *results) 657 global sym(vp8_sad16x8x4d_sse3) 658 sym(vp8_sad16x8x4d_sse3): 659 push rbp 660 mov rbp, rsp 661 SHADOW_ARGS_TO_STACK 5 662 push rsi 663 push rdi 664 push rbx 665 ; end prolog 666 667 push rbp 668 mov rdi, arg(2) ; ref_ptr_base 669 670 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi 671 672 mov rsi, arg(0) ;src_ptr 673 674 movsxd rbx, dword ptr arg(1) ;src_stride 675 movsxd rbp, dword ptr arg(3) ;ref_stride 676 677 xchg rbx, rax 678 679 PROCESS_16X2X4 1 680 PROCESS_16X2X4 0 681 PROCESS_16X2X4 0 682 PROCESS_16X2X4 0 683 684 pop rbp 685 mov rdi, arg(4) ;Results 686 687 movq xmm0, xmm4 688 psrldq xmm4, 8 689 690 paddw xmm0, xmm4 691 movd [rdi], xmm0 692 ;- 693 movq xmm0, xmm5 694 psrldq xmm5, 8 695 696 paddw xmm0, xmm5 697 movd [rdi+4], xmm0 698 ;- 699 movq xmm0, xmm6 700 psrldq xmm6, 8 701 702 paddw xmm0, xmm6 703 movd [rdi+8], xmm0 704 ;- 705 movq xmm0, xmm7 706 psrldq xmm7, 8 707 708 paddw xmm0, xmm7 709 movd [rdi+12], xmm0 710 711 ; begin epilog 712 pop rbx 713 pop rdi 714 pop rsi 715 UNSHADOW_ARGS 716 pop rbp 717 ret 718 719 ;void int vp8_sad8x16x4d_sse3( 720 ; unsigned char *src_ptr, 721 ; int src_stride, 722 ; unsigned char *ref_ptr, 723 ; int ref_stride, 724 ; int *results) 725 global sym(vp8_sad8x16x4d_sse3) 726 sym(vp8_sad8x16x4d_sse3): 727 push rbp 728 mov rbp, rsp 729 SHADOW_ARGS_TO_STACK 5 730 push rsi 731 push rdi 732 push rbx 733 ; end prolog 734 735 push rbp 736 mov rdi, arg(2) ; ref_ptr_base 737 738 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi 739 740 mov rsi, arg(0) ;src_ptr 741 742 movsxd rbx, dword ptr arg(1) ;src_stride 743 movsxd rbp, dword ptr arg(3) ;ref_stride 744 745 xchg rbx, rax 746 747 PROCESS_8X2X4 1 748 PROCESS_8X2X4 0 749 PROCESS_8X2X4 0 750 PROCESS_8X2X4 0 751 PROCESS_8X2X4 0 752 PROCESS_8X2X4 0 753 PROCESS_8X2X4 0 754 PROCESS_8X2X4 0 755 756 pop rbp 757 mov rdi, arg(4) ;Results 758 759 movd [rdi], mm4 760 movd [rdi+4], mm5 761 movd [rdi+8], mm6 762 movd [rdi+12], mm7 763 764 ; begin epilog 765 pop rbx 766 pop rdi 767 pop rsi 768 UNSHADOW_ARGS 769 pop rbp 770 ret 771 772 ;void int vp8_sad8x8x4d_sse3( 773 ; unsigned char *src_ptr, 774 ; int src_stride, 775 ; unsigned char *ref_ptr, 776 ; int ref_stride, 777 ; int *results) 778 global sym(vp8_sad8x8x4d_sse3) 779 sym(vp8_sad8x8x4d_sse3): 780 push rbp 781 mov rbp, rsp 782 SHADOW_ARGS_TO_STACK 5 783 push rsi 784 push rdi 785 push rbx 786 ; end prolog 787 788 push rbp 789 mov rdi, arg(2) ; ref_ptr_base 790 791 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi 792 793 mov rsi, arg(0) ;src_ptr 794 795 movsxd rbx, dword ptr arg(1) ;src_stride 796 movsxd rbp, dword ptr arg(3) ;ref_stride 797 798 xchg rbx, rax 799 800 PROCESS_8X2X4 1 801 PROCESS_8X2X4 0 802 PROCESS_8X2X4 0 803 PROCESS_8X2X4 0 804 805 pop rbp 806 mov rdi, arg(4) ;Results 807 808 movd [rdi], mm4 809 movd [rdi+4], mm5 810 movd [rdi+8], mm6 811 movd [rdi+12], mm7 812 813 ; begin epilog 814 pop rbx 815 pop rdi 816 pop rsi 817 UNSHADOW_ARGS 818 pop rbp 819 ret 820 821 ;void int vp8_sad4x4x4d_sse3( 822 ; unsigned char *src_ptr, 823 ; int src_stride, 824 ; unsigned char *ref_ptr, 825 ; int ref_stride, 826 ; int *results) 827 global sym(vp8_sad4x4x4d_sse3) 828 sym(vp8_sad4x4x4d_sse3): 829 push rbp 830 mov rbp, rsp 831 SHADOW_ARGS_TO_STACK 5 832 push rsi 833 push rdi 834 push rbx 835 ; end prolog 836 837 push rbp 838 mov rdi, arg(2) ; ref_ptr_base 839 840 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi 841 842 mov rsi, arg(0) ;src_ptr 843 844 movsxd rbx, dword ptr arg(1) ;src_stride 845 movsxd rbp, dword ptr arg(3) ;ref_stride 846 847 xchg rbx, rax 848 849 movd mm0, QWORD PTR [rsi] 850 movd mm1, QWORD PTR [rcx] 851 852 movd mm2, QWORD PTR [rsi+rax] 853 movd mm3, QWORD PTR [rcx+rbp] 854 855 punpcklbw mm0, mm2 856 punpcklbw mm1, mm3 857 858 movd mm4, QWORD PTR [rdx] 859 movd mm5, QWORD PTR [rbx] 860 861 movd mm6, QWORD PTR [rdi] 862 movd mm2, QWORD PTR [rdx+rbp] 863 864 movd mm3, QWORD PTR [rbx+rbp] 865 movd mm7, QWORD PTR [rdi+rbp] 866 867 psadbw mm1, mm0 868 869 punpcklbw mm4, mm2 870 punpcklbw mm5, mm3 871 872 punpcklbw mm6, mm7 873 psadbw mm4, mm0 874 875 psadbw mm5, mm0 876 psadbw mm6, mm0 877 878 879 880 lea rsi, [rsi+rax*2] 881 lea rcx, [rcx+rbp*2] 882 883 lea rdx, [rdx+rbp*2] 884 lea rbx, [rbx+rbp*2] 885 886 lea rdi, [rdi+rbp*2] 887 888 movd mm0, QWORD PTR [rsi] 889 movd mm2, QWORD PTR [rcx] 890 891 movd mm3, QWORD PTR [rsi+rax] 892 movd mm7, QWORD PTR [rcx+rbp] 893 894 punpcklbw mm0, mm3 895 punpcklbw mm2, mm7 896 897 movd mm3, QWORD PTR [rdx] 898 movd mm7, QWORD PTR [rbx] 899 900 psadbw mm2, mm0 901 mov rax, rbp 902 903 pop rbp 904 mov rsi, arg(4) ;Results 905 906 paddw mm1, mm2 907 movd [rsi], mm1 908 909 movd mm2, QWORD PTR [rdx+rax] 910 movd mm1, QWORD PTR [rbx+rax] 911 912 punpcklbw mm3, mm2 913 punpcklbw mm7, mm1 914 915 psadbw mm3, mm0 916 psadbw mm7, mm0 917 918 movd mm2, QWORD PTR [rdi] 919 movd mm1, QWORD PTR [rdi+rax] 920 921 paddw mm3, mm4 922 paddw mm7, mm5 923 924 movd [rsi+4], mm3 925 punpcklbw mm2, mm1 926 927 movd [rsi+8], mm7 928 psadbw mm2, mm0 929 930 paddw mm2, mm6 931 movd [rsi+12], mm2 932 933 934 ; begin epilog 935 pop rbx 936 pop rdi 937 pop rsi 938 UNSHADOW_ARGS 939 pop rbp 940 ret 941