1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 %include "vpx_ports/x86_abi_support.asm" 12 13 %macro STACK_FRAME_CREATE_X3 0 14 %if ABI_IS_32BIT 15 %define src_ptr rsi 16 %define src_stride rax 17 %define ref_ptr rdi 18 %define ref_stride rdx 19 %define end_ptr rcx 20 %define ret_var rbx 21 %define result_ptr arg(4) 22 %define max_err arg(4) 23 push rbp 24 mov rbp, rsp 25 push rsi 26 push rdi 27 push rbx 28 29 mov rsi, arg(0) ; src_ptr 30 mov rdi, arg(2) ; ref_ptr 31 32 movsxd rax, dword ptr arg(1) ; src_stride 33 movsxd rdx, dword ptr arg(3) ; ref_stride 34 %else 35 %ifidn __OUTPUT_FORMAT__,x64 36 %define src_ptr rcx 37 %define src_stride rdx 38 %define ref_ptr r8 39 %define ref_stride r9 40 %define end_ptr r10 41 %define ret_var r11 42 %define result_ptr [rsp+8+4*8] 43 %define max_err [rsp+8+4*8] 44 %else 45 %define src_ptr rdi 46 %define src_stride rsi 47 %define ref_ptr rdx 48 %define ref_stride rcx 49 %define end_ptr r9 50 %define ret_var r10 51 %define result_ptr r8 52 %define max_err r8 53 %endif 54 %endif 55 56 %endmacro 57 58 %macro STACK_FRAME_DESTROY_X3 0 59 %define src_ptr 60 %define src_stride 61 %define ref_ptr 62 %define ref_stride 63 %define end_ptr 64 %define ret_var 65 %define result_ptr 66 %define max_err 67 68 %if ABI_IS_32BIT 69 pop rbx 70 pop rdi 71 pop rsi 72 pop rbp 73 %else 74 %ifidn __OUTPUT_FORMAT__,x64 75 %endif 76 %endif 77 ret 78 %endmacro 79 80 %macro STACK_FRAME_CREATE_X4 0 81 %if ABI_IS_32BIT 82 %define src_ptr rsi 83 %define src_stride rax 84 %define r0_ptr rcx 85 %define r1_ptr rdx 86 %define r2_ptr rbx 87 %define r3_ptr rdi 88 %define ref_stride rbp 89 %define result_ptr arg(4) 90 push rbp 91 mov rbp, rsp 92 push rsi 93 push rdi 94 push rbx 95 96 push rbp 97 mov rdi, arg(2) ; ref_ptr_base 98 99 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi 100 101 mov rsi, arg(0) ; src_ptr 102 103 movsxd rbx, dword ptr arg(1) ; src_stride 104 movsxd rbp, dword ptr arg(3) ; ref_stride 105 106 xchg rbx, rax 107 %else 108 %ifidn __OUTPUT_FORMAT__,x64 109 %define src_ptr rcx 110 %define src_stride rdx 111 %define r0_ptr rsi 112 %define r1_ptr r10 113 %define r2_ptr r11 114 %define r3_ptr r8 115 %define ref_stride r9 116 %define result_ptr [rsp+16+4*8] 117 push rsi 118 119 LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr 120 %else 121 %define src_ptr rdi 122 %define src_stride rsi 123 %define r0_ptr r9 124 %define r1_ptr r10 125 %define r2_ptr r11 126 %define r3_ptr rdx 127 %define ref_stride rcx 128 %define result_ptr r8 129 130 LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr 131 132 %endif 133 %endif 134 %endmacro 135 136 %macro STACK_FRAME_DESTROY_X4 0 137 %define src_ptr 138 %define src_stride 139 %define r0_ptr 140 %define r1_ptr 141 %define r2_ptr 142 %define r3_ptr 143 %define ref_stride 144 %define result_ptr 145 146 %if ABI_IS_32BIT 147 pop rbx 148 pop rdi 149 pop rsi 150 pop rbp 151 %else 152 %ifidn __OUTPUT_FORMAT__,x64 153 pop rsi 154 %endif 155 %endif 156 ret 157 %endmacro 158 159 %macro PROCESS_16X2X3 5 160 %if %1==0 161 movdqa xmm0, XMMWORD PTR [%2] 162 lddqu xmm5, XMMWORD PTR [%3] 163 lddqu xmm6, XMMWORD PTR [%3+1] 164 lddqu xmm7, XMMWORD PTR [%3+2] 165 166 psadbw xmm5, xmm0 167 psadbw xmm6, xmm0 168 psadbw xmm7, xmm0 169 %else 170 movdqa xmm0, XMMWORD PTR [%2] 171 lddqu xmm1, XMMWORD PTR [%3] 172 lddqu xmm2, XMMWORD PTR [%3+1] 173 lddqu xmm3, XMMWORD PTR [%3+2] 174 175 psadbw xmm1, xmm0 176 psadbw xmm2, xmm0 177 psadbw xmm3, xmm0 178 179 paddw xmm5, xmm1 180 paddw xmm6, xmm2 181 paddw xmm7, xmm3 182 %endif 183 movdqa xmm0, XMMWORD PTR [%2+%4] 184 lddqu xmm1, XMMWORD PTR [%3+%5] 185 lddqu xmm2, XMMWORD PTR [%3+%5+1] 186 lddqu xmm3, XMMWORD PTR [%3+%5+2] 187 188 %if %1==0 || %1==1 189 lea %2, [%2+%4*2] 190 lea %3, [%3+%5*2] 191 %endif 192 193 psadbw xmm1, xmm0 194 psadbw xmm2, xmm0 195 psadbw xmm3, xmm0 196 197 paddw xmm5, xmm1 198 paddw xmm6, xmm2 199 paddw xmm7, xmm3 200 %endmacro 201 202 %macro PROCESS_8X2X3 5 203 %if %1==0 204 movq mm0, QWORD PTR [%2] 205 movq mm5, QWORD PTR [%3] 206 movq mm6, QWORD PTR [%3+1] 207 movq mm7, QWORD PTR [%3+2] 208 209 psadbw mm5, mm0 210 psadbw mm6, mm0 211 psadbw mm7, mm0 212 %else 213 movq mm0, QWORD PTR [%2] 214 movq mm1, QWORD PTR [%3] 215 movq mm2, QWORD PTR [%3+1] 216 movq mm3, QWORD PTR [%3+2] 217 218 psadbw mm1, mm0 219 psadbw mm2, mm0 220 psadbw mm3, mm0 221 222 paddw mm5, mm1 223 paddw mm6, mm2 224 paddw mm7, mm3 225 %endif 226 movq mm0, QWORD PTR [%2+%4] 227 movq mm1, QWORD PTR [%3+%5] 228 movq mm2, QWORD PTR [%3+%5+1] 229 movq mm3, QWORD PTR [%3+%5+2] 230 231 %if %1==0 || %1==1 232 lea %2, [%2+%4*2] 233 lea %3, [%3+%5*2] 234 %endif 235 236 psadbw mm1, mm0 237 psadbw mm2, mm0 238 psadbw mm3, mm0 239 240 paddw mm5, mm1 241 paddw mm6, mm2 242 paddw mm7, mm3 243 %endmacro 244 245 %macro LOAD_X4_ADDRESSES 5 246 mov %2, [%1+REG_SZ_BYTES*0] 247 mov %3, [%1+REG_SZ_BYTES*1] 248 249 mov %4, [%1+REG_SZ_BYTES*2] 250 mov %5, [%1+REG_SZ_BYTES*3] 251 %endmacro 252 253 %macro PROCESS_16X2X4 8 254 %if %1==0 255 movdqa xmm0, XMMWORD PTR [%2] 256 lddqu xmm4, XMMWORD PTR [%3] 257 lddqu xmm5, XMMWORD PTR [%4] 258 lddqu xmm6, XMMWORD PTR [%5] 259 lddqu xmm7, XMMWORD PTR [%6] 260 261 psadbw xmm4, xmm0 262 psadbw xmm5, xmm0 263 psadbw xmm6, xmm0 264 psadbw xmm7, xmm0 265 %else 266 movdqa xmm0, XMMWORD PTR [%2] 267 lddqu xmm1, XMMWORD PTR [%3] 268 lddqu xmm2, XMMWORD PTR [%4] 269 lddqu xmm3, XMMWORD PTR [%5] 270 271 psadbw xmm1, xmm0 272 psadbw xmm2, xmm0 273 psadbw xmm3, xmm0 274 275 paddw xmm4, xmm1 276 lddqu xmm1, XMMWORD PTR [%6] 277 paddw xmm5, xmm2 278 paddw xmm6, xmm3 279 280 psadbw xmm1, xmm0 281 paddw xmm7, xmm1 282 %endif 283 movdqa xmm0, XMMWORD PTR [%2+%7] 284 lddqu xmm1, XMMWORD PTR [%3+%8] 285 lddqu xmm2, XMMWORD PTR [%4+%8] 286 lddqu xmm3, XMMWORD PTR [%5+%8] 287 288 psadbw xmm1, xmm0 289 psadbw xmm2, xmm0 290 psadbw xmm3, xmm0 291 292 paddw xmm4, xmm1 293 lddqu xmm1, XMMWORD PTR [%6+%8] 294 paddw xmm5, xmm2 295 paddw xmm6, xmm3 296 297 %if %1==0 || %1==1 298 lea %2, [%2+%7*2] 299 lea %3, [%3+%8*2] 300 301 lea %4, [%4+%8*2] 302 lea %5, [%5+%8*2] 303 304 lea %6, [%6+%8*2] 305 %endif 306 psadbw xmm1, xmm0 307 paddw xmm7, xmm1 308 309 %endmacro 310 311 %macro PROCESS_8X2X4 8 312 %if %1==0 313 movq mm0, QWORD PTR [%2] 314 movq mm4, QWORD PTR [%3] 315 movq mm5, QWORD PTR [%4] 316 movq mm6, QWORD PTR [%5] 317 movq mm7, QWORD PTR [%6] 318 319 psadbw mm4, mm0 320 psadbw mm5, mm0 321 psadbw mm6, mm0 322 psadbw mm7, mm0 323 %else 324 movq mm0, QWORD PTR [%2] 325 movq mm1, QWORD PTR [%3] 326 movq mm2, QWORD PTR [%4] 327 movq mm3, QWORD PTR [%5] 328 329 psadbw mm1, mm0 330 psadbw mm2, mm0 331 psadbw mm3, mm0 332 333 paddw mm4, mm1 334 movq mm1, QWORD PTR [%6] 335 paddw mm5, mm2 336 paddw mm6, mm3 337 338 psadbw mm1, mm0 339 paddw mm7, mm1 340 %endif 341 movq mm0, QWORD PTR [%2+%7] 342 movq mm1, QWORD PTR [%3+%8] 343 movq mm2, QWORD PTR [%4+%8] 344 movq mm3, QWORD PTR [%5+%8] 345 346 psadbw mm1, mm0 347 psadbw mm2, mm0 348 psadbw mm3, mm0 349 350 paddw mm4, mm1 351 movq mm1, QWORD PTR [%6+%8] 352 paddw mm5, mm2 353 paddw mm6, mm3 354 355 %if %1==0 || %1==1 356 lea %2, [%2+%7*2] 357 lea %3, [%3+%8*2] 358 359 lea %4, [%4+%8*2] 360 lea %5, [%5+%8*2] 361 362 lea %6, [%6+%8*2] 363 %endif 364 psadbw mm1, mm0 365 paddw mm7, mm1 366 367 %endmacro 368 369 ;void int vp8_sad16x16x3_sse3( 370 ; unsigned char *src_ptr, 371 ; int src_stride, 372 ; unsigned char *ref_ptr, 373 ; int ref_stride, 374 ; int *results) 375 global sym(vp8_sad16x16x3_sse3) 376 sym(vp8_sad16x16x3_sse3): 377 378 STACK_FRAME_CREATE_X3 379 380 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 381 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 382 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 383 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 384 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 385 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 386 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 387 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 388 389 mov rcx, result_ptr 390 391 movq xmm0, xmm5 392 psrldq xmm5, 8 393 394 paddw xmm0, xmm5 395 movd [rcx], xmm0 396 ;- 397 movq xmm0, xmm6 398 psrldq xmm6, 8 399 400 paddw xmm0, xmm6 401 movd [rcx+4], xmm0 402 ;- 403 movq xmm0, xmm7 404 psrldq xmm7, 8 405 406 paddw xmm0, xmm7 407 movd [rcx+8], xmm0 408 409 STACK_FRAME_DESTROY_X3 410 411 ;void int vp8_sad16x8x3_sse3( 412 ; unsigned char *src_ptr, 413 ; int src_stride, 414 ; unsigned char *ref_ptr, 415 ; int ref_stride, 416 ; int *results) 417 global sym(vp8_sad16x8x3_sse3) 418 sym(vp8_sad16x8x3_sse3): 419 420 STACK_FRAME_CREATE_X3 421 422 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 423 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 424 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 425 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 426 427 mov rcx, result_ptr 428 429 movq xmm0, xmm5 430 psrldq xmm5, 8 431 432 paddw xmm0, xmm5 433 movd [rcx], xmm0 434 ;- 435 movq xmm0, xmm6 436 psrldq xmm6, 8 437 438 paddw xmm0, xmm6 439 movd [rcx+4], xmm0 440 ;- 441 movq xmm0, xmm7 442 psrldq xmm7, 8 443 444 paddw xmm0, xmm7 445 movd [rcx+8], xmm0 446 447 STACK_FRAME_DESTROY_X3 448 449 ;void int vp8_sad8x16x3_sse3( 450 ; unsigned char *src_ptr, 451 ; int src_stride, 452 ; unsigned char *ref_ptr, 453 ; int ref_stride, 454 ; int *results) 455 global sym(vp8_sad8x16x3_sse3) 456 sym(vp8_sad8x16x3_sse3): 457 458 STACK_FRAME_CREATE_X3 459 460 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 461 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 462 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 463 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 464 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 465 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 466 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 467 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 468 469 mov rcx, result_ptr 470 471 punpckldq mm5, mm6 472 473 movq [rcx], mm5 474 movd [rcx+8], mm7 475 476 STACK_FRAME_DESTROY_X3 477 478 ;void int vp8_sad8x8x3_sse3( 479 ; unsigned char *src_ptr, 480 ; int src_stride, 481 ; unsigned char *ref_ptr, 482 ; int ref_stride, 483 ; int *results) 484 global sym(vp8_sad8x8x3_sse3) 485 sym(vp8_sad8x8x3_sse3): 486 487 STACK_FRAME_CREATE_X3 488 489 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 490 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 491 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 492 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 493 494 mov rcx, result_ptr 495 496 punpckldq mm5, mm6 497 498 movq [rcx], mm5 499 movd [rcx+8], mm7 500 501 STACK_FRAME_DESTROY_X3 502 503 ;void int vp8_sad4x4x3_sse3( 504 ; unsigned char *src_ptr, 505 ; int src_stride, 506 ; unsigned char *ref_ptr, 507 ; int ref_stride, 508 ; int *results) 509 global sym(vp8_sad4x4x3_sse3) 510 sym(vp8_sad4x4x3_sse3): 511 512 STACK_FRAME_CREATE_X3 513 514 movd mm0, DWORD PTR [src_ptr] 515 movd mm1, DWORD PTR [ref_ptr] 516 517 movd mm2, DWORD PTR [src_ptr+src_stride] 518 movd mm3, DWORD PTR [ref_ptr+ref_stride] 519 520 punpcklbw mm0, mm2 521 punpcklbw mm1, mm3 522 523 movd mm4, DWORD PTR [ref_ptr+1] 524 movd mm5, DWORD PTR [ref_ptr+2] 525 526 movd mm2, DWORD PTR [ref_ptr+ref_stride+1] 527 movd mm3, DWORD PTR [ref_ptr+ref_stride+2] 528 529 psadbw mm1, mm0 530 531 punpcklbw mm4, mm2 532 punpcklbw mm5, mm3 533 534 psadbw mm4, mm0 535 psadbw mm5, mm0 536 537 lea src_ptr, [src_ptr+src_stride*2] 538 lea ref_ptr, [ref_ptr+ref_stride*2] 539 540 movd mm0, DWORD PTR [src_ptr] 541 movd mm2, DWORD PTR [ref_ptr] 542 543 movd mm3, DWORD PTR [src_ptr+src_stride] 544 movd mm6, DWORD PTR [ref_ptr+ref_stride] 545 546 punpcklbw mm0, mm3 547 punpcklbw mm2, mm6 548 549 movd mm3, DWORD PTR [ref_ptr+1] 550 movd mm7, DWORD PTR [ref_ptr+2] 551 552 psadbw mm2, mm0 553 554 paddw mm1, mm2 555 556 movd mm2, DWORD PTR [ref_ptr+ref_stride+1] 557 movd mm6, DWORD PTR [ref_ptr+ref_stride+2] 558 559 punpcklbw mm3, mm2 560 punpcklbw mm7, mm6 561 562 psadbw mm3, mm0 563 psadbw mm7, mm0 564 565 paddw mm3, mm4 566 paddw mm7, mm5 567 568 mov rcx, result_ptr 569 570 punpckldq mm1, mm3 571 572 movq [rcx], mm1 573 movd [rcx+8], mm7 574 575 STACK_FRAME_DESTROY_X3 576 577 ;unsigned int vp8_sad16x16_sse3( 578 ; unsigned char *src_ptr, 579 ; int src_stride, 580 ; unsigned char *ref_ptr, 581 ; int ref_stride, 582 ; int max_err) 583 ;%define lddqu movdqu 584 global sym(vp8_sad16x16_sse3) 585 sym(vp8_sad16x16_sse3): 586 587 STACK_FRAME_CREATE_X3 588 589 mov end_ptr, 4 590 pxor xmm7, xmm7 591 592 .vp8_sad16x16_sse3_loop: 593 movdqa xmm0, XMMWORD PTR [src_ptr] 594 movdqu xmm1, XMMWORD PTR [ref_ptr] 595 movdqa xmm2, XMMWORD PTR [src_ptr+src_stride] 596 movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride] 597 598 lea src_ptr, [src_ptr+src_stride*2] 599 lea ref_ptr, [ref_ptr+ref_stride*2] 600 601 movdqa xmm4, XMMWORD PTR [src_ptr] 602 movdqu xmm5, XMMWORD PTR [ref_ptr] 603 movdqa xmm6, XMMWORD PTR [src_ptr+src_stride] 604 605 psadbw xmm0, xmm1 606 607 movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride] 608 609 psadbw xmm2, xmm3 610 psadbw xmm4, xmm5 611 psadbw xmm6, xmm1 612 613 lea src_ptr, [src_ptr+src_stride*2] 614 lea ref_ptr, [ref_ptr+ref_stride*2] 615 616 paddw xmm7, xmm0 617 paddw xmm7, xmm2 618 paddw xmm7, xmm4 619 paddw xmm7, xmm6 620 621 sub end_ptr, 1 622 jne .vp8_sad16x16_sse3_loop 623 624 movq xmm0, xmm7 625 psrldq xmm7, 8 626 paddw xmm0, xmm7 627 movq rax, xmm0 628 629 STACK_FRAME_DESTROY_X3 630 631 ;void vp8_sad16x16x4d_sse3( 632 ; unsigned char *src_ptr, 633 ; int src_stride, 634 ; unsigned char *ref_ptr_base, 635 ; int ref_stride, 636 ; int *results) 637 global sym(vp8_sad16x16x4d_sse3) 638 sym(vp8_sad16x16x4d_sse3): 639 640 STACK_FRAME_CREATE_X4 641 642 PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 643 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 644 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 645 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 646 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 647 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 648 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 649 PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 650 651 %if ABI_IS_32BIT 652 pop rbp 653 %endif 654 mov rcx, result_ptr 655 656 movq xmm0, xmm4 657 psrldq xmm4, 8 658 659 paddw xmm0, xmm4 660 movd [rcx], xmm0 661 ;- 662 movq xmm0, xmm5 663 psrldq xmm5, 8 664 665 paddw xmm0, xmm5 666 movd [rcx+4], xmm0 667 ;- 668 movq xmm0, xmm6 669 psrldq xmm6, 8 670 671 paddw xmm0, xmm6 672 movd [rcx+8], xmm0 673 ;- 674 movq xmm0, xmm7 675 psrldq xmm7, 8 676 677 paddw xmm0, xmm7 678 movd [rcx+12], xmm0 679 680 STACK_FRAME_DESTROY_X4 681 682 ;void vp8_sad16x8x4d_sse3( 683 ; unsigned char *src_ptr, 684 ; int src_stride, 685 ; unsigned char *ref_ptr_base, 686 ; int ref_stride, 687 ; int *results) 688 global sym(vp8_sad16x8x4d_sse3) 689 sym(vp8_sad16x8x4d_sse3): 690 691 STACK_FRAME_CREATE_X4 692 693 PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 694 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 695 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 696 PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 697 698 %if ABI_IS_32BIT 699 pop rbp 700 %endif 701 mov rcx, result_ptr 702 703 movq xmm0, xmm4 704 psrldq xmm4, 8 705 706 paddw xmm0, xmm4 707 movd [rcx], xmm0 708 ;- 709 movq xmm0, xmm5 710 psrldq xmm5, 8 711 712 paddw xmm0, xmm5 713 movd [rcx+4], xmm0 714 ;- 715 movq xmm0, xmm6 716 psrldq xmm6, 8 717 718 paddw xmm0, xmm6 719 movd [rcx+8], xmm0 720 ;- 721 movq xmm0, xmm7 722 psrldq xmm7, 8 723 724 paddw xmm0, xmm7 725 movd [rcx+12], xmm0 726 727 STACK_FRAME_DESTROY_X4 728 729 ;void int vp8_sad8x16x4d_sse3( 730 ; unsigned char *src_ptr, 731 ; int src_stride, 732 ; unsigned char *ref_ptr, 733 ; int ref_stride, 734 ; int *results) 735 global sym(vp8_sad8x16x4d_sse3) 736 sym(vp8_sad8x16x4d_sse3): 737 738 STACK_FRAME_CREATE_X4 739 740 PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 741 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 742 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 743 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 744 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 745 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 746 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 747 PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 748 749 %if ABI_IS_32BIT 750 pop rbp 751 %endif 752 mov rcx, result_ptr 753 754 punpckldq mm4, mm5 755 punpckldq mm6, mm7 756 757 movq [rcx], mm4 758 movq [rcx+8], mm6 759 760 STACK_FRAME_DESTROY_X4 761 762 ;void int vp8_sad8x8x4d_sse3( 763 ; unsigned char *src_ptr, 764 ; int src_stride, 765 ; unsigned char *ref_ptr, 766 ; int ref_stride, 767 ; int *results) 768 global sym(vp8_sad8x8x4d_sse3) 769 sym(vp8_sad8x8x4d_sse3): 770 771 STACK_FRAME_CREATE_X4 772 773 PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 774 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 775 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 776 PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 777 778 %if ABI_IS_32BIT 779 pop rbp 780 %endif 781 mov rcx, result_ptr 782 783 punpckldq mm4, mm5 784 punpckldq mm6, mm7 785 786 movq [rcx], mm4 787 movq [rcx+8], mm6 788 789 STACK_FRAME_DESTROY_X4 790 791 ;void int vp8_sad4x4x4d_sse3( 792 ; unsigned char *src_ptr, 793 ; int src_stride, 794 ; unsigned char *ref_ptr, 795 ; int ref_stride, 796 ; int *results) 797 global sym(vp8_sad4x4x4d_sse3) 798 sym(vp8_sad4x4x4d_sse3): 799 800 STACK_FRAME_CREATE_X4 801 802 movd mm0, DWORD PTR [src_ptr] 803 movd mm1, DWORD PTR [r0_ptr] 804 805 movd mm2, DWORD PTR [src_ptr+src_stride] 806 movd mm3, DWORD PTR [r0_ptr+ref_stride] 807 808 punpcklbw mm0, mm2 809 punpcklbw mm1, mm3 810 811 movd mm4, DWORD PTR [r1_ptr] 812 movd mm5, DWORD PTR [r2_ptr] 813 814 movd mm6, DWORD PTR [r3_ptr] 815 movd mm2, DWORD PTR [r1_ptr+ref_stride] 816 817 movd mm3, DWORD PTR [r2_ptr+ref_stride] 818 movd mm7, DWORD PTR [r3_ptr+ref_stride] 819 820 psadbw mm1, mm0 821 822 punpcklbw mm4, mm2 823 punpcklbw mm5, mm3 824 825 punpcklbw mm6, mm7 826 psadbw mm4, mm0 827 828 psadbw mm5, mm0 829 psadbw mm6, mm0 830 831 832 833 lea src_ptr, [src_ptr+src_stride*2] 834 lea r0_ptr, [r0_ptr+ref_stride*2] 835 836 lea r1_ptr, [r1_ptr+ref_stride*2] 837 lea r2_ptr, [r2_ptr+ref_stride*2] 838 839 lea r3_ptr, [r3_ptr+ref_stride*2] 840 841 movd mm0, DWORD PTR [src_ptr] 842 movd mm2, DWORD PTR [r0_ptr] 843 844 movd mm3, DWORD PTR [src_ptr+src_stride] 845 movd mm7, DWORD PTR [r0_ptr+ref_stride] 846 847 punpcklbw mm0, mm3 848 punpcklbw mm2, mm7 849 850 movd mm3, DWORD PTR [r1_ptr] 851 movd mm7, DWORD PTR [r2_ptr] 852 853 psadbw mm2, mm0 854 %if ABI_IS_32BIT 855 mov rax, rbp 856 857 pop rbp 858 %define ref_stride rax 859 %endif 860 mov rsi, result_ptr 861 862 paddw mm1, mm2 863 movd [rsi], mm1 864 865 movd mm2, DWORD PTR [r1_ptr+ref_stride] 866 movd mm1, DWORD PTR [r2_ptr+ref_stride] 867 868 punpcklbw mm3, mm2 869 punpcklbw mm7, mm1 870 871 psadbw mm3, mm0 872 psadbw mm7, mm0 873 874 movd mm2, DWORD PTR [r3_ptr] 875 movd mm1, DWORD PTR [r3_ptr+ref_stride] 876 877 paddw mm3, mm4 878 paddw mm7, mm5 879 880 movd [rsi+4], mm3 881 punpcklbw mm2, mm1 882 883 movd [rsi+8], mm7 884 psadbw mm2, mm0 885 886 paddw mm2, mm6 887 movd [rsi+12], mm2 888 889 890 STACK_FRAME_DESTROY_X4 891