1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 %include "vpx_ports/x86_abi_support.asm" 13 14 ;void copy_mem16x16_sse2( 15 ; unsigned char *src, 16 ; int src_stride, 17 ; unsigned char *dst, 18 ; int dst_stride 19 ; ) 20 global sym(vp8_copy_mem16x16_sse2) PRIVATE 21 sym(vp8_copy_mem16x16_sse2): 22 push rbp 23 mov rbp, rsp 24 SHADOW_ARGS_TO_STACK 4 25 push rsi 26 push rdi 27 ; end prolog 28 29 mov rsi, arg(0) ;src; 30 movdqu xmm0, [rsi] 31 32 movsxd rax, dword ptr arg(1) ;src_stride; 33 mov rdi, arg(2) ;dst; 34 35 movdqu xmm1, [rsi+rax] 36 movdqu xmm2, [rsi+rax*2] 37 38 movsxd rcx, dword ptr arg(3) ;dst_stride 39 lea rsi, [rsi+rax*2] 40 41 movdqa [rdi], xmm0 42 add rsi, rax 43 44 movdqa [rdi+rcx], xmm1 45 movdqa [rdi+rcx*2],xmm2 46 47 lea rdi, [rdi+rcx*2] 48 movdqu xmm3, [rsi] 49 50 add rdi, rcx 51 movdqu xmm4, [rsi+rax] 52 53 movdqu xmm5, [rsi+rax*2] 54 lea rsi, [rsi+rax*2] 55 56 movdqa [rdi], xmm3 57 add rsi, rax 58 59 movdqa [rdi+rcx], xmm4 60 movdqa [rdi+rcx*2],xmm5 61 62 lea rdi, [rdi+rcx*2] 63 movdqu xmm0, [rsi] 64 65 add rdi, rcx 66 movdqu xmm1, [rsi+rax] 67 68 movdqu xmm2, [rsi+rax*2] 69 lea rsi, [rsi+rax*2] 70 71 movdqa [rdi], xmm0 72 add rsi, rax 73 74 movdqa [rdi+rcx], xmm1 75 76 movdqa [rdi+rcx*2], xmm2 77 movdqu xmm3, [rsi] 78 79 movdqu xmm4, [rsi+rax] 80 lea rdi, [rdi+rcx*2] 81 82 add rdi, rcx 83 movdqu xmm5, [rsi+rax*2] 84 85 lea rsi, [rsi+rax*2] 86 movdqa [rdi], xmm3 87 88 add rsi, rax 89 movdqa [rdi+rcx], xmm4 90 91 movdqa [rdi+rcx*2],xmm5 92 movdqu xmm0, [rsi] 93 94 lea rdi, [rdi+rcx*2] 95 movdqu xmm1, [rsi+rax] 96 97 add rdi, rcx 98 movdqu xmm2, [rsi+rax*2] 99 100 lea rsi, [rsi+rax*2] 101 movdqa [rdi], xmm0 102 103 movdqa [rdi+rcx], xmm1 104 movdqa [rdi+rcx*2],xmm2 105 106 movdqu xmm3, [rsi+rax] 107 lea rdi, [rdi+rcx*2] 108 109 movdqa [rdi+rcx], xmm3 110 111 ; begin epilog 112 pop rdi 113 pop rsi 114 UNSHADOW_ARGS 115 pop rbp 116 ret 117 118 119 ;void vp8_intra_pred_uv_dc_mmx2( 120 ; unsigned char *dst, 121 ; int dst_stride 122 ; unsigned char *above, 123 ; unsigned char *left, 124 ; int left_stride, 125 ; ) 126 global sym(vp8_intra_pred_uv_dc_mmx2) PRIVATE 127 sym(vp8_intra_pred_uv_dc_mmx2): 128 push rbp 129 mov rbp, rsp 130 SHADOW_ARGS_TO_STACK 5 131 push rsi 132 push rdi 133 ; end prolog 134 135 ; from top 136 mov rdi, arg(2) ;above; 137 mov rsi, arg(3) ;left; 138 movsxd rax, dword ptr arg(4) ;left_stride; 139 pxor mm0, mm0 140 movq mm1, [rdi] 141 lea rdi, [rax*3] 142 psadbw mm1, mm0 143 ; from left 144 movzx ecx, byte [rsi] 145 movzx edx, byte [rsi+rax*1] 146 add ecx, edx 147 movzx edx, byte [rsi+rax*2] 148 add ecx, edx 149 150 movzx edx, byte [rsi+rdi] 151 lea rsi, [rsi+rax*4] 152 add ecx, edx 153 movzx edx, byte [rsi] 154 add ecx, edx 155 movzx edx, byte [rsi+rax] 156 add ecx, edx 157 movzx edx, byte [rsi+rax*2] 158 add ecx, edx 159 movzx edx, byte [rsi+rdi] 160 add ecx, edx 161 162 ; add up 163 pextrw edx, mm1, 0x0 164 lea edx, [edx+ecx+8] 165 sar edx, 4 166 movd mm1, edx 167 movsxd rcx, dword ptr arg(1) ;dst_stride 168 pshufw mm1, mm1, 0x0 169 mov rdi, arg(0) ;dst; 170 packuswb mm1, mm1 171 172 ; write out 173 lea rax, [rcx*3] 174 lea rdx, [rdi+rcx*4] 175 176 movq [rdi ], mm1 177 movq [rdi+rcx ], mm1 178 movq [rdi+rcx*2], mm1 179 movq [rdi+rax ], mm1 180 movq [rdx ], mm1 181 movq [rdx+rcx ], mm1 182 movq [rdx+rcx*2], mm1 183 movq [rdx+rax ], mm1 184 185 ; begin epilog 186 pop rdi 187 pop rsi 188 UNSHADOW_ARGS 189 pop rbp 190 ret 191 192 ;void vp8_intra_pred_uv_dctop_mmx2( 193 ; unsigned char *dst, 194 ; int dst_stride 195 ; unsigned char *above, 196 ; unsigned char *left, 197 ; int left_stride, 198 ; ) 199 global sym(vp8_intra_pred_uv_dctop_mmx2) PRIVATE 200 sym(vp8_intra_pred_uv_dctop_mmx2): 201 push rbp 202 mov rbp, rsp 203 SHADOW_ARGS_TO_STACK 5 204 GET_GOT rbx 205 push rsi 206 push rdi 207 ; end prolog 208 209 ;arg(3), arg(4) not used 210 211 ; from top 212 mov rsi, arg(2) ;above; 213 pxor mm0, mm0 214 movq mm1, [rsi] 215 psadbw mm1, mm0 216 217 ; add up 218 paddw mm1, [GLOBAL(dc_4)] 219 psraw mm1, 3 220 pshufw mm1, mm1, 0x0 221 packuswb mm1, mm1 222 223 ; write out 224 mov rdi, arg(0) ;dst; 225 movsxd rcx, dword ptr arg(1) ;dst_stride 226 lea rax, [rcx*3] 227 228 movq [rdi ], mm1 229 movq [rdi+rcx ], mm1 230 movq [rdi+rcx*2], mm1 231 movq [rdi+rax ], mm1 232 lea rdi, [rdi+rcx*4] 233 movq [rdi ], mm1 234 movq [rdi+rcx ], mm1 235 movq [rdi+rcx*2], mm1 236 movq [rdi+rax ], mm1 237 238 ; begin epilog 239 pop rdi 240 pop rsi 241 RESTORE_GOT 242 UNSHADOW_ARGS 243 pop rbp 244 ret 245 246 ;void vp8_intra_pred_uv_dcleft_mmx2( 247 ; unsigned char *dst, 248 ; int dst_stride 249 ; unsigned char *above, 250 ; unsigned char *left, 251 ; int left_stride, 252 ; ) 253 global sym(vp8_intra_pred_uv_dcleft_mmx2) PRIVATE 254 sym(vp8_intra_pred_uv_dcleft_mmx2): 255 push rbp 256 mov rbp, rsp 257 SHADOW_ARGS_TO_STACK 5 258 push rsi 259 push rdi 260 ; end prolog 261 262 ;arg(2) not used 263 264 ; from left 265 mov rsi, arg(3) ;left; 266 movsxd rax, dword ptr arg(4) ;left_stride; 267 lea rdi, [rax*3] 268 movzx ecx, byte [rsi] 269 movzx edx, byte [rsi+rax] 270 add ecx, edx 271 movzx edx, byte [rsi+rax*2] 272 add ecx, edx 273 movzx edx, byte [rsi+rdi] 274 add ecx, edx 275 lea rsi, [rsi+rax*4] 276 movzx edx, byte [rsi] 277 add ecx, edx 278 movzx edx, byte [rsi+rax] 279 add ecx, edx 280 movzx edx, byte [rsi+rax*2] 281 add ecx, edx 282 movzx edx, byte [rsi+rdi] 283 lea edx, [ecx+edx+4] 284 285 ; add up 286 shr edx, 3 287 movd mm1, edx 288 pshufw mm1, mm1, 0x0 289 packuswb mm1, mm1 290 291 ; write out 292 mov rdi, arg(0) ;dst; 293 movsxd rcx, dword ptr arg(1) ;dst_stride 294 lea rax, [rcx*3] 295 296 movq [rdi ], mm1 297 movq [rdi+rcx ], mm1 298 movq [rdi+rcx*2], mm1 299 movq [rdi+rax ], mm1 300 lea rdi, [rdi+rcx*4] 301 movq [rdi ], mm1 302 movq [rdi+rcx ], mm1 303 movq [rdi+rcx*2], mm1 304 movq [rdi+rax ], mm1 305 306 ; begin epilog 307 pop rdi 308 pop rsi 309 UNSHADOW_ARGS 310 pop rbp 311 ret 312 313 ;void vp8_intra_pred_uv_dc128_mmx( 314 ; unsigned char *dst, 315 ; int dst_stride 316 ; unsigned char *above, 317 ; unsigned char *left, 318 ; int left_stride, 319 ; ) 320 global sym(vp8_intra_pred_uv_dc128_mmx) PRIVATE 321 sym(vp8_intra_pred_uv_dc128_mmx): 322 push rbp 323 mov rbp, rsp 324 SHADOW_ARGS_TO_STACK 5 325 GET_GOT rbx 326 ; end prolog 327 328 ;arg(2), arg(3), arg(4) not used 329 330 ; write out 331 movq mm1, [GLOBAL(dc_128)] 332 mov rax, arg(0) ;dst; 333 movsxd rdx, dword ptr arg(1) ;dst_stride 334 lea rcx, [rdx*3] 335 336 movq [rax ], mm1 337 movq [rax+rdx ], mm1 338 movq [rax+rdx*2], mm1 339 movq [rax+rcx ], mm1 340 lea rax, [rax+rdx*4] 341 movq [rax ], mm1 342 movq [rax+rdx ], mm1 343 movq [rax+rdx*2], mm1 344 movq [rax+rcx ], mm1 345 346 ; begin epilog 347 RESTORE_GOT 348 UNSHADOW_ARGS 349 pop rbp 350 ret 351 352 ;void vp8_intra_pred_uv_tm_sse2( 353 ; unsigned char *dst, 354 ; int dst_stride 355 ; unsigned char *above, 356 ; unsigned char *left, 357 ; int left_stride, 358 ; ) 359 %macro vp8_intra_pred_uv_tm 1 360 global sym(vp8_intra_pred_uv_tm_%1) PRIVATE 361 sym(vp8_intra_pred_uv_tm_%1): 362 push rbp 363 mov rbp, rsp 364 SHADOW_ARGS_TO_STACK 5 365 GET_GOT rbx 366 push rsi 367 push rdi 368 ; end prolog 369 370 ; read top row 371 mov edx, 4 372 mov rsi, arg(2) ;above 373 movsxd rax, dword ptr arg(4) ;left_stride; 374 pxor xmm0, xmm0 375 %ifidn %1, ssse3 376 movdqa xmm2, [GLOBAL(dc_1024)] 377 %endif 378 movq xmm1, [rsi] 379 punpcklbw xmm1, xmm0 380 381 ; set up left ptrs ans subtract topleft 382 movd xmm3, [rsi-1] 383 mov rsi, arg(3) ;left; 384 %ifidn %1, sse2 385 punpcklbw xmm3, xmm0 386 pshuflw xmm3, xmm3, 0x0 387 punpcklqdq xmm3, xmm3 388 %else 389 pshufb xmm3, xmm2 390 %endif 391 psubw xmm1, xmm3 392 393 ; set up dest ptrs 394 mov rdi, arg(0) ;dst; 395 movsxd rcx, dword ptr arg(1) ;dst_stride 396 397 .vp8_intra_pred_uv_tm_%1_loop: 398 movd xmm3, [rsi] 399 movd xmm5, [rsi+rax] 400 %ifidn %1, sse2 401 punpcklbw xmm3, xmm0 402 punpcklbw xmm5, xmm0 403 pshuflw xmm3, xmm3, 0x0 404 pshuflw xmm5, xmm5, 0x0 405 punpcklqdq xmm3, xmm3 406 punpcklqdq xmm5, xmm5 407 %else 408 pshufb xmm3, xmm2 409 pshufb xmm5, xmm2 410 %endif 411 paddw xmm3, xmm1 412 paddw xmm5, xmm1 413 packuswb xmm3, xmm5 414 movq [rdi ], xmm3 415 movhps[rdi+rcx], xmm3 416 lea rsi, [rsi+rax*2] 417 lea rdi, [rdi+rcx*2] 418 dec edx 419 jnz .vp8_intra_pred_uv_tm_%1_loop 420 421 ; begin epilog 422 pop rdi 423 pop rsi 424 RESTORE_GOT 425 UNSHADOW_ARGS 426 pop rbp 427 ret 428 %endmacro 429 430 vp8_intra_pred_uv_tm sse2 431 vp8_intra_pred_uv_tm ssse3 432 433 ;void vp8_intra_pred_uv_ve_mmx( 434 ; unsigned char *dst, 435 ; int dst_stride 436 ; unsigned char *above, 437 ; unsigned char *left, 438 ; int left_stride, 439 ; ) 440 global sym(vp8_intra_pred_uv_ve_mmx) PRIVATE 441 sym(vp8_intra_pred_uv_ve_mmx): 442 push rbp 443 mov rbp, rsp 444 SHADOW_ARGS_TO_STACK 5 445 ; end prolog 446 447 ; arg(3), arg(4) not used 448 449 ; read from top 450 mov rax, arg(2) ;src; 451 452 movq mm1, [rax] 453 454 ; write out 455 mov rax, arg(0) ;dst; 456 movsxd rdx, dword ptr arg(1) ;dst_stride 457 lea rcx, [rdx*3] 458 459 movq [rax ], mm1 460 movq [rax+rdx ], mm1 461 movq [rax+rdx*2], mm1 462 movq [rax+rcx ], mm1 463 lea rax, [rax+rdx*4] 464 movq [rax ], mm1 465 movq [rax+rdx ], mm1 466 movq [rax+rdx*2], mm1 467 movq [rax+rcx ], mm1 468 469 ; begin epilog 470 UNSHADOW_ARGS 471 pop rbp 472 ret 473 474 ;void vp8_intra_pred_uv_ho_mmx2( 475 ; unsigned char *dst, 476 ; int dst_stride 477 ; unsigned char *above, 478 ; unsigned char *left, 479 ; int left_stride 480 ; ) 481 %macro vp8_intra_pred_uv_ho 1 482 global sym(vp8_intra_pred_uv_ho_%1) PRIVATE 483 sym(vp8_intra_pred_uv_ho_%1): 484 push rbp 485 mov rbp, rsp 486 SHADOW_ARGS_TO_STACK 5 487 push rsi 488 push rdi 489 %ifidn %1, ssse3 490 %ifndef GET_GOT_SAVE_ARG 491 push rbx 492 %endif 493 GET_GOT rbx 494 %endif 495 ; end prolog 496 497 ;arg(2) not used 498 499 ; read from left and write out 500 %ifidn %1, mmx2 501 mov edx, 4 502 %endif 503 mov rsi, arg(3) ;left 504 movsxd rax, dword ptr arg(4) ;left_stride; 505 mov rdi, arg(0) ;dst; 506 movsxd rcx, dword ptr arg(1) ;dst_stride 507 %ifidn %1, ssse3 508 lea rdx, [rcx*3] 509 movdqa xmm2, [GLOBAL(dc_00001111)] 510 lea rbx, [rax*3] 511 %endif 512 513 %ifidn %1, mmx2 514 .vp8_intra_pred_uv_ho_%1_loop: 515 movd mm0, [rsi] 516 movd mm1, [rsi+rax] 517 punpcklbw mm0, mm0 518 punpcklbw mm1, mm1 519 pshufw mm0, mm0, 0x0 520 pshufw mm1, mm1, 0x0 521 movq [rdi ], mm0 522 movq [rdi+rcx], mm1 523 lea rsi, [rsi+rax*2] 524 lea rdi, [rdi+rcx*2] 525 dec edx 526 jnz .vp8_intra_pred_uv_ho_%1_loop 527 %else 528 movd xmm0, [rsi] 529 movd xmm3, [rsi+rax] 530 movd xmm1, [rsi+rax*2] 531 movd xmm4, [rsi+rbx] 532 punpcklbw xmm0, xmm3 533 punpcklbw xmm1, xmm4 534 pshufb xmm0, xmm2 535 pshufb xmm1, xmm2 536 movq [rdi ], xmm0 537 movhps [rdi+rcx], xmm0 538 movq [rdi+rcx*2], xmm1 539 movhps [rdi+rdx], xmm1 540 lea rsi, [rsi+rax*4] 541 lea rdi, [rdi+rcx*4] 542 movd xmm0, [rsi] 543 movd xmm3, [rsi+rax] 544 movd xmm1, [rsi+rax*2] 545 movd xmm4, [rsi+rbx] 546 punpcklbw xmm0, xmm3 547 punpcklbw xmm1, xmm4 548 pshufb xmm0, xmm2 549 pshufb xmm1, xmm2 550 movq [rdi ], xmm0 551 movhps [rdi+rcx], xmm0 552 movq [rdi+rcx*2], xmm1 553 movhps [rdi+rdx], xmm1 554 %endif 555 556 ; begin epilog 557 %ifidn %1, ssse3 558 RESTORE_GOT 559 %ifndef GET_GOT_SAVE_ARG 560 pop rbx 561 %endif 562 %endif 563 pop rdi 564 pop rsi 565 UNSHADOW_ARGS 566 pop rbp 567 ret 568 %endmacro 569 570 vp8_intra_pred_uv_ho mmx2 571 vp8_intra_pred_uv_ho ssse3 572 573 ;void vp8_intra_pred_y_dc_sse2( 574 ; unsigned char *dst, 575 ; int dst_stride 576 ; unsigned char *above, 577 ; unsigned char *left, 578 ; int left_stride 579 ; ) 580 global sym(vp8_intra_pred_y_dc_sse2) PRIVATE 581 sym(vp8_intra_pred_y_dc_sse2): 582 push rbp 583 mov rbp, rsp 584 SHADOW_ARGS_TO_STACK 5 585 push rsi 586 push rdi 587 ; end prolog 588 589 ; from top 590 mov rdi, arg(2) ;above 591 mov rsi, arg(3) ;left 592 movsxd rax, dword ptr arg(4) ;left_stride; 593 594 pxor xmm0, xmm0 595 movdqa xmm1, [rdi] 596 psadbw xmm1, xmm0 597 movq xmm2, xmm1 598 punpckhqdq xmm1, xmm1 599 paddw xmm1, xmm2 600 601 ; from left 602 lea rdi, [rax*3] 603 604 movzx ecx, byte [rsi] 605 movzx edx, byte [rsi+rax] 606 add ecx, edx 607 movzx edx, byte [rsi+rax*2] 608 add ecx, edx 609 movzx edx, byte [rsi+rdi] 610 add ecx, edx 611 lea rsi, [rsi+rax*4] 612 613 movzx edx, byte [rsi] 614 add ecx, edx 615 movzx edx, byte [rsi+rax] 616 add ecx, edx 617 movzx edx, byte [rsi+rax*2] 618 add ecx, edx 619 movzx edx, byte [rsi+rdi] 620 add ecx, edx 621 lea rsi, [rsi+rax*4] 622 623 movzx edx, byte [rsi] 624 add ecx, edx 625 movzx edx, byte [rsi+rax] 626 add ecx, edx 627 movzx edx, byte [rsi+rax*2] 628 add ecx, edx 629 movzx edx, byte [rsi+rdi] 630 add ecx, edx 631 lea rsi, [rsi+rax*4] 632 633 movzx edx, byte [rsi] 634 add ecx, edx 635 movzx edx, byte [rsi+rax] 636 add ecx, edx 637 movzx edx, byte [rsi+rax*2] 638 add ecx, edx 639 movzx edx, byte [rsi+rdi] 640 add ecx, edx 641 642 ; add up 643 pextrw edx, xmm1, 0x0 644 lea edx, [edx+ecx+16] 645 sar edx, 5 646 movd xmm1, edx 647 ; FIXME use pshufb for ssse3 version 648 pshuflw xmm1, xmm1, 0x0 649 punpcklqdq xmm1, xmm1 650 packuswb xmm1, xmm1 651 652 ; write out 653 mov rsi, 2 654 mov rdi, arg(0) ;dst; 655 movsxd rcx, dword ptr arg(1) ;dst_stride 656 lea rax, [rcx*3] 657 658 .label 659 movdqa [rdi ], xmm1 660 movdqa [rdi+rcx ], xmm1 661 movdqa [rdi+rcx*2], xmm1 662 movdqa [rdi+rax ], xmm1 663 lea rdi, [rdi+rcx*4] 664 movdqa [rdi ], xmm1 665 movdqa [rdi+rcx ], xmm1 666 movdqa [rdi+rcx*2], xmm1 667 movdqa [rdi+rax ], xmm1 668 lea rdi, [rdi+rcx*4] 669 dec rsi 670 jnz .label 671 672 ; begin epilog 673 pop rdi 674 pop rsi 675 UNSHADOW_ARGS 676 pop rbp 677 ret 678 679 ;void vp8_intra_pred_y_dctop_sse2( 680 ; unsigned char *dst, 681 ; int dst_stride 682 ; unsigned char *above, 683 ; unsigned char *left, 684 ; int left_stride 685 ; ) 686 global sym(vp8_intra_pred_y_dctop_sse2) PRIVATE 687 sym(vp8_intra_pred_y_dctop_sse2): 688 push rbp 689 mov rbp, rsp 690 SHADOW_ARGS_TO_STACK 5 691 push rsi 692 GET_GOT rbx 693 ; end prolog 694 695 ;arg(3), arg(4) not used 696 697 ; from top 698 mov rcx, arg(2) ;above; 699 pxor xmm0, xmm0 700 movdqa xmm1, [rcx] 701 psadbw xmm1, xmm0 702 movdqa xmm2, xmm1 703 punpckhqdq xmm1, xmm1 704 paddw xmm1, xmm2 705 706 ; add up 707 paddw xmm1, [GLOBAL(dc_8)] 708 psraw xmm1, 4 709 ; FIXME use pshufb for ssse3 version 710 pshuflw xmm1, xmm1, 0x0 711 punpcklqdq xmm1, xmm1 712 packuswb xmm1, xmm1 713 714 ; write out 715 mov rsi, 2 716 mov rdx, arg(0) ;dst; 717 movsxd rcx, dword ptr arg(1) ;dst_stride 718 lea rax, [rcx*3] 719 720 .label 721 movdqa [rdx ], xmm1 722 movdqa [rdx+rcx ], xmm1 723 movdqa [rdx+rcx*2], xmm1 724 movdqa [rdx+rax ], xmm1 725 lea rdx, [rdx+rcx*4] 726 movdqa [rdx ], xmm1 727 movdqa [rdx+rcx ], xmm1 728 movdqa [rdx+rcx*2], xmm1 729 movdqa [rdx+rax ], xmm1 730 lea rdx, [rdx+rcx*4] 731 dec rsi 732 jnz .label 733 734 ; begin epilog 735 RESTORE_GOT 736 pop rsi 737 UNSHADOW_ARGS 738 pop rbp 739 ret 740 741 ;void vp8_intra_pred_y_dcleft_sse2( 742 ; unsigned char *dst, 743 ; int dst_stride 744 ; unsigned char *above, 745 ; unsigned char *left, 746 ; int left_stride 747 ; ) 748 global sym(vp8_intra_pred_y_dcleft_sse2) PRIVATE 749 sym(vp8_intra_pred_y_dcleft_sse2): 750 push rbp 751 mov rbp, rsp 752 SHADOW_ARGS_TO_STACK 5 753 push rsi 754 push rdi 755 ; end prolog 756 757 ;arg(2) not used 758 759 ; from left 760 mov rsi, arg(3) ;left; 761 movsxd rax, dword ptr arg(4) ;left_stride; 762 763 lea rdi, [rax*3] 764 movzx ecx, byte [rsi] 765 movzx edx, byte [rsi+rax] 766 add ecx, edx 767 movzx edx, byte [rsi+rax*2] 768 add ecx, edx 769 movzx edx, byte [rsi+rdi] 770 add ecx, edx 771 lea rsi, [rsi+rax*4] 772 movzx edx, byte [rsi] 773 add ecx, edx 774 movzx edx, byte [rsi+rax] 775 add ecx, edx 776 movzx edx, byte [rsi+rax*2] 777 add ecx, edx 778 movzx edx, byte [rsi+rdi] 779 add ecx, edx 780 lea rsi, [rsi+rax*4] 781 movzx edx, byte [rsi] 782 add ecx, edx 783 movzx edx, byte [rsi+rax] 784 add ecx, edx 785 movzx edx, byte [rsi+rax*2] 786 add ecx, edx 787 movzx edx, byte [rsi+rdi] 788 add ecx, edx 789 lea rsi, [rsi+rax*4] 790 movzx edx, byte [rsi] 791 add ecx, edx 792 movzx edx, byte [rsi+rax] 793 add ecx, edx 794 movzx edx, byte [rsi+rax*2] 795 add ecx, edx 796 movzx edx, byte [rsi+rdi] 797 lea edx, [ecx+edx+8] 798 799 ; add up 800 shr edx, 4 801 movd xmm1, edx 802 ; FIXME use pshufb for ssse3 version 803 pshuflw xmm1, xmm1, 0x0 804 punpcklqdq xmm1, xmm1 805 packuswb xmm1, xmm1 806 807 ; write out 808 mov rsi, 2 809 mov rdi, arg(0) ;dst; 810 movsxd rcx, dword ptr arg(1) ;dst_stride 811 lea rax, [rcx*3] 812 813 .label 814 movdqa [rdi ], xmm1 815 movdqa [rdi+rcx ], xmm1 816 movdqa [rdi+rcx*2], xmm1 817 movdqa [rdi+rax ], xmm1 818 lea rdi, [rdi+rcx*4] 819 movdqa [rdi ], xmm1 820 movdqa [rdi+rcx ], xmm1 821 movdqa [rdi+rcx*2], xmm1 822 movdqa [rdi+rax ], xmm1 823 lea rdi, [rdi+rcx*4] 824 dec rsi 825 jnz .label 826 827 ; begin epilog 828 pop rdi 829 pop rsi 830 UNSHADOW_ARGS 831 pop rbp 832 ret 833 834 ;void vp8_intra_pred_y_dc128_sse2( 835 ; unsigned char *dst, 836 ; int dst_stride 837 ; unsigned char *above, 838 ; unsigned char *left, 839 ; int left_stride 840 ; ) 841 global sym(vp8_intra_pred_y_dc128_sse2) PRIVATE 842 sym(vp8_intra_pred_y_dc128_sse2): 843 push rbp 844 mov rbp, rsp 845 SHADOW_ARGS_TO_STACK 5 846 push rsi 847 GET_GOT rbx 848 ; end prolog 849 850 ;arg(2), arg(3), arg(4) not used 851 852 ; write out 853 mov rsi, 2 854 movdqa xmm1, [GLOBAL(dc_128)] 855 mov rax, arg(0) ;dst; 856 movsxd rdx, dword ptr arg(1) ;dst_stride 857 lea rcx, [rdx*3] 858 859 .label 860 movdqa [rax ], xmm1 861 movdqa [rax+rdx ], xmm1 862 movdqa [rax+rdx*2], xmm1 863 movdqa [rax+rcx ], xmm1 864 lea rax, [rax+rdx*4] 865 movdqa [rax ], xmm1 866 movdqa [rax+rdx ], xmm1 867 movdqa [rax+rdx*2], xmm1 868 movdqa [rax+rcx ], xmm1 869 lea rax, [rax+rdx*4] 870 dec rsi 871 jnz .label 872 873 ; begin epilog 874 RESTORE_GOT 875 pop rsi 876 UNSHADOW_ARGS 877 pop rbp 878 ret 879 880 ;void vp8_intra_pred_y_tm_sse2( 881 ; unsigned char *dst, 882 ; int dst_stride 883 ; unsigned char *above, 884 ; unsigned char *left, 885 ; int left_stride 886 ; ) 887 %macro vp8_intra_pred_y_tm 1 888 global sym(vp8_intra_pred_y_tm_%1) PRIVATE 889 sym(vp8_intra_pred_y_tm_%1): 890 push rbp 891 mov rbp, rsp 892 SHADOW_ARGS_TO_STACK 5 893 SAVE_XMM 7 894 push rsi 895 push rdi 896 GET_GOT rbx 897 ; end prolog 898 899 ; read top row 900 mov edx, 8 901 mov rsi, arg(2) ;above 902 movsxd rax, dword ptr arg(4) ;left_stride; 903 pxor xmm0, xmm0 904 %ifidn %1, ssse3 905 movdqa xmm3, [GLOBAL(dc_1024)] 906 %endif 907 movdqa xmm1, [rsi] 908 movdqa xmm2, xmm1 909 punpcklbw xmm1, xmm0 910 punpckhbw xmm2, xmm0 911 912 ; set up left ptrs ans subtract topleft 913 movd xmm4, [rsi-1] 914 mov rsi, arg(3) ;left 915 %ifidn %1, sse2 916 punpcklbw xmm4, xmm0 917 pshuflw xmm4, xmm4, 0x0 918 punpcklqdq xmm4, xmm4 919 %else 920 pshufb xmm4, xmm3 921 %endif 922 psubw xmm1, xmm4 923 psubw xmm2, xmm4 924 925 ; set up dest ptrs 926 mov rdi, arg(0) ;dst; 927 movsxd rcx, dword ptr arg(1) ;dst_stride 928 vp8_intra_pred_y_tm_%1_loop: 929 movd xmm4, [rsi] 930 movd xmm5, [rsi+rax] 931 %ifidn %1, sse2 932 punpcklbw xmm4, xmm0 933 punpcklbw xmm5, xmm0 934 pshuflw xmm4, xmm4, 0x0 935 pshuflw xmm5, xmm5, 0x0 936 punpcklqdq xmm4, xmm4 937 punpcklqdq xmm5, xmm5 938 %else 939 pshufb xmm4, xmm3 940 pshufb xmm5, xmm3 941 %endif 942 movdqa xmm6, xmm4 943 movdqa xmm7, xmm5 944 paddw xmm4, xmm1 945 paddw xmm6, xmm2 946 paddw xmm5, xmm1 947 paddw xmm7, xmm2 948 packuswb xmm4, xmm6 949 packuswb xmm5, xmm7 950 movdqa [rdi ], xmm4 951 movdqa [rdi+rcx], xmm5 952 lea rsi, [rsi+rax*2] 953 lea rdi, [rdi+rcx*2] 954 dec edx 955 jnz vp8_intra_pred_y_tm_%1_loop 956 957 ; begin epilog 958 RESTORE_GOT 959 pop rdi 960 pop rsi 961 RESTORE_XMM 962 UNSHADOW_ARGS 963 pop rbp 964 ret 965 %endmacro 966 967 vp8_intra_pred_y_tm sse2 968 vp8_intra_pred_y_tm ssse3 969 970 ;void vp8_intra_pred_y_ve_sse2( 971 ; unsigned char *dst, 972 ; int dst_stride 973 ; unsigned char *above, 974 ; unsigned char *left, 975 ; int left_stride 976 ; ) 977 global sym(vp8_intra_pred_y_ve_sse2) PRIVATE 978 sym(vp8_intra_pred_y_ve_sse2): 979 push rbp 980 mov rbp, rsp 981 SHADOW_ARGS_TO_STACK 5 982 push rsi 983 ; end prolog 984 985 ;arg(3), arg(4) not used 986 987 mov rax, arg(2) ;above; 988 mov rsi, 2 989 movsxd rdx, dword ptr arg(1) ;dst_stride 990 991 ; read from top 992 movdqa xmm1, [rax] 993 994 ; write out 995 mov rax, arg(0) ;dst; 996 lea rcx, [rdx*3] 997 998 .label 999 movdqa [rax ], xmm1 1000 movdqa [rax+rdx ], xmm1 1001 movdqa [rax+rdx*2], xmm1 1002 movdqa [rax+rcx ], xmm1 1003 lea rax, [rax+rdx*4] 1004 movdqa [rax ], xmm1 1005 movdqa [rax+rdx ], xmm1 1006 movdqa [rax+rdx*2], xmm1 1007 movdqa [rax+rcx ], xmm1 1008 lea rax, [rax+rdx*4] 1009 dec rsi 1010 jnz .label 1011 1012 ; begin epilog 1013 pop rsi 1014 UNSHADOW_ARGS 1015 pop rbp 1016 ret 1017 1018 ;void vp8_intra_pred_y_ho_sse2( 1019 ; unsigned char *dst, 1020 ; int dst_stride 1021 ; unsigned char *above, 1022 ; unsigned char *left, 1023 ; int left_stride, 1024 ; ) 1025 global sym(vp8_intra_pred_y_ho_sse2) PRIVATE 1026 sym(vp8_intra_pred_y_ho_sse2): 1027 push rbp 1028 mov rbp, rsp 1029 SHADOW_ARGS_TO_STACK 5 1030 push rsi 1031 push rdi 1032 ; end prolog 1033 1034 ;arg(2) not used 1035 1036 ; read from left and write out 1037 mov edx, 8 1038 mov rsi, arg(3) ;left; 1039 movsxd rax, dword ptr arg(4) ;left_stride; 1040 mov rdi, arg(0) ;dst; 1041 movsxd rcx, dword ptr arg(1) ;dst_stride 1042 1043 vp8_intra_pred_y_ho_sse2_loop: 1044 movd xmm0, [rsi] 1045 movd xmm1, [rsi+rax] 1046 ; FIXME use pshufb for ssse3 version 1047 punpcklbw xmm0, xmm0 1048 punpcklbw xmm1, xmm1 1049 pshuflw xmm0, xmm0, 0x0 1050 pshuflw xmm1, xmm1, 0x0 1051 punpcklqdq xmm0, xmm0 1052 punpcklqdq xmm1, xmm1 1053 movdqa [rdi ], xmm0 1054 movdqa [rdi+rcx], xmm1 1055 lea rsi, [rsi+rax*2] 1056 lea rdi, [rdi+rcx*2] 1057 dec edx 1058 jnz vp8_intra_pred_y_ho_sse2_loop 1059 1060 ; begin epilog 1061 pop rdi 1062 pop rsi 1063 UNSHADOW_ARGS 1064 pop rbp 1065 ret 1066 1067 SECTION_RODATA 1068 align 16 1069 dc_128: 1070 times 16 db 128 1071 dc_4: 1072 times 4 dw 4 1073 align 16 1074 dc_8: 1075 times 8 dw 8 1076 align 16 1077 dc_1024: 1078 times 8 dw 0x400 1079 align 16 1080 dc_00001111: 1081 times 8 db 0 1082 times 8 db 1 1083